ICU-7611 delete old copy of Mark's Java unicodetools from before he moved them to the unicode.org repository

X-SVN-Rev: 27924
This commit is contained in:
Markus Scherer 2010-04-09 23:40:43 +00:00
parent baed720ac1
commit a7c0c94a15
167 changed files with 0 additions and 461201 deletions

View File

@ -1,392 +0,0 @@
package com.ibm.text;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.lang.UCharacter;
import java.util.BitSet;
import java.util.Set;
import java.util.HashSet;
import java.util.TreeSet;
import java.util.Iterator;
import java.text.NumberFormat;
import com.ibm.text.utility.FastIntBinarySearch;
public class TestICU4J {
public static void main(String[] args) {
String a = UTF16.valueOf(0x10000);
String b = Normalizer.normalize("a\u0308", Normalizer.NFC);
System.out.println(b);
/*
System.out.println(UCharacter.getType(0x10FFFF));
System.out.println(UCharacter.getName(0x61));
*/
testUnicodeSetSpeed(Character.TITLECASE_LETTER, 100);
testUnicodeSetSpeed(Character.UNASSIGNED, 1);
}
static final boolean SHOW_ERRORS = false;
static boolean OPTIMIZATION = true;
static void testUnicodeSetSpeed(int prop, int ITERATIONS) {
NumberFormat numb = NumberFormat.getNumberInstance();
NumberFormat percent = NumberFormat.getPercentInstance();
double start, delta, oldDelta;
int temp = 0;
Set s;
UnicodeSet us;
Iterator it;
UnicodeSetIterator uit;
BitSet bs = new BitSet();
System.out.println();
System.out.println("Getting characters for property " + prop);
int total = 0;
for (int cp = 0; cp < 0x10FFFF; ++cp) {
if (UCharacter.getType(cp) == prop) {
bs.set(cp);
++total;
}
}
System.out.println("Total characters: " + numb.format(total));
System.out.println("Loop Iterations: " + numb.format(ITERATIONS));
System.out.println();
System.out.println("Testing Add speed");
s = new TreeSet();
start = System.currentTimeMillis();
for (int i = 0; i < ITERATIONS; ++i) {
s.clear();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (bs.get(cp)) {
s.add(new Integer(cp));
}
}
}
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
System.out.println("Set add time: " + numb.format(delta));
System.out.println("Total characters: " + numb.format(s.size()));
us = new UnicodeSet();
start = System.currentTimeMillis();
for (int i = 0; i < ITERATIONS; ++i) {
us.clear();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (bs.get(cp)) {
optimizedAdd(us,cp);
}
}
}
optimizedDone(us);
delta = (System.currentTimeMillis() - start)/ITERATIONS;
System.out.println("UnicodeSet add time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
System.out.println("Total characters: " + numb.format(us.size()) + ", ranges: " + us.getRangeCount());
System.out.println();
System.out.println("Testing Contains speed");
start = System.currentTimeMillis();
for (int i = 0; i < ITERATIONS; ++i) {
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (s.contains(new Integer(cp)) != bs.get(cp)) {
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
}
}
}
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
System.out.println("Set contains time: " + numb.format(delta));
start = System.currentTimeMillis();
for (int i = 0; i < ITERATIONS; ++i) {
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (us.contains(cp) != bs.get(cp)) {
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
}
}
}
delta = (System.currentTimeMillis() - start)/ITERATIONS;
System.out.println("UnicodeSet contains time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
setupBinary(us);
start = System.currentTimeMillis();
for (int i = 0; i < ITERATIONS; ++i) {
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (binaryContains(cp) != bs.get(cp)) {
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
}
}
}
delta = (System.currentTimeMillis() - start)/ITERATIONS;
System.out.println("BINARY UnicodeSet contains time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
System.out.println("Testing Iteration speed");
start = System.currentTimeMillis();
for (int i = 0; i < ITERATIONS; ++i) {
it = s.iterator();
while (it.hasNext()) {
temp += ((Integer)it.next()).intValue();
}
}
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
System.out.println("Set iteration time: " + numb.format(delta));
uit = new UnicodeSetIterator(us);
start = System.currentTimeMillis();
for (int i = 0; i < ITERATIONS; ++i) {
uit.reset();
while (uit.next()) {
temp += uit.codepoint;
}
}
delta = (System.currentTimeMillis() - start)/ITERATIONS;
System.out.println("UnicodeSet iteration time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
uit.reset();
start = System.currentTimeMillis();
while (uit.nextRange()) {
System.out.println(info(uit.codepoint, uit.codepointEnd));
}
}
static FastIntBinarySearch fibs;
static void setupBinary(UnicodeSet us) {
int[] dummySearch = new int[us.getRangeCount()*2];
int dummyLimit = 0;
UnicodeSetIterator uit = new UnicodeSetIterator(us);
while (uit.nextRange()) {
dummySearch[dummyLimit++] = uit.codepoint;
dummySearch[dummyLimit++] = uit.codepointEnd+1;
}
fibs = new FastIntBinarySearch(dummySearch);
}
static boolean binaryContains(int cp) {
return ((fibs.findIndex(cp) & 1) != 0); // return true if odd
}
static String info(int cp) {
return Integer.toString(cp, 16).toUpperCase() + " " + UCharacter.getName(cp);
}
static String info(int cpStart, int cpEnd) {
if (cpStart == cpEnd) {
return Integer.toString(cpStart, 16).toUpperCase()
+ " " + UCharacter.getName(cpStart);
}
return Integer.toString(cpStart, 16).toUpperCase() + ".." + Integer.toString(cpEnd, 16).toUpperCase()
+ " " + UCharacter.getName(cpStart) + ".." + UCharacter.getName(cpEnd);
}
static int first;
static int limit = -2;
static void optimizedAdd(UnicodeSet us, int cp) {
if (!OPTIMIZATION) {
us.add(cp);
return;
}
if (cp == limit) {
++limit;
} else {
if (limit > 0) {
us.add(first, limit - 1);
// System.out.println(info(first, limit-1));
}
first = cp;
limit = cp + 1;
}
}
static void optimizedDone(UnicodeSet us) {
if (!OPTIMIZATION) return;
if (limit > 0) {
us.add(first, limit - 1);
//System.out.println(info(first, limit-1));
}
limit = -2; // reset to invalid
}
public static class UXCharacter {
/**
* Provides interface for properties in
* http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt
* and their values in
* http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
*/
/**
* Tests a particular code point to see if the cited property has the given value.
*
* Sample: the following are equivalent
* <pre>
* if (UCharacter.test("LB", "AL", cp)) ...
* if (UCharacter.test("line break", "alphabetic", cp)) ...
* </pre>
*
*/
public static boolean test(String propertyName, String propertyValue, int codePoint) {
return false;
}
/**
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
* @param set the resulting value. The set is cleared,
* then all the code points with the given <property, value> are added.
*
* Sample: the following are equivalent
* <pre>
* if (UCharacter.test("WSpace", cp)) ...
* if (UCharacter.test("White_Space", cp)) ...
* if (UCharacter.test("White_Space", "true", cp)) ...
* if (!UCharacter.test("White_Space", "false", cp)) ...
* </pre>
*
*/
public static void getSet(String propertyName, String propertyValue, UnicodeSet set) {
// logical implemenation. Real implementation would be way faster!
set.clear();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (test(propertyName, propertyValue, cp)) set.add(cp);
}
}
// ======================================================
// POSSIBLE ADDITIONAL UTILITIES FOR CONVENIENCE OR SPEED
// ======================================================
/**
* Tests a particular code point to see if the cited boolean property is true.
* @param propertyName the cited property
* @param codePoint the particular code point
* @return true if the cited property has the given value for the specified code point.
*
* Sample: the following are equivalent
* <pre>
* if (UCharacter.test("WSpace", cp)) ...
* if (UCharacter.test("White_Space", cp)) ...
* if (UCharacter.test("White_Space", "true", cp)) ...
* if (!UCharacter.test("White_Space", "false", cp)) ...
* </pre>
*
*/
public static boolean test(String booleanPropertyName, int codePoint) {
return test(booleanPropertyName, "true", codePoint);
}
// ===============================================
// The following allow access to properties by number, saving a string lookup
// on each call.
// ===============================================
/**
* Gets an index for higher-speed access to properties.
*
* Sample:
* <pre>
* int prop = UCharacter.getPropertyIndexIndex("LB");
* int value = UCharacter.getValueIndex("LB", "AL");
* while (true) {
* ...
* if (test(prop, value, codePoint)) ...
* </pre>
*
*/
public static int getPropertyIndex(String propertyName) {
return 0;
}
/**
* Gets maximum property index, used for iterating through properties
*
*/
public static int getMaxPropertyIndex() {
return 0;
}
static final byte // NAME_STYLE
SHORT = 0,
DEFAULT = 1,
LONG = 2;
/**
* Gets property name
*
*/
public static String getPropertyName(int propertyIndex, byte namestyle) {
return "";
}
/*
* Tests a particular code point to see if the cited property has the given value.
*/
public static boolean test(int propertyIndex, String propertyValue, int codePoint) {
return false;
}
/**
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
*/
public static void getSet(int propertyIndex, String propertyValue, UnicodeSet set) {
}
// ===============================================
// The following allow access to enumerated property values by number,
// saving a string lookup on each call.
// They are only valid for enumerated properties
// including the combining character class (0..255).
// ===============================================
/**
* Gets an index for higher-speed access to property values.
* Only valid for enumerated properties.
*/
public static int getValueIndex(String propertyName, String propertyValue) {
return 0;
}
/**
* Gets maximum value index for a given property, used for iterating through property values.
* Only valid for enumerated properties.
*
*/
public static int getMaxValueIndex(int propertyIndex) {
return 0;
}
/**
* Gets property value, corresponding to one of the values passed in
*
*/
public static String getValueName(int propertyIndex, int valueIndex, byte namestyle) {
return "";
}
/*
* Tests a particular code point to see if the cited property has the given value.
*/
public static boolean test(int propertyIndex, int valueIndex, int codePoint) {
return false;
}
/**
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
*/
public static void getSet(int propertyIndex, int valueIndex, UnicodeSet set) {
}
/* OPEN ISSUES:
- Don't like the names of the functions. Any better options? test => hasValue? hasPropertyValue?
- Should getSet really ADD to the set (avoiding the clear?) and be called addProperties?
Maybe faster sometimes, but might also be more errorprone.
*/
}
}

View File

@ -1,66 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java,v $
* $Date: 2004/02/06 18:32:04 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import java.util.*;
import java.io.BufferedReader;
import java.io.Reader;
import java.io.PrintWriter;
import java.io.FileReader;
import java.text.MessageFormat;
import java.io.IOException;
import com.ibm.text.UCD.Normalizer;
import com.ibm.text.UCD.UCD;
import com.ibm.text.utility.*;
import com.ibm.text.UCD.UnifiedBinaryProperty;
import com.ibm.text.UCD.UCDProperty;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
public class AbbreviatedUnicodeSetIterator extends UnicodeSetIterator {
private boolean abbreviated;
private int perRange;
public AbbreviatedUnicodeSetIterator() {
super();
abbreviated = false;
}
public void reset(UnicodeSet newSet) {
reset(newSet, false);
}
public void reset(UnicodeSet newSet, boolean abb) {
reset(newSet, abb, 100);
}
public void reset(UnicodeSet newSet, boolean abb, int density) {
super.reset(newSet);
abbreviated = abb;
perRange = newSet.getRangeCount();
if (perRange != 0) {
perRange = density / perRange;
}
}
protected void loadRange(int myRange) {
super.loadRange(myRange);
if (abbreviated && (endElement > nextElement + perRange)) {
endElement = nextElement + perRange;
}
}
}

View File

@ -1,256 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $
* $Date: 2002/07/03 02:15:47 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
public final class CEList implements java.lang.Comparable, UCD_Types {
int[] contents;
int startOffset;
int endOffset;
int count;
public CEList (int[] source, int start, int end) {
count = end-start;
contents = new int[count];
System.arraycopy(source, start, contents, 0, count);
startOffset = 0;
endOffset = count;
}
public CEList(int[] source) {
this(source, 0, source.length);
}
private CEList(int[] source, int start, int end, boolean spare) {
contents = source;
startOffset = start;
endOffset = end;
count = end - start;
}
public CEList append(CEList that) {
int[] newContents = new int[count + that.count];
System.arraycopy(contents, startOffset, newContents, 0, count);
System.arraycopy(that.contents, that.startOffset, newContents, count, that.count);
return new CEList(newContents, 0, count + that.count, true);
}
public CEList sub(int start, int end) {
return new CEList(contents, start, end, true);
}
public CEList start(int end) {
return new CEList(contents, 0, end, true);
}
public CEList end(int start) {
return new CEList(contents, start, contents.length, true);
}
public int length() {
return count;
}
public int at(int i) {
i -= startOffset;
if (i < 0 || i >= count) throw new ArrayIndexOutOfBoundsException(i);
return contents[i];
}
public int hashCode() {
int result = count;
for (int i = startOffset; i < endOffset; ++i) {
result *= 37;
result += contents[i];
}
return result;
}
public boolean equals(Object other) {
try {
CEList that = (CEList)other;
if (count != that.count) return false;
int delta = that.startOffset - startOffset;
for (int i = startOffset; i < endOffset; ++i) {
if (contents[i] != that.contents[i + delta]) return false;
}
return true;
} catch (Exception e) {
return false;
}
}
public int compareTo(Object other) {
CEList that = (CEList)other;
try {
int delta = that.startOffset - startOffset;
int min = endOffset;
int min2 = that.endOffset - delta;
if (min > min2) min = min2;
for (int i = startOffset; i < min; ++i) {
if (contents[i] != that.contents[i + delta]) {
if ((contents[i] & 0xFFFFFFFFL)
< (that.contents[i + delta] & 0xFFFFFFFFL)) return -1;
return 1;
}
}
if (count < that.count) return -1;
if (count > that.count) return 1;
return 0;
} catch (RuntimeException e) {
System.out.println("This: " + this + ", that: " + other);
System.out.println(startOffset + ", " + endOffset
+ ", " + count + ", " + contents.length);
System.out.println(that.startOffset + ", " + that.endOffset
+ ", " + that.count + ", " + that.contents.length);
throw e;
}
}
public static byte remap(int ch, byte type, int t) {
if (type != CANONICAL) {
if (0x3041 <= ch && ch <= 0x3094) t = 0xE; // hiragana
else if (0x30A1 <= ch && ch <= 0x30FA) t = 0x11; // katakana
}
switch (type) {
case COMPATIBILITY: t = (t == 8) ? 0xA : 4; break;
case COMPAT_FONT: t = (t == 8) ? 0xB : 5; break;
case COMPAT_NOBREAK: t = 0x1B; break;
case COMPAT_INITIAL: t = 0x17; break;
case COMPAT_MEDIAL: t = 0x18; break;
case COMPAT_FINAL: t = 0x19; break;
case COMPAT_ISOLATED: t = 0x1A; break;
case COMPAT_CIRCLE: t = (t == 0x11) ? 0x13 : (t == 8) ? 0xC : 6; break;
case COMPAT_SUPER: t = 0x14; break;
case COMPAT_SUB: t = 0x15; break;
case COMPAT_VERTICAL: t = 0x16; break;
case COMPAT_WIDE: t= (t == 8) ? 9 : 3; break;
case COMPAT_NARROW: t = (0xFF67 <= ch && ch <= 0xFF6F) ? 0x10 : 0x12; break;
case COMPAT_SMALL: t = (t == 0xE) ? 0xE : 0xF; break;
case COMPAT_SQUARE: t = (t == 8) ? 0x1D : 0x1C; break;
case COMPAT_FRACTION: t = 0x1E; break;
}
return (byte)t;
}
public String toString() {
if (startOffset >= endOffset) return toString(0);
StringBuffer result = new StringBuffer();
for (int i = startOffset; i < endOffset; ++i) {
if (i != startOffset) result.append(' ');
result.append(toString(contents[i]));
}
return result.toString();
}
public static String toString(int[] ces, int len) {
if (len <= 0) return toString(0);
StringBuffer result = new StringBuffer();
for (int i = 0; i < len; ++i) {
if (i != 0) result.append(' ');
result.append(toString(ces[i]));
}
return result.toString();
}
public static String toString(IntStack ces) {
if (ces.length() <= 0) return toString(0);
StringBuffer result = new StringBuffer();
for (int i = 0; i < ces.length(); ++i) {
if (i != 0) result.append(' ');
result.append(toString(ces.get(i)));
}
return result.toString();
}
public static String toString(int ce) {
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
+ Utility.hex(UCA.getSecondary(ce)) + "."
+ Utility.hex(UCA.getTertiary(ce)) + "]"
// + "(" + NAME3[UCA.getTertiary(ce)] + ")"
;
}
static final String[] NAME3 = {
"IGNORE", // 0
"BLK", // Unused?
"MIN",
"WIDE",
"COMPAT",
"FONT",
"CIRCLE",
"RES-2",
"CAP",
"WIDECAP",
"COMPATCAP",
"FONTCAP",
"CIRCLECAP",
"HIRA-SMALL",
"HIRA",
"SMALL",
"SMALL-NARROW",
"KATA",
"NARROW",
"CIRCLE-KATA",
"SUP-MNN",
"SUB-MNS",
"VERT", // Missing??
"AINI",
"AMED",
"AFIN",
"AISO",
"NOBREAK", // Missing?
"SQUARED",
"SQUAREDCAP",
"FRACTION",
"MAX"
};
// testing
public static void main(String args[]) throws Exception {
/* This: [0241.0020.0004], that: [0F6B.0020.0002]
1, 2, 1, 2
0, 1, 1, 1
*/
CEList t1 = new CEList(new int[] {0, 0x02412004});
t1 = t1.sub(1,2);
CEList t2 = new CEList(new int[] {0x0F6B2002});
System.out.println(t1.compareTo(t2));
CEList foo = new CEList(new int[] {0, 1, 2, 3, 4});
CEList fuu = new CEList(new int[] {});
int cc = foo.compareTo(fuu);
System.out.println(cc);
System.out.println(foo);
System.out.println(foo.start(2));
System.out.println(foo.end(1));
CEList fii = new CEList(new int[] {2, 3});
CEList foo2 = foo.sub(2,4);
System.out.println(fii.equals(foo2));
System.out.println(fii.compareTo(foo2));
System.out.println(fii.compareTo(foo));
System.out.println(fii.hashCode() == foo2.hashCode());
}
}

View File

@ -1,826 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Case.java,v $
* $Date: 2001/08/31 00:20:40 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
public final class Case {
static StringBuffer out = new StringBuffer();
static String fold(char c) {
return fold(String.valueOf(c));
}
static String fold(String in) {
synchronized (out) {
out.setLength(0);
for (int i = 0; i < in.length(); ++i) {
char c = in.charAt(i);
String f = CF[c];
if (f == null) out.append(c);
else out.append(f);
}
return out.toString();
}
}
static String[] CF = new String[65536];
static {
CF[0x0041]="\u0061";
CF[0x0042]="\u0062";
CF[0x0043]="\u0063";
CF[0x0044]="\u0064";
CF[0x0045]="\u0065";
CF[0x0046]="\u0066";
CF[0x0047]="\u0067";
CF[0x0048]="\u0068";
CF[0x0049]="\u0069";
CF[0x004A]="\u006A";
CF[0x004B]="\u006B";
CF[0x004C]="\u006C";
CF[0x004D]="\u006D";
CF[0x004E]="\u006E";
CF[0x004F]="\u006F";
CF[0x0050]="\u0070";
CF[0x0051]="\u0071";
CF[0x0052]="\u0072";
CF[0x0053]="\u0073";
CF[0x0054]="\u0074";
CF[0x0055]="\u0075";
CF[0x0056]="\u0076";
CF[0x0057]="\u0077";
CF[0x0058]="\u0078";
CF[0x0059]="\u0079";
CF[0x005A]="\u007A";
CF[0x00B5]="\u03BC";
CF[0x00C0]="\u00E0";
CF[0x00C1]="\u00E1";
CF[0x00C2]="\u00E2";
CF[0x00C3]="\u00E3";
CF[0x00C4]="\u00E4";
CF[0x00C5]="\u00E5";
CF[0x00C6]="\u00E6";
CF[0x00C7]="\u00E7";
CF[0x00C8]="\u00E8";
CF[0x00C9]="\u00E9";
CF[0x00CA]="\u00EA";
CF[0x00CB]="\u00EB";
CF[0x00CC]="\u00EC";
CF[0x00CD]="\u00ED";
CF[0x00CE]="\u00EE";
CF[0x00CF]="\u00EF";
CF[0x00D0]="\u00F0";
CF[0x00D1]="\u00F1";
CF[0x00D2]="\u00F2";
CF[0x00D3]="\u00F3";
CF[0x00D4]="\u00F4";
CF[0x00D5]="\u00F5";
CF[0x00D6]="\u00F6";
CF[0x00D8]="\u00F8";
CF[0x00D9]="\u00F9";
CF[0x00DA]="\u00FA";
CF[0x00DB]="\u00FB";
CF[0x00DC]="\u00FC";
CF[0x00DD]="\u00FD";
CF[0x00DE]="\u00FE";
CF[0x00DF]="\u0073\u0073";
CF[0x0100]="\u0101";
CF[0x0102]="\u0103";
CF[0x0104]="\u0105";
CF[0x0106]="\u0107";
CF[0x0108]="\u0109";
CF[0x010A]="\u010B";
CF[0x010C]="\u010D";
CF[0x010E]="\u010F";
CF[0x0110]="\u0111";
CF[0x0112]="\u0113";
CF[0x0114]="\u0115";
CF[0x0116]="\u0117";
CF[0x0118]="\u0119";
CF[0x011A]="\u011B";
CF[0x011C]="\u011D";
CF[0x011E]="\u011F";
CF[0x0120]="\u0121";
CF[0x0122]="\u0123";
CF[0x0124]="\u0125";
CF[0x0126]="\u0127";
CF[0x0128]="\u0129";
CF[0x012A]="\u012B";
CF[0x012C]="\u012D";
CF[0x012E]="\u012F";
CF[0x0130]="\u0069";
CF[0x0131]="\u0069";
CF[0x0132]="\u0133";
CF[0x0134]="\u0135";
CF[0x0136]="\u0137";
CF[0x0139]="\u013A";
CF[0x013B]="\u013C";
CF[0x013D]="\u013E";
CF[0x013F]="\u0140";
CF[0x0141]="\u0142";
CF[0x0143]="\u0144";
CF[0x0145]="\u0146";
CF[0x0147]="\u0148";
CF[0x0149]="\u02BC\u006E";
CF[0x014A]="\u014B";
CF[0x014C]="\u014D";
CF[0x014E]="\u014F";
CF[0x0150]="\u0151";
CF[0x0152]="\u0153";
CF[0x0154]="\u0155";
CF[0x0156]="\u0157";
CF[0x0158]="\u0159";
CF[0x015A]="\u015B";
CF[0x015C]="\u015D";
CF[0x015E]="\u015F";
CF[0x0160]="\u0161";
CF[0x0162]="\u0163";
CF[0x0164]="\u0165";
CF[0x0166]="\u0167";
CF[0x0168]="\u0169";
CF[0x016A]="\u016B";
CF[0x016C]="\u016D";
CF[0x016E]="\u016F";
CF[0x0170]="\u0171";
CF[0x0172]="\u0173";
CF[0x0174]="\u0175";
CF[0x0176]="\u0177";
CF[0x0178]="\u00FF";
CF[0x0179]="\u017A";
CF[0x017B]="\u017C";
CF[0x017D]="\u017E";
CF[0x017F]="\u0073";
CF[0x0181]="\u0253";
CF[0x0182]="\u0183";
CF[0x0184]="\u0185";
CF[0x0186]="\u0254";
CF[0x0187]="\u0188";
CF[0x0189]="\u0256";
CF[0x018A]="\u0257";
CF[0x018B]="\u018C";
CF[0x018E]="\u01DD";
CF[0x018F]="\u0259";
CF[0x0190]="\u025B";
CF[0x0191]="\u0192";
CF[0x0193]="\u0260";
CF[0x0194]="\u0263";
CF[0x0196]="\u0269";
CF[0x0197]="\u0268";
CF[0x0198]="\u0199";
CF[0x019C]="\u026F";
CF[0x019D]="\u0272";
CF[0x019F]="\u0275";
CF[0x01A0]="\u01A1";
CF[0x01A2]="\u01A3";
CF[0x01A4]="\u01A5";
CF[0x01A6]="\u0280";
CF[0x01A7]="\u01A8";
CF[0x01A9]="\u0283";
CF[0x01AC]="\u01AD";
CF[0x01AE]="\u0288";
CF[0x01AF]="\u01B0";
CF[0x01B1]="\u028A";
CF[0x01B2]="\u028B";
CF[0x01B3]="\u01B4";
CF[0x01B5]="\u01B6";
CF[0x01B7]="\u0292";
CF[0x01B8]="\u01B9";
CF[0x01BC]="\u01BD";
CF[0x01C4]="\u01C6";
CF[0x01C5]="\u01C6";
CF[0x01C7]="\u01C9";
CF[0x01C8]="\u01C9";
CF[0x01CA]="\u01CC";
CF[0x01CB]="\u01CC";
CF[0x01CD]="\u01CE";
CF[0x01CF]="\u01D0";
CF[0x01D1]="\u01D2";
CF[0x01D3]="\u01D4";
CF[0x01D5]="\u01D6";
CF[0x01D7]="\u01D8";
CF[0x01D9]="\u01DA";
CF[0x01DB]="\u01DC";
CF[0x01DE]="\u01DF";
CF[0x01E0]="\u01E1";
CF[0x01E2]="\u01E3";
CF[0x01E4]="\u01E5";
CF[0x01E6]="\u01E7";
CF[0x01E8]="\u01E9";
CF[0x01EA]="\u01EB";
CF[0x01EC]="\u01ED";
CF[0x01EE]="\u01EF";
CF[0x01F0]="\u006A\u030C";
CF[0x01F1]="\u01F3";
CF[0x01F2]="\u01F3";
CF[0x01F4]="\u01F5";
CF[0x01F6]="\u0195";
CF[0x01F7]="\u01BF";
CF[0x01F8]="\u01F9";
CF[0x01FA]="\u01FB";
CF[0x01FC]="\u01FD";
CF[0x01FE]="\u01FF";
CF[0x0200]="\u0201";
CF[0x0202]="\u0203";
CF[0x0204]="\u0205";
CF[0x0206]="\u0207";
CF[0x0208]="\u0209";
CF[0x020A]="\u020B";
CF[0x020C]="\u020D";
CF[0x020E]="\u020F";
CF[0x0210]="\u0211";
CF[0x0212]="\u0213";
CF[0x0214]="\u0215";
CF[0x0216]="\u0217";
CF[0x0218]="\u0219";
CF[0x021A]="\u021B";
CF[0x021C]="\u021D";
CF[0x021E]="\u021F";
CF[0x0222]="\u0223";
CF[0x0224]="\u0225";
CF[0x0226]="\u0227";
CF[0x0228]="\u0229";
CF[0x022A]="\u022B";
CF[0x022C]="\u022D";
CF[0x022E]="\u022F";
CF[0x0230]="\u0231";
CF[0x0232]="\u0233";
CF[0x0345]="\u03B9";
CF[0x0386]="\u03AC";
CF[0x0388]="\u03AD";
CF[0x0389]="\u03AE";
CF[0x038A]="\u03AF";
CF[0x038C]="\u03CC";
CF[0x038E]="\u03CD";
CF[0x038F]="\u03CE";
CF[0x0390]="\u03B9\u0308\u0301";
CF[0x0391]="\u03B1";
CF[0x0392]="\u03B2";
CF[0x0393]="\u03B3";
CF[0x0394]="\u03B4";
CF[0x0395]="\u03B5";
CF[0x0396]="\u03B6";
CF[0x0397]="\u03B7";
CF[0x0398]="\u03B8";
CF[0x0399]="\u03B9";
CF[0x039A]="\u03BA";
CF[0x039B]="\u03BB";
CF[0x039C]="\u03BC";
CF[0x039D]="\u03BD";
CF[0x039E]="\u03BE";
CF[0x039F]="\u03BF";
CF[0x03A0]="\u03C0";
CF[0x03A1]="\u03C1";
CF[0x03A3]="\u03C2";
CF[0x03A4]="\u03C4";
CF[0x03A5]="\u03C5";
CF[0x03A6]="\u03C6";
CF[0x03A7]="\u03C7";
CF[0x03A8]="\u03C8";
CF[0x03A9]="\u03C9";
CF[0x03AA]="\u03CA";
CF[0x03AB]="\u03CB";
CF[0x03B0]="\u03C5\u0308\u0301";
CF[0x03C3]="\u03C2";
CF[0x03D0]="\u03B2";
CF[0x03D1]="\u03B8";
CF[0x03D5]="\u03C6";
CF[0x03D6]="\u03C0";
CF[0x03DA]="\u03DB";
CF[0x03DC]="\u03DD";
CF[0x03DE]="\u03DF";
CF[0x03E0]="\u03E1";
CF[0x03E2]="\u03E3";
CF[0x03E4]="\u03E5";
CF[0x03E6]="\u03E7";
CF[0x03E8]="\u03E9";
CF[0x03EA]="\u03EB";
CF[0x03EC]="\u03ED";
CF[0x03EE]="\u03EF";
CF[0x03F0]="\u03BA";
CF[0x03F1]="\u03C1";
CF[0x03F2]="\u03C2";
CF[0x0400]="\u0450";
CF[0x0401]="\u0451";
CF[0x0402]="\u0452";
CF[0x0403]="\u0453";
CF[0x0404]="\u0454";
CF[0x0405]="\u0455";
CF[0x0406]="\u0456";
CF[0x0407]="\u0457";
CF[0x0408]="\u0458";
CF[0x0409]="\u0459";
CF[0x040A]="\u045A";
CF[0x040B]="\u045B";
CF[0x040C]="\u045C";
CF[0x040D]="\u045D";
CF[0x040E]="\u045E";
CF[0x040F]="\u045F";
CF[0x0410]="\u0430";
CF[0x0411]="\u0431";
CF[0x0412]="\u0432";
CF[0x0413]="\u0433";
CF[0x0414]="\u0434";
CF[0x0415]="\u0435";
CF[0x0416]="\u0436";
CF[0x0417]="\u0437";
CF[0x0418]="\u0438";
CF[0x0419]="\u0439";
CF[0x041A]="\u043A";
CF[0x041B]="\u043B";
CF[0x041C]="\u043C";
CF[0x041D]="\u043D";
CF[0x041E]="\u043E";
CF[0x041F]="\u043F";
CF[0x0420]="\u0440";
CF[0x0421]="\u0441";
CF[0x0422]="\u0442";
CF[0x0423]="\u0443";
CF[0x0424]="\u0444";
CF[0x0425]="\u0445";
CF[0x0426]="\u0446";
CF[0x0427]="\u0447";
CF[0x0428]="\u0448";
CF[0x0429]="\u0449";
CF[0x042A]="\u044A";
CF[0x042B]="\u044B";
CF[0x042C]="\u044C";
CF[0x042D]="\u044D";
CF[0x042E]="\u044E";
CF[0x042F]="\u044F";
CF[0x0460]="\u0461";
CF[0x0462]="\u0463";
CF[0x0464]="\u0465";
CF[0x0466]="\u0467";
CF[0x0468]="\u0469";
CF[0x046A]="\u046B";
CF[0x046C]="\u046D";
CF[0x046E]="\u046F";
CF[0x0470]="\u0471";
CF[0x0472]="\u0473";
CF[0x0474]="\u0475";
CF[0x0476]="\u0477";
CF[0x0478]="\u0479";
CF[0x047A]="\u047B";
CF[0x047C]="\u047D";
CF[0x047E]="\u047F";
CF[0x0480]="\u0481";
CF[0x048C]="\u048D";
CF[0x048E]="\u048F";
CF[0x0490]="\u0491";
CF[0x0492]="\u0493";
CF[0x0494]="\u0495";
CF[0x0496]="\u0497";
CF[0x0498]="\u0499";
CF[0x049A]="\u049B";
CF[0x049C]="\u049D";
CF[0x049E]="\u049F";
CF[0x04A0]="\u04A1";
CF[0x04A2]="\u04A3";
CF[0x04A4]="\u04A5";
CF[0x04A6]="\u04A7";
CF[0x04A8]="\u04A9";
CF[0x04AA]="\u04AB";
CF[0x04AC]="\u04AD";
CF[0x04AE]="\u04AF";
CF[0x04B0]="\u04B1";
CF[0x04B2]="\u04B3";
CF[0x04B4]="\u04B5";
CF[0x04B6]="\u04B7";
CF[0x04B8]="\u04B9";
CF[0x04BA]="\u04BB";
CF[0x04BC]="\u04BD";
CF[0x04BE]="\u04BF";
CF[0x04C1]="\u04C2";
CF[0x04C3]="\u04C4";
CF[0x04C7]="\u04C8";
CF[0x04CB]="\u04CC";
CF[0x04D0]="\u04D1";
CF[0x04D2]="\u04D3";
CF[0x04D4]="\u04D5";
CF[0x04D6]="\u04D7";
CF[0x04D8]="\u04D9";
CF[0x04DA]="\u04DB";
CF[0x04DC]="\u04DD";
CF[0x04DE]="\u04DF";
CF[0x04E0]="\u04E1";
CF[0x04E2]="\u04E3";
CF[0x04E4]="\u04E5";
CF[0x04E6]="\u04E7";
CF[0x04E8]="\u04E9";
CF[0x04EA]="\u04EB";
CF[0x04EC]="\u04ED";
CF[0x04EE]="\u04EF";
CF[0x04F0]="\u04F1";
CF[0x04F2]="\u04F3";
CF[0x04F4]="\u04F5";
CF[0x04F8]="\u04F9";
CF[0x0531]="\u0561";
CF[0x0532]="\u0562";
CF[0x0533]="\u0563";
CF[0x0534]="\u0564";
CF[0x0535]="\u0565";
CF[0x0536]="\u0566";
CF[0x0537]="\u0567";
CF[0x0538]="\u0568";
CF[0x0539]="\u0569";
CF[0x053A]="\u056A";
CF[0x053B]="\u056B";
CF[0x053C]="\u056C";
CF[0x053D]="\u056D";
CF[0x053E]="\u056E";
CF[0x053F]="\u056F";
CF[0x0540]="\u0570";
CF[0x0541]="\u0571";
CF[0x0542]="\u0572";
CF[0x0543]="\u0573";
CF[0x0544]="\u0574";
CF[0x0545]="\u0575";
CF[0x0546]="\u0576";
CF[0x0547]="\u0577";
CF[0x0548]="\u0578";
CF[0x0549]="\u0579";
CF[0x054A]="\u057A";
CF[0x054B]="\u057B";
CF[0x054C]="\u057C";
CF[0x054D]="\u057D";
CF[0x054E]="\u057E";
CF[0x054F]="\u057F";
CF[0x0550]="\u0580";
CF[0x0551]="\u0581";
CF[0x0552]="\u0582";
CF[0x0553]="\u0583";
CF[0x0554]="\u0584";
CF[0x0555]="\u0585";
CF[0x0556]="\u0586";
CF[0x0587]="\u0565\u0582";
CF[0x1E00]="\u1E01";
CF[0x1E02]="\u1E03";
CF[0x1E04]="\u1E05";
CF[0x1E06]="\u1E07";
CF[0x1E08]="\u1E09";
CF[0x1E0A]="\u1E0B";
CF[0x1E0C]="\u1E0D";
CF[0x1E0E]="\u1E0F";
CF[0x1E10]="\u1E11";
CF[0x1E12]="\u1E13";
CF[0x1E14]="\u1E15";
CF[0x1E16]="\u1E17";
CF[0x1E18]="\u1E19";
CF[0x1E1A]="\u1E1B";
CF[0x1E1C]="\u1E1D";
CF[0x1E1E]="\u1E1F";
CF[0x1E20]="\u1E21";
CF[0x1E22]="\u1E23";
CF[0x1E24]="\u1E25";
CF[0x1E26]="\u1E27";
CF[0x1E28]="\u1E29";
CF[0x1E2A]="\u1E2B";
CF[0x1E2C]="\u1E2D";
CF[0x1E2E]="\u1E2F";
CF[0x1E30]="\u1E31";
CF[0x1E32]="\u1E33";
CF[0x1E34]="\u1E35";
CF[0x1E36]="\u1E37";
CF[0x1E38]="\u1E39";
CF[0x1E3A]="\u1E3B";
CF[0x1E3C]="\u1E3D";
CF[0x1E3E]="\u1E3F";
CF[0x1E40]="\u1E41";
CF[0x1E42]="\u1E43";
CF[0x1E44]="\u1E45";
CF[0x1E46]="\u1E47";
CF[0x1E48]="\u1E49";
CF[0x1E4A]="\u1E4B";
CF[0x1E4C]="\u1E4D";
CF[0x1E4E]="\u1E4F";
CF[0x1E50]="\u1E51";
CF[0x1E52]="\u1E53";
CF[0x1E54]="\u1E55";
CF[0x1E56]="\u1E57";
CF[0x1E58]="\u1E59";
CF[0x1E5A]="\u1E5B";
CF[0x1E5C]="\u1E5D";
CF[0x1E5E]="\u1E5F";
CF[0x1E60]="\u1E61";
CF[0x1E62]="\u1E63";
CF[0x1E64]="\u1E65";
CF[0x1E66]="\u1E67";
CF[0x1E68]="\u1E69";
CF[0x1E6A]="\u1E6B";
CF[0x1E6C]="\u1E6D";
CF[0x1E6E]="\u1E6F";
CF[0x1E70]="\u1E71";
CF[0x1E72]="\u1E73";
CF[0x1E74]="\u1E75";
CF[0x1E76]="\u1E77";
CF[0x1E78]="\u1E79";
CF[0x1E7A]="\u1E7B";
CF[0x1E7C]="\u1E7D";
CF[0x1E7E]="\u1E7F";
CF[0x1E80]="\u1E81";
CF[0x1E82]="\u1E83";
CF[0x1E84]="\u1E85";
CF[0x1E86]="\u1E87";
CF[0x1E88]="\u1E89";
CF[0x1E8A]="\u1E8B";
CF[0x1E8C]="\u1E8D";
CF[0x1E8E]="\u1E8F";
CF[0x1E90]="\u1E91";
CF[0x1E92]="\u1E93";
CF[0x1E94]="\u1E95";
CF[0x1E96]="\u0068\u0331";
CF[0x1E97]="\u0074\u0308";
CF[0x1E98]="\u0077\u030A";
CF[0x1E99]="\u0079\u030A";
CF[0x1E9A]="\u0061\u02BE";
CF[0x1E9B]="\u1E61";
CF[0x1EA0]="\u1EA1";
CF[0x1EA2]="\u1EA3";
CF[0x1EA4]="\u1EA5";
CF[0x1EA6]="\u1EA7";
CF[0x1EA8]="\u1EA9";
CF[0x1EAA]="\u1EAB";
CF[0x1EAC]="\u1EAD";
CF[0x1EAE]="\u1EAF";
CF[0x1EB0]="\u1EB1";
CF[0x1EB2]="\u1EB3";
CF[0x1EB4]="\u1EB5";
CF[0x1EB6]="\u1EB7";
CF[0x1EB8]="\u1EB9";
CF[0x1EBA]="\u1EBB";
CF[0x1EBC]="\u1EBD";
CF[0x1EBE]="\u1EBF";
CF[0x1EC0]="\u1EC1";
CF[0x1EC2]="\u1EC3";
CF[0x1EC4]="\u1EC5";
CF[0x1EC6]="\u1EC7";
CF[0x1EC8]="\u1EC9";
CF[0x1ECA]="\u1ECB";
CF[0x1ECC]="\u1ECD";
CF[0x1ECE]="\u1ECF";
CF[0x1ED0]="\u1ED1";
CF[0x1ED2]="\u1ED3";
CF[0x1ED4]="\u1ED5";
CF[0x1ED6]="\u1ED7";
CF[0x1ED8]="\u1ED9";
CF[0x1EDA]="\u1EDB";
CF[0x1EDC]="\u1EDD";
CF[0x1EDE]="\u1EDF";
CF[0x1EE0]="\u1EE1";
CF[0x1EE2]="\u1EE3";
CF[0x1EE4]="\u1EE5";
CF[0x1EE6]="\u1EE7";
CF[0x1EE8]="\u1EE9";
CF[0x1EEA]="\u1EEB";
CF[0x1EEC]="\u1EED";
CF[0x1EEE]="\u1EEF";
CF[0x1EF0]="\u1EF1";
CF[0x1EF2]="\u1EF3";
CF[0x1EF4]="\u1EF5";
CF[0x1EF6]="\u1EF7";
CF[0x1EF8]="\u1EF9";
CF[0x1F08]="\u1F00";
CF[0x1F09]="\u1F01";
CF[0x1F0A]="\u1F02";
CF[0x1F0B]="\u1F03";
CF[0x1F0C]="\u1F04";
CF[0x1F0D]="\u1F05";
CF[0x1F0E]="\u1F06";
CF[0x1F0F]="\u1F07";
CF[0x1F18]="\u1F10";
CF[0x1F19]="\u1F11";
CF[0x1F1A]="\u1F12";
CF[0x1F1B]="\u1F13";
CF[0x1F1C]="\u1F14";
CF[0x1F1D]="\u1F15";
CF[0x1F28]="\u1F20";
CF[0x1F29]="\u1F21";
CF[0x1F2A]="\u1F22";
CF[0x1F2B]="\u1F23";
CF[0x1F2C]="\u1F24";
CF[0x1F2D]="\u1F25";
CF[0x1F2E]="\u1F26";
CF[0x1F2F]="\u1F27";
CF[0x1F38]="\u1F30";
CF[0x1F39]="\u1F31";
CF[0x1F3A]="\u1F32";
CF[0x1F3B]="\u1F33";
CF[0x1F3C]="\u1F34";
CF[0x1F3D]="\u1F35";
CF[0x1F3E]="\u1F36";
CF[0x1F3F]="\u1F37";
CF[0x1F48]="\u1F40";
CF[0x1F49]="\u1F41";
CF[0x1F4A]="\u1F42";
CF[0x1F4B]="\u1F43";
CF[0x1F4C]="\u1F44";
CF[0x1F4D]="\u1F45";
CF[0x1F50]="\u03C5\u0313";
CF[0x1F52]="\u03C5\u0313\u0300";
CF[0x1F54]="\u03C5\u0313\u0301";
CF[0x1F56]="\u03C5\u0313\u0342";
CF[0x1F59]="\u1F51";
CF[0x1F5B]="\u1F53";
CF[0x1F5D]="\u1F55";
CF[0x1F5F]="\u1F57";
CF[0x1F68]="\u1F60";
CF[0x1F69]="\u1F61";
CF[0x1F6A]="\u1F62";
CF[0x1F6B]="\u1F63";
CF[0x1F6C]="\u1F64";
CF[0x1F6D]="\u1F65";
CF[0x1F6E]="\u1F66";
CF[0x1F6F]="\u1F67";
CF[0x1F80]="\u1F00\u03B9";
CF[0x1F81]="\u1F01\u03B9";
CF[0x1F82]="\u1F02\u03B9";
CF[0x1F83]="\u1F03\u03B9";
CF[0x1F84]="\u1F04\u03B9";
CF[0x1F85]="\u1F05\u03B9";
CF[0x1F86]="\u1F06\u03B9";
CF[0x1F87]="\u1F07\u03B9";
CF[0x1F88]="\u1F00\u03B9";
CF[0x1F89]="\u1F01\u03B9";
CF[0x1F8A]="\u1F02\u03B9";
CF[0x1F8B]="\u1F03\u03B9";
CF[0x1F8C]="\u1F04\u03B9";
CF[0x1F8D]="\u1F05\u03B9";
CF[0x1F8E]="\u1F06\u03B9";
CF[0x1F8F]="\u1F07\u03B9";
CF[0x1F90]="\u1F20\u03B9";
CF[0x1F91]="\u1F21\u03B9";
CF[0x1F92]="\u1F22\u03B9";
CF[0x1F93]="\u1F23\u03B9";
CF[0x1F94]="\u1F24\u03B9";
CF[0x1F95]="\u1F25\u03B9";
CF[0x1F96]="\u1F26\u03B9";
CF[0x1F97]="\u1F27\u03B9";
CF[0x1F98]="\u1F20\u03B9";
CF[0x1F99]="\u1F21\u03B9";
CF[0x1F9A]="\u1F22\u03B9";
CF[0x1F9B]="\u1F23\u03B9";
CF[0x1F9C]="\u1F24\u03B9";
CF[0x1F9D]="\u1F25\u03B9";
CF[0x1F9E]="\u1F26\u03B9";
CF[0x1F9F]="\u1F27\u03B9";
CF[0x1FA0]="\u1F60\u03B9";
CF[0x1FA1]="\u1F61\u03B9";
CF[0x1FA2]="\u1F62\u03B9";
CF[0x1FA3]="\u1F63\u03B9";
CF[0x1FA4]="\u1F64\u03B9";
CF[0x1FA5]="\u1F65\u03B9";
CF[0x1FA6]="\u1F66\u03B9";
CF[0x1FA7]="\u1F67\u03B9";
CF[0x1FA8]="\u1F60\u03B9";
CF[0x1FA9]="\u1F61\u03B9";
CF[0x1FAA]="\u1F62\u03B9";
CF[0x1FAB]="\u1F63\u03B9";
CF[0x1FAC]="\u1F64\u03B9";
CF[0x1FAD]="\u1F65\u03B9";
CF[0x1FAE]="\u1F66\u03B9";
CF[0x1FAF]="\u1F67\u03B9";
CF[0x1FB2]="\u1F70\u03B9";
CF[0x1FB3]="\u03B1\u03B9";
CF[0x1FB4]="\u03AC\u03B9";
CF[0x1FB6]="\u03B1\u0342";
CF[0x1FB7]="\u03B1\u0342\u03B9";
CF[0x1FB8]="\u1FB0";
CF[0x1FB9]="\u1FB1";
CF[0x1FBA]="\u1F70";
CF[0x1FBB]="\u1F71";
CF[0x1FBC]="\u03B1\u03B9";
CF[0x1FBE]="\u03B9";
CF[0x1FC2]="\u1F74\u03B9";
CF[0x1FC3]="\u03B7\u03B9";
CF[0x1FC4]="\u03AE\u03B9";
CF[0x1FC6]="\u03B7\u0342";
CF[0x1FC7]="\u03B7\u0342\u03B9";
CF[0x1FC8]="\u1F72";
CF[0x1FC9]="\u1F73";
CF[0x1FCA]="\u1F74";
CF[0x1FCB]="\u1F75";
CF[0x1FCC]="\u03B7\u03B9";
CF[0x1FD2]="\u03B9\u0308\u0300";
CF[0x1FD3]="\u03B9\u0308\u0301";
CF[0x1FD6]="\u03B9\u0342";
CF[0x1FD7]="\u03B9\u0308\u0342";
CF[0x1FD8]="\u1FD0";
CF[0x1FD9]="\u1FD1";
CF[0x1FDA]="\u1F76";
CF[0x1FDB]="\u1F77";
CF[0x1FE2]="\u03C5\u0308\u0300";
CF[0x1FE3]="\u03C5\u0308\u0301";
CF[0x1FE4]="\u03C1\u0313";
CF[0x1FE6]="\u03C5\u0342";
CF[0x1FE7]="\u03C5\u0308\u0342";
CF[0x1FE8]="\u1FE0";
CF[0x1FE9]="\u1FE1";
CF[0x1FEA]="\u1F7A";
CF[0x1FEB]="\u1F7B";
CF[0x1FEC]="\u1FE5";
CF[0x1FF2]="\u1F7C\u03B9";
CF[0x1FF3]="\u03C9\u03B9";
CF[0x1FF4]="\u03CE\u03B9";
CF[0x1FF6]="\u03C9\u0342";
CF[0x1FF7]="\u03C9\u0342\u03B9";
CF[0x1FF8]="\u1F78";
CF[0x1FF9]="\u1F79";
CF[0x1FFA]="\u1F7C";
CF[0x1FFB]="\u1F7D";
CF[0x1FFC]="\u03C9\u03B9";
CF[0x2126]="\u03C9";
CF[0x212A]="\u006B";
CF[0x212B]="\u00E5";
CF[0x2160]="\u2170";
CF[0x2161]="\u2171";
CF[0x2162]="\u2172";
CF[0x2163]="\u2173";
CF[0x2164]="\u2174";
CF[0x2165]="\u2175";
CF[0x2166]="\u2176";
CF[0x2167]="\u2177";
CF[0x2168]="\u2178";
CF[0x2169]="\u2179";
CF[0x216A]="\u217A";
CF[0x216B]="\u217B";
CF[0x216C]="\u217C";
CF[0x216D]="\u217D";
CF[0x216E]="\u217E";
CF[0x216F]="\u217F";
CF[0x24B6]="\u24D0";
CF[0x24B7]="\u24D1";
CF[0x24B8]="\u24D2";
CF[0x24B9]="\u24D3";
CF[0x24BA]="\u24D4";
CF[0x24BB]="\u24D5";
CF[0x24BC]="\u24D6";
CF[0x24BD]="\u24D7";
CF[0x24BE]="\u24D8";
CF[0x24BF]="\u24D9";
CF[0x24C0]="\u24DA";
CF[0x24C1]="\u24DB";
CF[0x24C2]="\u24DC";
CF[0x24C3]="\u24DD";
CF[0x24C4]="\u24DE";
CF[0x24C5]="\u24DF";
CF[0x24C6]="\u24E0";
CF[0x24C7]="\u24E1";
CF[0x24C8]="\u24E2";
CF[0x24C9]="\u24E3";
CF[0x24CA]="\u24E4";
CF[0x24CB]="\u24E5";
CF[0x24CC]="\u24E6";
CF[0x24CD]="\u24E7";
CF[0x24CE]="\u24E8";
CF[0x24CF]="\u24E9";
CF[0xFB00]="\u0066\u0066";
CF[0xFB01]="\u0066\u0069";
CF[0xFB02]="\u0066\u006C";
CF[0xFB03]="\u0066\u0066\u0069";
CF[0xFB04]="\u0066\u0066\u006C";
CF[0xFB05]="\u0073\u0074";
CF[0xFB06]="\u0073\u0074";
CF[0xFB13]="\u0574\u0576";
CF[0xFB14]="\u0574\u0565";
CF[0xFB15]="\u0574\u056B";
CF[0xFB16]="\u057E\u0576";
CF[0xFB17]="\u0574\u056D";
CF[0xFF21]="\uFF41";
CF[0xFF22]="\uFF42";
CF[0xFF23]="\uFF43";
CF[0xFF24]="\uFF44";
CF[0xFF25]="\uFF45";
CF[0xFF26]="\uFF46";
CF[0xFF27]="\uFF47";
CF[0xFF28]="\uFF48";
CF[0xFF29]="\uFF49";
CF[0xFF2A]="\uFF4A";
CF[0xFF2B]="\uFF4B";
CF[0xFF2C]="\uFF4C";
CF[0xFF2D]="\uFF4D";
CF[0xFF2E]="\uFF4E";
CF[0xFF2F]="\uFF4F";
CF[0xFF30]="\uFF50";
CF[0xFF31]="\uFF51";
CF[0xFF32]="\uFF52";
CF[0xFF33]="\uFF53";
CF[0xFF34]="\uFF54";
CF[0xFF35]="\uFF55";
CF[0xFF36]="\uFF56";
CF[0xFF37]="\uFF57";
CF[0xFF38]="\uFF58";
CF[0xFF39]="\uFF59";
CF[0xFF3A]="\uFF5A";
// 785 case foldings total
}
}

View File

@ -1,369 +0,0 @@
<html><body>
<h1
>1. Mismatches when NFD is OFF</h1><h2
>Date:Mon Jun 03 08:45:38 PDT 2002</h2><h2
>File Version:-3.1.1d1</h2><p
>Alternate Handling = NON_IGNORABLE</p><table border="1"
><caption
>Mismatches in UCA-NOD: Plain vs NFC: 4</caption><tr
><th
>Code</th><th
>Type</th><th
>CC?</th><th
>Key</th></tr><tr
><th rowSpan="2" align="right"
>F951 CJK COMPATIBILITY IDEOGRAPH-F951<br
></br>NFC=964B</th><th
>Plain</th><th
>n</th><td
>[FF41 96FB | 0020 0020 | 0002 0002]</td></tr><tr
><th
>NFC</th><th
>ERROR</th><td
>[FF41 964B | 0020 0020 | 0002 0002]</td></tr><tr
><th rowSpan="2" align="right"
>FB1F HEBREW LIGATURE YIDDISH YOD YOD PATAH<br
></br>NFC=05F2 05B7</th><th
>Plain</th><th
>n</th><td
>[0EC0 0EC0 | 0020 0020 00B2 | 0004 0004 001F]</td></tr><tr
><th
>NFC</th><th
>Y</th><td
>[0EC0 0EC0 | 0020 0020 00B2 | 0004 0004 0002]</td></tr><tr
><th rowSpan="2" align="right"
>FB3A HEBREW LETTER FINAL KAF WITH DAGESH<br
></br>NFC=05DA 05BC</th><th
>Plain</th><th
>n</th><td
>[0EC1 | 0020 00B6 | 0019 0019]</td></tr><tr
><th
>NFC</th><th
>Y</th><td
>[0EC1 | 0020 00B6 | 0019 0002]</td></tr><tr
><th rowSpan="2" align="right"
>FB43 HEBREW LETTER FINAL PE WITH DAGESH<br
></br>NFC=05E3 05BC</th><th
>Plain</th><th
>n</th><td
>[0EC7 | 0020 00B6 | 0019 0019]</td></tr><tr
><th
>NFC</th><th
>Y</th><td
>[0EC7 | 0020 00B6 | 0019 0002]</td></tr></table><br>
<h1>2. Differences in Ordering</h1>
<p>Codes and names are in the white rows: bold means that the NO-NFD sort key differs from UCA key.</p>
<p>Keys are in the light blue rows: green is the bad key, blue is UCA, black is where they equal.</p>
<table border='1'>
<tr><th>File Order</th><th>Code and Decomp</th><th>Key and Decomp-Key</th></tr>
<tr><td colspan='3'></td><tr>
<tr><td>12573</td><td>F951 CJK COMPATIBILITY IDEOGRAPH-F951<br><964B> </td><td>
<font color='#009900'>[FF41 96FB | 0020 0020 | 0002 0002 | |]</font><br><font color='#000099'>[FF41 964B | 0020 0020 | 0002 0002 | |]</font>
</td></tr>
<tr><td>12574</td><td>FA09 CJK COMPATIBILITY IDEOGRAPH-FA09<br><964D> </td><td>
[FF41 964D | 0020 0020 | 0002 0002 | |]
</td></tr>
</table>
<h2>3. Primaries Incompatible with Decompositions</h2><table border='1'>
<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>
<tr><td>00A8</td><td>[0214]</td><td>[0209]</td><td>DIAERESIS</td></tr>
<tr><td>00AF</td><td>[0210]</td><td>[0209]</td><td>MACRON</td></tr>
<tr><td>00B4</td><td>[020D]</td><td>[0209]</td><td>ACUTE ACCENT</td></tr>
<tr><td>00B8</td><td>[0219]</td><td>[0209]</td><td>CEDILLA</td></tr>
<tr><td>02D8</td><td>[0212]</td><td>[0209]</td><td>BREVE</td></tr>
<tr><td>02D9</td><td>[0213]</td><td>[0209]</td><td>DOT ABOVE</td></tr>
<tr><td>02DA</td><td>[0215]</td><td>[0209]</td><td>RING ABOVE</td></tr>
<tr><td>02DB</td><td>[021A]</td><td>[0209]</td><td>OGONEK</td></tr>
<tr><td>02DC</td><td>[020E]</td><td>[0209]</td><td>SMALL TILDE</td></tr>
<tr><td>02DD</td><td>[0216]</td><td>[0209]</td><td>DOUBLE ACUTE ACCENT</td></tr>
<tr><td>037A</td><td>[0C9B]</td><td>[0209]</td><td>GREEK YPOGEGRAMMENI</td></tr>
<tr><td>0384</td><td>[020D]</td><td>[0209]</td><td>GREEK TONOS</td></tr>
<tr><td>0385</td><td>[0214]</td><td>[0209]</td><td>GREEK DIALYTIKA TONOS</td></tr>
<tr><td>0CCB</td><td>[12C4]</td><td>[12C3 12C7]</td><td>KANNADA VOWEL SIGN OO</td></tr>
<tr><td>0DDD</td><td>[1353]</td><td>[1352 1346]</td><td>SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA</td></tr>
<tr><td>1FBD</td><td>[0217]</td><td>[0209]</td><td>GREEK KORONIS</td></tr>
<tr><td>1FBF</td><td>[0217]</td><td>[0209]</td><td>GREEK PSILI</td></tr>
<tr><td>1FC0</td><td>[021D]</td><td>[0209]</td><td>GREEK PERISPOMENI</td></tr>
<tr><td>1FC1</td><td>[0214]</td><td>[0209]</td><td>GREEK DIALYTIKA AND PERISPOMENI</td></tr>
<tr><td>1FCD</td><td>[0217]</td><td>[0209]</td><td>GREEK PSILI AND VARIA</td></tr>
<tr><td>1FCE</td><td>[0217]</td><td>[0209]</td><td>GREEK PSILI AND OXIA</td></tr>
<tr><td>1FCF</td><td>[0217]</td><td>[0209]</td><td>GREEK PSILI AND PERISPOMENI</td></tr>
<tr><td>1FDD</td><td>[0218]</td><td>[0209]</td><td>GREEK DASIA AND VARIA</td></tr>
<tr><td>1FDE</td><td>[0218]</td><td>[0209]</td><td>GREEK DASIA AND OXIA</td></tr>
<tr><td>1FDF</td><td>[0218]</td><td>[0209]</td><td>GREEK DASIA AND PERISPOMENI</td></tr>
<tr><td>1FED</td><td>[0214]</td><td>[0209]</td><td>GREEK DIALYTIKA AND VARIA</td></tr>
<tr><td>1FEE</td><td>[0214]</td><td>[0209]</td><td>GREEK DIALYTIKA AND OXIA</td></tr>
<tr><td>1FFD</td><td>[020D]</td><td>[0209]</td><td>GREEK OXIA</td></tr>
<tr><td>1FFE</td><td>[0218]</td><td>[0209]</td><td>GREEK DASIA</td></tr>
<tr><td>2017</td><td>[021C]</td><td>[0209]</td><td>DOUBLE LOW LINE</td></tr>
<tr><td>203E</td><td>[0211]</td><td>[0209]</td><td>OVERLINE</td></tr>
<tr><td>2047</td><td>[FFC0 A047]</td><td>[024E 024E]</td><td>DOUBLE QUESTION MARK</td></tr>
<tr><td>2057</td><td>[FFC0 A057]</td><td>[02B6 02B6 02B6 02B6]</td><td>QUADRUPLE PRIME</td></tr>
<tr><td>205F</td><td>[FFC0 A05F]</td><td>[0209]</td><td>MEDIUM MATHEMATICAL SPACE</td></tr>
<tr><td>2071</td><td>[FFC0 A071]</td><td>[0AD3]</td><td>SUPERSCRIPT LATIN SMALL LETTER I</td></tr>
<tr><td>213D</td><td>[FFC0 A13D]</td><td>[0C93]</td><td>DOUBLE-STRUCK SMALL GAMMA</td></tr>
<tr><td>213E</td><td>[FFC0 A13E]</td><td>[0C93]</td><td>DOUBLE-STRUCK CAPITAL GAMMA</td></tr>
<tr><td>213F</td><td>[FFC0 A13F]</td><td>[0CA3]</td><td>DOUBLE-STRUCK CAPITAL PI</td></tr>
<tr><td>2140</td><td>[FFC0 A140]</td><td>[039E]</td><td>DOUBLE-STRUCK N-ARY SUMMATION</td></tr>
<tr><td>2145</td><td>[FFC0 A145]</td><td>[0A49]</td><td>DOUBLE-STRUCK ITALIC CAPITAL D</td></tr>
<tr><td>2146</td><td>[FFC0 A146]</td><td>[0A49]</td><td>DOUBLE-STRUCK ITALIC SMALL D</td></tr>
<tr><td>2147</td><td>[FFC0 A147]</td><td>[0A65]</td><td>DOUBLE-STRUCK ITALIC SMALL E</td></tr>
<tr><td>2148</td><td>[FFC0 A148]</td><td>[0AD3]</td><td>DOUBLE-STRUCK ITALIC SMALL I</td></tr>
<tr><td>2149</td><td>[FFC0 A149]</td><td>[0AE7]</td><td>DOUBLE-STRUCK ITALIC SMALL J</td></tr>
<tr><td>2A0C</td><td>[FFC0 AA0C]</td><td>[03C2 03C2 03C2 03C2]</td><td>QUADRUPLE INTEGRAL OPERATOR</td></tr>
<tr><td>2A74</td><td>[FFC0 AA74]</td><td>[0237 0237 03A4]</td><td>DOUBLE COLON EQUAL</td></tr>
<tr><td>2A75</td><td>[FFC0 AA75]</td><td>[03A4 03A4]</td><td>TWO CONSECUTIVE EQUALS SIGNS</td></tr>
<tr><td>2A76</td><td>[FFC0 AA76]</td><td>[03A4 03A4 03A4]</td><td>THREE CONSECUTIVE EQUALS SIGNS</td></tr>
<tr><td>2ADC</td><td>[FFC0 AADC]</td><td>[FFC0 AADD]</td><td>FORKING</td></tr>
<tr><td>309B</td><td>[021E]</td><td>[0209]</td><td>KATAKANA-HIRAGANA VOICED SOUND MARK</td></tr>
<tr><td>309C</td><td>[021F]</td><td>[0209]</td><td>KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK</td></tr>
<tr><td>309F</td><td>[FFC0 B09F]</td><td>[1946 1948]</td><td>HIRAGANA DIGRAPH YORI</td></tr>
<tr><td>30FF</td><td>[FFC0 B0FF]</td><td>[192A 1934]</td><td>KATAKANA DIGRAPH KOTO</td></tr>
<tr><td>3251</td><td>[FFC0 B251]</td><td>[0A0D 0A0C]</td><td>CIRCLED NUMBER TWENTY ONE</td></tr>
<tr><td>3252</td><td>[FFC0 B252]</td><td>[0A0D 0A0D]</td><td>CIRCLED NUMBER TWENTY TWO</td></tr>
<tr><td>3253</td><td>[FFC0 B253]</td><td>[0A0D 0A0E]</td><td>CIRCLED NUMBER TWENTY THREE</td></tr>
<tr><td>3254</td><td>[FFC0 B254]</td><td>[0A0D 0A0F]</td><td>CIRCLED NUMBER TWENTY FOUR</td></tr>
<tr><td>3255</td><td>[FFC0 B255]</td><td>[0A0D 0A10]</td><td>CIRCLED NUMBER TWENTY FIVE</td></tr>
<tr><td>3256</td><td>[FFC0 B256]</td><td>[0A0D 0A11]</td><td>CIRCLED NUMBER TWENTY SIX</td></tr>
<tr><td>3257</td><td>[FFC0 B257]</td><td>[0A0D 0A12]</td><td>CIRCLED NUMBER TWENTY SEVEN</td></tr>
<tr><td>3258</td><td>[FFC0 B258]</td><td>[0A0D 0A13]</td><td>CIRCLED NUMBER TWENTY EIGHT</td></tr>
<tr><td>3259</td><td>[FFC0 B259]</td><td>[0A0D 0A14]</td><td>CIRCLED NUMBER TWENTY NINE</td></tr>
<tr><td>325A</td><td>[FFC0 B25A]</td><td>[0A0E 0A0B]</td><td>CIRCLED NUMBER THIRTY</td></tr>
<tr><td>325B</td><td>[FFC0 B25B]</td><td>[0A0E 0A0C]</td><td>CIRCLED NUMBER THIRTY ONE</td></tr>
<tr><td>325C</td><td>[FFC0 B25C]</td><td>[0A0E 0A0D]</td><td>CIRCLED NUMBER THIRTY TWO</td></tr>
<tr><td>325D</td><td>[FFC0 B25D]</td><td>[0A0E 0A0E]</td><td>CIRCLED NUMBER THIRTY THREE</td></tr>
<tr><td>325E</td><td>[FFC0 B25E]</td><td>[0A0E 0A0F]</td><td>CIRCLED NUMBER THIRTY FOUR</td></tr>
<tr><td>325F</td><td>[FFC0 B25F]</td><td>[0A0E 0A10]</td><td>CIRCLED NUMBER THIRTY FIVE</td></tr>
<tr><td>32B1</td><td>[FFC0 B2B1]</td><td>[0A0E 0A11]</td><td>CIRCLED NUMBER THIRTY SIX</td></tr>
<tr><td>32B2</td><td>[FFC0 B2B2]</td><td>[0A0E 0A12]</td><td>CIRCLED NUMBER THIRTY SEVEN</td></tr>
<tr><td>32B3</td><td>[FFC0 B2B3]</td><td>[0A0E 0A13]</td><td>CIRCLED NUMBER THIRTY EIGHT</td></tr>
<tr><td>32B4</td><td>[FFC0 B2B4]</td><td>[0A0E 0A14]</td><td>CIRCLED NUMBER THIRTY NINE</td></tr>
<tr><td>32B5</td><td>[FFC0 B2B5]</td><td>[0A0F 0A0B]</td><td>CIRCLED NUMBER FORTY</td></tr>
<tr><td>32B6</td><td>[FFC0 B2B6]</td><td>[0A0F 0A0C]</td><td>CIRCLED NUMBER FORTY ONE</td></tr>
<tr><td>32B7</td><td>[FFC0 B2B7]</td><td>[0A0F 0A0D]</td><td>CIRCLED NUMBER FORTY TWO</td></tr>
<tr><td>32B8</td><td>[FFC0 B2B8]</td><td>[0A0F 0A0E]</td><td>CIRCLED NUMBER FORTY THREE</td></tr>
<tr><td>32B9</td><td>[FFC0 B2B9]</td><td>[0A0F 0A0F]</td><td>CIRCLED NUMBER FORTY FOUR</td></tr>
<tr><td>32BA</td><td>[FFC0 B2BA]</td><td>[0A0F 0A10]</td><td>CIRCLED NUMBER FORTY FIVE</td></tr>
<tr><td>32BB</td><td>[FFC0 B2BB]</td><td>[0A0F 0A11]</td><td>CIRCLED NUMBER FORTY SIX</td></tr>
<tr><td>32BC</td><td>[FFC0 B2BC]</td><td>[0A0F 0A12]</td><td>CIRCLED NUMBER FORTY SEVEN</td></tr>
<tr><td>32BD</td><td>[FFC0 B2BD]</td><td>[0A0F 0A13]</td><td>CIRCLED NUMBER FORTY EIGHT</td></tr>
<tr><td>32BE</td><td>[FFC0 B2BE]</td><td>[0A0F 0A14]</td><td>CIRCLED NUMBER FORTY NINE</td></tr>
<tr><td>32BF</td><td>[FFC0 B2BF]</td><td>[0A10 0A0B]</td><td>CIRCLED NUMBER FIFTY</td></tr>
<tr><td>F951</td><td>[FF41 96FB]</td><td>[FF41 964B]</td><td>CJK COMPATIBILITY IDEOGRAPH-F951</td></tr>
<tr><td>FA30</td><td>[FFC1 FA30]</td><td>[FF40 CFAE]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA30</td></tr>
<tr><td>FA31</td><td>[FFC1 FA31]</td><td>[FF40 D0E7]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA31</td></tr>
<tr><td>FA32</td><td>[FFC1 FA32]</td><td>[FF40 D14D]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA32</td></tr>
<tr><td>FA33</td><td>[FFC1 FA33]</td><td>[FF40 D2C9]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA33</td></tr>
<tr><td>FA34</td><td>[FFC1 FA34]</td><td>[FF40 D2E4]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA34</td></tr>
<tr><td>FA35</td><td>[FFC1 FA35]</td><td>[FF40 D351]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA35</td></tr>
<tr><td>FA36</td><td>[FFC1 FA36]</td><td>[FF40 D59D]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA36</td></tr>
<tr><td>FA37</td><td>[FFC1 FA37]</td><td>[FF40 D606]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA37</td></tr>
<tr><td>FA38</td><td>[FFC1 FA38]</td><td>[FF40 D668]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA38</td></tr>
<tr><td>FA39</td><td>[FFC1 FA39]</td><td>[FF40 D840]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA39</td></tr>
<tr><td>FA3A</td><td>[FFC1 FA3A]</td><td>[FF40 D8A8]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3A</td></tr>
<tr><td>FA3B</td><td>[FFC1 FA3B]</td><td>[FF40 DC64]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3B</td></tr>
<tr><td>FA3C</td><td>[FFC1 FA3C]</td><td>[FF40 DC6E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3C</td></tr>
<tr><td>FA3D</td><td>[FFC1 FA3D]</td><td>[FF40 E094]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3D</td></tr>
<tr><td>FA3E</td><td>[FFC1 FA3E]</td><td>[FF40 E168]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3E</td></tr>
<tr><td>FA3F</td><td>[FFC1 FA3F]</td><td>[FF40 E18E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3F</td></tr>
<tr><td>FA40</td><td>[FFC1 FA40]</td><td>[FF40 E1F2]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA40</td></tr>
<tr><td>FA41</td><td>[FFC1 FA41]</td><td>[FF40 E54F]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA41</td></tr>
<tr><td>FA42</td><td>[FFC1 FA42]</td><td>[FF40 E5E2]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA42</td></tr>
<tr><td>FA43</td><td>[FFC1 FA43]</td><td>[FF40 E691]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA43</td></tr>
<tr><td>FA44</td><td>[FFC1 FA44]</td><td>[FF40 E885]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA44</td></tr>
<tr><td>FA45</td><td>[FFC1 FA45]</td><td>[FF40 ED77]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA45</td></tr>
<tr><td>FA46</td><td>[FFC1 FA46]</td><td>[FF40 EE1A]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA46</td></tr>
<tr><td>FA47</td><td>[FFC1 FA47]</td><td>[FF40 EF22]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA47</td></tr>
<tr><td>FA48</td><td>[FFC1 FA48]</td><td>[FF40 F16E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA48</td></tr>
<tr><td>FA49</td><td>[FFC1 FA49]</td><td>[FF40 F22B]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA49</td></tr>
<tr><td>FA4A</td><td>[FFC1 FA4A]</td><td>[FF40 F422]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4A</td></tr>
<tr><td>FA4B</td><td>[FFC1 FA4B]</td><td>[FF40 F891]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4B</td></tr>
<tr><td>FA4C</td><td>[FFC1 FA4C]</td><td>[FF40 F93E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4C</td></tr>
<tr><td>FA4D</td><td>[FFC1 FA4D]</td><td>[FF40 F949]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4D</td></tr>
<tr><td>FA4E</td><td>[FFC1 FA4E]</td><td>[FF40 F948]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4E</td></tr>
<tr><td>FA4F</td><td>[FFC1 FA4F]</td><td>[FF40 F950]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4F</td></tr>
<tr><td>FA50</td><td>[FFC1 FA50]</td><td>[FF40 F956]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA50</td></tr>
<tr><td>FA51</td><td>[FFC1 FA51]</td><td>[FF40 F95D]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA51</td></tr>
<tr><td>FA52</td><td>[FFC1 FA52]</td><td>[FF40 F98D]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA52</td></tr>
<tr><td>FA53</td><td>[FFC1 FA53]</td><td>[FF40 F98E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA53</td></tr>
<tr><td>FA54</td><td>[FFC1 FA54]</td><td>[FF40 FA40]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA54</td></tr>
<tr><td>FA55</td><td>[FFC1 FA55]</td><td>[FF40 FA81]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA55</td></tr>
<tr><td>FA56</td><td>[FFC1 FA56]</td><td>[FF40 FBC0]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA56</td></tr>
<tr><td>FA57</td><td>[FFC1 FA57]</td><td>[FF40 FDF4]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA57</td></tr>
<tr><td>FA58</td><td>[FFC1 FA58]</td><td>[FF40 FE09]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA58</td></tr>
<tr><td>FA59</td><td>[FFC1 FA59]</td><td>[FF40 FE41]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA59</td></tr>
<tr><td>FA5A</td><td>[FFC1 FA5A]</td><td>[FF40 FF72]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5A</td></tr>
<tr><td>FA5B</td><td>[FFC1 FA5B]</td><td>[FF41 8005]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5B</td></tr>
<tr><td>FA5C</td><td>[FFC1 FA5C]</td><td>[FF41 81ED]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5C</td></tr>
<tr><td>FA5D</td><td>[FFC1 FA5D]</td><td>[FF41 8279]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5D</td></tr>
<tr><td>FA5E</td><td>[FFC1 FA5E]</td><td>[FF41 8279]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5E</td></tr>
<tr><td>FA5F</td><td>[FFC1 FA5F]</td><td>[FF41 8457]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5F</td></tr>
<tr><td>FA60</td><td>[FFC1 FA60]</td><td>[FF41 8910]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA60</td></tr>
<tr><td>FA61</td><td>[FFC1 FA61]</td><td>[FF41 8996]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA61</td></tr>
<tr><td>FA62</td><td>[FFC1 FA62]</td><td>[FF41 8B01]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA62</td></tr>
<tr><td>FA63</td><td>[FFC1 FA63]</td><td>[FF41 8B39]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA63</td></tr>
<tr><td>FA64</td><td>[FFC1 FA64]</td><td>[FF41 8CD3]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA64</td></tr>
<tr><td>FA65</td><td>[FFC1 FA65]</td><td>[FF41 8D08]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA65</td></tr>
<tr><td>FA66</td><td>[FFC1 FA66]</td><td>[FF41 8FB6]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA66</td></tr>
<tr><td>FA67</td><td>[FFC1 FA67]</td><td>[FF41 9038]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA67</td></tr>
<tr><td>FA68</td><td>[FFC1 FA68]</td><td>[FF41 96E3]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA68</td></tr>
<tr><td>FA69</td><td>[FFC1 FA69]</td><td>[FF41 97FF]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA69</td></tr>
<tr><td>FA6A</td><td>[FFC1 FA6A]</td><td>[FF41 983B]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA6A</td></tr>
<tr><td>FC5E</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM</td></tr>
<tr><td>FC5F</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM</td></tr>
<tr><td>FC60</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM</td></tr>
<tr><td>FC61</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM</td></tr>
<tr><td>FC62</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM</td></tr>
<tr><td>FC63</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM</td></tr>
<tr><td>FCF2</td><td>[]</td><td>[020B]</td><td>ARABIC LIGATURE SHADDA WITH FATHA MEDIAL FORM</td></tr>
<tr><td>FCF3</td><td>[]</td><td>[020B]</td><td>ARABIC LIGATURE SHADDA WITH DAMMA MEDIAL FORM</td></tr>
<tr><td>FCF4</td><td>[]</td><td>[020B]</td><td>ARABIC LIGATURE SHADDA WITH KASRA MEDIAL FORM</td></tr>
<tr><td>FDFC</td><td>[FFC1 FDFC]</td><td>[0EF9 0F4A 0ED6 0F2D]</td><td>RIAL SIGN</td></tr>
<tr><td>FE49</td><td>[0211]</td><td>[0209]</td><td>DASHED OVERLINE</td></tr>
<tr><td>FE4A</td><td>[0211]</td><td>[0209]</td><td>CENTRELINE OVERLINE</td></tr>
<tr><td>FE4B</td><td>[0211]</td><td>[0209]</td><td>WAVY OVERLINE</td></tr>
<tr><td>FE4C</td><td>[0211]</td><td>[0209]</td><td>DOUBLE WAVY OVERLINE</td></tr>
<tr><td>FE70</td><td>[]</td><td>[0209]</td><td>ARABIC FATHATAN ISOLATED FORM</td></tr>
<tr><td>FE71</td><td>[]</td><td>[020B]</td><td>ARABIC TATWEEL WITH FATHATAN ABOVE</td></tr>
<tr><td>FE72</td><td>[]</td><td>[0209]</td><td>ARABIC DAMMATAN ISOLATED FORM</td></tr>
<tr><td>FE74</td><td>[]</td><td>[0209]</td><td>ARABIC KASRATAN ISOLATED FORM</td></tr>
<tr><td>FE76</td><td>[]</td><td>[0209]</td><td>ARABIC FATHA ISOLATED FORM</td></tr>
<tr><td>FE77</td><td>[]</td><td>[020B]</td><td>ARABIC FATHA MEDIAL FORM</td></tr>
<tr><td>FE78</td><td>[]</td><td>[0209]</td><td>ARABIC DAMMA ISOLATED FORM</td></tr>
<tr><td>FE79</td><td>[]</td><td>[020B]</td><td>ARABIC DAMMA MEDIAL FORM</td></tr>
<tr><td>FE7A</td><td>[]</td><td>[0209]</td><td>ARABIC KASRA ISOLATED FORM</td></tr>
<tr><td>FE7B</td><td>[]</td><td>[020B]</td><td>ARABIC KASRA MEDIAL FORM</td></tr>
<tr><td>FE7C</td><td>[]</td><td>[0209]</td><td>ARABIC SHADDA ISOLATED FORM</td></tr>
<tr><td>FE7D</td><td>[]</td><td>[020B]</td><td>ARABIC SHADDA MEDIAL FORM</td></tr>
<tr><td>FE7E</td><td>[]</td><td>[0209]</td><td>ARABIC SUKUN ISOLATED FORM</td></tr>
<tr><td>FE7F</td><td>[]</td><td>[020B]</td><td>ARABIC SUKUN MEDIAL FORM</td></tr>
<tr><td>FF5F</td><td>[FFC1 FF5F]</td><td>[FFC0 A985]</td><td>FULLWIDTH LEFT WHITE PARENTHESIS</td></tr>
<tr><td>FF60</td><td>[FFC1 FF60]</td><td>[FFC0 A986]</td><td>FULLWIDTH RIGHT WHITE PARENTHESIS</td></tr>
<tr><td>FFE3</td><td>[0210]</td><td>[0209]</td><td>FULLWIDTH MACRON</td></tr>
</table>
<h2>4. Secondaries Incompatible with Decompositions</h2><table border='1'>
<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>
<tr><td>00A8</td><td>[0214 | 0020]</td><td>[0209 | 0020 0047]</td><td>DIAERESIS</td></tr>
<tr><td>00AF</td><td>[0210 | 0020]</td><td>[0209 | 0020 005A]</td><td>MACRON</td></tr>
<tr><td>00B4</td><td>[020D | 0020]</td><td>[0209 | 0020 0032]</td><td>ACUTE ACCENT</td></tr>
<tr><td>00B8</td><td>[0219 | 0020]</td><td>[0209 | 0020 0055]</td><td>CEDILLA</td></tr>
<tr><td>017F</td><td>[0BA7 | 0020 0154]</td><td>[0BA7 | 0020]</td><td>LATIN SMALL LETTER LONG S</td></tr>
<tr><td>02D8</td><td>[0212 | 0020]</td><td>[0209 | 0020 0037]</td><td>BREVE</td></tr>
<tr><td>02D9</td><td>[0213 | 0020]</td><td>[0209 | 0020 0052]</td><td>DOT ABOVE</td></tr>
<tr><td>02DA</td><td>[0215 | 0020]</td><td>[0209 | 0020 0043]</td><td>RING ABOVE</td></tr>
<tr><td>02DB</td><td>[021A | 0020]</td><td>[0209 | 0020 0058]</td><td>OGONEK</td></tr>
<tr><td>02DC</td><td>[020E | 0020]</td><td>[0209 | 0020 004E]</td><td>SMALL TILDE</td></tr>
<tr><td>02DD</td><td>[0216 | 0020]</td><td>[0209 | 0020 004D]</td><td>DOUBLE ACUTE ACCENT</td></tr>
<tr><td>037A</td><td>[0C9B | 0020]</td><td>[0209 | 0020 0096]</td><td>GREEK YPOGEGRAMMENI</td></tr>
<tr><td>0384</td><td>[020D | 0020]</td><td>[0209 | 0020 0032]</td><td>GREEK TONOS</td></tr>
<tr><td>0385</td><td>[0214 | 0020 0032]</td><td>[0209 | 0020 0047 0032]</td><td>GREEK DIALYTIKA TONOS</td></tr>
<tr><td>1E9B</td><td>[0BA7 | 0020 0154 0052]</td><td>[0BA7 | 0020 0052]</td><td>LATIN SMALL LETTER LONG S WITH DOT ABOVE</td></tr>
<tr><td>1FBD</td><td>[0217 | 0020]</td><td>[0209 | 0020 0022]</td><td>GREEK KORONIS</td></tr>
<tr><td>1FBF</td><td>[0217 | 0020]</td><td>[0209 | 0020 0022]</td><td>GREEK PSILI</td></tr>
<tr><td>1FC0</td><td>[021D | 0020]</td><td>[0209 | 0020 0045]</td><td>GREEK PERISPOMENI</td></tr>
<tr><td>1FC1</td><td>[0214 | 0020 0045]</td><td>[0209 | 0020 0047 0045]</td><td>GREEK DIALYTIKA AND PERISPOMENI</td></tr>
<tr><td>1FCD</td><td>[0217 | 0020 0035]</td><td>[0209 | 0020 0022 0035]</td><td>GREEK PSILI AND VARIA</td></tr>
<tr><td>1FCE</td><td>[0217 | 0020 0032]</td><td>[0209 | 0020 0022 0032]</td><td>GREEK PSILI AND OXIA</td></tr>
<tr><td>1FCF</td><td>[0217 | 0020 0045]</td><td>[0209 | 0020 0022 0045]</td><td>GREEK PSILI AND PERISPOMENI</td></tr>
<tr><td>1FDD</td><td>[0218 | 0020 0035]</td><td>[0209 | 0020 002A 0035]</td><td>GREEK DASIA AND VARIA</td></tr>
<tr><td>1FDE</td><td>[0218 | 0020 0032]</td><td>[0209 | 0020 002A 0032]</td><td>GREEK DASIA AND OXIA</td></tr>
<tr><td>1FDF</td><td>[0218 | 0020 0045]</td><td>[0209 | 0020 002A 0045]</td><td>GREEK DASIA AND PERISPOMENI</td></tr>
<tr><td>1FED</td><td>[0214 | 0020 0035]</td><td>[0209 | 0020 0047 0035]</td><td>GREEK DIALYTIKA AND VARIA</td></tr>
<tr><td>1FEE</td><td>[0214 | 0020 0032]</td><td>[0209 | 0020 0047 0032]</td><td>GREEK DIALYTIKA AND OXIA</td></tr>
<tr><td>1FFD</td><td>[020D | 0020]</td><td>[0209 | 0020 0032]</td><td>GREEK OXIA</td></tr>
<tr><td>1FFE</td><td>[0218 | 0020]</td><td>[0209 | 0020 002A]</td><td>GREEK DASIA</td></tr>
<tr><td>2017</td><td>[021C | 0020]</td><td>[0209 | 0020 008A]</td><td>DOUBLE LOW LINE</td></tr>
<tr><td>203E</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>OVERLINE</td></tr>
<tr><td>2047</td><td>[FFC0 A047 | 0020 0020]</td><td>[024E 024E | 0020 0020]</td><td>DOUBLE QUESTION MARK</td></tr>
<tr><td>2057</td><td>[FFC0 A057 | 0020 0020]</td><td>[02B6 02B6 02B6 02B6 | 0020 0020 0020 0020]</td><td>QUADRUPLE PRIME</td></tr>
<tr><td>205F</td><td>[FFC0 A05F | 0020 0020]</td><td>[0209 | 0020]</td><td>MEDIUM MATHEMATICAL SPACE</td></tr>
<tr><td>2071</td><td>[FFC0 A071 | 0020 0020]</td><td>[0AD3 | 0020]</td><td>SUPERSCRIPT LATIN SMALL LETTER I</td></tr>
<tr><td>213D</td><td>[FFC0 A13D | 0020 0020]</td><td>[0C93 | 0020]</td><td>DOUBLE-STRUCK SMALL GAMMA</td></tr>
<tr><td>213E</td><td>[FFC0 A13E | 0020 0020]</td><td>[0C93 | 0020]</td><td>DOUBLE-STRUCK CAPITAL GAMMA</td></tr>
<tr><td>213F</td><td>[FFC0 A13F | 0020 0020]</td><td>[0CA3 | 0020]</td><td>DOUBLE-STRUCK CAPITAL PI</td></tr>
<tr><td>2140</td><td>[FFC0 A140 | 0020 0020]</td><td>[039E | 0020]</td><td>DOUBLE-STRUCK N-ARY SUMMATION</td></tr>
<tr><td>2145</td><td>[FFC0 A145 | 0020 0020]</td><td>[0A49 | 0020]</td><td>DOUBLE-STRUCK ITALIC CAPITAL D</td></tr>
<tr><td>2146</td><td>[FFC0 A146 | 0020 0020]</td><td>[0A49 | 0020]</td><td>DOUBLE-STRUCK ITALIC SMALL D</td></tr>
<tr><td>2147</td><td>[FFC0 A147 | 0020 0020]</td><td>[0A65 | 0020]</td><td>DOUBLE-STRUCK ITALIC SMALL E</td></tr>
<tr><td>2148</td><td>[FFC0 A148 | 0020 0020]</td><td>[0AD3 | 0020]</td><td>DOUBLE-STRUCK ITALIC SMALL I</td></tr>
<tr><td>2149</td><td>[FFC0 A149 | 0020 0020]</td><td>[0AE7 | 0020]</td><td>DOUBLE-STRUCK ITALIC SMALL J</td></tr>
<tr><td>2A0C</td><td>[FFC0 AA0C | 0020 0020]</td><td>[03C2 03C2 03C2 03C2 | 0020 0020 0020 0020]</td><td>QUADRUPLE INTEGRAL OPERATOR</td></tr>
<tr><td>2A74</td><td>[FFC0 AA74 | 0020 0020]</td><td>[0237 0237 03A4 | 0020 0020 0020]</td><td>DOUBLE COLON EQUAL</td></tr>
<tr><td>2A75</td><td>[FFC0 AA75 | 0020 0020]</td><td>[03A4 03A4 | 0020 0020]</td><td>TWO CONSECUTIVE EQUALS SIGNS</td></tr>
<tr><td>2A76</td><td>[FFC0 AA76 | 0020 0020]</td><td>[03A4 03A4 03A4 | 0020 0020 0020]</td><td>THREE CONSECUTIVE EQUALS SIGNS</td></tr>
<tr><td>309B</td><td>[021E | 0020]</td><td>[0209 | 0020 013D]</td><td>KATAKANA-HIRAGANA VOICED SOUND MARK</td></tr>
<tr><td>309C</td><td>[021F | 0020]</td><td>[0209 | 0020 013E]</td><td>KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK</td></tr>
<tr><td>309F</td><td>[FFC0 B09F | 0020 0020]</td><td>[1946 1948 | 0020 0020]</td><td>HIRAGANA DIGRAPH YORI</td></tr>
<tr><td>30FF</td><td>[FFC0 B0FF | 0020 0020]</td><td>[192A 1934 | 0020 0020]</td><td>KATAKANA DIGRAPH KOTO</td></tr>
<tr><td>3251</td><td>[FFC0 B251 | 0020 0020]</td><td>[0A0D 0A0C | 0020 0020]</td><td>CIRCLED NUMBER TWENTY ONE</td></tr>
<tr><td>3252</td><td>[FFC0 B252 | 0020 0020]</td><td>[0A0D 0A0D | 0020 0020]</td><td>CIRCLED NUMBER TWENTY TWO</td></tr>
<tr><td>3253</td><td>[FFC0 B253 | 0020 0020]</td><td>[0A0D 0A0E | 0020 0020]</td><td>CIRCLED NUMBER TWENTY THREE</td></tr>
<tr><td>3254</td><td>[FFC0 B254 | 0020 0020]</td><td>[0A0D 0A0F | 0020 0020]</td><td>CIRCLED NUMBER TWENTY FOUR</td></tr>
<tr><td>3255</td><td>[FFC0 B255 | 0020 0020]</td><td>[0A0D 0A10 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY FIVE</td></tr>
<tr><td>3256</td><td>[FFC0 B256 | 0020 0020]</td><td>[0A0D 0A11 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY SIX</td></tr>
<tr><td>3257</td><td>[FFC0 B257 | 0020 0020]</td><td>[0A0D 0A12 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY SEVEN</td></tr>
<tr><td>3258</td><td>[FFC0 B258 | 0020 0020]</td><td>[0A0D 0A13 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY EIGHT</td></tr>
<tr><td>3259</td><td>[FFC0 B259 | 0020 0020]</td><td>[0A0D 0A14 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY NINE</td></tr>
<tr><td>325A</td><td>[FFC0 B25A | 0020 0020]</td><td>[0A0E 0A0B | 0020 0020]</td><td>CIRCLED NUMBER THIRTY</td></tr>
<tr><td>325B</td><td>[FFC0 B25B | 0020 0020]</td><td>[0A0E 0A0C | 0020 0020]</td><td>CIRCLED NUMBER THIRTY ONE</td></tr>
<tr><td>325C</td><td>[FFC0 B25C | 0020 0020]</td><td>[0A0E 0A0D | 0020 0020]</td><td>CIRCLED NUMBER THIRTY TWO</td></tr>
<tr><td>325D</td><td>[FFC0 B25D | 0020 0020]</td><td>[0A0E 0A0E | 0020 0020]</td><td>CIRCLED NUMBER THIRTY THREE</td></tr>
<tr><td>325E</td><td>[FFC0 B25E | 0020 0020]</td><td>[0A0E 0A0F | 0020 0020]</td><td>CIRCLED NUMBER THIRTY FOUR</td></tr>
<tr><td>325F</td><td>[FFC0 B25F | 0020 0020]</td><td>[0A0E 0A10 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY FIVE</td></tr>
<tr><td>32B1</td><td>[FFC0 B2B1 | 0020 0020]</td><td>[0A0E 0A11 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY SIX</td></tr>
<tr><td>32B2</td><td>[FFC0 B2B2 | 0020 0020]</td><td>[0A0E 0A12 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY SEVEN</td></tr>
<tr><td>32B3</td><td>[FFC0 B2B3 | 0020 0020]</td><td>[0A0E 0A13 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY EIGHT</td></tr>
<tr><td>32B4</td><td>[FFC0 B2B4 | 0020 0020]</td><td>[0A0E 0A14 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY NINE</td></tr>
<tr><td>32B5</td><td>[FFC0 B2B5 | 0020 0020]</td><td>[0A0F 0A0B | 0020 0020]</td><td>CIRCLED NUMBER FORTY</td></tr>
<tr><td>32B6</td><td>[FFC0 B2B6 | 0020 0020]</td><td>[0A0F 0A0C | 0020 0020]</td><td>CIRCLED NUMBER FORTY ONE</td></tr>
<tr><td>32B7</td><td>[FFC0 B2B7 | 0020 0020]</td><td>[0A0F 0A0D | 0020 0020]</td><td>CIRCLED NUMBER FORTY TWO</td></tr>
<tr><td>32B8</td><td>[FFC0 B2B8 | 0020 0020]</td><td>[0A0F 0A0E | 0020 0020]</td><td>CIRCLED NUMBER FORTY THREE</td></tr>
<tr><td>32B9</td><td>[FFC0 B2B9 | 0020 0020]</td><td>[0A0F 0A0F | 0020 0020]</td><td>CIRCLED NUMBER FORTY FOUR</td></tr>
<tr><td>32BA</td><td>[FFC0 B2BA | 0020 0020]</td><td>[0A0F 0A10 | 0020 0020]</td><td>CIRCLED NUMBER FORTY FIVE</td></tr>
<tr><td>32BB</td><td>[FFC0 B2BB | 0020 0020]</td><td>[0A0F 0A11 | 0020 0020]</td><td>CIRCLED NUMBER FORTY SIX</td></tr>
<tr><td>32BC</td><td>[FFC0 B2BC | 0020 0020]</td><td>[0A0F 0A12 | 0020 0020]</td><td>CIRCLED NUMBER FORTY SEVEN</td></tr>
<tr><td>32BD</td><td>[FFC0 B2BD | 0020 0020]</td><td>[0A0F 0A13 | 0020 0020]</td><td>CIRCLED NUMBER FORTY EIGHT</td></tr>
<tr><td>32BE</td><td>[FFC0 B2BE | 0020 0020]</td><td>[0A0F 0A14 | 0020 0020]</td><td>CIRCLED NUMBER FORTY NINE</td></tr>
<tr><td>32BF</td><td>[FFC0 B2BF | 0020 0020]</td><td>[0A10 0A0B | 0020 0020]</td><td>CIRCLED NUMBER FIFTY</td></tr>
<tr><td>FB05</td><td>[0BA7 0BBF | 0020 0154 0020]</td><td>[0BA7 0BBF | 0020 0020]</td><td>LATIN SMALL LIGATURE LONG S T</td></tr>
<tr><td>FBA4</td><td>[0F3D | 00CC]</td><td>[0F3D | 0020 00CC]</td><td>ARABIC LETTER HEH WITH YEH ABOVE ISOLATED FORM</td></tr>
<tr><td>FBA5</td><td>[0F3D | 00CC]</td><td>[0F3D | 0020 00CC]</td><td>ARABIC LETTER HEH WITH YEH ABOVE FINAL FORM</td></tr>
<tr><td>FBB0</td><td>[0F4F | 00CC]</td><td>[0F4F | 0020 00CC]</td><td>ARABIC LETTER YEH BARREE WITH HAMZA ABOVE ISOLATED FORM</td></tr>
<tr><td>FBB1</td><td>[0F4F | 00CC]</td><td>[0F4F | 0020 00CC]</td><td>ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM</td></tr>
<tr><td>FC5E</td><td>[| 00C8]</td><td>[0209 | 0020 00BE 00C8]</td><td>ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM</td></tr>
<tr><td>FC5F</td><td>[| 00C8]</td><td>[0209 | 0020 00C0 00C8]</td><td>ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM</td></tr>
<tr><td>FC60</td><td>[| 00C8]</td><td>[0209 | 0020 00C2 00C8]</td><td>ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM</td></tr>
<tr><td>FC61</td><td>[| 00C8]</td><td>[0209 | 0020 00C4 00C8]</td><td>ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM</td></tr>
<tr><td>FC62</td><td>[| 00C8]</td><td>[0209 | 0020 00C6 00C8]</td><td>ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM</td></tr>
<tr><td>FC63</td><td>[| 00C8 00CE]</td><td>[0209 | 0020 00C8 00CE]</td><td>ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM</td></tr>
<tr><td>FCF2</td><td>[| 00C8]</td><td>[020B | 0020 00C2 00C8]</td><td>ARABIC LIGATURE SHADDA WITH FATHA MEDIAL FORM</td></tr>
<tr><td>FCF3</td><td>[| 00C8]</td><td>[020B | 0020 00C4 00C8]</td><td>ARABIC LIGATURE SHADDA WITH DAMMA MEDIAL FORM</td></tr>
<tr><td>FCF4</td><td>[| 00C8]</td><td>[020B | 0020 00C6 00C8]</td><td>ARABIC LIGATURE SHADDA WITH KASRA MEDIAL FORM</td></tr>
<tr><td>FD3C</td><td>[0ED6 | 00BD]</td><td>[0ED6 | 0020 00BD]</td><td>ARABIC LIGATURE ALEF WITH FATHATAN FINAL FORM</td></tr>
<tr><td>FD3D</td><td>[0ED6 | 00BD]</td><td>[0ED6 | 0020 00BD]</td><td>ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM</td></tr>
<tr><td>FDFC</td><td>[FFC1 FDFC | 0020 0020]</td><td>[0EF9 0F4A 0ED6 0F2D | 0020 0020 0020 0020]</td><td>RIAL SIGN</td></tr>
<tr><td>FE49</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>DASHED OVERLINE</td></tr>
<tr><td>FE4A</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>CENTRELINE OVERLINE</td></tr>
<tr><td>FE4B</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>WAVY OVERLINE</td></tr>
<tr><td>FE4C</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>DOUBLE WAVY OVERLINE</td></tr>
<tr><td>FE70</td><td>[| 00BD]</td><td>[0209 | 0020 00BD]</td><td>ARABIC FATHATAN ISOLATED FORM</td></tr>
<tr><td>FE71</td><td>[| 00BD]</td><td>[020B | 0020 00BD]</td><td>ARABIC TATWEEL WITH FATHATAN ABOVE</td></tr>
<tr><td>FE72</td><td>[| 00BE]</td><td>[0209 | 0020 00BE]</td><td>ARABIC DAMMATAN ISOLATED FORM</td></tr>
<tr><td>FE74</td><td>[| 00C0]</td><td>[0209 | 0020 00C0]</td><td>ARABIC KASRATAN ISOLATED FORM</td></tr>
<tr><td>FE76</td><td>[| 00C2]</td><td>[0209 | 0020 00C2]</td><td>ARABIC FATHA ISOLATED FORM</td></tr>
<tr><td>FE77</td><td>[| 00C2]</td><td>[020B | 0020 00C2]</td><td>ARABIC FATHA MEDIAL FORM</td></tr>
<tr><td>FE78</td><td>[| 00C4]</td><td>[0209 | 0020 00C4]</td><td>ARABIC DAMMA ISOLATED FORM</td></tr>
<tr><td>FE79</td><td>[| 00C4]</td><td>[020B | 0020 00C4]</td><td>ARABIC DAMMA MEDIAL FORM</td></tr>
<tr><td>FE7A</td><td>[| 00C6]</td><td>[0209 | 0020 00C6]</td><td>ARABIC KASRA ISOLATED FORM</td></tr>
<tr><td>FE7B</td><td>[| 00C6]</td><td>[020B | 0020 00C6]</td><td>ARABIC KASRA MEDIAL FORM</td></tr>
<tr><td>FE7C</td><td>[| 00C8]</td><td>[0209 | 0020 00C8]</td><td>ARABIC SHADDA ISOLATED FORM</td></tr>
<tr><td>FE7D</td><td>[| 00C8]</td><td>[020B | 0020 00C8]</td><td>ARABIC SHADDA MEDIAL FORM</td></tr>
<tr><td>FE7E</td><td>[| 00CA]</td><td>[0209 | 0020 00CA]</td><td>ARABIC SUKUN ISOLATED FORM</td></tr>
<tr><td>FE7F</td><td>[| 00CA]</td><td>[020B | 0020 00CA]</td><td>ARABIC SUKUN MEDIAL FORM</td></tr>
<tr><td>FF5F</td><td>[FFC1 FF5F | 0020 0020]</td><td>[FFC0 A985 | 0020 0020]</td><td>FULLWIDTH LEFT WHITE PARENTHESIS</td></tr>
<tr><td>FF60</td><td>[FFC1 FF60 | 0020 0020]</td><td>[FFC0 A986 | 0020 0020]</td><td>FULLWIDTH RIGHT WHITE PARENTHESIS</td></tr>
<tr><td>FFE3</td><td>[0210 | 0020]</td><td>[0209 | 0020 005A]</td><td>FULLWIDTH MACRON</td></tr>
</table>
</body></html>

View File

@ -1,742 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
* $Date: 2005/04/06 08:48:16 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import java.util.*;
import java.io.*;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
public class GenOverlap implements UCD_Types, UCA_Types {
static Map completes = new TreeMap();
static Map back = new HashMap();
static Map initials = new HashMap();
static int[] ces = new int[50];
static UCA collator;
static UCD ucd;
static Normalizer nfd;
static Normalizer nfkd;
public static void validateUCA(UCA collatorIn) throws Exception {
collator = collatorIn;
ucd = UCD.make();
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion());
for (int cp = 0x0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!ucd.isRepresented(cp)) continue;
byte decompType = ucd.getDecompositionType(cp);
if (decompType >= UCD.COMPATIBILITY) {
String decomp = nfkd.normalize(cp);
CEList celistDecomp = getCEList(cp, decomp, true, decompType);
CEList celistNormal = getCEList(UTF16.valueOf(cp), false);
if (!celistNormal.equals(celistDecomp)) {
Utility.fixDot();
System.out.println();
System.out.println(ucd.getCodeAndName(cp));
System.out.println(celistNormal);
System.out.println(celistDecomp);
}
}
}
}
public static void test(UCA collatorIn) throws Exception {
collator = collatorIn;
CEList.main(null);
System.out.println("# Overlap");
System.out.println("# Generated " + Default.getDate());
ucd = UCD.make();
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion());
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
// store data for faster lookup
System.out.println("# Gathering Data");
int counter = 0;
int[] lenArray = new int[1];
while (true) {
Utility.dot(counter++);
String s = cc.next(ces, lenArray);
if (s == null) break;
int len = lenArray[0];
CEList currCEList = new CEList(ces, 0, len);
addString(s, currCEList);
}
/*
for (int cp = 0x10000; cp <= 0x10FFFF; ++cp) {
if (!ucd.isRepresented(cp)) continue;
byte decompType = ucd.getDecompositionType(cp);
if (decompType >= UCD.COMPATIBILITY) {
String decomp = nfkd.normalize(cp);
CEList celist = getCEList(cp, decomp, true, decompType);
addString(decomp, celist);
System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist);
}
}
*/
Utility.fixDot();
System.out.println("# Completes Count: " + completes.size());
System.out.println("# Initials Count: " + initials.size());
System.out.println("# Writing Overlaps");
// simpleList();
fullCheck();
}
public static void addString(String s, CEList currCEList) {
back.put(s, currCEList);
completes.put(currCEList, s);
for (int i = 1; i < currCEList.length(); ++i) {
CEList start = currCEList.start(i);
Set bag = (Set) initials.get(start);
if (bag == null) {
bag = new TreeSet();
initials.put(start, bag);
}
bag.add(s);
}
}
static void simpleList() {
Iterator it = completes.keySet().iterator();
int counter = 0;
int foundCount = 0;
while (it.hasNext()) {
Utility.dot(counter++);
// see if the ces for the current element are the start of something else
CEList key = (CEList) it.next();
String val = (String) completes.get(key);
Set probe = (Set) initials.get(key);
if (probe != null) {
Utility.fixDot();
foundCount++;
System.out.println("Possible Overlap: ");
System.out.println(" " + ucd.getCodeAndName(val));
System.out.println("\t" + key);
Iterator it2 = probe.iterator();
int count2 = 0;
while (it2.hasNext()) {
String match = (String) it2.next();
CEList ceList = (CEList) back.get(match);
System.out.println((count2++) + ". " + ucd.getCodeAndName(match));
System.out.println("\t" + ceList);
}
}
}
System.out.println("# Found Count: " + foundCount);
}
static boolean PROGRESS = false;
static void fullCheck() throws IOException {
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.html", Utility.UTF8_WINDOWS);
PrintWriter simpleList = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.txt", Utility.UTF8_WINDOWS);
Iterator it = completes.keySet().iterator();
int counter = 0;
int foundCount = 0;
String [] goalChars = new String[1];
String [] matchChars = new String[1];
// CEList show = getCEList("\u2034");
Utility.writeHtmlHeader(log, "Overlaps");
log.print("<table>");
while (it.hasNext()) {
Utility.dot(counter++);
CEList key = (CEList) it.next();
if (key.length() < 2) continue;
String val = (String) completes.get(key);
goalChars[0] = "";
matchChars[0] = "";
if (matchWhole(val, key, 0, goalChars, matchChars)) {
simpleList.println(ucd.getCodeAndName(val));
goalChars[0] = val + goalChars[0]; // fix first char
if (!getCEList(goalChars[0]).equals(getCEList(matchChars[0]))) {
log.println("<tr><td colspan='6'>WARNING:" + getCEList(matchChars[0]) + "</td></tr>");
}
foundCount++;
log.println("<tr><td>" + val + "</td>");
log.println("<td>" + goalChars[0] + "</td>");
log.println("<td>" + matchChars[0] + "</td>");
log.println("<td>" + ucd.getCodeAndName(goalChars[0]) + "</td>");
log.println("<td>" + ucd.getCodeAndName(matchChars[0]) + "</td>");
log.println("<td>" + getCEList(goalChars[0]) + "</td></tr>");
//log.println("\t" + );
}
}
log.println("</tr></table>Number of Overlapping characters: " + foundCount + "</body>");
log.close();
simpleList.close();
}
static private CEList getCEList(String s) {
return getCEList(s, true);
}
static private CEList getCEList(String s, boolean decomp) {
int len = collator.getCEs(s, decomp, ces);
return new CEList(ces, 0, len);
}
static private CEList getCEList(int originalChar, String s, boolean decomp, byte type) {
int len = collator.getCEs(s, decomp, ces);
if (decomp) {
for (int i = 0; i < len; ++i) {
ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]),
UCA.getSecondary(ces[i]),
CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
}
}
return new CEList(ces, 0, len);
}
static boolean matchWhole(String goalStr, CEList goal, int depth, String[] goalChars, String[] otherChars) {
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Trying: " + ucd.getCodeAndName(goalStr) + ", " + goal);
// to stop infinite loops, we limit the depth to 5
if (depth > 5) {
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "stack exhausted");
return false;
}
String match;
// There are 3 possible conditions. Any of which work.
// To eliminate double matches at the top level, we test depth > 0
if (depth > 0) {
// Condition 1.
// we have an exact match
match = (String) completes.get(goal);
if (match != null) {
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Exactly: " + ucd.getCodeAndName(match));
otherChars[0] = match + otherChars[0];
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
+ ucd.getCode(goalChars[0])
+ " / " + ucd.getCode(otherChars[0])
);
return true;
}
// Condition 2
// this whole string matches some initial portion of another string
// AND the remainder of that other string also does a matchWhole.
// Example: if we get the following, we search for a match to "de"
// abc...
// abcde
// If we find a match, we append to the strings, the string for abc
// and the one for abcde
Set probe = (Set) initials.get(goal);
if (probe != null) {
Iterator it2 = probe.iterator();
while (it2.hasNext()) {
match = (String) it2.next();
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Longer: " + ucd.getCodeAndName(match)
+ "\t\tswitching");
CEList trail = ((CEList) back.get(match)).end(goal.length());
boolean doesMatch = matchWhole(match, trail, depth+1, otherChars, goalChars);
if (doesMatch) {
otherChars[0] = match + otherChars[0];
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
+ ucd.getCode(goalChars[0])
+ " / " + ucd.getCode(otherChars[0])
);
return true;
}
}
}
}
// Condition 3
// the first part of this string matches a whole other string
// and the remainder of this string also does a matchWhole
// Example: if we get the following, we search for a match to "de"
// abcde..
// abc..
// if we find a match
for (int i = goal.length() - 1; i > 0; --i) {
CEList first = goal.start(i);
match = (String) completes.get(first);
if (match != null) {
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Shorter: " + ucd.getCodeAndName(match));
boolean doesMatch = matchWhole("", goal.end(i), depth+1, goalChars, otherChars);
if (doesMatch) {
otherChars[0] = match + otherChars[0];
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
+ ucd.getCode(goalChars[0])
+ " / " + ucd.getCode(otherChars[0])
);
return true;
}
}
}
// if we get this far, we failed.
return false;
}
public static void generateRevision (UCA collatorIn) throws Exception {
//generateRevision(collatorIn, false);
generateRevision(collatorIn, true);
}
public static void generateRevision (UCA collatorIn, boolean doMax) throws Exception {
collator = collatorIn;
CEList.main(null);
System.out.println("# Generate");
System.out.println("# Generated " + Default.getDate());
ucd = UCD.make();
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion());
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
// store data for faster lookup
System.out.println("# Gathering Data");
int counter = 0;
int[] lenArray = new int[1];
Set list = new TreeSet();
Map newCollisions = new HashMap();
Map oldCollisions = new HashMap();
Map newProblems = new TreeMap();
Map oldProblems = new TreeMap();
CEList nullCEList = new CEList(new int[1]);
while (true) {
Utility.dot(counter++);
String str = cc.next(ces, lenArray);
if (str == null) break;
int len = lenArray[0];
CEList oldList = new CEList(ces, 0, len);
CEList newList = new CEList(ces,0,0);
int cp;
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(str, i);
if (0xFF3F == cp) {
System.out.println("debug");
}
boolean mashLast = false;
if (!nfkd.isNormalized(cp)) {
String decomp = nfkd.normalize(cp);
String canon = nfd.normalize(cp);
len = collator.getCEs(decomp, true, ces);
if (!decomp.equals(canon)) {
byte type = ucd.getDecompositionType(cp);
for (int j = 0; j < len; ++j) {
int p = (i == 0 && decomp.length() > 1 && decomp.charAt(0) == ' ' ? 0x20A : UCA.getPrimary(ces[j]));
int s = UCA.getSecondary(ces[j]);
boolean needsFix = (s != 0x20 && p != 0);
if (needsFix) ++len;
int t = (doMax && j > 0 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
if (needsFix) {
ces[j++] = UCA.makeKey(p, 0x20, t); // Set Extra
System.arraycopy(ces, j, ces, j+1, len - j); // Insert HOLE!
p = 0;
}
ces[j] = UCA.makeKey(p, s, t);
}
}
} else {
len = collator.getCEs(UTF16.valueOf(cp), true, ces);
}
CEList inc = new CEList(ces, 0, len);
if (cp == 0xFF71 || cp == 0xFF67) {
System.out.println(" String: " + ucd.getCodeAndName(cp));
System.out.println(" Type: " + ucd.getDecompositionTypeID(cp));
System.out.println(" xxx: " + inc);
}
newList = newList.append(inc);
}
if (newList.length() == 0) newList = nullCEList;
if (oldList.length() == 0) oldList = nullCEList;
if (!newList.equals(oldList)) {
/*
System.out.println("String: " + ucd.getCodeAndName(str));
System.out.println("\tOld: " + oldList);
System.out.println("\tNew: " + newList);
*/
list.add(new Pair(newList, new Pair(str, oldList)));
}
// check for collisions
if (str.equals("\u206F")) {
System.out.println("debug");
}
Object probe = newCollisions.get(newList);
if (probe == null) {
newCollisions.put(newList, str);
} else {
newProblems.put(str, new Pair((String)probe, newList));
}
probe = oldCollisions.get(oldList);
if (probe == null) {
oldCollisions.put(oldList, str);
} else {
oldProblems.put(str, new Pair((String)probe, oldList));
}
}
Set newKeys = new TreeSet(newProblems.keySet());
Set oldKeys = new TreeSet(oldProblems.keySet());
Set joint = new TreeSet(newKeys);
joint.retainAll(oldKeys);
newKeys.removeAll(joint);
oldKeys.removeAll(joint);
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS);
Iterator it = list.iterator();
int last = -1;
while (it.hasNext()) {
Utility.dot(counter++);
Pair value = (Pair) it.next();
CEList newList = (CEList)value.first;
int cur = UCA.getPrimary(newList.at(0));
if (cur != last) {
log.println();
last = cur;
}
Pair v2 = (Pair) value.second;
String ss = (String)v2.first;
log.println(ucd.getCodeAndName(ss) + "\t\t" + ucd.getDecompositionTypeID(ss.charAt(0)));
log.println("\tnew:\t" + value.first);
log.println("\told:\t" + v2.second);
}
/*
log.println();
log.println("New Collisions: " + newKeys.size());
it = newKeys.iterator();
while (it.hasNext()) {
String key = (String) it.next();
CEList cel = (CEList) newProblems.get(key);
String other = (String) newCollisions.get(cel);
log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other));
log.println("\t" + cel);
}
log.println("Removed Collisions: " + oldKeys.size());
it = oldKeys.iterator();
while (it.hasNext()) {
String key = (String) it.next();
CEList cel = (CEList) oldProblems.get(key);
String other = (String) oldCollisions.get(cel);
log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other));
log.println("\t" + cel);
}
*/
showCollisions(log, "New Collisions:", newKeys, newProblems);
showCollisions(log, "Old Collisions:", oldKeys, oldProblems);
showCollisions(log, "In Both:", joint, oldProblems);
log.close();
}
static void showCollisions(PrintWriter log, String title, Set bad, Map probs) {
log.println();
log.println(title + bad.size());
Iterator it = bad.iterator();
Set lister = new TreeSet();
while (it.hasNext()) {
String key = (String) it.next();
Pair pair = (Pair) probs.get(key);
String other = (String) pair.first;
CEList cel = (CEList) pair.second;
if (key.equals("\u0001")) {
System.out.println("debug");
}
lister.add(new Pair(cel, ucd.getCodeAndName(key) + ",\t" + ucd.getCodeAndName(other)));
}
it = lister.iterator();
int last = -1;
while (it.hasNext()) {
Pair pair = (Pair) it.next();
CEList cel = (CEList) pair.first;
int curr = UCA.getPrimary(cel.at(0));
if (curr != last) {
last = curr;
log.println();
}
log.println("Collision between: " + pair.second);
log.println("\t" + pair.first);
}
log.flush();
}
public static void checkHash(UCA collatorIn) throws Exception {
collator = collatorIn;
System.out.println("# Check Hash");
System.out.println("# Generated " + Default.getDate());
ucd = UCD.make();
//nfd = new Normalizer(Normalizer.NFD);
//nfkd = new Normalizer(Normalizer.NFKD);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion());
int tableLength = 257;
/*
257 263 269 271 277 281 283 293 307 311 313 317
331 337 347 349 353 359 367 373 379 383 389 397
401 409 419 421 431 433 439 443 449 457 461 463
467 479 487 491 499 503 509 521 523 541 547 557
563 569 571 577 587 593 599 601 607 613 617 619
631 641 643 647 653 659 661 673 677 683 691 701
709 719 727 733 739 743 751 757 761 769 773 787
797 809 811 821 823 827 829 839 853 857 859 863
877 881 883 887 907 911 919 929 937 941 947 953
967 971 977 983 991 997
*/
int [][] collisions = new int[LIMIT_SCRIPT][];
BitSet[] repeats = new BitSet[LIMIT_SCRIPT];
for (int i = 0; i < collisions.length; ++i) {
collisions[i] = new int[tableLength];
repeats[i] = new BitSet();
}
int counter = 0;
int[] lenArray = new int[1];
if (false) while (true) {
Utility.dot(counter++);
String s = cc.next(ces, lenArray);
if (s == null) break;
if (UTF16.countCodePoint(s) != 1) continue; // skip ligatures
int cp = UTF16.charAt(s, 0);
if (!nfkd.isNormalized(cp)) continue;
int script = ucd.getScript(cp);
int len = lenArray[0];
for (int i = 0; i < len; ++i) {
int prim = UCA.getPrimary(ces[i]);
int hash = prim % tableLength;
if (!repeats[script].get(prim)) {
++collisions[script][hash];
repeats[script].set(prim);
} else {
System.out.println("Skipping: " + prim + " in " + ucd.getCodeAndName(cp));
}
if (!repeats[UNUSED_SCRIPT].get(prim)) {
++collisions[UNUSED_SCRIPT][hash];
repeats[UNUSED_SCRIPT].set(prim);
}
}
}
String [] latin = new String[tableLength];
for (int i = 0; i < latin.length; ++i) {
latin[i] = "";
}
for (int cp = 0; cp < 0x10FFFF; ++cp) {
Utility.dot(counter++);
if (!ucd.isAllocated(cp)) continue;
if (!nfkd.isNormalized(cp)) continue;
if (ucd.getCategory(cp) == Lu) continue; // don't count case
String scp = UTF16.valueOf(cp);
int len = collator.getCEs(scp, true, ces);
int script = ucd.getScript(cp);
for (int i = 0; i < len; ++i) {
int prim = UCA.getPrimary(ces[i]);
int hash = prim % tableLength;
if (!repeats[script].get(prim)) {
++collisions[script][hash];
repeats[script].set(prim);
if (script == LATIN_SCRIPT) latin[hash] += scp;
}
if (!repeats[UNUSED_SCRIPT].get(prim)) {
++collisions[UNUSED_SCRIPT][hash];
repeats[UNUSED_SCRIPT].set(prim);
}
}
}
System.out.println("Data Gathered");
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "checkstringsearchhash.html", Utility.UTF8_WINDOWS);
Utility.writeHtmlHeader(log, "Check Hash");
log.println("<h1>Collisions</h1>");
log.println("<p>Shows collisions among primary values when hashed to table size = " + tableLength + ".");
log.println("Note: All duplicate primarys are removed: all non-colliding values are removed.</p>");
log.println("<table><tr><th>Script</th><th>Sum</th><th>Average</th><th>Std Dev.</th></tr>");
for (byte i = 0; i < collisions.length; ++i) {
if (i == UNUSED_SCRIPT) continue;
showCollisions(log, ucd.getScriptID_fromIndex(i), collisions[i]);
}
showCollisions(log, "All", collisions[UNUSED_SCRIPT]);
log.println("</table>");
log.println("<p>Details of collisions for Latin</p>");
for (int i = 0; i < latin.length; ++i) {
if (latin[i].length() < 2) continue;
//if (UTF16.countCodePoint(latin[i]) < 2) continue;
int cp2;
log.println("<table>");
for (int j = 0; j < latin[i].length(); j += UTF16.getCharCount(cp2)) {
cp2 = UTF16.charAt(latin[i], j);
String scp2 = UTF16.valueOf(cp2);
CEList clist = collator.getCEList(scp2, true);
log.println("<tr><td>" + scp2 + "</td><td>" + clist + "</td><td>" + ucd.getCodeAndName(cp2) + "</td></tr>");
}
log.println("</table><br>");
}
log.close();
}
static java.text.NumberFormat nf = new java.text.DecimalFormat("#,##0.00");
static java.text.NumberFormat nf0 = new java.text.DecimalFormat("#,##0");
static void showCollisions(PrintWriter log, String title, int[] curr) {
double sum = 0;
int count = 0;
for (int j = 0; j < curr.length; ++j) {
if (curr[j] == 0) continue;
sum += curr[j];
++count;
}
double average = sum / count;
double sd = 0;
for (int j = 0; j < curr.length; ++j) {
if (curr[j] == 0) continue;
double deviation = curr[j] - average;
sd += deviation * deviation;
}
sd = Math.sqrt(sd / count);
log.println("<tr><td>" + title
+ "</td><td align='right'>" + nf0.format(sum)
+ "</td><td align='right'>" + nf.format(average)
+ "</td><td align='right'>" + nf.format(sd)
+ "</td></tr>");
}
public static void listCyrillic(UCA collatorIn) throws IOException {
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "ListCyrillic.txt", Utility.UTF8_WINDOWS);
Set set = new TreeSet(collatorIn);
Set set2 = new TreeSet(collatorIn);
ucd = UCD.make();
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
for (char i = 0; i < 0xFFFF; ++i) {
Utility.dot(i);
if (!ucd.isRepresented(i)) continue;
if (ucd.getScript(i) != CYRILLIC_SCRIPT) continue;
String decomp = nfd.normalize(String.valueOf(i));
String oldDecomp = decomp;
for (int j = 0; j < decomp.length(); ++j) {
if (ucd.getCategory(decomp.charAt(j)) == Mn) {
decomp = decomp.substring(0,j) + decomp.substring(j+1);
}
}
if (decomp.length() == 0) continue;
set.add(decomp);
if (!decomp.equals(oldDecomp)) set2.add(oldDecomp);
}
Iterator it = set.iterator();
while (it.hasNext()) {
String s = (String) it.next();
String name = ucd.getName(s.charAt(0));
Utility.replace(name, "CYRILLIC ", "");
log.println("# " + s + " <> XXX ; # " + name);
}
it = set2.iterator();
while (it.hasNext()) {
String s = (String) it.next();
String name = ucd.getName(s.charAt(0));
Utility.replace(name, "CYRILLIC ", "");
log.println("### " + s + " <> XXX ; # " + name);
}
log.close();
}
}

View File

@ -1,46 +0,0 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="VI60_defaultClientScript" content="JavaScript">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="keywords" content="Unicode Standard, technical reports">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>Technical Reports</title>
<link rel="stylesheet" type="text/css"
href="http://www.unicode.org/webscripts/standard_styles.css">
<script language="Javascript" src="http://www.unicode.org/webscripts/commonHeader.js"></script>
</head>
<body text="#330000" topmargin="0" leftmargin="0" marginwidth="0"
marginheight="0">
<form action="http://www.unicode.org/webscripts/POST">
<table width="100%" cellpadding="0" cellspacing="0" border="0">
<tr>
<td colspan="2">
<table width="100%" border="0" cellpadding="0" cellspacing="0">
<tr>
<td class="icon"><img border="0"
src="http://www.unicode.org/webscripts/logo60s2.gif"
align="middle" alt="[Unicode]" width="34" height="33">&nbsp;&nbsp;Charts</td>
<td class="bar"><a href="http://www.unicode.org" class="bar">Home</a>
| <a href="http://www.unicode.org/sitemap/" class="bar">Site Map</a>
| <a href="http://www.unicode.org/search" class="bar">Search </a><script language="Javascript" src="http://www.unicode.org/webscripts/commonSearch.js"></script><noscript><a
href="http://www.unicode.org/webscripts/quick_links.html"
class="bar" target="_blank">Goto</a></noscript></td>
</tr>
</table>
</td>
</tr>
<tr>
<td colspan="2" class="gray">&nbsp;</td>
</tr>
<tr>
<td>
<h1>Collation Charts</h1>
</td>
</tr>
<tr><td valign="top" class="navCol">

View File

@ -1,8 +0,0 @@
<hr width="50%">
<p align="center"><script language="Javascript" src="http://www.unicode.org/webscripts/lastModified.js"></script>
</blockquote>
</td>
</table>
</form>
</body>
</html>

View File

@ -1,438 +0,0 @@
package com.ibm.text.UCA;
import com.ibm.text.UCD.UCD_Types;
import com.ibm.text.utility.Utility;
/**
* For generation of Implicit CEs
* @author Davis
*
* Cleaned up so that changes can be made more easily.
* Old values:
# First Implicit: E26A792D
# Last Implicit: E3DC70C0
# First CJK: E0030300
# Last CJK: E0A9DD00
# First CJK_A: E0A9DF00
# Last CJK_A: E0DE3100
*/
public class Implicit implements UCD_Types {
/**
* constants
*/
static final boolean DEBUG = false;
static final long topByte = 0xFF000000L;
static final long bottomByte = 0xFFL;
static final long fourBytes = 0xFFFFFFFFL;
static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
/**
* Testing function
* @param args ignored
*/
public static void main(String[] args) {
System.out.println("Start");
try {
Implicit foo = new Implicit(0xE0, 0xE4);
//int x = foo.getRawImplicit(0xF810);
foo.getRawFromImplicit(0xE20303E7);
int gap4 = foo.getGap4();
System.out.println("Gap4: " + gap4);
int gap3 = foo.getGap3();
int minTrail = foo.getMinTrail();
int maxTrail = foo.getMaxTrail();
long last = 0;
long current;
for (int i = 0; i <= MAX_INPUT; ++i) {
current = foo.getImplicitFromRaw(i) & fourBytes;
// check that it round-trips AND that all intervening ones are illegal
int roundtrip = foo.getRawFromImplicit((int)current);
if (roundtrip != i) {
foo.throwError("No roundtrip", i);
}
if (last != 0) {
for (long j = last + 1; j < current; ++j) {
roundtrip = foo.getRawFromImplicit((int)j);
// raise an error if it *doesn't* find an error
if (roundtrip != -1) {
foo.throwError("Fails to recognize illegal", j);
}
}
}
// now do other consistency checks
long lastBottom = last & bottomByte;
long currentBottom = current & bottomByte;
long lastTop = last & topByte;
long currentTop = current & topByte;
// do some consistency checks
/*
long gap = current - last;
if (currentBottom != 0) { // if we are a 4-byte
// gap has to be at least gap4
// and gap from minTrail, maxTrail has to be at least gap4
if (gap <= gap4) foo.throwError("Failed gap4 between", i);
if (currentBottom < minTrail + gap4) foo.throwError("Failed gap4 before", i);
if (currentBottom > maxTrail - gap4) foo.throwError("Failed gap4 after", i);
} else { // we are a three-byte
gap = gap >> 8; // move gap down for comparison.
long current3Bottom = (current >> 8) & bottomByte;
if (gap <= gap3) foo.throwError("Failed gap3 between ", i);
if (current3Bottom < minTrail + gap3) foo.throwError("Failed gap3 before", i);
if (current3Bottom > maxTrail - gap3) foo.throwError("Failed gap3 after", i);
}
*/
// print out some values for spot-checking
if (lastTop != currentTop || i == 0x10000 || i == 0x110000) {
foo.show(i-3);
foo.show(i-2);
foo.show(i-1);
if (i == 0) {
// do nothing
} else if (lastBottom == 0 && currentBottom != 0) {
System.out.println("+ primary boundary, 4-byte CE's below");
} else if (lastTop != currentTop) {
System.out.println("+ primary boundary");
}
foo.show(i);
foo.show(i+1);
foo.show(i+2);
System.out.println("...");
}
last = current;
}
foo.show(MAX_INPUT-2);
foo.show(MAX_INPUT-1);
foo.show(MAX_INPUT);
} catch (Exception e) {
e.printStackTrace();
} finally {
System.out.println("End");
}
}
private void throwError(String title, int cp) {
throw new IllegalArgumentException(title + "\t" + Utility.hex(cp) + "\t" + Utility.hex(getImplicitFromRaw(cp) & fourBytes));
}
private void throwError(String title, long ce) {
throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
}
private void show(int i) {
if (i >= 0 && i <= MAX_INPUT) {
System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes));
}
}
/**
* Precomputed by constructor
*/
int final3Multiplier;
int final4Multiplier;
int final3Count;
int final4Count;
int medialCount;
int min3Primary;
int min4Primary;
int max4Primary;
int minTrail;
int maxTrail;
int max3Trail;
int max4Trail;
int min4Boundary;
public int getGap4() {
return final4Multiplier - 1;
}
public int getGap3() {
return final3Multiplier - 1;
}
// old comment
// we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
// we shift so that HAN all has the same first primary, for compression.
// for the 4 byte case, we make the gap as large as we can fit.
/**
* Supply parameters for generating implicit CEs
*/
public Implicit(int minPrimary, int maxPrimary) {
// 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
}
/**
* Set up to generate implicits.
* @param minPrimary
* @param maxPrimary
* @param minTrail final byte
* @param maxTrail final byte
* @param gap3 the gap we leave for tailoring for 3-byte forms
* @param primaries3count number of 3-byte primarys we can use (normally 1)
*/
public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
if (DEBUG) {
System.out.println("minPrimary: " + Utility.hex(minPrimary));
System.out.println("maxPrimary: " + Utility.hex(maxPrimary));
System.out.println("minTrail: " + Utility.hex(minTrail));
System.out.println("maxTrail: " + Utility.hex(maxTrail));
System.out.println("gap3: " + Utility.hex(gap3));
System.out.println("primaries3count: " + primaries3count);
}
// some simple parameter checks
if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes");
if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes");
if (primaries3count < 1) throw new IllegalArgumentException("bad three-byte primaries");
this.minTrail = minTrail;
this.maxTrail = maxTrail;
min3Primary = minPrimary;
max4Primary = maxPrimary;
// compute constants for use later.
// number of values we can use in trailing bytes
// leave room for empty values between AND above, e.g. if gap = 2
// range 3..7 => +3 -4 -5 -6 -7: so 1 value
// range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
// range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
final3Multiplier = gap3 + 1;
final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
// medials can use full range
medialCount = (maxTrail - minTrail + 1);
// find out how many values fit in each form
int threeByteCount = medialCount * final3Count;
// now determine where the 3/4 boundary is.
// we use 3 bytes below the boundary, and 4 above
int primariesAvailable = maxPrimary - minPrimary + 1;
int primaries4count = primariesAvailable - primaries3count;
int min3ByteCoverage = primaries3count * threeByteCount;
min4Primary = minPrimary + primaries3count;
min4Boundary = min3ByteCoverage;
// Now expand out the multiplier for the 4 bytes, and redo.
int totalNeeded = MAX_INPUT - min4Boundary;
int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
if (DEBUG) System.out.println("expandedGap: " + gap4);
if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
final4Multiplier = gap4 + 1;
final4Count = neededPerFinalByte;
max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
throw new IllegalArgumentException("internal error");
}
if (DEBUG) {
System.out.println("final4Count: " + final4Count);
for (int counter = 0; counter < final4Count; ++counter) {
int value = minTrail + (1 + counter)*final4Multiplier;
System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
}
}
}
static public int divideAndRoundUp(int a, int b) {
return 1 + (a-1)/b;
}
/**
* Converts implicit CE into raw integer
* @param implicit
* @return -1 if illegal format
*/
public int getRawFromImplicit(int implicit) {
int result;
int b3 = implicit & 0xFF;
implicit >>= 8;
int b2 = implicit & 0xFF;
implicit >>= 8;
int b1 = implicit & 0xFF;
implicit >>= 8;
int b0 = implicit & 0xFF;
// simple parameter checks
if (b0 < min3Primary || b0 > max4Primary
|| b1 < minTrail || b1 > maxTrail) return -1;
// normal offsets
b1 -= minTrail;
// take care of the final values, and compose
if (b0 < min4Primary) {
if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
b2 -= minTrail;
int remainder = b2 % final3Multiplier;
if (remainder != 0) return -1;
b0 -= min3Primary;
b2 /= final3Multiplier;
result = ((b0 * medialCount) + b1) * final3Count + b2;
} else {
if (b2 < minTrail || b2 > maxTrail
|| b3 < minTrail || b3 > max4Trail) return -1;
b2 -= minTrail;
b3 -= minTrail;
int remainder = b3 % final4Multiplier;
if (remainder != 0) return -1;
b3 /= final4Multiplier;
b0 -= min4Primary;
result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
}
// final check
if (result < 0 || result > MAX_INPUT) return -1;
return result;
}
/**
* Generate the implicit CE, from raw integer.
* Left shifted to put the first byte at the top of an int.
* @param cp code point
* @return
*/
public int getImplicitFromRaw(int cp) {
if (cp < 0 || cp > MAX_INPUT) {
throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
}
int last0 = cp - min4Boundary;
if (last0 < 0) {
int last1 = cp / final3Count;
last0 = cp % final3Count;
int last2 = last1 / medialCount;
last1 %= medialCount;
last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = min3Primary + last2; // offset
if (last2 >= min4Primary) {
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
}
return (last2 << 24) + (last1 << 16) + (last0 << 8);
} else {
int last1 = last0 / final4Count;
last0 %= final4Count;
int last2 = last1 / medialCount;
last1 %= medialCount;
int last3 = last2 / medialCount;
last2 %= medialCount;
last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = minTrail + last2; // offset
last3 = min4Primary + last3; // offset
if (last3 > max4Primary) {
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
}
return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
}
}
/**
* Gets an Implicit from a code point. Internally,
* swaps (which produces a raw value 0..220000,
* then converts raw to implicit.
* @param cp
* @return
*/
public int getSwappedImplicit(int cp) {
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
// Produce Raw value
// note, we add 1 so that the first value is always empty!!
cp = Implicit.swapCJK(cp) + 1;
// we now have a range of numbers from 0 to 220000.
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
return getImplicitFromRaw(cp);
}
/**
* Function used to:
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
* b) bump any non-CJK characters by 10FFFF.
* The relevant blocks are:
* A: 4E00..9FFF; CJK Unified Ideographs
* F900..FAFF; CJK Compatibility Ideographs
* B: 3400..4DBF; CJK Unified Ideographs Extension A
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
* As long as
* no new B characters are allocated between 4E00 and FAFF, and
* no new A characters are outside of this range,
* (very high probability) this simple code will work.
* The reordered blocks are:
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
* (all contiguous)
* Any other CJK gets its normal code point
* Any non-CJK gets +10FFFF
* When we reorder Block1, we make sure that it is at the very start,
* so that it will use a 3-byte form.
* Warning: the we only pick up the compatibility characters that are
* NOT decomposed, so that block is smaller!
*/
static int NON_CJK_OFFSET = 0x110000;
static int swapCJK(int i) {
if (i >= CJK_BASE) {
if (i < CJK_LIMIT) return i - CJK_BASE;
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
+ (CJK_LIMIT - CJK_BASE);
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
return i + NON_CJK_OFFSET; // non-CJK
}
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
+ (CJK_LIMIT - CJK_BASE)
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
return i + NON_CJK_OFFSET; // non-CJK
}
/**
* @return
*/
public int getMinTrail() {
return minTrail;
}
/**
* @return
*/
public int getMaxTrail() {
return maxTrail;
}
}

View File

@ -1,175 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
* $Date: 2005/04/06 15:15:43 $
* $Revision: 1.20 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import java.io.File;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
public class Main {
//static final String UCDVersion = "4.0.0";
static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA",
"WriteRules", "WriteRulesXML", "writeconformance", "writeconformanceshifted",
"short",
"WriteRules", "WriteRulesXML", "writeconformance", "writeconformanceshifted",
"noCE", "short",
"WriteRules",
"collationChart"
};
public static void main(String args[]) throws Exception {
// NOTE: so far, we don't need to build the UCA with anything but the latest versions.
// A few changes would need to be made to the code to do older versions.
try {
if (args.length == 0) args = new String[] {"?"}; // force the help comment
boolean shortPrint = false;
boolean noCE = false;
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
System.out.println("OPTION: " + arg);
if (arg.charAt(0) == '#') return; // skip rest of line
if (arg.equalsIgnoreCase("ICU")) {
args = Utility.append(ICU_FILES, Utility.subarray(args, i+1));
i = -1;
continue;
}
if (arg.equalsIgnoreCase("version")) {
Default.setUCD(args[++i]); // get next arg
continue;
}
if (WriteCollationData.collator == null) {
System.out.println("Building UCA");
String file = Utility.searchDirectory(new File(UCD_Types.BASE_DIR + "UCA\\" + Default.ucdVersion() + "\\"), "allkeys", true, ".txt");
WriteCollationData.collator = new UCA(file, Default.ucdVersion());
System.out.println("Built version " + WriteCollationData.collator.getDataVersion()
+ "/ucd: " + WriteCollationData.collator.getUCDVersion());
System.out.println("Building UCD data");
WriteCollationData.ucd = UCD.make(WriteCollationData.collator.getUCDVersion());
}
if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(WriteCollationData.collator);
else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(WriteCollationData.collator);
//else if (arg.equalsIgnoreCase("writeNonspacingDifference")) WriteCollationData.writeNonspacingDifference();
else if (arg.equalsIgnoreCase("collationChart")) WriteCharts.collationChart(WriteCollationData.collator);
else if (arg.equalsIgnoreCase("scriptChart")) WriteCharts.scriptChart();
else if (arg.equalsIgnoreCase("normalizationChart")) WriteCharts.normalizationChart();
else if (arg.equalsIgnoreCase("caseChart")) WriteCharts.caseChart();
else if (arg.equalsIgnoreCase("indexChart")) WriteCharts.indexChart();
else if (arg.equalsIgnoreCase("special")) WriteCharts.special();
else if (arg.equalsIgnoreCase("writeCompositionChart")) WriteCharts.writeCompositionChart();
else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(WriteCollationData.collator);
else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(WriteCollationData.collator);
else if (arg.equalsIgnoreCase("listCyrillic")) GenOverlap.listCyrillic(WriteCollationData.collator);
else if (arg.equalsIgnoreCase("WriteRules")) WriteCollationData.writeRules(WriteCollationData.WITHOUT_NAMES, shortPrint, noCE);
// else if (arg.equalsIgnoreCase("WriteRulesWithNames")) WriteCollationData.writeRules(WriteCollationData.WITH_NAMES);
else if (arg.equalsIgnoreCase("WriteRulesXML")) WriteCollationData.writeRules(WriteCollationData.IN_XML, shortPrint, noCE);
else if (arg.equalsIgnoreCase("checkDisjointIgnorables")) WriteCollationData.checkDisjointIgnorables();
else if (arg.equalsIgnoreCase("writeContractions")) WriteCollationData.writeContractions();
else if (arg.equalsIgnoreCase("writeFractionalUCA")) WriteCollationData.writeFractionalUCA("FractionalUCA");
else if (arg.equalsIgnoreCase("writeConformance")) WriteCollationData.writeConformance("CollationTest_NON_IGNORABLE", UCA.NON_IGNORABLE, shortPrint);
else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) WriteCollationData.writeConformance("CollationTest_SHIFTED", UCA.SHIFTED, shortPrint);
else if (arg.equalsIgnoreCase("testCompatibilityCharacters")) WriteCollationData.testCompatibilityCharacters();
else if (arg.equalsIgnoreCase("writeCollationValidityLog")) WriteCollationData.writeCollationValidityLog();
else if (arg.equalsIgnoreCase("writeCaseExceptions")) WriteCollationData.writeCaseExceptions();
else if (arg.equalsIgnoreCase("writeJavascriptInfo")) WriteCollationData.writeJavascriptInfo();
else if (arg.equalsIgnoreCase("writeCaseFolding")) WriteCollationData.writeCaseFolding();
else if (arg.equalsIgnoreCase("javatest")) WriteCollationData.javatest();
else if (arg.equalsIgnoreCase("short")) shortPrint = !shortPrint;
else if (arg.equalsIgnoreCase("noCE")) noCE = !noCE;
else if (arg.equalsIgnoreCase("checkCanonicalIterator")) checkCanonicalIterator();
else if (arg.equalsIgnoreCase("writeAllocation")) WriteCharts.writeAllocation();
// else if (arg.equalsIgnoreCase("probe")) Probe.test();
else {
System.out.println();
System.out.println("UNKNOWN OPTION (" + arg + "): must be one of the following (case-insensitive)");
System.out.println("\tWriteRulesXML, WriteRulesWithNames, WriteRules,");
System.out.println("\tcheckDisjointIgnorables, writeContractions,");
System.out.println("\twriteFractionalUCA, writeConformance, writeConformanceSHIFTED, testCompatibilityCharacters,");
System.out.println("\twriteCollationValidityLog, writeCaseExceptions, writeJavascriptInfo, writeCaseFolding");
System.out.println("\tjavatest, hex (used for conformance)");
}
}
} finally {
System.out.println("Done");
/*
String s = WriteCollationData.collator.getSortKey("\u1025\u102E", UCA.NON_IGNORABLE, true);
System.out.println(Utility.hex("\u0595\u0325") + ", " + WriteCollationData.collator.toString(s));
String t = WriteCollationData.collator.getSortKey("\u0596\u0325", UCA.NON_IGNORABLE, true);
System.out.println(Utility.hex("\u0596\u0325") + ", " + WriteCollationData.collator.toString(t));
Normalizer foo = new Normalizer(Normalizer.NFKD);
char x = '\u1EE2';
System.out.println(Utility.hex(x) + " " + ucd.getName(x));
String nx = foo.normalize(x);
for (int i = 0; i < nx.length(); ++i) {
char c = nx.charAt(i);
System.out.println(ucd.getCanonicalClass(c));
}
System.out.println(Utility.hex(nx, " ") + " " + ucd.getName(nx));
*/
}
}
/**
*
*/
private static void checkCanonicalIterator() {
int firstImplicit = WriteCollationData.getImplicitPrimary(UCD_Types.CJK_BASE);
System.out.println("UCD_Types.CJK_BASE: " + Utility.hex(UCD_Types.CJK_BASE));
System.out.println("first implicit: " + Utility.hex((long)(firstImplicit & 0xFFFFFFFFL)));
CanonicalIterator it = new CanonicalIterator("");
String[] tests = new String[] {"\uF900", "\u00C5d\u0307\u0327"};
for (int j = 0; j < tests.length; ++j) {
System.out.println(Default.ucd().getCodeAndName(tests[j]));
it.setSource(tests[j]);
String ss;
for (int i = 0; (ss = it.next()) != null; ++i) {
System.out.println(i + "\t" + Default.ucd().getCodeAndName(ss));
}
}
// verify that nothing breaks
for (int i = 0; i < 0x10FFFF; ++i) {
int cat = UCharacter.getType(i);
if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue;
String s = UTF16.valueOf(i);
try {
it.setSource(s);
} catch (RuntimeException e) {
System.out.println("Failure with U+" + Utility.hex(i));
e.printStackTrace();
}
}
}
}

View File

@ -1,67 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/RuleComparator.java,v $
* $Date: 2001/08/31 00:20:40 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
public final class RuleComparator implements java.util.Comparator {
public int compare(Object s, Object t) {
String ss = (String)s;
String tt = (String)t;
// compare just the initial portions of each level, FIRST
// only if there is a difference outside of the initial level do we stop
// we assume that there are the same number of levels!!
int si = 0;
int ti = 0;
int result = 0;
try {
while (si < ss.length() && ti < tt.length()) {
char cs = ss.charAt(si++);
char ct = tt.charAt(ti++);
if (cs == ct) continue;
/*
if (cs == 0) {
if (result == 0) result = -1;
while (ct != 0 && ti < tt.length()) {
ct = tt.charAt(ti++);
}
continue;
}
if (ct == 0) {
if (result == 0) result = 1;
while (cs != 0 && si < ss.length()) {
cs = ss.charAt(si++);
}
continue;
}
*/
if (cs < ct) return -1;
return 1;
}
} catch (StringIndexOutOfBoundsException e) {
System.out.println("WHOOPS: ");
System.out.println(si + ", " + Utility.hex(ss));
System.out.println(ti + ", " + Utility.hex(tt));
}
if (result != 0) return result;
if (ss.length() > tt.length()) return 1;
if (ss.length() < tt.length()) return -1;
return 0;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,336 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $
* $Date: 2006/06/08 18:16:40 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import java.util.*;
import java.io.BufferedReader;
import java.io.Reader;
import java.io.PrintWriter;
import java.io.FileReader;
import java.text.MessageFormat;
import java.io.IOException;
import com.ibm.text.UCD.Normalizer;
import com.ibm.text.UCD.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class UCA_Data implements UCA_Types {
static final boolean DEBUG = false;
static final boolean DEBUG_SHOW_ADD = false;
static final boolean lessThan410 = false;
private Normalizer toD;
private UCD ucd;
public UCA_Data(Normalizer toD, UCD ucd) {
this.toD = toD;
this.ucd = ucd;
}
/**
* The collation element data is stored a couple of different structures.
* First is collationElements, which generally contains the 32-bit CE corresponding
* to the data. It is directly indexed by character code.<br>
* For brevity in the implementation, we just use a flat array.
* A real implementation would use a multi-stage table, as described in TUS Section 5.
* table of simple collation elements, indexed by char.<br>
* Exceptional cases: expanding, contracting, unsupported are handled as described below.
*/
private int[] collationElements = new int[65536];
/**
* Although a single character can expand into multiple CEs, we don't want to burden
* the normal case with the storage. So, they get a special value in the collationElements
* array. This value has a distinct primary weight, followed by an index into a separate
* table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
* will be used for the expansion. The implementation is as a stack; this just makes it
* easy to generate.
*/
private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
/**
* For now, this is just a simple mapping of strings to collation elements.
* The implementation depends on the contracting characters being "completed",
* so that it can be efficiently determined when to stop looking.
*/
private Map contractingTable = new TreeMap();
{
// clear some tables
for (int i = 0; i < collationElements.length; ++i) {
collationElements[i] = UNSUPPORTED_FLAG;
}
// preload with parts
for (char i = 0xD800; i < 0xDC00; ++i) {
collationElements[i] = CONTRACTING;
addToContractingTable(String.valueOf(i), UNSUPPORTED_FLAG);
}
checkConsistency();
}
/**
* Return the type of the CE
*/
public byte getCEType(int ch) {
if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands
int ce = collationElements[ch];
if (ce == UNSUPPORTED_FLAG) {
// Special check for Han, Hangul
if (ucd.isHangulSyllable(ch)) return HANGUL_CE;
if (ucd.isCJK_BASE(ch)) return CJK_CE;
if (ucd.isCJK_AB(ch)) return CJK_AB_CE;
// special check for unsupported surrogate pair, 20 1/8 bits
//if (0xD800 <= ch && ch <= 0xDFFF) {
// return SURROGATE_CE;
//}
return UNSUPPORTED_CE;
}
if (ce == CONTRACTING) return CONTRACTING_CE;
if ((ce & EXPANDING_MASK) == EXPANDING_MASK) return EXPANDING_CE;
return NORMAL_CE;
}
public void add(String source, IntStack ces) {
add(new StringBuffer(source), ces);
}
public void add(StringBuffer source, IntStack ces) {
if (DEBUG_SHOW_ADD) {
System.out.println("Adding: " + ucd.getCodeAndName(source.toString()) + CEList.toString(ces));
}
if (source.length() < 1 || ces.length() < 1) {
throw new IllegalArgumentException("String or CEs too short");
}
int ce;
if (ces.length() == 1) {
ce = ces.get(0);
} else {
ce = EXPANDING_MASK | expandingTable.getTop();
expandingTable.append(ces);
expandingTable.append(TERMINATOR);
}
// assign CE(s) to char(s)
char value = source.charAt(0);
//if (value == 0x10000) System.out.print("DEBUG2: " + source);
if (source.length() > 1) {
addToContractingTable(source, ce);
if (collationElements[value] == UNSUPPORTED_FLAG) {
collationElements[value] = CONTRACTING; // mark special
} else if (collationElements[value] != CONTRACTING) {
// move old value to contracting table!
//contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
addToContractingTable(String.valueOf(value), collationElements[value]);
collationElements[value] = CONTRACTING; // signal we must look up in table
}
} else if (collationElements[value] == CONTRACTING) {
// must add old value to contracting table!
addToContractingTable(source, ce);
//contractingTable.put(source, new Integer(ce));
} else {
collationElements[source.charAt(0)] = ce; // normal
}
//if (DEBUG) checkConsistency();
}
boolean isCompletelyIgnoreable(int cp) {
int ce = collationElements[cp < UTF16.SUPPLEMENTARY_MIN_VALUE ? cp : UTF16.getLeadSurrogate(cp)];
if (ce == 0) return true;
if (ce != CONTRACTING) return false;
Object newValue = contractingTable.get(UTF16.valueOf(cp));
if (newValue == null) return false;
return ((Integer)newValue).intValue() == 0;
}
// returns new pos, fills in result.
public int get(char ch, StringBuffer decompositionBuffer, int index, IntStack result) {
int ce = collationElements[ch];
if (ce == CONTRACTING) {
// Contracting is probably the most interesting (read "tricky") part
// of the algorithm.
// First get longest substring that is in the contracting table.
// For simplicity, we use a hash table for contracting.
// There are much better optimizations,
// but they take a more complicated build algorithm than we want to show here.
// NOTE: We are guaranteed that the first code unit is in the contracting table because
// of the build process.
String probe = String.valueOf(ch);
Object value = contractingTable.get(probe);
if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));
// complete the first character, if part of supplementary
if (UTF16.isLeadSurrogate(ch) && index < decompositionBuffer.length()) {
char ch2 = decompositionBuffer.charAt(index);
String newProbe = probe + ch2;
Object newValue = contractingTable.get(newProbe);
if (newValue != null) {
probe = newProbe;
value = newValue;
index++;
}
}
// We loop, trying to add successive CODE UNITS to the longest substring.
int cp2;
while (index < decompositionBuffer.length()) {
//char ch2 = decompositionBuffer.charAt(index);
cp2 = UTF16.charAt(decompositionBuffer, index);
int increment = UTF16.getCharCount(cp2);
// CHECK if last char was completely ignorable
if (lessThan410 && isCompletelyIgnoreable(cp2)) {
index += increment; // just skip char don't set probe, value
continue;
}
// see whether the current string plus the next char are in
// the contracting table.
String newProbe = probe + UTF16.valueOf(cp2);
Object newValue = contractingTable.get(newProbe);
if (newValue == null) break; // stop if not in table.
// We succeeded--so update our new values, and set index
// and quaternary to indicate that we swallowed another character.
probe = newProbe;
value = newValue;
index += increment;
}
// Now, see if we can add any combining marks
short lastCan = 0;
int increment;
for (int i = index; i < decompositionBuffer.length(); i += increment) {
// We only take certain characters. They have to be accents,
// and they have to not be blocked.
// Unlike above, if we don't find a match (and it was an accent!)
// then we don't stop, we continue looping.
cp2 = UTF16.charAt(decompositionBuffer, i);
increment = UTF16.getCharCount(cp2);
short can = toD.getCanonicalClass(cp2);
if (can == 0) break; // stop with any zero (non-accent)
if (can == lastCan) continue; // blocked if same class as last
lastCan = can; // remember for next time
// CHECK if last char was completely ignorable. If so, skip it.
if (lessThan410 && isCompletelyIgnoreable(cp2)) {
continue;
}
// Now see if we can successfully add it onto our string
// and find it in the contracting table.
String newProbe = probe + UTF16.valueOf(cp2);
Object newValue = contractingTable.get(newProbe);
if (newValue == null) continue;
// We succeeded--so update our new values, remove the char, and update
// quaternary to indicate that we swallowed another character.
probe = newProbe;
value = newValue;
decompositionBuffer.setCharAt(i,'\u0000'); // zero char
if (increment == 2) {
// WARNING: we had a supplementary character. zero BOTH parts
decompositionBuffer.setCharAt(i+1,'\u0000'); // zero char
}
}
// we are all done, and can extract the CE from the last value set.
ce = ((Integer)value).intValue();
}
// if the CE is not expanding) we are done.
if ((ce & EXPANDING_MASK) != EXPANDING_MASK) {
result.push(ce);
} else {
// expanding, so copy list of items onto stack
int ii = ce & EXCEPTION_INDEX_MASK; // get index
// copy onto stack from index until reach TERMINATOR
while (true) {
ce = expandingTable.get(ii++);
if (ce == TERMINATOR) break;
result.push(ce);
}
}
return index;
}
private void addToContractingTable(Object s, int ce) {
if (s == null) {
throw new IllegalArgumentException("String can't be null");
}
contractingTable.put(s.toString(), new Integer(ce));
}
void checkConsistency() {
// at this point, we have to guarantee that the contractingTable is CLOSED
// e.g. if a substring of length n is in the table, then the first n-1 characters
// are also!!
// First check consistency. the CE for a value is CONTRACTING if and only if there is a contraction starting
// with that value.
UnicodeSet ceSet = new UnicodeSet();
for (int i = 0; i < collationElements.length; ++i) {
if (collationElements[i] == CONTRACTING) ceSet.add(i);
}
UnicodeSet ceSet2 = new UnicodeSet();
Iterator enum1 = contractingTable.keySet().iterator();
while (enum1.hasNext()) {
String sequence = (String)enum1.next();
ceSet2.add(sequence.charAt(0));
}
if (!ceSet.equals(ceSet2)) {
System.out.println("In both: " + new UnicodeSet(ceSet).retainAll(ceSet2).toPattern(true));
System.out.println("CONTRACTING but not in table: " + new UnicodeSet(ceSet).removeAll(ceSet2).toPattern(true));
System.out.println("In table but not CONTRACTING: " + new UnicodeSet(ceSet2).removeAll(ceSet).toPattern(true));
throw new IllegalArgumentException("Inconsistent data");
}
/*
0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
int[] temp1 = int[20];
int[] temp2 = int[20];
int[] temp3 = int[20];
getCEs("\u0fb2", true, temp1);
getCEs("\u0fb3", true, temp2);
getCEs("\u0f71", true, temp3);
add("\u0FB2\u0F71", concat(temp1, temp3));
*/
}
Iterator getContractions() {
return contractingTable.keySet().iterator();
}
int getContractionCount() {
return contractingTable.size();
}
boolean contractionTableContains(String s) {
return contractingTable.get(s) != null;
}
}

View File

@ -1,98 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Types.java,v $
* $Date: 2005/04/06 08:48:17 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
package com.ibm.text.UCA;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
public interface UCA_Types {
/**
* Version of the UCA tables to use
*/
//private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7";
//public static final String UCA_BASE = "4.1.0"; // "3.1.1"; // ; // ""; // "-2.1.9d7";
//public static final String VERSION = "-" + UCA_BASE; // + "d6" ""; // "-2.1.9d7";
public static final String ALLFILES = "allkeys"; // null if not there
public static final String BASE_UCA_GEN_DIR = UCD_Types.GEN_DIR + "collation" + "\\";
public static final char LEVEL_SEPARATOR = '\u0000';
/**
* Expanding characters are marked with a exception bit combination
* in the collationElement table.
* This means that they map to more than one CE, which is looked up in
* the expansionTable by index.
*/
static final int EXPANDING_MASK = 0xFFFF0000; // marks expanding range start
/**
* This mask is used to get the index from an EXPANDING exception.
* The contracting characters can also make use of this in a future optimization.
*/
static final int EXCEPTION_INDEX_MASK = 0x0000FFFF;
/**
* Contracting characters are marked with a exception bit combination
* in the collationElement table.
* This means that they are the first character of a contraction, and need
* to be looked up (with following characters) in the contractingTable.<br>
* This isn't a MASK since there is exactly one value.
*/
static final int CONTRACTING = 0xFFFE0000;
static final int UNSUPPORTED_FLAG = 0xFFFD0000;
/**
* Used to composed Hangul and Han characters
*/
static final int NEUTRAL_SECONDARY = 0x20;
static final int NEUTRAL_TERTIARY = 0x02;
/** Enum for alternate handling */
public static final byte SHIFTED = 0, ZEROED = 1, NON_IGNORABLE = 2, SHIFTED_TRIMMED = 3, LAST = 3;
/**
* Used to terminate a list of CEs
*/
public static final int TERMINATOR = 0xFFFFFFFF; // CE that marks end of string
/**
* Any unsupported characters (those not in the UCA data tables)
* are marked with a exception bit combination
* so that they can be treated specially.<br>
* There are at least 34 values, so that we can use a range for surrogates
* However, we do add to the first weight if we have surrogate pairs!
*/
static final int UNSUPPORTED_CJK_BASE = 0xFB40;
static final int UNSUPPORTED_CJK_AB_BASE = 0xFB80;
static final int UNSUPPORTED_OTHER_BASE = 0xFBC0;
static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
static final int UNSUPPORTED_LIMIT = UNSUPPORTED_OTHER_BASE + 0x40;
/**
* Special char value that means failed or terminated
*/
static final char NOT_A_CHAR = '\uFFFF';
/**
* CEType
*/
static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2,
CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
FIXED_CE = 3;
// SURROGATE_CE = 6,
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,62 +0,0 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>Chart Instructions</title>
<style>
<!--
th { background-color: #eeeeee }
-->
</style>
</head>
<body>
<h1>Instructions</h1>
<p>The Case Charts provide charts of the characters in Unicode that differ from
at least one of their case forms (lower, title, upper, or fold).</p>
<blockquote>
<p><i>To properly view these charts, your browser should be reasonably recent
so it handles Unicode and cascading style sheets, and you should install a
Unicode font and configure your browser to use it.</i></p>
</blockquote>
<p><b>Notes:</b></p>
<ul>
<li>The index pages are ordered by the following:
<ul>
<li>By script, unless the script is COMMON or INHERITED</li>
<li>By general category, in the latter two cases</li>
<li>If characters have a decomposition containing a cased character, but
do not have a case mapping (lower, title, upper, or fold), then they are
listed in NoCaseMapping.</li>
</ul>
</li>
<li>Within each chart page, the code points are sorted by lowercased <a href="http://www.unicode.org/unicode/reports/tr15/" target="_top">NFKD</a>,
to place related characters next to one another.</li>
<li>To help pick out cells visually, the more interesting ones have a light
blue background. The other cells have grayed-out text.
<ul>
<li>The more interesting ones are:
<ul>
<li><i>lower: </i>if different than the character</li>
<li><i>title: </i>if different than upper</li>
<li><i>upper: </i>if different than the character</li>
<li><i>fold: </i>if different than lower</li>
</ul>
</li>
</ul>
</li>
<li>If your browser supports tool-tops, then hovering your mouse over cells
will show the names of the characters.</li>
<li>For more information, see <a href="http://www.unicode.org/unicode/reports/tr21/" target="_top">UAX
#21: Case Mappings</a>.</li>
</ul>
</body>
</html>

View File

@ -1,35 +0,0 @@
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="keywords" content="Basic">
<title>Case Chart</title>
<style><!--
p { font-size: 90% }
--></style>
<base target="main">
<link rel="stylesheet" type="text/css"
href="http://www.unicode.org/webscripts/standard_styles.css">
</head>
<body>
<table width="100%" cellpadding="0" cellspacing="0" border="0">
<tr>
<td colspan="2">
<table width="100%" border="0" cellpadding="0" cellspacing="0">
<tr>
<td class="icon"><a href="http://www.unicode.org/"><img border="0"
src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
alt="[Unicode]" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar"
href="http://www.unicode.org/unicode/faq/"><font size="3">Charts</font></a>
</tr>
</table>
</td>
</tr>
</table>
<div class="body">
<!-- BEGIN CONTENTS -->
<h2 align="center">Case Chart</h2>
<p align="center"><a href="help.html">Help</a>

View File

@ -1,21 +0,0 @@
td { border: 1 solid #0000FF; color: #000000; background-color: #FFFFFF;
font-size: 120%; text-align: Center; vertical-align: top; width: 48px }
td.p { color: #000000; background-color: #7777FF }
td.s { color: #000000; background-color: #BBBBFF }
td.t { color: #000000; background-color: #DDDDFF }
td.q { color: #000000; background-color: #FFFFFF }
td.ep { color: #000000; background-color: #FF5555 }
td.es { color: #000000; background-color: #FF7777 }
td.et { color: #000000; background-color: #FF9999 }
td.eq { color: #000000; background-color: #FFBBBB }
th { vertical-align: top; font-weight: bold }
th.x { vertical-align: top; font-weight: regular; text-align: Left }
tt { font-size: 50% }
td.name { text-align: left; vertical-align: middle; width: 96% }
body { background-color: #FFFFFF; }
td.g { font-size: 120%; text-align: Center; width: 72px; color: #808080; }
td.n { font-size: 120%; text-align: Center; width: 72px; color: #000000; background-color: #CCCCFF; }
td.z { font-size: 120%; text-align: Center; width: 72px; font-weight: bold; background-color: #EEEEEE; }
td.h { font-size: 120%; text-align: Left; color: #000000; background-color: #EEEEEE; }

View File

@ -1,125 +0,0 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<link rel="stylesheet" href="charts.css" type="text/css">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>UCA Chart Help</title>
<base target="main">
</head>
<body>
<h2 align="center">UCA Chart Help</h2>
<p>This set of charts shows the Unicode Collation Algorithm values for Unicode
characters. The characters are arranged in the following groups:</p>
<table cellspacing="0" cellpadding="4">
<tr>
<th align="left"><i>Null</i></th>
<th class="x">Completely ignoreable (primary, secondary and tertiary levels)<br>
These include control codes and various formatting codes.</th>
</tr>
<tr>
<th align="left"><i>Ignorable</i></th>
<th class="x">Ignorable at a primary level, but not at a secondary or
tertiary level.<br>
These include most accents and diacritics.</th>
</tr>
<tr>
<th align="left"><i>Variable</i></th>
<th class="x">Characters that may be set to ignorable by a programmatic
switch.<br>
These include spaces, punctuation marks, and most symbols.</th>
</tr>
<tr>
<th align="left"><i>Common</i></th>
<th class="x">Characters that are none of the above, but not considered
letters.<br>
These include numbers, currency symbols, etc.</th>
<tr>
<th align="left"><i>Letters</i></th>
<th class="x">According to script</th>
</tr>
<tr>
<th align="left"><i>Unsupported</i></th>
<th class="x">Not explicitly supported in this version of UCA; uses
code-point order</th>
</tr>
</table>
<p>The characters* within each group are arranged in cells. The color of the
cell indicates the strength of the difference between that character and the <i>previous</i>
character in the chart, as follows.</p>
<table cellspacing="0" cellpadding="4">
<tr>
<th colspan="2"><font size="3"><u>No Expansion</u></font>
<th rowspan="5">&nbsp;
<th colspan="2"><font size="3"><u>Expansion</u></font>
</tr>
<tr>
<td class="p">a<br>
<tt>0061</tt></td>
<th class="x">Primary difference
<td class="ep">dz<br>
<tt>01F3</tt></td>
<th class="x">Primary difference</th>
</tr>
<tr>
<td class="s">á<br>
<tt>00E1</tt></td>
<th class="x">Secondary Difference</th>
<td class="es">DZ<br>
<tt>01F1</tt></td>
<th class="x">Secondary Difference</th>
</tr>
<tr>
<td class="t">A<br>
<tt>0041</tt></td>
<th class="x">Tertiary difference</th>
<td class="et">Dz<br>
<tt>01F2</tt></td>
<th class="x">Tertiary difference</th>
<tr>
<td class="q"><br>
<tt>212B</tt></td>
<th class="x">Quarternary difference<br>
or no difference</th>
<td class="eq">&nbsp;</td>
<th class="x">Quarternary difference<br>
or no difference</th>
</tr>
</table>
<blockquote>
<p align="left"><b>Note: </b>If tool-tips are enabled in your browser, then if
you pause the mouse over any cell, you will see the name of the character and
a representation of the sort key. In this representation, the separators
between the weight levels are represented with &quot;|&quot;.</p>
</blockquote>
<table>
<tr>
<th>*</th>
<th class="x">In some cases, the UCA data table also includes contractions.<br>
They can be recognized by the multiple code point numbers, as in the
following:</th>
<td class="p">ஔ<br>
<tt>0B92 0BD7</tt></td>
</tr>
</table>
<h3><b>Notes</b></h3>
<ul>
<li>The UCA results are versioned <i>both</i> by the version of the UCA <i>and</i>
by the version of The Unicode Standard used to process the data.</li>
<li>These charts only provide one of the alternatives for handling variable
characters (punctuation), whereby these characters are <b>non-ignorable.</b></li>
<li>Characters from large blocks, such as CJK-Ideographs, Hangul Syllables,
Private Use Area, etc. are represented by a sampling.</li>
<li>Some unassigned code points, noncharacters and other edge cases are also
added to the list for comparison.</li>
<li>For more information, see <a href="http://www.unicode.org/unicode/reports/tr10/" target="_top">UTS
#10: Unicode Collation Algorithm</a>.</li>
</ul>
</body>
</html>

View File

@ -1,21 +0,0 @@
<html>
<head>
<title>%%%</title>
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
</head>
<frameset cols="192,*">
<frame name="header" src="index_list.html" target="main" scrolling="auto">
<frame name="main" src="help.html" target="main" scrolling="auto">
<noframes>
<body>
<p>This page uses frames, but your browser doesn't support them.</p>
</body>
</noframes>
</frameset>
</html>

View File

@ -1,37 +0,0 @@
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="keywords" content="Basic">
<title>%%%</title>
<style><!--
p { font-size: 90%; text-align: Center }
--></style>
<link rel="stylesheet" type="text/css"
href="http://www.unicode.org/webscripts/standard_styles.css">
<base target='main'>
</head>
<body class="navColTable">
<table width="120%" cellpadding="0" cellspacing="0" border="0">
<tr>
<td colspan="2">
<table width="100%" border="0" cellpadding="0" cellspacing="0">
<tr>
<td class="icon"><a href="http://www.unicode.org/" target='_top'><img border="0"
src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
alt="[Unicode]" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar"
href="http://www.unicode.org/charts/" target='_top'><font size="3">Charts</font></a>
</tr>
</table>
</td>
</tr>
<tr>
<td colspan="2" class="gray">&nbsp;</td>
</tr>
</table>
<div class="body">
<!-- BEGIN CONTENTS -->
<h2 align="center">%%%</h2>
<p><a href="help.html">Help</a>

View File

@ -1,55 +0,0 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<meta name="COPYRIGHT" content=
"Copyright (c) 2002-2006 IBM Corporation and others. All Rights Reserved.">
<title>Chart Instructions</title>
<style>
<!--
th { background-color: #eeeeee }
-->
</style>
</head>
<body>
<h1>Instructions</h1>
<p>The Name charts provide an index to Unicode names. Each word in each Unicode
character name is extracted, and used as an index for the characters. </p>
<blockquote>
<p><i>To properly view these charts, your browser should be reasonably recent
so it handles Unicode and cascading style sheets, and you should install a
Unicode font and configure your browser to use it.</i></p>
</blockquote>
<p><b>Notes:</b></p>
<ul>
<li>To keep the charts from becoming too large, a 'stop-list' of words are
omitted. These are:
<ul>
<li>AND, CAPITAL, CHARACTER, COMPATIBILITY, LETTER, SMALL, WITH</li>
<li>All script names</li>
<li>All words containing a digit</li>
<li>All Hangul Syllables</li>
</ul>
</li>
<li>Unlike some of the other charts, tool-tips to reveal the names are not
included (for compactness). However, if you want to know the name of any
particular characters:
<ul>
<li>Copy the character from the cell.</li>
<li>Go to <a href="http://demo.icu-project.org/icu-bin/translit">http://demo.icu-project.org/icu-bin/translit</a></li>
<li>Paste in under <b>Input 1</b></li>
<li>Select <b>Output 1</b>: Any - Name</li>
</ul>
</li>
</ul>
</body>
</html>

View File

@ -1,61 +0,0 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>Chart Instructions</title>
<style>
<!--
th { background-color: #eeeeee }
-->
</style>
</head>
<body>
<h1>Instructions</h1>
<p>The Normalization Charts provide charts of the characters in Unicode that
differ from at least one of their normalization forms (C, D, KC, KD).</p>
<blockquote>
<p><i>To properly view these charts, your browser should be reasonably recent
so it handles Unicode and cascading style sheets, and you should install a
Unicode font and configure your browser to use it.</i></p>
</blockquote>
<p><b>Notes:</b></p>
<ul>
<li>The index pages are ordered by the following:
<ul>
<li>By script, unless the script is COMMON or INHERITED</li>
<li>By general category, in the latter two cases</li>
</ul>
</li>
<li>Within each chart page, the code points are sorted by folded <a href="http://www.unicode.org/unicode/reports/tr15/" target="_top">NFKD</a>,
to place related characters next to one another.</li>
<li>To keep the size of the Hangul chart manageable, characters U+AD00..U+D6FF
(관..훿) are omitted.</li>
<li>To help pick out cells visually, the more interesting ones have a light
blue background. The other cells have grayed-out text.
<ul>
<li>The more interesting ones are:
<ul>
<li><i>C: </i>if different than the character</li>
<li><i>D: </i>if different than C</li>
<li><i>KC: </i>if different than C</li>
<li><i>KD: </i>if different than KC and D</li>
</ul>
</li>
</ul>
</li>
<li>If your browser supports tool-tops, then hovering your mouse over cells
will show the names of the characters.</li>
<li>For more information, see <a href="http://www.unicode.org/unicode/reports/tr15/" target="_top">UAX
#15: Unicode Normalization Forms</a>.</li>
</ul>
</body>
</html>

View File

@ -1,35 +0,0 @@
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="keywords" content="Basic">
<title>Normalization Chart</title>
<style><!--
p { font-size: 90% }
--></style>
<base target="main">
<link rel="stylesheet" type="text/css"
href="http://www.unicode.org/webscripts/standard_styles.css">
</head>
<body>
<table width="100%" cellpadding="0" cellspacing="0" border="0">
<tr>
<td colspan="2">
<table width="100%" border="0" cellpadding="0" cellspacing="0">
<tr>
<td class="icon"><a href="http://www.unicode.org/"><img border="0"
src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
alt="[Unicode]" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar"
href="http://www.unicode.org/unicode/faq/"><font size="3">Charts</font></a>
</tr>
</table>
</td>
</tr>
</table>
<div class="body">
<!-- BEGIN CONTENTS -->
<h2 align="center">Normalization Chart</h2>
<p align="center"><a href="help.html">Help</a>

View File

@ -1,31 +0,0 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>Chart Instructions</title>
<style>
<!--
th { background-color: #eeeeee }
-->
</style>
</head>
<body>
<h1>Instructions</h1>
<p>The Script charts provide an index to Unicode characters by script.</p>
<blockquote>
<p><i>To properly view these charts, your browser should be reasonably recent
so it handles Unicode and cascading style sheets, and you should install a
Unicode font and configure your browser to use it.</i></p>
</blockquote>
<p>Where the script = Common, the General Category is used in the index instead.</p>
</body>
</html>

View File

@ -1,35 +0,0 @@
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="keywords" content="Basic">
<title>Script Chart</title>
<style><!--
p { font-size: 90% }
--></style>
<base target="main">
<link rel="stylesheet" type="text/css"
href="http://www.unicode.org/webscripts/standard_styles.css">
</head>
<body>
<table width="100%" cellpadding="0" cellspacing="0" border="0">
<tr>
<td colspan="2">
<table width="100%" border="0" cellpadding="0" cellspacing="0">
<tr>
<td class="icon"><a href="http://www.unicode.org/"><img border="0"
src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
alt="[Unicode]" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar"
href="http://www.unicode.org/unicode/faq/"><font size="3">Charts</font></a>
</tr>
</table>
</td>
</tr>
</table>
<div class="body">
<!-- BEGIN CONTENTS -->
<h2 align="center">Script Chart</h2>
<p align="center"><a href="help.html">Help</a>

View File

@ -1,6 +0,0 @@
#
# Note: The casing of block names is not normative.
# For example, "Basic Latin" and "BASIC LATIN" are equivalent.
#
# Format:
# Start Code..End Code; Block Name

View File

@ -1,657 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
* $Date: 2004/03/11 19:03:18 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.IOException;
import com.ibm.icu.text.UTF16;
//import com.ibm.text.unicode.UInfo;
import java.util.*;
import java.io.*;
//import java.text.*;
import com.ibm.text.utility.*;
public class BuildNames implements UCD_Types {
static final boolean DEBUG = false;
public static void main(String[] args) throws IOException {
collectWords();
}
static Map words = new TreeMap(new LengthFirstComparator());
static Map doubleWords = new TreeMap(new LengthFirstComparator());
static Map tripleWords = new TreeMap(new LengthFirstComparator());
static Map quadWords = new TreeMap(new LengthFirstComparator());
static Set lines = new TreeSet(new LengthFirstComparator());
static int[] letters = new int[128];
static class Count {
Count(int count) {this.count = count;}
int count;
}
static String lastWord = "";
static String preLastWord = "";
static String prePreLastWord = "";
static void addWord(String word, Map words) {
Count count = (Count) words.get(word);
if (count == null) {
count = new Count(0);
words.put(word, count);
}
count.count++;
}
static void stash(String word, int position) {
addWord(word, words);
// doubles
if (position > 0) {
addWord(lastWord + "/" + word, doubleWords);
}
if (position > 1) {
addWord(preLastWord + "/" + lastWord + "/" + word, tripleWords);
}
if (position > 2) {
addWord(prePreLastWord + "/" + preLastWord + "/" + lastWord + "/" + word, quadWords);
}
prePreLastWord = preLastWord;
preLastWord = lastWord;
lastWord = word;
for (int i = 0; i < word.length(); ++i) {
letters[word.charAt(i)]++;
}
}
static String transform(String line) {
StringBuffer result = new StringBuffer();
boolean changed = false;
for (int i = 0; i < line.length(); ++i) {
char c = line.charAt(i);
if (c == '-' || c == '<' || c == '>') {
if (result.length() > 0 && result.charAt(result.length()-1) != ' ') result.append(' ');
result.append(c);
if (i + 1 < line.length() && line.charAt(i+1) != ' ') result.append(' ');
changed = true;
continue;
}
if ('a' <= c && c <= 'z') {
result.append((char)(c - 'a' + 'A'));
changed = true;
continue;
}
if ('0' <= c && c <= '9') {
result.append('*').append((char)(c - '0' + 'A'));
changed = true;
continue;
}
result.append(c);
}
if (!changed) return line;
return result.toString().trim();
}
static void printWords(Map words) {
System.out.println();
System.out.println("Finding largest");
System.out.println();
Map biggest = new TreeMap();
Iterator it = words.keySet().iterator();
while (it.hasNext()) {
String word = (String) it.next();
Count count = (Count) words.get(word);
biggest.put(new Integer(-count.count * word.length()), word); // make it negative just to reverse the sort
}
it = biggest.keySet().iterator();
int counter = 0;
while (it.hasNext()) {
if (counter++ > 50) break;
Integer key = (Integer) it.next();
String word = (String) biggest.get(key);
System.out.println(word + ":\t" + (-key.intValue()));
}
}
static void collectWords() throws IOException {
String fname = "ShortNames.txt";
System.out.println("Writing " + fname);
PrintWriter log = Utility.openPrintWriter(fname, Utility.LATIN1_WINDOWS);
System.out.println("Gathering data");
//Counter counter = new Counter();
String[] parts = new String[100];
//int total = 0;
int used = 0;
int sum = 0;
int longSum = 0;
for (int cp = 0; cp < 0x10FFFF; ++cp) {
if (!Default.ucd().isAllocated(cp)) continue;
if (Default.ucd().hasComputableName(cp)) continue;
Utility.dot(cp);
String name;
if (Default.ucd().isRepresented(cp)) {
name = Default.ucd().getName(cp, SHORT);
log.println(Utility.hex(cp) + " " + name);
String backName = Utility.replace(name, UCD_Names.NAME_ABBREVIATIONS, false);
if (!name.equals(backName)) {
System.out.println("Failed to recreate: " + name + ", " + backName);
}
}
// check the string, and its decomposition. This is just to get a good count.
String str = UTF16.valueOf(cp);
if (false && !Default.nfkd().isNormalized(cp)) {
str += Default.nfkd().normalize(cp);
}
int cp2;
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp2)) {
cp2 = UTF16.charAt(str, i);
name = Default.ucd().getName(cp2, SHORT);
if (name == null) continue;
//name = transform(name);
sum += name.length();
longSum += Default.ucd().getName(cp2).length();
used++;
// replace numbers & letters
int len = Utility.split(name, ' ', parts);
for (int j = 0; j < len; ++j) {
stash(parts[j], j);
}
lines.add(name);
}
}
log.close();
Utility.fixDot();
//System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
//System.out.println("Strings: " + sum + ", " + (lastLink*4));
System.out.println("Short Names sum: " + sum + ", average: " + (sum + 0.0)/used);
System.out.println("Long Names sum: " + longSum + ", average: " + (longSum + 0.0)/used);
System.out.println("Savings: " + (1 - (sum+0.0)/longSum));
printWords(words);
printWords(doubleWords);
printWords(tripleWords);
printWords(quadWords);
if (true) return;
System.out.println();
System.out.println("Compacting Words");
System.out.println();
Iterator it = words.keySet().iterator();
int i = 0;
while (it.hasNext()) {
String s = (String) it.next();
int test = CompactName.addWord(s);
String round = CompactName.stringFromToken(test);
boolean goesRound = round.equals(s);
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
+ (goesRound ? ": NO RT: '" + round + "'" : ""));
}
System.out.println();
System.out.println("Compacting Lines");
System.out.println();
CompactName.startLines();
it = lines.iterator();
i = 0;
while (it.hasNext()) {
String s = (String) it.next();
if (s.equals("< BELL >")) {
System.out.println("DEBUG");
}
int test = CompactName.addLine(s);
String round = CompactName.stringFromToken(test);
boolean goesRound = round.equals(s);
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
+ (!goesRound ? ": NO RT: '" + round + "'" : ""));
}
/*System.out.println("Printing Compact Forms");
for (int i = 0; i < CompactName.lastToken; ++i) {
String s = CompactName.stringFromToken(i);
System.out.println(i + ": '" + s + "'");
}*/
System.out.println("Strings: " + sum
+ ", " + (CompactName.spacedMinimum*4)
+ ", " + (CompactName.lastToken*4)
);
}
/*
Set stuff = new TreeSet();
for (int i = 0; i < letters.length; ++i) {
if (letters[i] != 0) {
stuff.add(new Integer((letters[i] << 8) + i));
}
}
it = stuff.iterator();
while (it.hasNext()) {
int in = ((Integer) it.next()).intValue();
System.out.println((char)(in & 0xFF) + ":\t" + String.valueOf(in >> 8));
}
int r = addString(name);
if (!DEBUG && !rname.equals(name)) {
System.out.println("\tNo Round Trip: '" + rname + "'");
}
*/
static Map stringToInt = new HashMap();
static Map intToString = new HashMap();
static final int[] remap = new int['Z'+1];
static final int maxToken;
static {
int counter = 1;
remap[' '] = counter++;
remap['-'] = counter++;
remap['>'] = counter++;
remap['<'] = counter++;
for (int i = 'A'; i <= 'Z'; ++i) {
remap[i] = counter++;
}
for (int i = '0'; i <= '9'; ++i) {
remap[i] = counter++;
}
maxToken = counter;
}
static final String[] unmap = new String[maxToken];
static {
unmap[0] = "";
for (int i = 0; i < remap.length; ++i) {
int x = remap[i];
if (x != 0) unmap[x] = String.valueOf((char)i);
}
}
static int[] links = new int[40000];
static final int linkStart = 0;
static int lastLink = 0;
static final int LITERAL_BOUND = 0x7FFF - maxToken * maxToken;
static boolean isLiteral(int i) {
return (i & 0x7FFF) > LITERAL_BOUND;
}
static String lookup(int i) {
String result;
boolean trailingSpace = false;
if ((i & 0x8000) != 0) {
i ^= 0x8000;
trailingSpace = true;
}
if (i > LITERAL_BOUND) {
i = i - LITERAL_BOUND;
int first = i / maxToken;
int second = i % maxToken;
result = unmap[first] + unmap[second];
} else {
int value = links[i];
int lead = value >>> 16;
int trail = value & 0xFFFF;
//if (DEBUG) System.out.println("lead: " + lead + ", trail: " + trail);
result = lookup(lead) + lookup(trail);
}
if (trailingSpace) result += ' ';
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
return result;
}
static int getInt(String s) {
if (s.length() < 3) {
if (s.length() == 0) return 0;
int first = s.charAt(0);
int second = s.length() > 1 ? s.charAt(1) : 0;
return LITERAL_BOUND + (remap[first] * maxToken + remap[second]);
}
Object in = stringToInt.get(s);
if (in == null) return -1;
return ((Integer)in).intValue();
}
static int putString(String s, int lead, int trail) {
Object in = stringToInt.get(s);
if (in != null) throw new IllegalArgumentException();
int value = (lead << 16) + (trail & 0xFFFF);
int result = lastLink;
links[lastLink++] = value;
if (DEBUG) {
System.out.println("'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail);
String roundTrip = lookup(result);
if (!roundTrip.equals(s)) {
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
}
}
stringToInt.put(s, new Integer(result));
return result;
}
// s cannot have a trailing space. Must be <,>,-,SPACE,0-9,A-Z
static int addString(String s) {
int result = getInt(s);
if (result != -1) return result;
int limit = s.length() - 1;
int bestLen = 0;
int best_i = 0;
int bestSpaceLen = 0;
int bestSpace_i = 0;
int lastSpace = -1;
int spaceBits;
int endOfFirst;
// invariant. We break after a space if there is one.
for (int i = 1; i < limit; ++i) {
char c = s.charAt(i-1);
spaceBits = 0;
endOfFirst = i;
if (c == ' ') {
lastSpace = i;
endOfFirst--;
spaceBits = 0x8000;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(i);
if (firstPart.equals("<START OF ")) {
System.out.println("HUH");
}
int lead = getInt(firstPart);
int trail = getInt(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH BOTH");
return putString(s, spaceBits | lead, trail);
}
if (!isLiteral(lead)) {
if (i > bestLen) {
bestLen = i;
best_i = i;
}
if (i > bestSpaceLen && c == ' ') {
bestSpaceLen = i;
bestSpace_i = i + 1;
}
}
int end_i = s.length() - i;
if (!isLiteral(trail)) {
if (end_i > bestLen) {
bestLen = end_i;
best_i = i;
}
if (end_i > bestSpaceLen && c == ' ') {
bestSpaceLen = end_i;
bestSpace_i = i + 1;
}
}
}
if (lastSpace >= 0) {
bestLen = bestSpaceLen;
best_i = bestSpace_i;
}
spaceBits = 0;
if (bestLen > 0) { // if one matches, recurse -- and return pair
endOfFirst = best_i;
if (lastSpace > 0) {
--endOfFirst;
spaceBits = 0x8000;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(best_i);
int lead = getInt(firstPart);
int trail = getInt(lastPart);
if (lead >= 0) {
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH FIRST");
return putString(s, spaceBits | lead, addString(lastPart));
} else {
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH SECOND");
return putString(s, spaceBits | addString(firstPart), trail);
}
}
// otherwise, we failed to find anything. Then break before the last word, if there is one
// otherwise break in the middle (but at even value)
if (lastSpace >= 0) {
best_i = lastSpace;
endOfFirst = lastSpace - 1;
spaceBits = 0x8000;
} else {
endOfFirst = best_i = ((s.length() + 1) / 4) * 2;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(best_i);
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' FALLBACK");
return putString(s, spaceBits | addString(firstPart), addString(lastPart));
}
/*
static int addCompression(String s) {
Object in = stringToInt.get(s);
if (in != null) return ((Integer) in).intValue();
// find best match, recursively
int bestBreak = -1;
boolean pickFirst = false;
for (int i = 1; i < s.length() - 1; ++i) {
char c = s.charAt(i);
if (c == ' ' || c == '-') {
Object pos1 = stringToInt.get(s.substring(0,i+1));
//Object pos23 = stringToInt.get(s..substring(i));
if (pos2 >= 0 && pos3 >= 0) {
fullToCompressed.put(value, new Integer(index + reserved));
continue main;
}
if (pos2 >= 0) {
if (k > bestBreak) {
bestBreak = k;
pickFirst = true;
}
} else if (pos3 >= 0) {
if (value.length() - k > bestBreak) {
bestBreak = k;
pickFirst = false;
}
}
}
}
}
}
static void gatherData() throws IOException {
System.out.println("Gathering data");
Counter counter = new Counter();
String[] parts = new String[100];
String[] parts2 = new String[100];
int total = 0;
for (int i = 0; i < 0x10FFFF; ++i) {
//if ((i & 0xFF) == 0) System.out.println(Utility.hex(i));
if (!ucd.isRepresented(i)) continue;
String s = ucd.getName(i);
total += s.length();
int len = Utility.split(s, ' ', parts);
for (int j = 0; j < len; ++j) {
if (parts[j].indexOf('-') >= 0) {
// hyphen stuff
int len2 = Utility.split(parts[j], '-', parts2);
for (int k = 0; k < len2; ++k) {
if (k == len2 - 1) {
counter.add(parts2[k] + '-');
} else {
counter.add(parts2[k] + " ");
}
}
} else {
// normal
counter.add(parts[j] + " ");
}
}
}
System.out.println("Sorting data");
Map m = counter.extract();
System.out.println("Printing data");
PrintWriter log = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "NameCompression.txt")),
32*1024));
log.println("total: " + total);
Iterator it = m.keySet().iterator();
String mondo = "";
int i = 0;
int strTotal = 0;
int index = 0;
Map fullToCompressed = new HashMap();
String mondoIndex = "";
main:
while (it.hasNext()) {
index++;
if ((i & 255) == 0) System.out.println("#" + i);
Counter.RWInteger key = (Counter.RWInteger) it.next();
String value = (String)m.get(key);
log.println(i++ + ": " + key + ": \"" + value + "\"");
strTotal += value.length();
// first 128 are the highest frequency, inc. space
if (index < 128 - SINGLES) {
mondo += value;
fullToCompressed.put(value, new String((char)(index + reserved)));
continue;
}
int pos = mondo.indexOf(value);
if (pos >= 0) {
// try splitting!
int bestBreak = -1;
boolean pickFirst = false;
if (value.length() > 2) for (int k = 1; k < value.length()-1; ++k) {
int pos2 = mondo.indexOf(value.substring(0,k) + " ");
int pos3 = mondo.indexOf(value.substring(k));
if (pos2 >= 0 && pos3 >= 0) {
fullToCompressed.put(value, new Integer(index + reserved));
continue main;
}
if (pos2 >= 0) {
if (k > bestBreak) {
bestBreak = k;
pickFirst = true;
}
} else if (pos3 >= 0) {
if (value.length() - k > bestBreak) {
bestBreak = k;
pickFirst = false;
}
}
}
if (bestBreak > 0) {
if (pickFirst) {
mondo += value.substring(bestBreak);
} else {
mondo += value.substring(0, bestBreak) + " ";
}
} else {
mondo += value;
}
}
// high bit on, means 2 bytes, look in array
}
log.println("strTotal: " + strTotal);
log.println("mondo: " + mondo.length());
int k = 80;
for (; k < mondo.length(); k += 80) {
log.println(mondo.substring(k-80, k));
}
log.println(mondo.substring(k-80)); // last line
log.close();
}
static int indexOf(StringBuffer target, String source) {
int targetLen = target.length() - source.length();
main:
for (int i = 0; i <= targetLen; ++i) {
for (int j = 0; j < source.length(); ++j) {
if (target.charAt(i) != source.charAt(j)) continue main;
}
return i;
}
return -1;
}
static final int SINGLES = 26 + 10 + 2;
*/
/*
static String decode(int x) {
if (x < SINGLES) {
if (x < 26) return String.valueOf(x + 'A');
if (x < 36) return String.valueOf(x - 26 + '0');
if (x == 36) return "-";
return " ";
}
if (x < binaryLimit) {
x =
*/
}

View File

@ -1,47 +0,0 @@
#
# Case Folding Properties
#
# This file is a supplement to the UnicodeData file.
# It provides a case folding mapping generated from the Unicode Character Database.
# If all characters are mapped according to the full mapping below, then
# case differences (according to UnicodeData.txt and SpecialCasing.txt)
# are eliminated.
#
# The data supports both implementations that require simple case foldings
# (where string lengths don't change), and implementations that allow full case folding
# (where string lengths may grow). Note that where they can be supported, the
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
#
# All code points not listed in this file map to themselves.
#
# NOTE: case folding does not preserve normalization formats!
#
# For information on case folding, see
# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
#
# ================================================================================
# Format
# ================================================================================
# The entries in this file are in the following machine-readable format:
#
# <code>; <status>; <mapping>; # <name>
#
# The status field is:
# C: common case folding, common mappings shared by both simple and full mappings.
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
# S: simple case folding, mappings to single characters where different from F.
# T: special case for uppercase I and dotted uppercase I
# - For non-Turkic languages, this mapping is normally not used.
# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
# Note that the Turkic mappings do not maintain canonical equivalence without additional processing.
# See the discussions of case mapping in the Unicode Standard for more information.
#
# Usage:
# A. To do a simple case folding, use the mappings with status C + S.
# B. To do a full case folding, use the mappings with status C + F.
#
# The mappings with status T can be used or omitted depending on the desired case-folding
# behavior. (The default option is to exclude them.)
#
# =================================================================

View File

@ -1,47 +0,0 @@
#
# This file is used to test (1) case conversion, (2) case detection,
# and (3) case-insensitive matching.
# (1) is represented below by function names such as toLower(),
# (2) is represented below by function names such as isLower().
# (3) is represented below by the function name equalsCaseInsensitive().
# (The actual function names will vary depending on software language and/or library.)
#
# The test cases also check whether canonical equivalence is preserved
# by these functions.
#
# Format:
# <src> ; <lower> ; <upper> ; <title> ; <fold> (# <comment>)?
#
# Test:
#
# A. For each line:
# 1. Verify the following equalities:
# lower == toLower(src)
# upper == toUpper(src)
# title == toTitle(src)
# fold == toFold(src)
# 2. Verify that all of the following are true:
# isLower(toLower(lower))
# isUpper(toUpper(upper))
# isTitle(toTitle(title))
# isFold(toTitle(fold))
# 3. Verify that all of the following are true:
# equalsCaseInsensitive(src, lower)
# equalsCaseInsensitive(src, upper)
# equalsCaseInsensitive(src, title)
# equalsCaseInsensitive(src, fold)
#
# B. For each code point that is NOT listed as a src:
# 1. Verify the following equalities:
# src == toLower(src) == toUpper(src) == toTitle(src) == toFold(src)
# 2. Verify that all of the following are true:
# isLower(toLower(lower))
# isUpper(toUpper(upper))
# isTitle(toTitle(title))
# isFold(toTitle(fold))
# 3. Verify that all of the following are true:
# equalsCaseInsensitive(src, lower)
# equalsCaseInsensitive(src, upper)
# equalsCaseInsensitive(src, title)
# equalsCaseInsensitive(src, fold)
#

View File

@ -1,25 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Charts.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.icu.text.UnicodeSet;
import java.io.*;
import java.util.*;
import com.ibm.icu.text.UTF16;
import com.ibm.text.utility.*;
public class Charts {
}

View File

@ -1,351 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CheckCollator.java,v $
* $Date: 2002/08/09 23:56:24 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
// http://java.sun.com/j2se/1.3/docs/guide/intl/encoding.doc.html
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.text.NumberFormat;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* This is a quick and dirty program to get some idea of collation performance, comparing old Java to new stuff.
*/
abstract public class CheckCollator {
static final String PREFIX = "C:\\ICUInternal\\icu4c\\collation-perf-data\\TestNames_";
static final boolean DO_RAW = false;
static final NumberFormat nf = NumberFormat.getInstance();
static final NumberFormat percent = NumberFormat.getPercentInstance();
static {
nf.setMaximumFractionDigits(2);
}
public static void main(String[] args) throws IOException {
// later, drive off of args
// choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai
//test(Locale.KOREAN, "Korean");
test(Locale.ENGLISH, "Latin");
test(Locale.FRENCH, "Latin");
test(Locale.JAPANESE, "Japanese");
}
public static void test(Locale loc, String name) throws IOException {
System.out.println();
System.out.println("Testing " + loc.getDisplayName() + ", file: " + name);
System.out.println();
// get test data
String fileName = PREFIX + name + ".txt";
FileInputStream fis = new FileInputStream(fileName);
InputStreamReader isr = new InputStreamReader(fis, "UnicodeLittle");
BufferedReader br = new BufferedReader(isr, 32*1024);
int counter = 0;
ArrayList list = new ArrayList();
while (true) {
String line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
Utility.dot(counter++);
list.add(line);
}
System.out.println("Read " + counter + " lines in file");
int limit = 800; // put a limit on it to save time
// pump it up if there aren't very many
while (list.size() < limit) {
list.addAll(list);
}
int size = list.size();
// later, adjust these so we always get a reasonble number of tries
int extraIterations = 200;
if (size > limit) size = limit;
String[] tests = new String [size];
for (int i = 0; i < size; ++i) {
tests[i] = (String) list.get(i);
}
// get collators
com.ibm.icu.text.Collator newCol = com.ibm.icu.text.Collator.getInstance(loc);
java.text.Collator oldCol = java.text.Collator.getInstance(loc);
double startTime, endTime;
double delta, oldDelta;
String probe;
// load classes at least once before starting
newCol.compare("a", "b");
oldCol.compare("a", "b");
// ================================================
// check sort key size
int stringSize = 0, newSize = 0, oldSize = 0;
for (int i = 0; i < size; ++i) {
stringSize += tests[i].length() * 2;
byte[] newKey = newCol.getCollationKey(tests[i]).toByteArray();
newSize += newKey.length;
byte[] oldKey = oldCol.getCollationKey(tests[i]).toByteArray();
oldSize += oldKey.length;
}
delta = stringSize/(size + 0.0);
System.out.println("string size: " + nf.format(delta) + " bytes per key");
System.out.println();
delta = oldDelta = (oldSize/(size + 0.0));
System.out.println("old sortkey size: " + nf.format(delta) + " bytes per key ");
delta = (newSize/(size + 0.0));
System.out.println("new sortkey size: " + nf.format(delta) + " bytes per key " + percent.format(delta/oldDelta));
System.out.println();
// ================================================
// Sort Key: old time
// get overhead time
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
counter++;
}
}
endTime = System.currentTimeMillis();
double overhead = (1000*(endTime - startTime) / counter);
System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros");
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
oldCol.getCollationKey(probe);
counter++;
}
}
endTime = System.currentTimeMillis();
oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead;
System.out.println("Old sort key time: " + nf.format(delta)
+ " micros (" + counter + " iterations)");
// Sort Key: new time
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
newCol.getCollationKey(probe);
counter++;
}
}
endTime = System.currentTimeMillis();
delta = (1000*(endTime - startTime) / counter) - overhead;
System.out.println("New sort key time: " + nf.format(delta)
+ " micros (" + counter + " iterations) " + percent.format(delta/oldDelta));
System.out.println();
// ================================================
// Raw Compare
if (DO_RAW) {
// get overhead time
counter = 0;
startTime = System.currentTimeMillis();
int opt = 0; // to keep the compiler from optimizing out
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int j = 0; j < size; ++j) {
opt ^= probe.compareTo(tests[j]);
counter++;
}
}
endTime = System.currentTimeMillis();
overhead = (1000*(endTime - startTime) / counter);
System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros");
// Raw Compare: old time
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int j = 0; j < size; ++j) {
opt ^= oldCol.compare(probe, tests[j]);
counter++;
}
}
endTime = System.currentTimeMillis();
oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead;
System.out.println("Old raw compare time: " + nf.format(delta)
+ " micros (" + counter + " iterations)");
// Raw Compare: new time
counter = 0;
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int j = 0; j < size; ++j) {
opt ^= newCol.compare(probe, tests[j]);
counter++;
}
}
endTime = System.currentTimeMillis();
delta = (1000*(endTime - startTime) / counter) - overhead;
System.out.println("New raw compare time: " + nf.format(delta)
+ " micros (" + counter + " iterations) " + percent.format(delta/oldDelta));
System.out.println();
}
// ================================================
// Binary Search
// note: I don't worry about getting the binary search precisely right, since I just want to
// see which strings would get compared.
// overhead
int iterations = (size * extraIterations);
startTime = System.currentTimeMillis();
Arrays.sort(tests);
int opt2 = 0; // keep from optimizing out
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
opt2 ^= Arrays.binarySearch(tests, probe);
}
}
endTime = System.currentTimeMillis();
overhead = delta = (1000*(endTime - startTime) / iterations);
System.out.println("Overhead: " + nf.format(delta)
+ " micros (" + iterations + " iterations)");
// old time
startTime = System.currentTimeMillis();
Arrays.sort(tests, oldCol);
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
opt2 ^= Arrays.binarySearch(tests, probe, oldCol);
}
}
endTime = System.currentTimeMillis();
oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead;
System.out.println("Old binary search time: " + nf.format(delta)
+ " micros (" + iterations + " iterations)");
// new time
Arrays.sort(tests, newCol);
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
probe = tests[i];
for (int k = 0; k < extraIterations; ++k) {
opt2 ^= Arrays.binarySearch(tests, probe, newCol);
}
}
endTime = System.currentTimeMillis();
delta = (1000*(endTime - startTime) / iterations) - overhead;
System.out.println("New binary search time: " + nf.format(delta)
+ " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta));
System.out.println();
// ================================================
// Sort
String[] sortTests = (String[]) tests.clone();
extraIterations = 5;
iterations = (size * extraIterations);
// overhead
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
for (int k = 0; k < extraIterations; ++k) {
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
Arrays.sort(sortTests);
}
}
endTime = System.currentTimeMillis();
overhead = delta = (1000*(endTime - startTime) / iterations);
System.out.println("overhead: " + nf.format(delta)
+ " micros (" + iterations + " iterations)");
// old time
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
for (int k = 0; k < extraIterations; ++k) {
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
Arrays.sort(sortTests, oldCol);
}
}
endTime = System.currentTimeMillis();
oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead;
System.out.println("Old sort time: " + nf.format(delta)
+ " micros (" + iterations + " iterations)");
// new time
startTime = System.currentTimeMillis();
for (int i = 0; i < size; ++i) {
for (int k = 0; k < extraIterations; ++k) {
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
Arrays.sort(sortTests, newCol);
}
}
endTime = System.currentTimeMillis();
delta = (1000*(endTime - startTime) / iterations) - overhead;
System.out.println("New sort time: " + nf.format(delta)
+ " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta));
}
}

View File

@ -1,327 +0,0 @@
package com.ibm.text.UCD;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeLabel;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
import com.ibm.text.utility.Utility;
public class CheckICU {
static final BagFormatter bf = new BagFormatter();
public static void main(String[] args) throws IOException {
System.out.println("Start");
test();
System.out.println("End");
}
static UnicodeSet itemFailures;
static ICUPropertyFactory icuFactory;
static ToolUnicodePropertySource toolFactory;
static class ReplaceLabel extends UnicodeLabel {
UnicodeProperty p;
ReplaceLabel(UnicodeProperty p) {
this.p = p;
}
public String getValue(int codepoint, boolean isShort) {
// TODO Auto-generated method stub
return p.getValue(codepoint, isShort).replace('_',' ');
}
public int getMaxWidth(boolean v) {
return p.getMaxWidth(v);
}
}
public static void test() throws IOException {
checkAvailable();
if (true) return;
checkUCD();
itemFailures = new UnicodeSet();
icuFactory = ICUPropertyFactory.make();
toolFactory = ToolUnicodePropertySource.make("4.0.0");
String[] quickList = {
// "Canonical_Combining_Class",
// "Script", "Bidi_Mirroring_Glyph", "Case_Folding",
//"Numeric_Value"
};
for (int i = 0; i < quickList.length; ++i) {
testProperty(quickList[i], -1);
}
if (quickList.length > 0) return;
Collection availableTool = toolFactory.getAvailableNames();
Collection availableICU = icuFactory.getAvailableNames();
System.out.println(showDifferences("Property Aliases", "ICU", availableICU, "Tool", availableTool));
Collection common = new TreeSet(availableICU);
common.retainAll(availableTool);
for (int j = UnicodeProperty.BINARY; j < UnicodeProperty.LIMIT_TYPE; ++j) {
System.out.println();
System.out.println(UnicodeProperty.getTypeName(j));
Iterator it = common.iterator();
while (it.hasNext()) {
String prop = (String)it.next();
testProperty(prop, j);
}
}
}
/**
*
*/
private static void checkAvailable() {
//generateFile("4.0.0", "DerivedCombiningClass");
//generateFile("4.0.0", "DerivedCoreProperties");
ULocale[] locales = Collator.getAvailableULocales();
System.out.println("Collation");
System.out.println("Possible keyword=values pairs:");
{
String[] keywords = Collator.getKeywords();
for (int i = 0; i < Collator.getKeywords().length; ++i) {
String[] values = Collator.getKeywordValues(keywords[i]);
for (int j = 0; j < values.length; ++j) {
System.out.println("\t" + keywords[i] + "=" + values[j]);
}
}
}
System.out.println("Differing Collators:");
Set testSet = new HashSet(Arrays.asList(new String[] {
"nl", "de", "de_DE", "zh_TW"
}));
for (int k = 0; k < locales.length; ++k) {
if (!testSet.contains(locales[k].toString())) continue;
showCollationVariants(locales[k]);
}
}
/**
*
*/
private static void showCollationVariants(ULocale locale) {
String[] keywords = Collator.getKeywords();
System.out.println(locale.getDisplayName(ULocale.ENGLISH) + " [" + locale + "]");
for (int i = 0; i < Collator.getKeywords().length; ++i) {
ULocale base = Collator.getFunctionalEquivalent(keywords[i],
locale
//new ULocale(locale + "@" + keywords[i] + "=standard")
);
if (true) System.out.println("\"" + base + "\" == Collator.getFunctionalEquivalent(\"" + keywords[i] + "\", \"" + locale + "\");");
String[] values = Collator.getKeywordValues(keywords[i]);
for (int j = 0; j < Collator.getKeywordValues(keywords[i]).length; ++j) {
ULocale other = Collator.getFunctionalEquivalent(keywords[i],
new ULocale(locale + "@" + keywords[i] + "=" + values[j]));
if (true) System.out.println(
"\"" + other
+ "\" == Collator.getFunctionalEquivalent(\"" + keywords[i]
+ "\", new ULocale(\""
+ locale + "@" + keywords[i] + "=" + values[j] + "\");");
// HACK: commented line should work but doesn't
if (!other.equals(base)) {
//if (other.toString().indexOf("@") >= 0) {
System.out.println("\t" + keywords[i] + "=" + values[j] + "; \t" + base + "; \t" + other);
}
}
}
}
/**
* Sample code that prints out the variants that 'make a difference' for a given locale.
* To iterate through the locales, use Collator.getVariant
*/
private static void showCollationVariants2(ULocale locale) {
String[] keywords = Collator.getKeywords();
System.out.println(locale.getDisplayName(ULocale.ENGLISH) + " [" + locale + "]");
for (int i = 0; i < Collator.getKeywords().length; ++i) {
ULocale base = Collator.getFunctionalEquivalent(keywords[i], locale);
String[] values = Collator.getKeywordValues(keywords[i]);
for (int j = 0; j < Collator.getKeywordValues(keywords[i]).length; ++j) {
ULocale other = Collator.getFunctionalEquivalent(keywords[i],
new ULocale(locale + "@" + keywords[i] + "=" + values[j]));
if (!other.equals(base)) {
System.out.println("\t" + keywords[i] + "=" + values[j] + "; \t" + base + "; \t" + other);
}
}
}
}
private static void checkUCD() throws IOException {
UCD myUCD = UCD.make("4.0.0");
Normalizer nfc = new Normalizer(Normalizer.NFC, "4.0.0");
UnicodeSet leading = new UnicodeSet();
UnicodeSet trailing = new UnicodeSet();
UnicodeSet starter = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
if (myUCD.getCombiningClass(i) == 0) starter.add(i);
if (nfc.isTrailing(i)) trailing.add(i);
if (nfc.isLeading(i)) leading.add(i);
}
PrintWriter pw = bf.openUTF8Writer(UCD_Types.GEN_DIR, "Trailing.txt");
pw.println("+Trailing+Starter");
bf.showSetNames(pw, new UnicodeSet(trailing).retainAll(starter));
pw.println("+Trailing-Starter");
bf.showSetNames(pw, new UnicodeSet(trailing).removeAll(starter));
pw.println("-Trailing-Starter");
bf.showSetNames(pw, new UnicodeSet(trailing).complement().removeAll(starter));
pw.println("+Trailing+Leading");
bf.showSetNames(pw, new UnicodeSet(trailing).retainAll(leading));
pw.println("+Trailing-Leading");
bf.showSetNames(pw, new UnicodeSet(trailing).removeAll(leading));
pw.close();
}
/*
* int icuType;
int toolType;
Collection icuAliases;
Collection toolAliases;
String firstDiffICU;
String firstDiffTool;
String firstDiffCP;
String icuProp;
String toolProp;
*/
private static void testProperty(String prop, int typeFilter) {
UnicodeProperty icuProp = icuFactory.getProperty(prop);
int icuType = icuProp.getType();
if (typeFilter >= 0 && icuType != typeFilter) return;
System.out.println();
System.out.println("Testing: " + prop);
UnicodeProperty toolProp = toolFactory.getProperty(prop);
int toolType = toolProp.getType();
if (icuType != toolType) {
System.out.println("FAILURE Type: ICU: " + UnicodeProperty.getTypeName(icuType)
+ "\tTool: " + UnicodeProperty.getTypeName(toolType));
}
Collection icuAliases = icuProp.getNameAliases(new ArrayList());
Collection toolAliases = toolProp.getNameAliases(new ArrayList());
System.out.println(showDifferences("Aliases", "ICU", icuAliases, "Tool", toolAliases));
icuAliases = icuProp.getAvailableValues(new ArrayList());
toolAliases = toolProp.getAvailableValues(new ArrayList());
System.out.println(showDifferences("Value Aliases", "ICU", icuAliases, "Tool", toolAliases));
// TODO do property value aliases
itemFailures.clear();
String firstDiffICU = null, firstDiffTool = null, firstDiffCP = null;
for (int i = 0; i <= 0x10FFFF; ++i) {
/*if (i == 0x0237) {
System.out.println();
}
*/
String icuValue = icuProp.getValue(i);
String toolValue = toolProp.getValue(i);
if (!equals(icuValue, toolValue)) {
itemFailures.add(i);
if (firstDiffCP == null) {
firstDiffICU = icuValue;
firstDiffTool = toolValue;
firstDiffCP = Utility.hex(i);
}
}
}
if (itemFailures.size() != 0) {
System.out.println("FAILURE " + itemFailures.size() + " Differences: ");
System.out.println(itemFailures.toPattern(true));
if (firstDiffICU != null) firstDiffICU = bf.hex.transliterate(firstDiffICU);
if (firstDiffTool != null) firstDiffTool = bf.hex.transliterate(firstDiffTool);
System.out.println(firstDiffCP
+ "\tICU: <" + firstDiffICU
+ ">\tTool: <" + firstDiffTool + ">");
}
System.out.println("done");
// do values later, and their aliases
/*
System.out.println("-Values");
UnicodeSet
System.out.println(showDifferences("ICU", availableICU, "Tool", availableTool));
*/
}
static boolean equals(Object a, Object b) {
if (a == null) return b == null;
return a.equals(b);
}
static public String showDifferences(
String title,
String name1,
Collection set1,
String name2,
Collection set2) {
Collection temp = new TreeSet(set1);
temp.retainAll(set2);
if (set1.size() == temp.size()) {
return title + ": " + name1 + " == " + name2 + ": " + bf.join(set1);
}
StringBuffer result = new StringBuffer();
result.append(title + "\tFAILURE\r\n");
result.append("\t" + name1 + " = " + bf.join(set1) + "\r\n");
result.append("\t" + name2 + " = " + bf.join(set2) + "\r\n");
// damn'd collection doesn't have a clone, so
// we go with Set, even though that
// may not preserve order and duplicates
if (temp.size() != 0) {
result.append("\t" + name2 + " & " + name1 + ":\r\n");
result.append("\t" + bf.join(temp));
result.append("\r\n");
}
temp.clear();
temp.addAll(set1);
temp.removeAll(set2);
if (temp.size() != 0) {
result.append("\t" + name1 + " - " + name2 + ":\r\n");
result.append("\t" + bf.join(temp));
result.append("\r\n");
}
temp.clear();
temp.addAll(set2);
temp.removeAll(set1);
if (temp.size() != 0) {
result.append("\t" + name2 + " - " + name1 + ":\r\n");
result.append("\t" + bf.join(temp));
result.append("\r\n");
}
return result.toString();
}
}

View File

@ -1,81 +0,0 @@
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.text.DecimalFormat;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.UTF16;
import com.ibm.text.utility.Pair;
import com.ibm.text.utility.Utility;
public class ChineseFrequency {
static final String DICT_DIR = "C:\\DATA\\dict\\";
static NumberFormat percent = new DecimalFormat("0.000000%");
static NumberFormat percent3 = new DecimalFormat("000.000000%");
static NumberFormat number = new DecimalFormat("#,##0");
static class InverseCompareTo implements Comparator {
public int compare(Object o1, Object o2) {
return -((Comparable)o1).compareTo(o2);
}
}
public static void test() throws IOException{
Set freq_char = new TreeSet(new InverseCompareTo());
BufferedReader br = BagFormatter.openUTF8Reader(DICT_DIR, "kHYPLCDPF.txt");
double grandTotal = 0.0;
while (true) {
String line = br.readLine();
if (line == null) break;
String[] pieces = Utility.split(line,'\t');
int cp = Integer.parseInt(pieces[0],16);
String[] says = Utility.split(pieces[1],',');
long total = 0;
for (int i = 0; i < says.length; ++i) {
int start = says[i].indexOf('(');
int end = says[i].indexOf(')');
long count = Long.parseLong(says[i].substring(start+1, end));
total += count;
}
grandTotal += total;
freq_char.add(new Pair(new Long(total), new Integer(cp)));
}
br.close();
PrintWriter pw = BagFormatter.openUTF8Writer(DICT_DIR,"kHYPLCDPF_frequency.txt");
pw.write("\uFEFF");
pw.println("No.\tPercentage\tAccummulated\tHex\tChar");
Iterator it = freq_char.iterator();
int counter = 0;
double cummulative = 0;
double cummulativePercentage = 0;
while (it.hasNext()) {
Pair item = (Pair)it.next();
Long total = (Long) item.first;
Integer cp = (Integer) item.second;
double current = total.longValue();
cummulative += current;
double percentage = current / grandTotal;
cummulativePercentage += percentage;
pw.println(
++counter
//+ "\t" + number.format(current)
//+ "\t" + number.format(cummulative)
+ "\t" + percent.format(percentage)
+ "\t" + percent3.format(cummulativePercentage)
+ "\t" + Integer.toHexString(cp.intValue()).toUpperCase()
+ "\t" + UTF16.valueOf(cp.intValue()));
}
//pw.println("Grand total: " + (long)grandTotal);
pw.close();
}
}

View File

@ -1,106 +0,0 @@
package com.ibm.text.UCD;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.*;
import java.util.*;
// Enumerated properties will be IntCodePointProperty.
// The string values they return will be the property value names.
// Binary properties are Enumerated properties. They return 0 or 1
abstract public class CodePointProperty {
// styles for names and string values
static final byte SHORT = 0, DEFAULT = 1, LONG = 2, NORMAL_LIMIT = 3;
// gets the property name
abstract public String getName(byte style);
// value may also be numeric, etc, but this returns string equivalent.
abstract public String getValue(int codePoint, byte style);
// returns true if the code point has the value
// works with any style that getValue takes
abstract public boolean hasValue(int codePoint, String value);
// returns the set of all code points with that value.
// same effect as using hasValue one by one, but faster internal implementation
abstract public UnicodeSet getSet(String value);
// returns a list of all possible values
// logically the same as looping from 0..10FFFF with getValue and getStyleLimit,
// and throwing out duplicates, but much faster.
static Iterator getAllValues(byte style) {
return null;
}
// gets top value style available for this property
public byte getStyleLimit(byte style) {
return NORMAL_LIMIT;
}
// returns true if the value is known to be uniform over a type.
// this is used for various optimizations, especially for Cn & Co
public boolean isUniformOverCategory(byte generalCategory) {
return false;
}
// subclasses
static abstract public class IntCodePointProperty extends CodePointProperty {
abstract int getNumericValue(int codePoint);
abstract int getMaxValue();
abstract int getMinValue();
static Iterator getAllNumericValues() {
return null;
}
}
static abstract public class DoubleCodePointProperty extends CodePointProperty {
abstract double getNumericValue(int codePoint);
abstract double getMaxValue();
abstract double getMinValue();
static Iterator getAllNumericValues() {
return null;
}
}
// registration and lookup
// register a new property
static void register(CodePointProperty newProp) {
//...
}
// finds a registered property by name
static CodePointProperty getInstance(String name) {
return null;
}
// returns a list of all registered properties
static Iterator getAllRegistered() {
return null;
}
// UnicodeSet would use these internally to handle properties. That is, when
// it encountered ... [:name=value:] ...
// it would do:
// CodePointProperty x = getInstance(name);
// if (x != null) doError(name, value);
// UnicodeSet s = x.getSet(value);
// and then use s.
// open issue: we could have a property like: contains("dot")
// in that case, we would register "contains" as the 'base' name,
// but allow lookup with string parameters ("dot")
// Maybe just adding:
public boolean hasParameters() {
return false;
}
public void setParameters(String parameters) {}
public String getParameters() {
return null;
}
// that way we could have [[:letter:]&[:contains(dot):]]
}

View File

@ -1,273 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompactName.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.IOException;
import java.util.*;
import java.io.*;
import java.text.*;
public class CompactName {
static final boolean DEBUG = false;
public static void main(String[] args) throws IOException {
int test = tokenFromString("ABZ");
String ss = stringFromToken(test);
System.out.println(ss);
CompactName.addWord("ABSOLUTEISM");
for (int i = 0; i < CompactName.lastToken; ++i) {
String s = CompactName.stringFromToken(i);
System.out.println(s);
}
}
static final char[] compactMap = new char[128];
static final char[] compactUnmap = new char[128];
static {
char counter = 0;
compactMap[0] = counter++;
for (int i = 'A'; i <= 'Z'; ++i) {
compactMap[i] = counter++;
}
compactMap['-'] = counter++;
compactMap['>'] = counter++;
compactMap['<'] = counter++;
compactMap['*'] = counter++;
compactUnmap[0] = 0;
for (char i = 0; i < compactUnmap.length; ++i) {
int x = compactMap[i];
if (x != 0) compactUnmap[x] = i;
}
}
/*
static String expand(String s) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
int m = s.charAt(i);
if (m == 31 && i < s.length() + 1) {
m = 31 + s.charAt(++i);
}
result.append(compactUnmap[m]);
}
return result.toString();
}
static String compact(String s) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
int m = compactMap[s.charAt(i)];
if (m >= 31) {
result.append((char)31);
m -= 31;
}
result.append(m);
}
return result.toString();
}
*/
static Map string_token = new HashMap();
static Map token_string = new HashMap();
static int[] tokenList = new int[40000];
static final int tokenStart = 0;
static int lastToken = 0;
static int spacedMinimum = Integer.MAX_VALUE;
static boolean isLiteral(int i) {
return (i & 0x8000) != 0;
}
static int addTokenForString(String s, int lead, int trail) {
Object in = string_token.get(s);
if (in != null) throw new IllegalArgumentException();
int value = (lead << 16) + (trail & 0xFFFF);
int result = lastToken;
tokenList[lastToken++] = value;
if (DEBUG) {
System.out.println("'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail);
String roundTrip = stringFromToken(result);
if (!roundTrip.equals(s)) {
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
}
}
string_token.put(s, new Integer(result));
return result;
}
static String stringFromToken(int i) {
String result;
if ((i & 0x8000) != 0) {
char first = compactUnmap[(i >> 10) & 0x1F];
char second = compactUnmap[(i >> 5) & 0x1F];
char third = compactUnmap[i & 0x1F];
result = String.valueOf(first);
if (second != 0) result += String.valueOf(second);
if (third != 0) result += String.valueOf(third);
} else if (i > lastToken) {
throw new IllegalArgumentException("bad token: " + i);
} else {
int value = tokenList[i];
int lead = value >>> 16;
int trail = value & 0xFFFF;
if (i >= spacedMinimum) result = stringFromToken(lead) + ' ' + stringFromToken(trail);
else result = stringFromToken(lead) + stringFromToken(trail);
}
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
return result;
}
static int tokenFromString(String s) {
if (s.length() <= 3) {
int first = compactMap[s.charAt(0)];
int second = compactMap[s.length() > 1 ? s.charAt(1) : 0];
int third = compactMap[s.length() > 2 ? s.charAt(2) : 0];
return 0x8000 + (first << 10) + (second << 5) + third;
}
Object in = string_token.get(s);
if (in == null) return -1;
return ((Integer)in).intValue();
}
static int addWord(String s) {
int result = tokenFromString(s);
if (result != -1) return result;
int bestLen = 0;
int best_i = 0;
int limit = s.length() - 1;
for (int i = limit; i >= 1; --i) {
String firstPart = s.substring(0, i);
String lastPart = s.substring(i);
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
return addTokenForString(s, lead, trail);
}
if (!isLiteral(lead)) {
if (i > bestLen) {
bestLen = i;
best_i = i;
}
}
if (!isLiteral(trail)) {
int end_i = s.length() - i;
if (end_i > bestLen) {
bestLen = end_i;
best_i = i;
}
}
}
if (bestLen > 0) { // if one matches, recurse -- and return pair
String firstPart = s.substring(0, best_i);
String lastPart = s.substring(best_i);
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0) {
if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST");
return addTokenForString(s, lead, addWord(lastPart));
} else {
if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND");
return addTokenForString(s, addWord(firstPart), trail);
}
}
// break at multiple of 3
best_i = ((s.length() + 1) / 6) * 3;
String firstPart = s.substring(0, best_i);
String lastPart = s.substring(best_i);
if (DEBUG) show(s, firstPart, lastPart, "Fallback");
return addTokenForString(s, addWord(firstPart), addWord(lastPart));
}
static void show(String s, String firstPart, String lastPart, String comment) {
System.out.println((s) + " => '" + (firstPart)
+ "' # '" + (lastPart) + "' " + comment);
}
static void startLines() {
spacedMinimum = lastToken;
}
static int addLine(String s) {
int result = tokenFromString(s);
if (result != -1) return result;
int bestLen = 0;
int best_i = 0;
int limit = s.length() - 2;
for (int i = limit; i >= 1; --i) {
char c = s.charAt(i);
if (c != ' ') continue;
String firstPart = s.substring(0, i);
String lastPart = s.substring(i+1);
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
return addTokenForString(s, lead, trail);
}
if (i > bestLen) {
bestLen = i;
best_i = i;
}
int end_i = s.length() - i - 1;
if (end_i > bestLen) {
bestLen = end_i;
best_i = i;
}
}
if (bestLen > 0) { // if one matches, recurse -- and return pair
String firstPart = s.substring(0, best_i);
String lastPart = s.substring(best_i + 1);
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0) {
if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST");
return addTokenForString(s, lead, addLine(lastPart));
} else {
if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND");
return addTokenForString(s, addLine(firstPart), trail);
}
}
System.out.println("SHOULD HAVE MATCHED!!");
throw new IllegalArgumentException("SHOULD HAVE MATCHED!! " + s);
}
}

View File

@ -1,387 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Compare14652.java,v $
* $Date: 2004/02/07 01:01:16 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
// quick and dirty function for grabbing contents of ISO 14652 file
public class Compare14652 implements UCD_Types {
static final boolean oldVersion = false;
public static UnicodeSet getSet(int prop, byte propValue) {
return UnifiedBinaryProperty.make(prop | propValue).getSet();
}
static UnicodeSet
titleSet = getSet(CATEGORY, Lt),
combiningSet = getSet(CATEGORY, Mc)
.addAll(getSet(CATEGORY, Me))
.addAll(getSet(CATEGORY, Mn)),
zSet = getSet(CATEGORY, Zs)
.addAll(getSet(CATEGORY, Zl))
.addAll(getSet(CATEGORY, Zp)),
pSet = getSet(CATEGORY, Pd)
.addAll(getSet(CATEGORY, Ps))
.addAll(getSet(CATEGORY, Pe))
.addAll(getSet(CATEGORY, Pc))
.addAll(getSet(CATEGORY, Po))
.addAll(getSet(CATEGORY, Pi))
.addAll(getSet(CATEGORY, Pf)),
sSet = getSet(CATEGORY, Sm)
.addAll(getSet(CATEGORY, Sc))
.addAll(getSet(CATEGORY, Sk))
.addAll(getSet(CATEGORY, So)),
noSet = getSet(CATEGORY, No),
csSet = getSet(CATEGORY, Cs),
cfSet = getSet(CATEGORY, Cf),
cnSet = getSet(CATEGORY, Cn),
circled = getSet(DECOMPOSITION_TYPE, COMPAT_CIRCLE),
whitespaceSet = getSet(BINARY_PROPERTIES, White_space),
alphaSet = getSet(DERIVED, PropAlphabetic).addAll(combiningSet),
lowerSet = getSet(DERIVED, PropLowercase).addAll(titleSet).removeAll(circled),
upperSet = getSet(DERIVED, PropUppercase).addAll(titleSet).removeAll(circled),
digitSet = getSet(CATEGORY, Nd),
xdigitSet = new UnicodeSet("[a-fA-F\uFF21-\uFF26\uFF41-\uFF46]").addAll(digitSet),
spaceSet = whitespaceSet.size() == 0 ? zSet : whitespaceSet,
controlSet = getSet(CATEGORY, Cc),
punctSet = new UnicodeSet(pSet).addAll(sSet),
graphSet = new UnicodeSet(0,0x10ffff)
.removeAll(controlSet)
//.removeAll(getSet(CATEGORY, Cf))
.removeAll(csSet)
.removeAll(cnSet)
.removeAll(zSet),
// Cc, Cf, Cs, Cn, Z
blankSet = new UnicodeSet(spaceSet).removeAll(new UnicodeSet("[\\u000A-\\u000D\\u0085]"))
.removeAll(getSet(CATEGORY, Zl))
.removeAll(getSet(CATEGORY, Zp));
static class Prop {
String name;
UnicodeSet contents = new UnicodeSet();
String guess = "???";
UnicodeSet guessContents = new UnicodeSet();
String wsname = whitespaceSet.size() == 0 ? "gc=Z" : "Whitespace";
Prop(String name) {
this.name = name;
if (name.equals("alpha")) {
guess = "Alphabetic + gc=M";
guessContents = alphaSet;
} else if (name.equals("lower")) {
guess = "Lowercase + gc=Lt - dt=circle";
guessContents = lowerSet;
} else if (name.equals("upper")) {
guess = "Uppercase + gc=Lt - dt=circle";
guessContents = upperSet;
} else if (name.equals("digit")) {
guess = "gc=Nd";
guessContents = digitSet;
} else if (name.equals("xdigit")) {
guess = "gc=Nd+a..f (upper/lower,normal/fullwidth)";
guessContents = xdigitSet;
} else if (name.equals("space")) {
guess = wsname;
guessContents = spaceSet;
//Utility.showSetNames("Whitespace", spaceSet, true, Default.ucd);
} else if (name.equals("cntrl")) {
guess = "gc=Cc";
guessContents = controlSet;
} else if (name.equals("punct")) {
guess = "gc=P,S";
guessContents = punctSet;
} else if (name.equals("graph")) {
guess = "All - gc=Cc, Cs, Cn, or Z";
guessContents = graphSet;
} else if (name.equals("blank")) {
guess = wsname + " - (LF,VT,FF,CR,NEL + gc=Zl,Zp)";
guessContents = blankSet;
} else if (name.equals("ISO_14652_class \"combining\"")) {
guess = "gc=M";
guessContents = combiningSet;
}
/*upper
lower
alpha
digit
outdigit
space
cntrl
punct
graph
xdigit
blank
toupper
tolower
*/
}
void show(PrintWriter pw) {
if (name.equals("ISO_14652_LC_CTYPE")) return;
if (name.equals("ISO_14652_toupper")) return;
if (name.equals("ISO_14652_tolower")) return;
if (name.equals("ISO_14652_outdigit")) return;
if (name.equals("ISO_14652_outdigit")) return;
if (name.startsWith("ISO_14652_class")) return;
pw.println();
pw.println("**************************************************");
pw.println(name);
pw.println("**************************************************");
Utility.showSetDifferences(pw, name, contents, guess, guessContents, false, true, null, Default.ucd());
//pw.println(props[i].contents);
}
}
static Prop[] props = new Prop[100];
static int propCount = 0;
public static void main(String[] args) throws IOException {
String version = Default.ucd().getVersion();
PrintWriter log = Utility.openPrintWriter("Diff14652_" + version + ".txt", Utility.UTF8_WINDOWS);
try {
log.write('\uFEFF');
log.print("Version: " + version);
if (false) {
UnicodeSet ID = getSet(DERIVED, ID_Start).addAll(getSet(DERIVED, ID_Continue_NO_Cf));
UnicodeSet XID = getSet(DERIVED, Mod_ID_Start).addAll(getSet(DERIVED, Mod_ID_Continue_NO_Cf));
UnicodeSet alphanumSet = new UnicodeSet(alphaSet).addAll(digitSet).addAll(getSet(CATEGORY, Pc));
Utility.showSetDifferences("ID", ID, "XID", XID, false, Default.ucd());
Utility.showSetDifferences("ID", ID, "Alphabetic+Digit+Pc", alphanumSet, false, Default.ucd());
}
BufferedReader br = Utility.openReadFile("C:\\DATA\\ISO14652_CTYPE.txt", Utility.LATIN1);
while (true) {
String line = br.readLine();
if (line == null) break;
line = line.trim();
if (line.length() == 0) continue;
if (line.charAt(line.length() - 1) == '/') {
line = line.substring(0, line.length() - 1);
}
line = line.trim();
if (line.length() == 0) continue;
char ch = line.charAt(0);
if (ch == '%') continue;
if (ch == '(') continue;
if (ch == '<') {
addItems(line, props[propCount-1].contents);
} else {
// new property
System.out.println(line);
if (line.equals("width")) break;
props[propCount] = new Prop(line);
props[propCount].name = "ISO_14652_" + line;
props[propCount].contents = new UnicodeSet();
propCount++;
}
}
for (int i = 0; i < propCount; ++i) props[i].show(log);
log.println();
log.println("**************************************************");
log.println("Checking POSIX requirements for inclusion and disjointness.");
log.println("**************************************************");
log.println();
/*
alpha, digit, punct, cntrl are all disjoint
space, cntrl, blank are pairwise disjoint with any of alpha, digit, xdigit
alpha includes upper, lower
graph includes alpha, digit, punct
print includes graph
xdigit includes digit
*/
Prop
alpha = getProp("ISO_14652_alpha"),
upper = getProp("ISO_14652_upper"),
lower = getProp("ISO_14652_lower"),
graph = getProp("ISO_14652_graph"),
//print = getProp("ISO_14652_print"),
punct = getProp("ISO_14652_punct"),
digit = getProp("ISO_14652_digit"),
xdigit = getProp("ISO_14652_xdigit"),
space = getProp("ISO_14652_space"),
blank = getProp("ISO_14652_blank"),
cntrl = getProp("ISO_14652_cntrl");
checkDisjoint(log, new Prop[] {alpha, digit, punct, cntrl});
Prop [] l1 = new Prop[] {space, cntrl, blank};
Prop [] l2 = new Prop[] {alpha, digit, xdigit};
for (int i = 0; i < l1.length; ++i) {
for (int j = i + 1; j < l2.length; ++j) {
checkDisjoint(log, l1[i], l2[j]);
}
}
checkIncludes(log, alpha, upper);
checkIncludes(log, alpha, lower);
checkIncludes(log, graph, alpha);
checkIncludes(log, graph, digit);
checkIncludes(log, graph, punct);
//checkIncludes(log, print, graph);
checkIncludes(log, xdigit, digit);
// possibly alpha, digit, punct, cntrl, space cover the !(Cn,Cs)
UnicodeSet trRemainder = new UnicodeSet(cnSet)
.complement()
.removeAll(csSet)
.removeAll(digit.contents)
.removeAll(punct.contents)
.removeAll(alpha.contents)
.removeAll(cntrl.contents)
.removeAll(space.contents);
Utility.showSetNames(log, "TR Remainder: ", trRemainder, false, false, Default.ucd());
UnicodeSet propRemainder = new UnicodeSet(cnSet)
.complement()
.removeAll(csSet)
//.removeAll(noSet)
//.removeAll(cfSet)
.removeAll(digit.guessContents)
.removeAll(punct.guessContents)
.removeAll(alpha.guessContents)
.removeAll(cntrl.guessContents)
.removeAll(space.guessContents);
Utility.showSetNames(log, "Prop Remainder: ", propRemainder, false, false, Default.ucd());
/*
checkDisjoint(new Prop[] {alpha, digit, punct, cntrl});
UnicodeSet remainder = cnSet.complement();
UnicodeSet guessRemainder = new UnicodeSet(remainder);
for (int i = 0; i < list.length; ++i) {
for (int j = i + 1; j < list.length; ++j) {
compare(log, list[i].name, list[i].contents, list[j].name, list[j].contents);
compare(log, list[i].guess, list[i].guessContents, list[j].guess, list[j].guessContents);
}
remainder.removeAll(list[i].contents);
guessRemainder.removeAll(list[i].guessContents);
}
if (remainder.size() != 0) {
log.println();
log.println("Incomplete (TR): " + remainder);
}
if (guessRemainder.size() != 0) {
log.println();
log.println("Incomplete (Prop): " + guessRemainder);
}
*/
} finally {
log.close();
}
}
static void checkDisjoint(PrintWriter log, Prop[] list) {
for (int i = 0; i < list.length; ++i) {
for (int j = i + 1; j < list.length; ++j) {
checkDisjoint(log, list[i], list[j]);
}
}
}
static void checkDisjoint(PrintWriter log, Prop prop1, Prop prop2) {
checkDisjoint(log, prop1.name, prop1.contents, prop2.name, prop2.contents);
checkDisjoint(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents);
}
static void checkDisjoint(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) {
if (set.containsSome(set2)) {
log.println();
log.println("Fails test: " + name + " disjoint-with " + name2);
UnicodeSet diff = new UnicodeSet(set).retainAll(set2);
Utility.showSetNames(log, "", diff, false, false, Default.ucd());
}
}
static void checkIncludes(PrintWriter log, Prop prop1, Prop prop2) {
checkIncludes(log, prop1.name, prop1.contents, prop2.name, prop2.contents);
checkIncludes(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents);
}
static void checkIncludes(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) {
if (!set.containsAll(set2)) {
log.println();
log.println("Fails test:" + name + " includes " + name2);
UnicodeSet diff = new UnicodeSet(set2).removeAll(set);
Utility.showSetNames(log, "", diff, false, false, Default.ucd());
}
}
static String[] pieces = new String[100];
// example: <U1F48>..<U1F4D>;<U1F59>;<U1F5B>;<U1F5D>;<U1F5F>;<U1F68>..<U1F6F>;/
static void addItems(String line, UnicodeSet contents) {
int len = Utility.split(line, ';', pieces);
for (int i = 0; i < len; ++i) {
String piece = pieces[i].trim();
if (piece.length() == 0) continue;
if (piece.equals("<0>")) continue;
int start, end;
int rangePoint = piece.indexOf("..");
if (rangePoint >= 0) {
start = parse(piece.substring(0,rangePoint));
end = parse(piece.substring(rangePoint+2));
} else {
start = end = parse(piece);
}
contents.add(start, end);
}
}
static int parse(String piece) {
if (!piece.startsWith("<U") || !piece.endsWith(">")) {
throw new IllegalArgumentException("Bogus code point: " + piece);
}
return Integer.parseInt(piece.substring(2,piece.length()-1), 16);
}
static Prop getProp(String name) {
//System.out.println("Searching for: " + name);
for (int i = 0; i < propCount; ++i) {
//System.out.println("Checking: " + props[i].name);
if (props[i].name.equals(name)) {
return props[i];
}
}
//System.out.println("Missed");
return null;
}
// oddities:
// extra space after ';' <U0300>..<U036F>; <U20D0>..<U20FF>; <UFE20>..<UFE2F>;/
// <0>?? <0>;<U0BE7>..<U0BEF>;/
// <U202C>; <U202D>;<U202E>; <UFEFF> : 0;/
// % "print" is by default "graph", and the <space> character
// print is odd, since it includes space but not other spaces.
// alnum not defined.
}

View File

@ -1,473 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompareProperties.java,v $
* $Date: 2004/02/12 08:23:15 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.text.NumberFormat;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
public class CompareProperties implements UCD_Types {
static final boolean DO_DISJOINT = false;
static CompareProperties me = null;
static void partition() throws IOException {
if (me == null) me = new CompareProperties();
me.printPartition();
}
static void statistics() throws IOException {
UnicodeSet a = new UnicodeSet("[abc]");
UnicodeSet empty = new UnicodeSet();
System.out.println(a.containsAll(empty));
System.out.println(empty.containsAll(a));
System.out.println(empty.containsAll(new UnicodeSet()));
if (me == null) me = new CompareProperties();
me.printStatistics();
}
public final class BitSetComparator implements Comparator {
public int compare(Object o1, Object o2) {
BitSet bs1 = (BitSet) o1;
BitSet bs2 = (BitSet) o2;
int count2 = bs1.size() > bs2.size() ? bs1.size() : bs2.size();
for (int i = 0; i < count2; ++i) {
if (bs1.get(i)) {
if (!bs2.get(i)) {
return 1;
}
} else if (bs2.get(i)) {
return -1;
}
}
return 0;
}
}
/*
*
* @author Davis
*
* Reverses the order of a comparison, for getting a list in reverse order
*/
public static class InverseComparator implements Comparator {
private Comparator other;
public InverseComparator(Comparator other) {
this.other = other;
}
public int compare(Object a, Object b) {
return other.compare(b, a);
}
}
/*
*
* @author Davis
*
* Reverses the order of a comparison, for getting a list in reverse order
*/
public static class MethodComparator implements Comparator {
public int compare(Object a, Object b) {
return ((Comparable)a).compareTo(b);
}
}
public final static class UnicodeSetComparator implements Comparator {
/**
* Compares two UnicodeSets, producing a transitive ordering.
* The ordering is based on the first codepoint that differs between them.
* @return -1 if first set contains the first different code point
* 1 if the second set does.
* 0 if there is no difference.
* If compareTo were added to UnicodeSet, this can be optimized to use list[i].
* @author Davis
*
*/
public int compare(Object o1, Object o2) {
UnicodeSetIterator it1 = new UnicodeSetIterator((UnicodeSet) o1);
UnicodeSetIterator it2 = new UnicodeSetIterator((UnicodeSet) o2);
while (it1.nextRange()) {
if (!it2.nextRange()) return -1; // first has range while second exhausted
if (it1.codepoint < it2.codepoint) return -1; // first has code point not in second
if (it1.codepoint > it2.codepoint) return 1;
if (it1.codepointEnd < it2.codepointEnd) return 1; // second has codepoint not in first
if (it1.codepointEnd > it2.codepointEnd) return -1;
}
if (it2.nextRange()) return 1; // second has range while first is exhausted
return 0; // otherwise we ran out in both of them, so equal
}
}
boolean isPartitioned = false;
UCDProperty[] props = new UCDProperty[500];
UnicodeSet[] sets = new UnicodeSet[500];
int count = 0;
BitSet[] disjoints = new BitSet[500];
BitSet[] contains = new BitSet[500];
BitSet[] isin = new BitSet[500];
BitSet[] equals = new BitSet[500];
Map map = new TreeMap(new BitSetComparator());
{
getProperties();
fillPropertyValues();
Utility.fixDot();
}
private void fillPropertyValues() {
BitSet probe = new BitSet();
int total = 0;
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = Default.ucd().getCategory(cp);
// if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
if (!Default.ucd().isAllocated(cp)) continue;
for (int i = 0; i < count; ++i) {
UCDProperty up = props[i];
boolean iProp = up.hasValue(cp);
if (iProp) {
probe.set(i);
sets[i].add(cp);
} else {
probe.clear(i);
}
}
++total;
UnicodeSet value = (UnicodeSet) map.get(probe);
if (value == null) {
value = new UnicodeSet();
map.put(probe.clone(), value);
// Utility.fixDot();
// System.out.println("Set Size: " + map.size() + ", total: " + total + ", " + Default.ucd.getCodeAndName(cp));
}
value.add(cp);
}
}
private void getProperties() {
for (int i = 0; i < LIMIT_ENUM; ++i) { // || iType == SCRIPT
int iType = i & 0xFF00;
if (iType == AGE || iType == JOINING_GROUP || iType == COMBINING_CLASS) continue;
if (i == 0x0900) {
System.out.println("debug");
}
UCDProperty up = UnifiedBinaryProperty.make(i, Default.ucd());
if (up == null) continue;
if (up.getValueType() < BINARY_PROP) {
System.out.println("\tSkipping " + up.getName() + "; value varies");
continue;
}
if (!up.isStandard()) {
System.out.println("\tSkipping " + getPropName(up) + "; not standard");
continue;
}
if (up.getName(LONG).startsWith("Other_")) {
System.out.println("\tSkipping " + getPropName(up) + "; contributory");
continue;
}
if (up.isDefaultValue() || up.skipInDerivedListing()) {
System.out.println("\tSkipping " + getPropName(up) + "; default value");
continue;
}
// System.out.println(Utility.hex(i) + " " + up.getName(LONG) + "(" + up.getName(SHORT) + ")");
// System.out.println("\t" + up.getValue(LONG) + "(" + up.getValue(SHORT) + ")");
sets[count] = new UnicodeSet();
disjoints[count] = new BitSet();
equals[count] = new BitSet();
contains[count] = new BitSet();
isin[count] = new BitSet();
props[count++] = up;
System.out.println(Utility.hex(i) + " " + (count - 1) + " " + getPropName(count - 1));
}
System.out.println("props: " + count);
}
public void printPartition() throws IOException {
System.out.println("Set Size: " + map.size());
PrintWriter output = Utility.openPrintWriter("Partition"
+ UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_WINDOWS);
Iterator it = map.keySet().iterator();
while (it.hasNext()) {
BitSet probe2 = (BitSet) it.next();
UnicodeSet value = (UnicodeSet) map.get(probe2);
output.println();
output.println(value);
output.println("Size: " + value.size());
for (int i = 0; i < count; ++i) {
if (!probe2.get(i)) continue;
output.print(" " + getPropName(i));
}
output.println();
}
output.println("Count: " + map.keySet().size());
output.close();
}
static final NumberFormat percent = NumberFormat.getPercentInstance(Locale.ENGLISH);
public void printStatistics() throws IOException {
System.out.println("Set Size: " + map.size());
PrintWriter output = Utility.openPrintWriter("Statistics"
+ UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_WINDOWS);
System.out.println("Finding disjoints/contains");
for (int i = 0; i < count; ++i) {
System.out.println(getPropName(i));
for (int j = 0; j < count; ++j) {
if (j == i) continue;
if (i == 1 && j == 2) {
System.out.println("debug");
}
if (sets[i].containsNone(sets[j])) {
disjoints[i].set(j);
} else if (sets[i].equals(sets[j])) {
equals[i].set(j);
} else if (sets[i].containsAll(sets[j])) {
contains[i].set(j);
} else if (sets[j].containsAll(sets[i])) {
isin[i].set(j);
}
}
}
System.out.println("Removing non-maximal sets");
// a set is non-maximal if it is contained in one of the other sets
// so remove anything that is contained in one of the items
if (false) {
BitSet[] tempContains = new BitSet[count];
for (int i = 0; i < count; ++i) {
System.out.println(getPropName(i));
tempContains[i] = (BitSet) contains[i]; // worry about collisions
BitSet b = contains[i];
for (int j = 0; j < b.size(); ++j) {
if (b.get(j)) tempContains[i].andNot(contains[j]);
}
b = disjoints[i]; // don't worry
for (int j = 0; j < b.size(); ++j) {
if (b.get(j)) b.andNot(contains[j]);
}
}
for (int i = 0; i < count; ++i) {
contains[i] = tempContains[i];
}
}
System.out.println("Printing disjoints & contains");
// a set is non-maximal if it is contained in one of the other sets
// so remove anything that is contained in one of the items
List remainder = new ArrayList();
Map m = new TreeMap(); // new UnicodeSetComparator()
for (int i = 0; i < count; ++i) {
m.put(getPropName(i), new Integer(i)); // sets[i]
}
Iterator it = m.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
int index = ((Integer)m.get(key)).intValue();
boolean haveName = printBitSet(output, index, "EQUALS: ", equals[index], false);
haveName = printBitSet(output, index, "CONTAINS: ", contains[index], haveName);
haveName = printBitSet(output, index, "IS CONTAINED IN: ", isin[index], haveName);
if (DO_DISJOINT) {
printBitSet(output, index, "IS DISJOINT WITH: ", disjoints[index], haveName);
}
if (!haveName) remainder.add(getPropName(index));
}
it = remainder.iterator();
output.println();
output.print("NONE OF THE ABOVE: ");
boolean first = true;
while (it.hasNext()) {
Object key = it.next();
if (!first) output.print(", ");
first = false;
output.print(key);
}
output.println();
output.close();
}
private boolean printBitSet(PrintWriter output, int index, String title, BitSet b, boolean haveName) {
if (!b.isEmpty()) {
if (!haveName) {
output.println();
output.println(getPropName(index));
haveName = true;
}
output.print(title);
Set ss = new TreeSet();
for (int j = 0; j < b.size(); ++j) {
if (b.get(j)) {
ss.add(getPropName(j));
}
}
Iterator it = ss.iterator();
boolean first = true;
while (it.hasNext()) {
if (!first) output.print(", ");
first = false;
output.print(it.next());
}
output.println();
output.flush();
}
return haveName;
}
/*
UnicodeSet a_b = new UnicodeSet();
UnicodeSet ab = new UnicodeSet();
UnicodeSet _ab = new UnicodeSet();
*/
/*
a_b.set(sets[i]).removeAll(sets[j]);
ab.set(sets[i]).retainAll(sets[j]);
_ab.set(sets[j]).removeAll(sets[i]);
// we are interested in cases where a contains b or is contained by b
// contain = _ab = 0
// is contained == a_b = 0
// is disjoint == ab == 0
// is equal == contains & iscontained
double total = a_b.size() + ab.size() + _ab.size();
double limit = total*0.03;
boolean gotName = showDiff(output, "C", j, a_b, total, limit, false);
gotName = showDiff(output, "D", j, ab, total, limit, gotName);
gotName = showDiff(output, "S", j, _ab, total, limit, gotName);
if (gotName) output.println();
*/
private boolean showDiff(PrintWriter output, String title, int propIndex, UnicodeSet a_b,
double total, double limit, boolean gotName) {
if (0 < a_b.size() && a_b.size() < limit) {
if (!gotName) {
gotName = true;
output.print("\t" + getPropName(propIndex));
}
output.print("\t" + title + percent.format(a_b.size()/total));
}
return gotName;
}
private String getPropName(int propertyIndex) {
return getPropName(props[propertyIndex]);
}
private String getPropName(UCDProperty ubp) {
return Utility.getUnskeleton(ubp.getFullName(LONG), true);
}
public static void listDifferences() throws IOException {
PrintWriter output = Utility.openPrintWriter("PropertyDifferences" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
output.println("# Listing of relationships among properties, suitable for analysis by spreadsheet");
output.println("# Generated for " + Default.ucd().getVersion());
output.println(UnicodeDataFile.generateDateLine());
output.println("# P1 P2 R(P1,P2) C(P1&P2) C(P1-P2) C(P2-P1)");
for (int i = 1; i < UCD_Types.LIMIT_ENUM; ++i) {
int iType = i & 0xFF00;
if (iType == UCD_Types.JOINING_GROUP || iType == UCD_Types.AGE || iType == UCD_Types.COMBINING_CLASS || iType == UCD_Types.SCRIPT) continue;
UCDProperty upi = UnifiedBinaryProperty.make(i, Default.ucd());
if (upi == null) continue;
if (!upi.isStandard()) {
System.out.println("Skipping " + upi.getName() + "; not standard");
continue;
}
if (upi.getValueType() < UCD_Types.BINARY_PROP) {
System.out.println("Skipping " + upi.getName() + "; value varies");
continue;
}
String iNameShort = upi.getFullName(UCD_Types.SHORT);
String iNameLong = upi.getFullName(UCD_Types.LONG);
System.out.println();
System.out.println();
System.out.println(iNameLong);
output.println("#" + iNameLong);
int last = -1;
for (int j = i+1; j < UCD_Types.LIMIT_ENUM; ++j) {
int jType = j & 0xFF00;
if (jType == UCD_Types.JOINING_GROUP || jType == UCD_Types.AGE || jType == UCD_Types.COMBINING_CLASS || jType == UCD_Types.SCRIPT
|| (jType == iType && jType != UCD_Types.BINARY_PROPERTIES)) continue;
UCDProperty upj = UnifiedBinaryProperty.make(j, Default.ucd());
if (upj == null) continue;
if (!upj.isStandard()) continue;
if (upj.getValueType() < UCD_Types.BINARY_PROP) continue;
if ((j >> 8) != last) {
last = j >> 8;
System.out.println();
System.out.print("\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]);
output.flush();
output.println("#\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]);
} else {
System.out.print('.');
}
System.out.flush();
int bothCount = 0, i_jPropCount = 0, j_iPropCount = 0, iCount = 0, jCount = 0;
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
int cat = Default.ucd().getCategory(cp);
if (cat == UCD_Types.UNASSIGNED || cat == UCD_Types.PRIVATE_USE || cat == UCD_Types.SURROGATE) continue;
if (!Default.ucd().isAllocated(cp)) continue;
boolean iProp = upi.hasValue(cp);
boolean jProp = upj.hasValue(cp);
if (jProp) ++jCount;
if (iProp) {
++iCount;
if (jProp) ++bothCount;
else ++i_jPropCount;
} else if (jProp) ++j_iPropCount;
}
if (iCount == 0 || jCount == 0) continue;
String jNameShort = upj.getFullName(UCD_Types.SHORT);
//String jNameLong = ubp.getFullID(j, LONG);
String rel = bothCount == 0 ? "DISJOINT"
: i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS"
: i_jPropCount == 0 ? "CONTAINS" // depends on reverse output
: j_iPropCount == 0 ? "CONTAINS"
: "OVERLAPS";
if (j_iPropCount > i_jPropCount) {
// reverse output
output.println(jNameShort + "\t" + iNameShort + "\t" + rel
+ "\t" + bothCount + "\t" + j_iPropCount + "\t" + i_jPropCount);
} else {
output.println(iNameShort + "\t" + jNameShort + "\t" + rel
+ "\t" + bothCount + "\t" + i_jPropCount + "\t" + j_iPropCount);
}
}
}
output.close();
}
}

View File

@ -1,908 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import java.util.*;
import java.text.NumberFormat;
import java.io.*;
/** Simple program to merge UCD files into XML. Not yet documented!!
* @author Mark Davis
*/
public final class ConvertUCD implements UCD_Types {
public static final boolean SHOW = false;
public static final boolean DEBUG = false;
static final boolean SHOW_SAMPLE = false;
int major;
int minor;
int update;
String version;
// varies by version
/*
public static final String BASE_DIR11 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR20 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR21 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR30 = DATA_DIR + "\\Update 3.0.1\\";
public static final String BASE_DIR31 = DATA_DIR + "\\3.1-Update\\";
*/
//public static final String blocksnamePlain = "Blocks.txt";
//public static final String blocksname31 = "Blocks-4d2.beta";
/** First item is file name, rest are field names (skipping character).
* "OMIT" is special -- means don't record
*/
static String[][] labelList = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
//{"ExtraProperties", "xp"},
{"PropList", "binary"},
//{"ExtraProperties", "xp"},
{"EastAsianWidth", "ea", "OMIT"},
{"LineBreak", "lb", "OMIT"},
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions", "ce"},
{"CaseFolding", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts", "sn"},
//{"Jamo", "jn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
//*/
};
static HashMap isHex = new HashMap();
static HashMap defaults = new HashMap();
static {
for (int j = 0; j < labelList.length; ++j) {
String[] labels = labelList[j];
for (int i = 1; i < labels.length; ++i) {
boolean hex = false;
String def = null;
//char appendChar = '\u0000';
// pull off "*": hex interpretation
if (labels[i].charAt(0) == '*') { // HEX value
hex = true;
labels[i] = labels[i].substring(1);
}
/*
// pull off "$": append duplicates
if (labels[i].charAt(0) == '$') { // HEX value
appendChar = labels[i].charAt(1);
labels[i] = labels[i].substring(2);
}
// pull off default values
int pos = labels[i].indexOf('-');
if (pos >= 0) {
def = labels[i].substring(pos+1);
labels[i] = labels[i].substring(0,pos);
}
*/
// store results
// we do this after all processing, so that the label is clean!!
if (hex) isHex.put(labels[i], "");
//if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
defaults.put(labels[i], def);
}
}
}
/*
static String[][] labelList31 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"PropList-3.1.0d5.beta", "binary"},
{"ExtraProperties", "xp"},
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
{"LineBreak-6d6.beta", "lb", "OMIT"},
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions-3d6.beta", "ce"},
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts-3.1.0d4.beta", "sn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
{"Jamo", "jn"},
//
};
/*
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"ExtraProperties", "xp"},
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
{"LineBreak-6d6.beta", "lb", "OMIT"},
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions-3d6.beta", "ce"},
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts-1d4", "sn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
{"Jamo", "jn"},
//
//"NamesList-3.1.0d1.beta"
static String[][] labelList30 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"CompositionExclusions", "ce"},
{"EastAsianWidth", "ea", "OMIT"},
{"LineBreak", "lb", "OMIT"},
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
{"CaseFolding", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
/*
{"Jamo", "jn"},
{"PropList.alpha", "RANGE", "OMIT"},
//
};
static String[][] labelList11 = {
{"UnicodeData-1.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
static String[][] labelList20 = {
{"UnicodeData-2.0", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
static String[][] labelList21 = {
{"UnicodeData-2.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
*/
// handles
public static final String blocksname = "Blocks";
//public static final String[][] labelList;
public static final boolean NEWPROPS = true;
/*
static {
switch (major*10 + minor) {
case 31:
blocksname = blocksname31;
labelList = labelList31;
break;
case 30:
blocksname = blocksnamePlain;
labelList = labelList30;
break;
case 21:
blocksname = blocksnamePlain;
labelList = labelList21;
break;
case 20:
blocksname = blocksnamePlain;
labelList = labelList20;
break;
default:
blocksname = blocksnamePlain;
labelList = labelList11;
break;
}
}
*/
static final String dataFilePrefix = "UCD_Data";
// MAIN!!
public static void main (String[] args) throws Exception {
System.out.println("Building binary version of UCD");
log = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "UCD-log.txt"),
"UTF8"),
32*1024));
log.write("\uFEFF"); // BOM
try {
for (int i = 0; i < args.length; ++i) {
String version = args[i];
if (version.length() == 0) version = UCD.latestVersion;
new ConvertUCD().toJava(version);
}
} finally {
log.close();
}
}
/*
static void toXML() throws Exception {
// Blocks is special
// Unihan is special
// collect all the other .txt files in the directory
if (false) readBlocks();
if (true) for (int i = 0; i < labelList.length; ++i) {
readSemi(labelList[i]);
} else {
readSemi(labelList[0]); // TESTING ONLY
}
writeXML();
}
*/
void toJava(String version) throws Exception {
this.version = version;
String[] parts = new String[3];
Utility.split(version, '.', parts);
major = Integer.parseInt(parts[0]);
minor = Integer.parseInt(parts[1]);
update = Integer.parseInt(parts[2]);
System.out.println("Building " + version);
// Blocks is special
// Unihan is special
// collect all the other .txt files in the directory
if (false) readBlocks();
if (true) for (int i = 0; i < labelList.length; ++i) {
readSemi(labelList[i]);
} else {
readSemi(labelList[0]); // TESTING ONLY
}
Iterator it = charData.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
UData value = (UData) charData.get(key);
value.compact();
}
/*
UData ud;
ud = getEntry(0x5e);
System.out.println("SPOT-CHECK: 5e: " + ud);
ud = getEntry(0x130);
System.out.println("SPOT-CHECK: 130: " + ud);
ud = getEntry(0x1f6);
System.out.println("SPOT-CHECK: 1f6: " + ud);
ud = getEntry(0x2A6D6);
System.out.println("SPOT-CHECK: 2A6D6: " + ud);
ud = getEntry(0xFFFF);
System.out.println("SPOT-CHECK: FFFF: " + ud);
*/
writeJavaData();
}
static PrintWriter log;
//static String directory = BASE_DIR;
//static Map appendDuplicates = new HashMap();
/** First item in labels is file name, rest are field names (skipping character).
* "OMIT" is special -- means don't record
*/
List blockData = new LinkedList();
void readBlocks() throws Exception {
System.out.println("Reading 'Blocks'");
BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, Utility.LATIN1);
String line = "";
try {
String[] parts = new String[20];
for (int lineNumber = 1; ; ++lineNumber) {
line = input.readLine();
if (line == null) break;
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
//String original = line;
String comment = "";
int commentPos = line.indexOf('#');
if (commentPos >= 0) {
comment = line.substring(commentPos+1).trim();
line = line.substring(0, commentPos);
}
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line,';',parts);
if (count != 3) throw new ChainException("Bad count in Blocks", null);
blockData.add(new String[] {Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim()});
}
} catch (Exception e) {
System.out.println("Exception at: " + line);
throw e;
} finally {
input.close();
}
}
Set properties = new TreeSet();
void readSemi(String[] labels) throws Exception {
System.out.println();
System.out.println("Reading '" + labels[0] + "'");
if (major < 3 || (major == 3 && minor < 1)) {
if (labels[0] == "PropList") {
System.out.println("SKIPPING old format of Proplist for " + version);
return;
}
}
String tempVersion = version;
if (version.equals(UCD.latestVersion)) tempVersion = "";
BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true, Utility.LATIN1);
if (input == null) {
System.out.println("COULDN'T OPEN: " + labels[0]);
return;
}
boolean showedSemi = false;
boolean showedShort = false;
String line = "";
try {
String[] parts = new String[20];
for (int lineNumber = 1; ; ++lineNumber) {
try {
line = input.readLine();
if (line == null) break;
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
String original = line;
String comment = "";
int commentPos = line.indexOf('#');
if (commentPos >= 0) {
comment = line.substring(commentPos+1).trim();
line = line.substring(0, commentPos);
}
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line,';',parts);
if (false && parts[0].equals("2801")) {
System.out.println("debug?");
}
// fix malformed or simple lists.
if (count != labels.length) {
if (count == labels.length + 1 && parts[count-1].equals("")) {
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
showedSemi = true;
} else if (count == 1) { // fix simple list
++count;
parts[1] = "Y";
} else if (count < labels.length) {
if (!showedShort) System.out.println("Line shorter than labels: " + original);
showedShort = true;
for (int i = count; i < labels.length; ++i) {
parts[i] = "";
}
} else {
throw new ChainException("wrong count: {0}",
new Object[] {new Integer(line), new Integer(count)});
}
}
// store char
// first field is always character OR range. May be UTF-32
int cpTop;
int cpStart;
int ddot = parts[0].indexOf(".");
if (ddot >= 0) {
cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0);
cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0);
// System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop));
} else {
cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0);
cpTop = cpStart;
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
}
// properties first
if (labels[1].equals("PROP")) {
String prop = parts[2].trim();
// FIX!!
boolean skipLetters = false;
if (prop.equals("Alphabetic")) {
prop = "Other_Alphabetic";
skipLetters = true;
}
// END FIX!!
properties.add(prop);
if (Utility.find(prop, UCD_Names.DeletedProperties, true) == -1) { // only undeleted
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
if (end == 0) end = cpStart;
for (int j = cpStart; j <= end; ++j) {
if (j != UCD.mapToRepresentative(j, Integer.MAX_VALUE)) continue;
if (skipLetters && getEntry(cpStart).isLetter()) continue;
appendCharProperties(j, prop);
}
}
} else { // not range!
String val = "";
String lastVal;
for (int i = 1; i < labels.length; ++i) {
String key = labels[i];
lastVal = val;
if (isHex.get(key) != null) {
val = Utility.fromHex(parts[i]);
} else {
val = parts[i].trim();
}
if (key.equals("OMIT")) continue; // do after val, so lastVal is correct
if (key.equals("RANGE")) continue; // do after val, so lastVal is correct
if (val.equals("")) continue; // skip empty values, they mean default
for (int cps = cpStart; cps <= cpTop; ++cps) {
if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) continue; // skip condensed ranges
if (key.equals("binary")) {
appendCharProperties(cps, val);
} else if (key.equals("fc")) {
UData data = getEntry(cps);
String type = parts[i-1].trim();
if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) {
data.fullCaseFolding = val;
//System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
if (type.equals("S") || type.equals("C") || type.equals("L")) {
data.simpleCaseFolding = val;
//System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
if (type.equals("I")) {
data.simpleCaseFolding = val;
setBinaryProperty(cps, CaseFoldTurkishI);
if (DEBUG) System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting "
+ Utility.hex(cps) + ": " + Utility.hex(val));
}
} else if (labels[0].equals("SpecialCasing") // special handling for special casing
&& labels[4].equals("sc")
&& parts[4].trim().length() > 0) {
if (i < 4) {
if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", "
+ Utility.hex(key) + ":" + Utility.hex(val));
addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val);
}
} else {
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
UData data = getEntryIfExists(cps);
if (data == null || data.generalCategory == Cn) continue;
}
*/
addCharData(cps, key, val);
}
}
}
}
} catch (Exception e) {
System.err.println("*Exception at: " + line + ", " + e.getMessage());
//System.err.println(e.getMessage());
}
}
} catch (Exception e) {
System.out.println("Exception at: " + line + ", " + e.getMessage());
throw e;
} finally {
input.close();
}
//printValues("JOINING_TYPE", jtSet);
//printValues("JOINING_GROUP", jgSet);
}
static void printValues(String title, Set s) {
Iterator it = s.iterator();
System.out.println("public static String[] " + title + " = {");
while (it.hasNext()) {
String value = (String) it.next();
System.out.println(" \"" + value + "\",");
}
System.out.println("};");
it = s.iterator();
System.out.println("public static byte ");
int count = 0;
while (it.hasNext()) {
String value = (String) it.next();
System.out.println(" " + value.replace(' ', '-').toUpperCase() + " = " + (count++) + ",");
}
System.out.println(" LIMIT_" + title + " = " + count);
System.out.println(";");
}
Map charData = new TreeMap();
/*
static void writeXML() throws IOException {
System.out.println("Writing 'UCD-Main.xml'");
BufferedWriter output = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(UCD.BIN_DIR + "UCD_Data.xml"),
"UTF8"),
32*1024);
try {
// write header
output.write("<?xml version='1.0' encoding='utf-8'?>\r\n");
output.write("<UnicodeCharacterDatabase>\r\n");
output.write(" <!-- IMPORTANT: see UCD-Notes.html for information on the format. This file CANNOT be read correctly without that information. -->\r\n");
output.write(" <unicode version='" + major + "' minor='" + minor + "' update='" + update + "'/>\r\n");
output.write(" <fileVersion status='DRAFT' date='" + new Date() + "'/>\r\n");
// write blocks
Iterator it = blockData.iterator();
while (it.hasNext()) {
String[] block = (String[]) it.next();
output.write(" <block start='" + Utility.quoteXML(block[0])
+ "' end='" + Utility.quoteXML(block[1])
+ "' name='" + Utility.quoteXML(block[2])
+ "'/>\r\n" );
}
// write char data
it = charData.keySet().iterator();
while (it.hasNext()) {
Integer cc = (Integer) it.next();
output.write(" <e c='" + Utility.quoteXML(cc.intValue()) + "'");
/*
UData data = (UData) charData.get(cc);
Iterator dataIt = data.keySet().iterator();
while (dataIt.hasNext()) {
String label = (String) dataIt.next();
if (label.equals("c")) continue; // already wrote it.
if (label.equals("fc")) {
String fc = getResolved(data, "fc");
String lc = getResolved(data, "lc");
if (!fc.equals(lc) && !lc.equals(cc)) log.println("FC " + fc.length() + ": " + toString(cc));
}
String value = Utility.quoteXML((String) data.get(label));
output.write(" " + label + "='" + value + "'");
}
*//*
output.write("/>\r\n");
}
// write footer
output.write("</UnicodeCharacterDatabase>\r\n");
} finally {
output.close();
}
}
*/
void writeJavaData() throws IOException {
Iterator it = charData.keySet().iterator();
int codePoint = -1;
System.out.println("Writing " + dataFilePrefix + version);
DataOutputStream dataOut = new DataOutputStream(
new BufferedOutputStream(
new FileOutputStream(UCD.BIN_DIR + dataFilePrefix + version + ".bin"),
128*1024));
// write header
dataOut.writeByte(BINARY_FORMAT);
dataOut.writeByte(major);
dataOut.writeByte(minor);
dataOut.writeByte(update);
long millis = System.currentTimeMillis();
dataOut.writeLong(millis);
dataOut.writeInt(charData.size());
System.out.println("Data Size: " + NumberFormat.getInstance().format(charData.size()));
int count = 0;
// write records
try {
// write char data
while (it.hasNext()) {
Object cc = (Object) it.next();
//codePoint = UTF32.char32At(cc,0);
if (DEBUG) System.out.println(Utility.hex(cc));
UData uData = (UData) charData.get(cc);
if (false && uData.name == null) {
System.out.println("Warning: NULL name\r\n" + uData);
System.out.println();
}
if (false && uData.codePoint == 0x2801) {
System.out.println("SPOT-CHECK: " + uData);
}
uData.writeBytes(dataOut);
count++;
if (DEBUG) System.out.println("Setting2");
}
System.out.println("Wrote Data " + count);
} catch (Exception e) {
throw new ChainException("Bad data write {0}", new Object [] {Utility.hex(codePoint)}, e);
} finally {
dataOut.close();
}
}
//static String[] xsSplit = new String[40];
// Cache a little bit for speed
int getEntryCodePoint = -1;
UData getEntryUData = null;
UData getEntryIfExists(int cp) {
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
UData charEntry = (UData) charData.get(cc);
if (charEntry == null) return null;
getEntryCodePoint = cp;
getEntryUData = charEntry;
return charEntry;
}
/* Get entry in table for cc
*/
UData getEntry(int cp) {
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
UData charEntry = (UData) charData.get(cc);
if (charEntry == null) {
charEntry = new UData(cp);
charData.put(cc, charEntry);
//charEntry.put("c", cc);
}
getEntryCodePoint = cp;
getEntryUData = charEntry;
return charEntry;
}
/** Adds the character data. Signals duplicates with an exception
*/
void setBinaryProperty(int cp, int binProp) {
UData charEntry = getEntry(cp);
charEntry.binaryProperties |= (1L << binProp);
}
void appendCharProperties(int cp, String key) {
int ind;
//if (true || NEWPROPS) {
ind = Utility.lookup(key, UCD_Names.BP, true);
/*} else {
ind = Utility.lookup(key, UCD_Names.BP_OLD);
}
*/
//charEntry.binaryProperties |= (1 << ind);
setBinaryProperty(cp, ind);
}
Set jtSet = new TreeSet();
Set jgSet = new TreeSet();
/** Adds the character data. Signals duplicates with an exception
*/
void addCharData(int cp, String key, String value) {
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
UData charEntry = getEntry(cp);
//if (cp < 10) System.out.println(" " + charEntry);
if (SHOW_SAMPLE && cp == 0x221) {
System.out.println("Sample: " + cp + ", " + key + ", " + value);
System.out.println(charEntry);
}
if (key.equals("bm")) {
if (value.equals("Y")) charEntry.binaryProperties |= 1;
} else if (key.equals("ce")) {
charEntry.binaryProperties |= 2;
} else if (key.equals("on")) {
if (charEntry.name.charAt(0) == '<') {
charEntry.name = '<' + value + '>';
}
} else if (key.equals("dm")) {
charEntry.decompositionType = CANONICAL;
if (value.charAt(0) == '<') {
int pos = value.indexOf('>');
String dType = value.substring(1,pos);
if (major < 2) if (dType.charAt(0) == '+') dType = dType.substring(1);
value = value.substring(pos+1);
setField(charEntry, "dt", dType);
}
// FIX OLD
if (major < 2) {
int oldStyle = value.indexOf('<');
if (oldStyle > 0) {
value = value.substring(0,oldStyle);
}
oldStyle = value.indexOf('{');
if (oldStyle > 0) {
value = value.substring(0,oldStyle);
}
}
setField(charEntry, key, Utility.fromHex(value));
// fix the numeric fields to be more sensible
} else if (key.equals("dd")) {
if (charEntry.numericType < UCD_Types.DECIMAL) {
charEntry.numericType = UCD_Types.DECIMAL;
}
setField(charEntry, "nv", value);
} else if (key.equals("dv")) {
if (charEntry.numericType < UCD_Types.DIGIT) {
charEntry.numericType = UCD_Types.DIGIT;
}
setField(charEntry, "nv", value);
} else if (key.equals("nv")) {
if (charEntry.numericType < UCD_Types.NUMERIC) {
charEntry.numericType = UCD_Types.NUMERIC;
}
setField(charEntry, "nv", value);
/*} else if (key.equals("jt")) {
jtSet.add(value);
} else if (key.equals("jg")) {
jgSet.add(value);
*/
} else {
setField(charEntry, key, value);
}
if (SHOW_SAMPLE && cp == 0x221) {
System.out.println("Sample Result:");
System.out.println(charEntry);
}
}
public void setField(UData uData, String fieldName, String fieldValue) {
try {
if (fieldName.equals("n")) {
uData.name = fieldValue;
} else if (fieldName.equals("dm")) {
uData.decompositionMapping = fieldValue;
} else if (fieldName.equals("bg")) {
uData.bidiMirror = fieldValue;
} else if (fieldName.equals("uc")) {
uData.simpleUppercase = fieldValue;
} else if (fieldName.equals("lc")) {
uData.simpleLowercase = fieldValue;
} else if (fieldName.equals("tc")) {
uData.simpleTitlecase = fieldValue;
} else if (fieldName.equals("su")) {
uData.fullUppercase = fieldValue;
} else if (fieldName.equals("sl")) {
if (DEBUG) System.out.println("Setting full lowercase to " + Utility.hex(fieldValue) + uData);
uData.fullLowercase = fieldValue;
} else if (fieldName.equals("st")) {
uData.fullTitlecase = fieldValue;
} else if (fieldName.equals("sc")) {
if (uData.specialCasing.length() > 0) {
uData.specialCasing += ";";
}
uData.specialCasing += fieldValue;
} else if (fieldName.equals("xp")) {
uData.binaryProperties |= 1L << Utility.lookup(fieldValue, UCD_Names.BP, true);
//UCD_Names.BP_OLD
} else if (fieldName.equals("gc")) {
uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GENERAL_CATEGORY, true);
// if (major >= 5 && uData.script == Unknown_Script
// && uData.generalCategory != Cn
// && uData.generalCategory != Cs
// && uData.generalCategory != Co) {
// uData.script = COMMON_SCRIPT;
// System.out.println("Resetting to Common Script: " + Utility.hex(uData.codePoint));
// }
} else if (fieldName.equals("bc")) {
uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BIDI_CLASS, true);
} else if (fieldName.equals("dt")) {
if (major < 2) {
if (fieldValue.equals("no-break")) fieldValue = "noBreak";
else if (fieldValue.equals("circled")) fieldValue = "circle";
else if (fieldValue.equals("sup")) fieldValue = "super";
else if (fieldValue.equals("break")) fieldValue = "compat";
else if (fieldValue.equals("font variant")) fieldValue = "font";
else if (fieldValue.equals("no-join")) fieldValue = "compat";
else if (fieldValue.equals("join")) fieldValue = "compat";
}
uData.decompositionType = Utility.lookup(fieldValue, UCD_Names.LONG_DECOMPOSITION_TYPE, true);
} else if (fieldName.equals("nt")) {
uData.numericType = Utility.lookup(fieldValue, UCD_Names.LONG_NUMERIC_TYPE, true);
} else if (fieldName.equals("ea")) {
uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.EAST_ASIAN_WIDTH, true);
} else if (fieldName.equals("lb")) {
uData.lineBreak = Utility.lookup(fieldValue, UCD_Names.LINE_BREAK, true);
} else if (fieldName.equals("sn")) {
uData.script = Utility.lookup(fieldValue, UCD_Names.LONG_SCRIPT, true);
} else if (fieldName.equals("jt")) {
uData.joiningType = Utility.lookup(fieldValue, UCD_Names.JOINING_TYPE, true);
} else if (fieldName.equals("jg")) {
byte temp = (byte)Utility.find(fieldValue, UCD_Names.OLD_JOINING_GROUP, true);
if (temp != -1) uData.joiningGroup = temp;
else uData.joiningGroup = Utility.lookup(fieldValue, UCD_Names.JOINING_GROUP, true);
} else if (fieldName.equals("nv")) {
if (major < 2) {
if (fieldValue.equals("-")) return;
}
uData.numericValue = Utility.doubleFrom(fieldValue);
} else if (fieldName.equals("cc")) {
uData.combiningClass = (byte)Utility.intFrom(fieldValue);
if (uData.combiningClass == 9 && major >= 5) {
System.out.println("setting Grapheme_Link " + Utility.hex(uData.codePoint) + "\t" + uData.name);
uData.binaryProperties |= (1<<GraphemeLink);
System.out.println(uData);
}
} else if (fieldName.equals("bp")) {
uData.binaryProperties = (byte)Utility.longFrom(fieldValue);
// if (major >= 5 && (uData.binaryProperties & 1<<Noncharacter_Code_Point) != 0) {
// uData.script = Unknown_Script;
// }
System.out.println("Resetting: " + uData);
} else {
throw new IllegalArgumentException("Unknown fieldName");
}
} catch (Exception e) {
throw new ChainException(
"Bad field name= \"{0}\", value= \"{1}\"", new Object[] {fieldName, fieldValue}, e);
}
}
}

View File

@ -1,93 +0,0 @@
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import java.util.Date;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.TimeZone;
public final class Default implements UCD_Types {
private static String ucdVersion = UCD.latestVersion;
private static UCD ucd;
private static Normalizer nfc;
private static Normalizer nfd;
private static Normalizer nfkc;
private static Normalizer nfkd;
private static Normalizer[] nf = new Normalizer[4];
private static String year;
public static void setUCD(String version) {
ucdVersion = version;
setUCD();
}
private static boolean inRecursiveCall = false;
private static void setUCD() {
if (inRecursiveCall) {
throw new IllegalArgumentException("Recursive call to setUCD");
}
inRecursiveCall = true;
ucd = UCD.make(ucdVersion);
nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdVersion());
nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdVersion());
nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdVersion());
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdVersion());
System.out.println("Loaded UCD" + ucd().getVersion() + " " + (new Date(ucd().getDate())));
inRecursiveCall = false;
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd', 'HH:mm:ss' GMT'");
static DateFormat yearFormat = new SimpleDateFormat("yyyy");
static {
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
year = yearFormat.format(new Date());
}
public static String getDate() {
return myDateFormat.format(new Date());
}
public static String getYear() {
return year;
}
public static String ucdVersion() {
if (ucd == null) setUCD();
return ucdVersion;
}
public static UCD ucd() {
if (ucd == null) setUCD();
return ucd;
}
public static Normalizer nfc() {
if (ucd == null) setUCD();
return nfc;
}
public static Normalizer nfd() {
if (ucd == null) setUCD();
return nfd;
}
public static Normalizer nfkc() {
if (ucd == null) setUCD();
return nfkc;
}
public static Normalizer nfkd() {
if (ucd == null) setUCD();
return nfkd;
}
public static Normalizer nf(int index) {
if (ucd == null) setUCD();
return nf[index];
}
/**
* @param lineValue
*/
public static void setYear(String lineValue) {
year = lineValue;
}
}

View File

@ -1,29 +0,0 @@
#
# Unicode Character Database: Derived Property Data
# This file shows when various code points were first assigned in Unicode.
#
# Caution: When using the Age *property*, all assigned code points
# in each version are included, not just the newly assigned code points.
# For more information, see http://www.unicode.org/reports/tr18/
#
# Notes:
#
# - The term 'assigned' means that a previously reserved code point was assigned
# to be a character (graphic, format, control, or private-use);
# a noncharacter code point; or a surrogate code point.
# For more information, see The Unicode Standard Section 2.4
#
# - Versions are only tracked from 1.1 onwards, since version 1.0
# predated changes required by the ISO 10646 merger.
#
# - The Hangul Syllables that were removed from 2.0 are not included in the 1.1 listing.
#
# - The supplementary private use code points and the non-character code points
# were assigned in version 2.0, but not specifically listed in the UCD
# until versions 3.0 and 3.1 respectively.
#
# - Contiguous ranges are broken into separate lines where they would cross code point
# types: graphic, format, control, private-use, surrogate, noncharacter
#
# For details on the contents of each version, see
# http://www.unicode.org/versions/enumeratedversions.html.

View File

@ -1,982 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2004/03/11 19:03:17 $
* $Revision: 1.26 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import java.util.*;
import java.io.PrintWriter;
public final class DerivedProperty implements UCD_Types {
UCD ucdData;
Normalizer nfc;
Normalizer nfd;
Normalizer nfkc;
Normalizer nfkd;
Normalizer[] nf = new Normalizer[4];
UnicodeSet XID_Start_Set = new UnicodeSet();
UnicodeSet XID_Continue_Set = new UnicodeSet();
// ADD CONSTANT to UCD_TYPES
static public UCDProperty make(int derivedPropertyID) {
return make(derivedPropertyID, Default.ucd());
}
static public UCDProperty make(int derivedPropertyID, UCD ucd) {
if (derivedPropertyID < 0 || derivedPropertyID >= DERIVED_PROPERTY_LIMIT) return null;
DerivedProperty dp = getCached(ucd);
return dp.dprops[derivedPropertyID];
}
///////////////////////////////////////////////////////////
static Map cache = new HashMap();
static UCD lastUCD = null;
static DerivedProperty lastValue = null;
private static DerivedProperty getCached(UCD ucd) {
if (ucd.equals(lastUCD)) return lastValue;
DerivedProperty dp = (DerivedProperty) cache.get(ucd);
if (dp == null) {
dp = new DerivedProperty(ucd);
cache.put(ucd, dp);
}
lastUCD = ucd;
lastValue = dp;
return dp;
}
/*
public String getHeader(int propNumber) {
UnicodeProperty dp = dprops[propNumber];
if (dp != null) return dp.getHeader();
else return "Unimplemented!!";
}
public String getName(int propNumber, byte style) {
UnicodeProperty dp = dprops[propNumber];
if (dp != null) return dp.getName(style);
else return "Unimplemented!!";
}
public String getValue(int cp, int propNumber) {
UnicodeProperty dp = dprops[propNumber];
if (dp != null) return dp.getValue(cp);
else return "Unimplemented!!";
}
public boolean isTest(int propNumber) {
if (!isDefined(propNumber)) return false;
return dprops[propNumber].isTest();
}
public boolean hasProperty(int cp, int propNumber) {
if (!isDefined(propNumber)) return false;
return dprops[propNumber].hasProperty(cp);
}
public boolean valueVaries(int propNumber) {
return dprops[propNumber].valueVaries();
}
/*
public String getValue(int cp, int propNumber) {
return dprops[propNumber].getValue(int cp);
}
*/
private UCDProperty[] dprops = new UCDProperty[50];
static final String[] CaseNames = {
"Uppercase",
"Lowercase",
"Mixedcase"};
class ExDProp extends UCDProperty {
Normalizer nfx;
ExDProp(int i) {
type = DERIVED_NORMALIZATION;
nfx = nf[i];
name = "Expands_On_" + nfx.getName();
shortName = "XO_" + nfx.getName();
header = "# Derived Property: " + name
+ "\r\n# Generated according to UAX #15."
+ "\r\n# Characters whose normalized length is not one."
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
}
public boolean hasValue(int cp) {
if (ucdData.getDecompositionType(cp) == NONE) return false;
String norm = nfx.normalize(cp);
if (UTF16.countCodePoint(norm) != 1) return true;
return false;
}
};
class NF_UnsafeStartProp extends UCDProperty {
Normalizer nfx;
//int prop;
NF_UnsafeStartProp(int i) {
isStandard = false;
type = DERIVED_NORMALIZATION;
nfx = nf[i];
name = nfx.getName() + "_UnsafeStart";
shortName = nfx.getName() + "_SS";
header = "# Derived Property: " + name
+ "\r\n# Generated according to UAX #15."
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
;
}
public boolean hasValue(int cp) {
if (ucdData.getCombiningClass(cp) != 0) return false;
String norm = nfx.normalize(cp);
int first = UTF16.charAt(norm, 0);
if (ucdData.getCombiningClass(first) != 0) return true;
if (nfx.isComposition()
&& dprops[NFC_TrailingZero].hasValue(first)) return true; // 1,3 == composing
return false;
}
};
/*
class HangulSyllableType extends UnicodeProperty {
Normalizer nfx;
//int prop;
HangulSyllableType(int i) {
isStandard = false;
type = DERIVED_NORMALIZATION;
nfx = nf[i];
name = nfx.getName() + "_UnsafeStart";
shortName = nfx.getName() + "_SS";
header = "# Derived Property: " + name
+ "\r\n# Generated according to UAX #15."
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
;
}
public boolean hasValue(int cp) {
if (ucdData.getCombiningClass(cp) != 0) return false;
String norm = nfx.normalize(cp);
int first = UTF16.charAt(norm, 0);
if (ucdData.getCombiningClass(first) != 0) return true;
if (nfx.isComposition()
&& dprops[NFC_TrailingZero].hasValue(first)) return true; // 1,3 == composing
return false;
}
};
*/
class NFC_Prop extends UCDProperty {
BitSet bitset;
boolean filter = false;
boolean keepNonZero = true;
NFC_Prop(int i) {
isStandard = false;
type = DERIVED_NORMALIZATION;
BitSet[] bitsets = new BitSet[3];
switch(i) {
case NFC_Leading: bitsets[0] = bitset = new BitSet(); break;
case NFC_Resulting: bitsets[2] = bitset = new BitSet(); break;
case NFC_TrailingZero: keepNonZero = false; // FALL THRU
case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
}
filter = bitsets[1] != null;
nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
name = Names[i-NFC_Leading];
shortName = SNames[i-NFC_Leading];
header = "# Derived Property: " + name
+ "\r\n# " + Description[i-NFC_Leading]
+ "\r\n# NFKC characters are the same, after subtracting the NFKD = NO values."
+ "\r\n# Generated according to UAX #15."
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
}
public boolean hasValue(int cp) {
boolean result = bitset.get(cp);
if (result && filter) {
result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero;
}
return result;
}
final String[] Names = {"NFC_Leading", "NFC_TrailingNonZero", "NFC_TrailingZero", "NFC_Resulting"};
final String[] SNames = {"NFC_L", "NFC_TNZ", "NFC_TZ", "NFC_R"};
final String[] Description = {
"Characters that can combine with following characters in NFC",
"Characters that can combine with previous characters in NFC, and have non-zero combining class",
"Characters that can combine with previous characters in NFC, and have zero combining class",
"Characters that can result from a combination of other characters in NFC",
};
};
class GenDProp extends UCDProperty {
Normalizer nfx;
Normalizer nfComp = null;
GenDProp (int i) {
isStandard = false;
setValueType(STRING_PROP);
type = DERIVED_NORMALIZATION;
nfx = nf[i];
name = nfx.getName();
String compName = "the character itself";
if (i == NFKC || i == NFD) {
name += "-NFC";
nfComp = nfc;
compName = "NFC for the character";
} else if (i == NFKD) {
name += "-NFD";
nfComp = nfd;
compName = "NFD for the character";
}
header = "# Derived Property: " + name
+ "\r\n# Lists characters in normalized form " + nfx.getName() + "."
+ "\r\n# Only those characters whith normalized forms are DIFFERENT from " + compName + " are listed!"
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
}
int cacheCp = 0;
String cacheStr = "";
public String getValue(int cp, byte style) {
if (cacheCp == cp) return cacheStr;
cacheCp = cp;
cacheStr = "";
if (ucdData.getDecompositionType(cp) != NONE) {
String cps = UTF32.valueOf32(cp);
String comp = cps;
if (nfComp != null) {
comp = nfComp.normalize(comp);
}
String normal = nfx.normalize(cps);
if (!comp.equals(normal)) {
String norm = Utility.hex(normal);
String pad = Utility.repeat(" ", 14-norm.length());
cacheStr = name + "; " + norm + pad;
}
}
return cacheStr;
//if (cp >= 0xAC00 && cp <= 0xD7A3) return true;
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps)));
} // default
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
class CaseDProp extends UCDProperty {
byte val;
CaseDProp (int i) {
type = DERIVED_CORE;
isStandard = false;
val = (i == Missing_Uppercase ? Lu : i == Missing_Lowercase ? Ll : Lt);
name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase];
header = "# Derived Property: " + name
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
}
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == val
|| val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) return false;
byte xCat = getDecompCat(cp);
if (xCat == val) return true;
return false;
}
};
class QuickDProp extends UCDProperty {
String NO;
String MAYBE;
Normalizer nfx;
QuickDProp (int i) {
//setValueType((i == NFC || i == NFKC) ? ENUMERATED_PROP : BINARY_PROP);
setValueType(ENUMERATED_PROP);
type = DERIVED_NORMALIZATION;
nfx = nf[i];
NO = nfx.getName() + "_NO";
MAYBE = nfx.getName() + "_MAYBE";
name = nfx.getName() + "_QuickCheck";
shortName = nfx.getName() + "_QC";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing decomposibles"
+ ((i == NFC || i == NFKC)
? " (and characters that may compose with previous ones)" : "");
}
public String getValue(int cp, byte style) {
if (!nfx.isNormalized(cp)) return NO;
else if (nfx.isTrailing(cp)) return MAYBE;
else return "";
}
public String getListingValue(int cp) {
return getValue(cp, LONG);
}
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
private DerivedProperty(UCD ucd) {
ucdData = ucd;
nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdData.getVersion());
nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdData.getVersion());
nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdData.getVersion());
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdData.getVersion());
for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
dprops[i] = new ExDProp(i-ExpandsOnNFD);
}
for (int i = GenNFD; i <= GenNFKC; ++i) {
dprops[i] = new GenDProp(i-GenNFD);
}
for (int i = NFC_Leading; i <= NFC_Resulting; ++i) {
dprops[i] = new NFC_Prop(i);
}
for (int i = NFD_UnsafeStart; i <= NFKC_UnsafeStart; ++i) {
dprops[i] = new NF_UnsafeStartProp(i-NFD_UnsafeStart);
}
dprops[ID_Start] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "ID_Start";
shortName = "IDS";
header = "# Derived Property: " + name
+ "\r\n# Characters that can start an identifier."
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl+Other_ID_Start";
}
public boolean hasValue(int cp) {
return ucdData.isIdentifierStart(cp);
}
};
dprops[ID_Continue_NO_Cf] = new UCDProperty() {
{
name = "ID_Continue";
type = DERIVED_CORE;
shortName = "IDC";
header = "# Derived Property: " + name
+ "\r\n# Characters that can continue an identifier."
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc + Other_ID_Continue"
+ "\r\n# NOTE: Cf characters should be filtered out.";
}
public boolean hasValue(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp);
}
};
StringBuffer tempBuf = new StringBuffer();
//System.out.println("Deriving data for XID");
// special hack for middle dot
XID_Continue_Set.add(0x00B7);
//System.out.println("Adding (2)" + ucdData.getCodeAndName(0x00B7));
for (int cp = 0; cp < 0x10FFFF; ++cp) {
// skip cases that can't matter
if (!ucdData.isAssigned(cp)) continue;
// find out normal status
int status = 0;
if (ucdData.isIdentifierStart(cp)) status = 1;
else if (ucdData.isIdentifierContinue_NO_Cf(cp)) status = 2;
if (status != 0 && !nfkd.isNormalized(cp)) {
// now find out NFKD status
// if it is <start><extend>*, then it is start
// else if it is <extend>*, then it is extend
// else it is nothing
int status2 = 0;
tempBuf.setLength(0);
nfkd.normalize(UTF32.valueOf32(cp), tempBuf);
for (int i = 0; i < tempBuf.length(); i += UTF32.count16(cp)) {
int cp2 = UTF32.char32At(tempBuf, i);
if (i == 0) {
if (ucdData.isIdentifierStart(cp2)) status2 = 1;
else if (ucdData.isIdentifierContinue_NO_Cf(cp2)) status2 = 2;
else {
status2 = 0;
break;
}
} else if (!ucdData.isIdentifierContinue_NO_Cf(cp2) && cp2 != 0xB7) {
status2 = 0;
break;
}
}
// Now see if the statuses are compatible.
if (status != status2) {
//System.out.println("Need to do something with:");
//System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp));
//System.out.println(" " + status2 + ": " + ucdData.getCodeAndName(tempBuf.toString()));
if (status2 == 0) status = 0;
else if (status2 > status) status = status2;
//System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp));
}
}
if (status == 1) XID_Start_Set.add(cp);
if (status != 0) XID_Continue_Set.add(cp);
}
dprops[Mod_ID_Start] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "XID_Start";
shortName = "XIDS";
header = "# Derived Property: " + name
+ "\r\n# ID_Start modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
public boolean hasValue(int cp) {
return XID_Start_Set.contains(cp);
}
};
dprops[Mod_ID_Continue_NO_Cf] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "XID_Continue";
shortName = "XIDC";
header = "# Derived Property: " + name
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
+ "\r\n# NOTE: Cf characters should be filtered out."
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
public boolean hasValue(int cp) {
return XID_Continue_Set.contains(cp);
}
};
dprops[PropMath] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Math";
shortName = name;
header = "# Derived Property: " + name
+ "\r\n# Generated from: Sm + Other_Math";
}
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Sm
|| ucdData.getBinaryProperty(cp,Math_Property)) return true;
return false;
}
};
dprops[PropAlphabetic] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Alphabetic";
shortName = "Alpha";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
}
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
|| ucdData.getBinaryProperty(cp, Other_Alphabetic)) return true;
return false;
}
};
dprops[PropLowercase] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Lowercase";
shortName = "Lower";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Ll + Other_Lowercase";
}
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return true;
return false;
}
};
dprops[PropUppercase] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Uppercase";
shortName = "Upper";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu + Other_Uppercase";
}
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return true;
return false;
}
};
for (int i = Missing_Uppercase; i <= Missing_Mixedcase; ++i) {
dprops[i] = new CaseDProp(i);
}
/*
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
including all characters whose canonical decomposition consists of a single character.
(4) Non-Starter Decompositions: characters that can be derived from the UnicodeData
file by including all characters whose canonical decomposition consists of a sequence
of characters, the first of which has a non-zero combining class.
*/
dprops[FullCompExclusion] = new UCDProperty() {
{
type = DERIVED_NORMALIZATION;
name = "Full_Composition_Exclusion";
shortName = "Comp_Ex";
defaultValueStyle = defaultPropertyStyle = SHORT;
header = "# Derived Property: " + name
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
}
public boolean hasValue(int cp) {
if (!ucdData.isRepresented(cp)) return false;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return false;
if (isCompEx(cp)) return true;
return false;
}
/*public String getListingValue(int cp) {
return "Comp_Ex";
}*/
/*
public String getListingValue(int cp) {
if (getValueType() != BINARY) return getValue(cp, SHORT);
return getProperty(SHORT);
}
*/
};
dprops[FullCompInclusion] = new UCDProperty() {
{
isStandard = false;
type = DERIVED_NORMALIZATION;
name = "Full_Composition_Inclusion";
shortName = "Comp_In";
defaultValueStyle = defaultPropertyStyle = SHORT;
header = "# Derived Property: " + name
+ ": Full Composition Inclusion"
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
}
public boolean hasValue(int cp) {
if (!ucdData.isRepresented(cp)) return false;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return false;
if (isCompEx(cp)) return true;
return false;
}
};
dprops[FC_NFKC_Closure] = new UCDProperty() {
{
type = DERIVED_NORMALIZATION;
setValueType(STRING_PROP);
name = "FC_NFKC_Closure";
shortName = "FC_NFKC";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));"
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
+ "\r\n# mappings that constitute the FC_NFKC_Closure list"
+ "\r\n# Uses the full case folding from CaseFolding.txt, without the T option."
;
}
public String getValue(int cp, byte style) {
if (!ucdData.isRepresented(cp)) return "";
String b = nfkc.normalize(fold(cp));
String c = nfkc.normalize(fold(b));
if (c.equals(b)) return "";
return "FNC; " + Utility.hex(c);
} // default
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
dprops[FC_NFC_Closure] = new UCDProperty() {
{
type = DERIVED_NORMALIZATION;
isStandard = false;
name = "FC_NFC_Closure";
setValueType(STRING_PROP);
shortName = "FC_NFC";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));"
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
+ "\r\n# mappings that constitute the FC_NFC_Closure list"
+ "\r\n# Uses the full case folding from CaseFolding.txt, without the T option."
;
}
public String getValue(int cp, byte style) {
if (!ucdData.isRepresented(cp)) return "";
String b = nfc.normalize(fold(cp));
String c = nfc.normalize(fold(b));
if (c.equals(b)) return "";
return "FN; " + Utility.hex(c);
} // default
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
dprops[i] = new QuickDProp(i - QuickNFD);
}
dprops[DefaultIgnorable] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Default_Ignorable_Code_Point";
hasUnassigned = true;
shortName = "DI";
header = null;
}
public String getHeader() {
if (ucdData.getCompositeVersion() > 0x040000) return "# Derived Property: " + name
+ "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Variation_Selector"
+ "\r\n# + Noncharacter_Code_Point + Cf + Cc + Cs) - White_Space"
+ "\r\n# - U+FFF9..U+FFFB// INTERLINEAR ANNOTATION characters";
//+ "\r\n# - U+0600..U+0603 - U+06DD - U+070F"
return "# Derived Property: " + name
+ "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Cf + Cc + Cs) - White_Space";
}
public boolean hasValue(int cp) {
if (ucdData.getBinaryProperty(cp, White_space)) return false;
if (ucdData.getBinaryProperty(cp, Other_Default_Ignorable_Code_Point)) return true;
if (ucdData.getCompositeVersion() > 0x040000 && cp >= 0xFFF9 && cp <= 0xFFFB) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Cf || cat == Cs || cat == Cc) return true;
if (ucdData.getCompositeVersion() <= 0x040000) return false;
//if (cp >= 0xFFF9 && cp <= 0xFFFB) return false;
//if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
//if (0x0600 <= cp && cp <= 0x0603 || 0x06DD == cp || 0x070F == cp) return false;
if (ucdData.getBinaryProperty(cp, Variation_Selector)) return true;
if (ucdData.getBinaryProperty(cp, Noncharacter_Code_Point)) return true;
return false;
}
};
dprops[Case_Sensitive] = new UCDProperty() {
{
type = DERIVED_CORE;
isStandard = false;
name = "Case_Sensitive";
hasUnassigned = false;
shortName = "CS";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from all characters that are either on the right or left side of a case mapping";
}
UnicodeSet case_sensitive = null;
UnicodeSet tempSet = new UnicodeSet();
UnicodeSet cased = null;
PrintWriter log;
private void addCase(String cps, byte c1, byte c2) {
String temp = ucdData.getCase(cps, c1, c2);
if (temp.equals(cps)) return;
//temp = nfc.normalize(temp);
//if (temp.equals(cps)) return;
tempSet.clear();
tempSet.addAll(cps);
tempSet.addAll(temp);
if (!case_sensitive.containsAll(tempSet)) {
tempSet.removeAll(case_sensitive);
if (!cased.containsAll(tempSet)) {
log.println();
log.println("Adding " + tempSet + " because of: ");
log.println("\t" + ucdData.getCodeAndName(cps));
log.println("=>\t" + ucdData.getCodeAndName(temp));
}
case_sensitive.addAll(tempSet);
}
}
public boolean hasValue(int cp) {
if (case_sensitive == null) {
try {
log = Utility.openPrintWriter("Case_Sensitive_Log.txt", Utility.UTF8_UNIX);
System.out.println("Building Case-Sensitive cache");
case_sensitive = new UnicodeSet();
cased = DerivedProperty.make(PropLowercase, ucdData).getSet()
.addAll(DerivedProperty.make(PropUppercase, ucdData).getSet())
.addAll(UnifiedBinaryProperty.make(CATEGORY | Lt).getSet());
for (int c = 0; c < 0x10FFFF; ++c) {
Utility.dot(c);
// skip cases that can't matter
if (!ucdData.isAssigned(c)) continue;
String cps = UTF16.valueOf(c);
addCase(cps, FULL, LOWER);
addCase(cps, FULL, UPPER);
addCase(cps, FULL, TITLE);
addCase(cps, FULL, FOLD);
addCase(cps, SIMPLE, LOWER);
addCase(cps, SIMPLE, UPPER);
addCase(cps, SIMPLE, TITLE);
addCase(cps, SIMPLE, FOLD);
}
Utility.fixDot();
UnicodeSet temp;
log.println("Cased, but not Case_Sensitive");
temp = new UnicodeSet().addAll(cased).removeAll(case_sensitive);
Utility.showSetNames(log, "", temp, false, false, ucdData);
log.println("Case_Sensitive, but not Cased");
temp = new UnicodeSet().addAll(case_sensitive).removeAll(cased);
Utility.showSetNames(log, "", temp, false, false, ucdData);
log.println("Both Case_Sensitive, and Cased");
temp = new UnicodeSet().addAll(case_sensitive).retainAll(cased);
log.println(temp);
System.out.println("Done Building Case-Sensitive cache");
log.close();
} catch (Exception e) {
throw new ChainException("internal error", null, e);
}
}
return case_sensitive.contains(cp);
}
};
dprops[Other_Case_Ignorable] = new UCDProperty() {
{
name = "Other_Case_Ignorable";
shortName = "OCI";
isStandard = false;
header = header = "# Binary Property";
}
public boolean hasValue(int cp) {
switch(cp) {
case 0x27: case 0x2019: case 0xAD: return true;
// case 0x2d: case 0x2010: case 0x2011:
/*
0027 ; Other_Case_Ignorable # Po APOSTROPHE
00AD ; Other_Case_Ignorable # Pd SOFT HYPHEN
2019 ; Other_Case_Ignorable # Pf RIGHT SINGLE QUOTATION MARK
*/
}
return false;
}
};
dprops[Type_i] = new UCDProperty() {
{
type = DERIVED_CORE;
isStandard = false;
name = "DSoft_Dotted";
shortName = "DSDot";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: all characters whose canonical decompositions end with a combining character sequence that"
+ "\r\n# - starts with i or j"
+ "\r\n# - has no combining marks above"
+ "\r\n# - has no combining marks with zero canonical combining class"
;
}
public boolean hasValue(int cp) {
if (hasSoftDot(cp)) return true;
if (nfkd.isNormalized(cp)) return false;
String decomp = nfd.normalize(cp);
boolean ok = false;
for (int i = decomp.length()-1; i >= 0; --i) {
int ch = UTF16.charAt(decomp, i);
int cc = ucdData.getCombiningClass(ch);
if (cc == 230) return false;
if (cc == 0) {
if (!hasSoftDot(ch)) return false;
ok = true;
}
}
return ok;
}
boolean hasSoftDot(int ch) {
return ch == 'i' || ch == 'j' || ch == 0x0268 || ch == 0x0456 || ch == 0x0458;
}
};
dprops[Case_Ignorable] = new UCDProperty() {
{
name = "Case_Ignorable";
isStandard = false;
shortName = "CI";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
}
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true;
if (dprops[Other_Case_Ignorable].hasValue(cp)) return true;
return false;
}
};
/*
GraphemeExtend = 27,
GraphemeBase = 28,
# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink
# GraphemeBase :=
*/
dprops[GraphemeExtend] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Grapheme_Extend";
shortName = "Gr_Ext";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Me + Mn + Other_Grapheme_Extend"
+ "\r\n# Note: depending on an application's interpretation of Co (private use),"
+ "\r\n# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."
;
}
public boolean hasValue(int cp) {
//if (cp == 0x034F) return false;
//if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
// || cat == Mc
byte cat = ucdData.getCategory(cp);
if (cat == Me || cat == Mn
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
return false;
}
};
dprops[GraphemeBase] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Grapheme_Base";
shortName = "Gr_Base";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend"
+ "\r\n# Note: depending on an application's interpretation of Co (private use),"
+ "\r\n# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."
;
}
public boolean hasValue(int cp) {
//if (cp == 0x034F) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp) return false;
// || ucdData.getBinaryProperty(cp,GraphemeLink)
if (dprops[GraphemeExtend].hasValue(cp)) return false;
return true;
}
};
for (int i = 0; i < dprops.length; ++i) {
UCDProperty up = dprops[i];
if (up == null) continue;
if (up.getValueType() != BINARY_PROP) continue;
up.setValue(NUMBER, "1");
up.setValue(SHORT, "T");
up.setValue(LONG, "True");
}
}
byte getDecompCat(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return Lu;
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
// if (true) throw new IllegalArgumentException("FIX nf[2]");
if (nf[NFKD].isNormalized(cp)) return Lo;
String norm = nf[NFKD].normalize(cp);
int cp2;
boolean gotUpper = false;
boolean gotLower = false;
boolean gotTitle = false;
for (int i = 0; i < norm.length(); i += UTF32.count16(cp2)) {
cp2 = UTF32.char32At(norm, i);
byte catx = ucdData.getCategory(cp2);
boolean upx = ucdData.getBinaryProperty(cp, Other_Uppercase);
boolean lowx = ucdData.getBinaryProperty(cp, Other_Lowercase);
if (catx == Ll || lowx || cp2 == 0x345) gotLower = true;
if (catx == Lu || upx) gotUpper = true;
if (catx == Lt) gotTitle = true;
}
if (gotLower && !gotUpper && !gotTitle) return Ll;
if (!gotLower && gotUpper && !gotTitle) return Lu;
if (gotLower || gotUpper || gotTitle) return Lt;
return cat;
}
boolean isCompEx(int cp) {
if (ucdData.getBinaryProperty(cp, CompositionExclusion)) return true;
String decomp = ucdData.getDecompositionMapping(cp);
if (UTF32.length32(decomp) == 1) return true;
int first = UTF32.char32At(decomp,0);
if (ucdData.getCombiningClass(first) != 0) return true;
return false;
}
String fold(int cp) {
return ucdData.getCase(cp, FULL, FOLD);
}
String fold(String s) {
return ucdData.getCase(s, FULL, FOLD);
}
public static void test() {
/*
DerivedProperty dprop = new DerivedProperty(Default.ucd);
for (int j = 0; j < LIMIT; ++j) {
System.out.println();
System.out.println(j + "\t" + dprop.getName(j));
System.out.println(dprop.getHeader(j));
}
*/
for (int cp = 0xA0; cp < 0xFF; ++cp) {
System.out.println();
System.out.println(Default.ucd().getCodeAndName(cp));
for (int j = 0; j < DERIVED_PROPERTY_LIMIT; ++j) {
String prop = make(j, Default.ucd()).getValue(cp);
if (prop.length() != 0) System.out.println("\t" + prop);
}
}
}
}

View File

@ -1,118 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
* $Date: 2006/06/09 21:21:20 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import java.util.*;
import com.ibm.text.utility.*;
final class DerivedPropertyLister extends PropertyLister {
static final boolean BRIDGE = false;
//static int enum = 0;
//private int propMask;
//private DerivedProperty dprop;
private UCDProperty uprop;
int width;
boolean varies;
public DerivedPropertyLister(UCD ucd, int propMask, PrintWriter output) {
//this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
// this.dprop = new DerivedProperty(ucd);
uprop = DerivedProperty.make(propMask, ucd);
varies = uprop.getValueType() < BINARY_PROP;
width = super.minPropertyWidth();
switch (propMask) {
case DerivedProperty.GenNFD: case DerivedProperty.GenNFC: case DerivedProperty.GenNFKD: case DerivedProperty.GenNFKC:
alwaysBreaks = true;
break;
case DerivedProperty.FC_NFKC_Closure:
alwaysBreaks = true;
width = 21;
break;
case DerivedProperty.QuickNFC: case DerivedProperty.QuickNFKC:
width = 11;
break;
}
}
public String headerString() {
return uprop.getHeader();
}
public String valueName(int cp) {
return uprop.getListingValue(cp);
}
//public String optionalComment(int cp) {
// return super.optionalComment(cp) + " [" + ucdData.getCodeAndName(computedValue) + "]";
//}
public int minPropertyWidth() {
return width;
}
/*
public String optionalComment(int cp) {
String id = ucdData.getCategoryID(cp);
if (UCD.mainCategoryMask(ucdData.getCategory(cp)) == LETTER_MASK) return id.substring(0,1) + "*";
return id;
}
*/
/*
public String optionalName(int cp) {
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
return Utility.hex(ucdData.getDecompositionMapping(cp));
} else {
return "";
}
}
*/
String last;
public byte status(int cp) {
if (!uprop.hasUnassigned() && !ucdData.isAssigned(cp)) return EXCLUDE;
if (!varies) {
return uprop.hasValue(cp) ? INCLUDE : EXCLUDE;
}
String prop = uprop.getValue(cp);
if (prop.length() == 0) return EXCLUDE;
if (prop.equals(last)) return INCLUDE;
last = prop;
return BREAK;
}
/*
static Map computedValue = new HashMap();
static String getComputedValue(int cp) {
return (String) computedValue.get(new Integer(cp));
}
static void setComputedValue(int cp, String value) {
computedValue.put(new Integer(cp), value);
}
static String lastValue = "";
static String currentValue = "";
StringBuffer foldBuffer = new StringBuffer();
*/
}

View File

@ -1,158 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
* $Date: 2004/02/06 18:30:22 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.icu.text.UnicodeSet;
import java.io.*;
class DiffPropertyLister extends PropertyLister {
private UCD oldUCD;
private UnicodeSet set = new UnicodeSet();
private static final int NOPROPERTY = -1;
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output, int property) {
this.output = output;
this.ucdData = UCD.make(newUCDName);
if (property != NOPROPERTY) newProp = DerivedProperty.make(property, ucdData);
if (oldUCDName != null) {
this.oldUCD = UCD.make(oldUCDName);
if (property != NOPROPERTY) oldProp = DerivedProperty.make(property, oldUCD);
}
breakByCategory = property != NOPROPERTY;
useKenName = false;
usePropertyComment = false;
}
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) {
this(oldUCDName, newUCDName, output, NOPROPERTY);
}
public UnicodeSet getSet() {
return set;
}
public String valueName(int cp) {
return major_minor_only(ucdData.getVersion());
}
/*
public String optionalName(int cp) {
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
return Utility.hex(ucdData.getDecompositionMapping(cp));
} else {
return "";
}
}
*/
UCDProperty newProp = null;
UCDProperty oldProp = null;
String value = "";
public String optionalComment(int cp) {
String normal = super.optionalComment(cp);
if (oldUCD != null && breakByCategory) {
byte modCat = oldUCD.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
normal = oldUCD.getModCatID_fromIndex(modCat) + "/" + normal;
}
return normal;
}
byte getModCat(int cp) {
byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : -1);
//System.out.println(breakByCategory + ", " + ucdData.getModCatID_fromIndex(result));
return result;
}
public byte status(int cp) {
if (newProp == null) {
if (ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp))) {
set.add(cp);
return INCLUDE;
} else {
return EXCLUDE;
}
}
// just look at property differences among allocated characters
if (!ucdData.isAllocated(cp)) return EXCLUDE;
if (!oldUCD.isAllocated(cp)) return EXCLUDE;
String val = newProp.getValue(cp);
String oldVal = oldProp.getValue(cp);
if (!oldVal.equals(val)) {
set.add(cp);
return INCLUDE;
}
return EXCLUDE;
/*if (cp == 0xFFFF) {
System.out.println("# " + Utility.hex(cp));
}
*/
}
public String headerString() {
String result;
if (oldUCD != null) {
result = "# Differences between "
+ major_minor_only(ucdData.getVersion())
+ " and "
+ major_minor_only(oldUCD.getVersion());
} else {
result = "# Designated as of "
+ major_minor_only(ucdData.getVersion())
+ " [excluding removed Hangul Syllables]";
}
//System.out.println("hs: " + result);
return result;
}
/*
public int print() {
String status;
if (oldUCD != null) {
status = "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion();
} else {
status = "# Allocated as of " + ucdData.getVersion();
}
output.println();
output.println();
output.println(status);
output.println();
System.out.println(status);
int count = super.print();
output.println();
if (oldUCD != null) {
output.println("# Total " + count + " new code points allocated in " + ucdData.getVersion());
} else {
output.println("# Total " + count + " code points allocated in " + ucdData.getVersion());
}
output.println();
return count;
}
*/
private String major_minor_only(String s) {
if (newProp != null) return s;
return s.substring(0, s.lastIndexOf('.'));
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,624 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2006/04/05 22:12:45 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.icu.text.UTF16;
import com.ibm.text.utility.*;
public class GenerateCaseFolding implements UCD_Types {
public static boolean DEBUG = false;
public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1
// PICK_SHORT & NF_CLOSURE = false for old style
/*public static void main(String[] args) throws java.io.IOException {
makeCaseFold(arg[0]);
//getAge();
}
*/
static PrintWriter log;
public static void makeCaseFold(boolean normalized) throws java.io.IOException {
PICK_SHORT = NF_CLOSURE = normalized;
log = Utility.openPrintWriter("CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
System.out.println("Writing Log: " + "CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true));
System.out.println("Making Full Data");
Map fullData = getCaseFolding(true, NF_CLOSURE, "");
Utility.fixDot();
System.out.println("Making Simple Data");
Map simpleData = getCaseFolding(false, NF_CLOSURE, "");
// write the data
System.out.println("Making Turkish Full Data");
Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr");
Utility.fixDot();
System.out.println("Making Simple Data");
Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr");
// write the data
Utility.fixDot();
System.out.println("Writing");
String filename = "CaseFolding";
if (normalized) filename += "-Normalized";
String directory = "DerivedData/";
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, filename);
PrintWriter out = fc.out;
/*
PrintWriter out = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(directory + fileRoot + GenerateData.getFileSuffix()),
"UTF8"),
4*1024));
*/
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!charsUsed.get(ch)) continue;
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch));
String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch));
if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue;
if (rFull != null && rFull.equals(rSimple)
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
String type = "C";
if (ch == 0x49) {
drawLine(out, ch, "C", "i");
drawLine(out, ch, "T", "\u0131");
} else if (ch == 0x130) {
drawLine(out, ch, "F", "i\u0307");
drawLine(out, ch, "T", "i");
} else if (ch == 0x131) {
// do nothing
//drawLine(out, ch, "I", "i");
} else {
drawLine(out, ch, type, rFull);
}
} else {
if (rFull != null) {
drawLine(out, ch, "F", rFull);
}
if (rSimple != null) {
drawLine(out, ch, "S", rSimple);
}
}
if (rFullTurkish != null && !rFullTurkish.equals(rFull)) {
drawLine(out, ch, "T", rFullTurkish);
}
if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) {
drawLine(out, ch, "t", rSimpleTurkish);
}
}
fc.close();
log.close();
}
/* Goal is following (with no entries for 0131 or 0069)
0049; C; 0069; # LATIN CAPITAL LETTER I
0049; T; 0131; # LATIN CAPITAL LETTER I
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
*/
static void drawLine(PrintWriter out, int ch, String type, String result) {
String comment = "";
if (COMMENT_DIFFS) {
String lower = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
if (!lower.equals(result)) {
String upper = Default.ucd().getCase(UTF16.valueOf(ch), FULL, UPPER);
String lower2 = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
if (lower.equals(lower2)) {
comment = "[Diff " + Utility.hex(lower, " ") + "] ";
} else {
Utility.fixDot();
System.out.println("PROBLEM WITH: " + Default.ucd().getCodeAndName(ch));
comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] ";
}
}
}
out.println(Utility.hex(ch)
+ "; " + type
+ "; " + Utility.hex(result, " ")
+ "; # " + comment + Default.ucd().getName(ch));
}
static int probeCh = 0x01f0;
static String shower = UTF16.valueOf(probeCh);
static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException {
Map data = new TreeMap();
Map repChar = new TreeMap();
//String option = "";
// get the equivalence classes
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
Utility.dot(ch);
//if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
if (!Default.ucd().isRepresented(ch)) continue;
getClosure(ch, data, full, nfClose, condition);
}
// get the representative characters
Iterator it = data.keySet().iterator();
while (it.hasNext()) {
String s = (String) it.next();
Set set = (Set) data.get(s);
show = set.contains(shower);
if (show) {
Utility.fixDot();
System.out.println(toString(set));
}
// Pick the best available representative
String rep = null;
int repGood = 0;
String dup = null;
Iterator it2 = set.iterator();
while (it2.hasNext()) {
String s2 = (String)it2.next();
int s2Good = goodness(s2, full, condition);
if (s2Good > repGood) {
rep = s2;
repGood = s2Good;
dup = null;
} else if (s2Good == repGood) {
dup = s2;
}
}
if (rep == null) {
Utility.fixDot();
System.err.println("No representative for: " + toString(set));
} else if ((repGood & (NFC_FORMAT | ISLOWER)) != (NFC_FORMAT | ISLOWER)) {
String message = "";
if ((repGood & NFC_FORMAT) == 0) {
message += " [NOT NFC FORMAT]";
}
if ((repGood & ISLOWER) == 0) {
message += " [NOT LOWERCASE]";
}
Utility.fixDot();
log.println("Non-Optimal Representative " + message);
log.println(" Rep:\t" + Default.ucd().getCodeAndName(rep));
log.println(" Set:\t" + toString(set,true, true));
}
log.println();
log.println();
log.println(rep + "\t#" + Default.ucd().getName(rep));
// Add it for all the elements of the set
it2 = set.iterator();
while (it2.hasNext()) {
String s2 = (String)it2.next();
if (s2.equals(rep)) continue;
log.println(s2 + "\t#" + Default.ucd().getName(s2));
if (UTF16.countCodePoint(s2) == 1) {
repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
charsUsed.set(UTF16.charAt(s2, 0));
}
}
}
return repChar;
}
static BitSet charsUsed = new BitSet();
static boolean show = false;
static final int NFC_FORMAT = 64;
static final int ISLOWER = 128;
static int goodness(String s, boolean full, String condition) {
if (s == null) return 0;
int result = 32-s.length();
if (!PICK_SHORT) {
result = s.length();
}
if (!full) result <<= 8;
String low = lower(upper(s, full, condition), full, condition);
if (s.equals(low)) result |= ISLOWER;
else if (PICK_SHORT && Default.nfd().normalize(s).equals(Default.nfd().normalize(low))) result |= ISLOWER;
if (s.equals(Default.nfc().normalize(s))) result |= NFC_FORMAT;
if (show) {
Utility.fixDot();
System.out.println(Utility.hex(result) + ", " + Default.ucd().getCodeAndName(s));
}
return result;
}
/*
static HashSet temp = new HashSet();
static void normalize(HashSet set) {
temp.clear();
temp.addAll(set);
set.clear();
Iterator it = temp.iterator();
while (it.hasNext()) {
String s = (String) it.next();
String s2 = KC.normalize(s);
set.add(s);
data2.put(s,set);
if (!s.equals(s2)) {
set.add(s2);
data2.put(s2,set);
System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2));
}
}
}
*/
/*
String
String lower1 = Default.ucd.getLowercase(ch);
String lower2 = Default.ucd.toLowercase(ch,option);
char ch2 = Default.ucd.getLowercase(Default.ucd.getUppercase(ch).charAt(0)).charAt(0);
//String lower1 = String.valueOf(Default.ucd.getLowercase(ch));
//String lower = Default.ucd.toLowercase(ch2,option);
String upper = Default.ucd.toUppercase(ch2,option);
String lowerUpper = Default.ucd.toLowercase(upper,option);
//String title = Default.ucd.toTitlecase(ch2,option);
//String lowerTitle = Default.ucd.toLowercase(upper,option);
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
output.println(Utility.hex(ch)
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
+ "; " + Utility.hex(lowerUpper," ")
+ ";\t#" + Default.ucd.getName(ch)
);
//if (!lowerUpper.equals(lower)) {
// output.println("Warning1: " + Utility.hex(lower) + " " + Default.ucd.getName(lower));
//}
//if (!lowerUpper.equals(lowerTitle)) {
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Default.ucd.getName(lowerTitle));
//}
}
*/
static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) {
String charStr = UTF32.valueOf32(ch);
String lowerStr = lower(charStr, full, condition);
String titleStr = title(charStr, full, condition);
String upperStr = upper(charStr, full, condition);
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
// make new set
Set set = new TreeSet();
set.add(charStr);
data.put(charStr, set);
// add cases to get started
add(set, lowerStr, data);
add(set, upperStr, data);
add(set, titleStr, data);
// close it
main:
while (true) {
Iterator it = set.iterator();
while (it.hasNext()) {
String s = (String) it.next();
// do funny stuff since we can't modify set while iterating
// We don't do this because if the source is not normalized, we don't want to normalize
if (nfClose) {
if (add(set, Default.nfd().normalize(s), data)) continue main;
if (add(set, Default.nfc().normalize(s), data)) continue main;
if (add(set, Default.nfkd().normalize(s), data)) continue main;
if (add(set, Default.nfkc().normalize(s), data)) continue main;
}
if (add(set, lower(s, full, condition), data)) continue main;
if (add(set, title(s, full, condition), data)) continue main;
if (add(set, upper(s, full, condition), data)) continue main;
}
break;
}
}
static String lower(String s, boolean full, String condition) {
String result = lower2(s,full, condition);
return result.replace('\u03C2', '\u03C3'); // HACK for lower
}
// These functions are no longer necessary, since Default.ucd is parameterized,
// but it's not worth changing
static String lower2(String s, boolean full, String condition) {
/*if (!full) {
if (s.length() != 1) return s;
return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
}
*/
return Default.ucd().getCase(s, full ? FULL : SIMPLE, LOWER, condition);
}
static String upper(String s, boolean full, String condition) {
/* if (!full) {
if (s.length() != 1) return s;
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
}
*/
return Default.ucd().getCase(s, full ? FULL : SIMPLE, UPPER, condition);
}
static String title(String s, boolean full, String condition) {
/*if (!full) {
if (s.length() != 1) return s;
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
}
*/
return Default.ucd().getCase(s, full ? FULL : SIMPLE, TITLE, condition);
}
static boolean add(Set set, String s, Map data) {
if (set.contains(s)) return false;
set.add(s);
if (DEBUG) System.err.println("adding: " + toString(set));
Set other = (Set) data.get(s);
if (other != null && other != set) { // merge
// make all the items in set point to merged set
Iterator it = other.iterator();
while (it.hasNext()) {
data.put(it.next(), set);
}
set.addAll(other);
}
if (DEBUG) System.err.println("done adding: " + toString(set));
return true;
}
static String toString(Set set) {
return toString(set, false, false);
}
static String toString(Set set, boolean name, boolean crtab) {
String result = "{";
Iterator it2 = set.iterator();
boolean first = true;
while (it2.hasNext()) {
String s2 = (String) it2.next();
if (!first) {
if (crtab) {
result += ";\r\n\t";
} else {
result += "; ";
}
}
first = false;
if (name) {
result += Default.ucd().getCodeAndName(s2);
} else {
result += Utility.hex(s2, " ");
}
}
return result + "}";
}
static boolean specialNormalizationDiffers(int ch) {
if (ch == 0x00DF) return true; // es-zed
return !Default.nfkd().isNormalized(ch);
}
static String specialNormalization(String s) {
if (s.equals("\u00DF")) return "ss";
return Default.nfkd().normalize(s);
}
static boolean isExcluded(int ch) {
// if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij
if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI
if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
if (0x20A8 <= ch && ch <= 0x217B) return true; // skip Rupee..
byte type = Default.ucd().getDecompositionType(ch);
if (type == COMPAT_SQUARE) return true;
//if (type == COMPAT_UNSPECIFIED) return true;
return false;
}
static void generateSpecialCasing(boolean normalize) throws IOException {
Map sorted = new TreeMap();
String suffix2 = "";
if (normalize) suffix2 = "-Normalized";
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions"
+ suffix2 + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!Default.ucd().isRepresented(ch)) continue;
if (!specialNormalizationDiffers(ch)) continue;
String lower = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, LOWER));
String upper = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, UPPER));
String title = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, TITLE));
String chstr = UTF16.valueOf(ch);
String decomp = specialNormalization(chstr);
String flower = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, LOWER));
String fupper = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, UPPER));
String ftitle = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, TITLE));
String base = decomp;
String blower = specialNormalization(lower);
String bupper = specialNormalization(upper);
String btitle = specialNormalization(title);
if (true) {
flower = Default.nfc().normalize(flower);
fupper = Default.nfc().normalize(fupper);
ftitle = Default.nfc().normalize(ftitle);
base = Default.nfc().normalize(base);
blower = Default.nfc().normalize(blower);
bupper = Default.nfc().normalize(bupper);
btitle = Default.nfc().normalize(btitle);
}
if (ch == CHECK_CHAR) {
System.out.println("Code: " + Default.ucd().getCodeAndName(ch));
System.out.println("Decomp: " + Default.ucd().getCodeAndName(decomp));
System.out.println("Base: " + Default.ucd().getCodeAndName(base));
System.out.println("SLower: " + Default.ucd().getCodeAndName(lower));
System.out.println("FLower: " + Default.ucd().getCodeAndName(flower));
System.out.println("BLower: " + Default.ucd().getCodeAndName(blower));
System.out.println("STitle: " + Default.ucd().getCodeAndName(title));
System.out.println("FTitle: " + Default.ucd().getCodeAndName(ftitle));
System.out.println("BTitle: " + Default.ucd().getCodeAndName(btitle));
System.out.println("SUpper: " + Default.ucd().getCodeAndName(upper));
System.out.println("FUpper: " + Default.ucd().getCodeAndName(fupper));
System.out.println("BUpper: " + Default.ucd().getCodeAndName(bupper));
}
// presumably if there is a single code point, it would already be in the simple mappings
if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
&& UTF16.countCodePoint(title) == 1) {
if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Default.ucd().getCodeAndName(ch));
continue;
}
// if there is no change from the base, skip
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) {
if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Default.ucd().getCodeAndName(ch));
continue;
}
// fix special cases
// if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
if (flower.equals(blower)) flower = lower;
if (fupper.equals(bupper)) fupper = upper;
if (ftitle.equals(btitle)) ftitle = title;
// if there are no changes from the original, or the expanded original, skip
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) {
if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Default.ucd().getCodeAndName(ch));
continue;
}
String name = Default.ucd().getName(ch);
int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
: ch == 0x130 ? 2
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4
: name.indexOf("LIGATURE") >= 0 ? 3
: name.indexOf("GEGRAMMENI") < 0 ? 5
: UTF16.countCodePoint(ftitle) == 1 ? 6
: UTF16.countCodePoint(fupper) == 2 ? 7
: 8;
if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Default.ucd().getCodeAndName(ch));
// HACK
boolean denormalize = !normalize && order != 6 && order != 7;
String mapping = Utility.hex(ch)
+ "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Default.nfd().normalize(flower) : flower)
+ "; " + Utility.hex(ftitle.equals(base) ? chstr : denormalize ? Default.nfd().normalize(ftitle) : ftitle)
+ "; " + Utility.hex(fupper.equals(base) ? chstr : denormalize ? Default.nfd().normalize(fupper) : fupper)
+ "; # " + Default.ucd().getName(ch);
// special exclusions
if (isExcluded(ch)) {
log.println("# " + mapping);
} else {
int x = ch;
if (ch == 0x01F0) x = 0x03B1; // HACK to reorder the same
sorted.put(new Integer((order << 24) | x), mapping);
}
}
log.close();
System.out.println("Writing");
//String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
//PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2);
PrintWriter out = udf.out;
/* String[] batName = {""};
String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName);
out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
out.println(UnicodeDataFile.generateDateLine());
out.println("#");
*/
//Utility.appendFile("com/ibm/text/UCD/SpecialCasingHeader.txt", Utility.UTF8, out);
Iterator it = sorted.keySet().iterator();
int lastOrder = -1;
while (it.hasNext()) {
Integer key = (Integer) it.next();
String line = (String) sorted.get(key);
int order = key.intValue() >> 24;
if (order != lastOrder) {
lastOrder = order;
out.println();
boolean skipLine = false;
switch(order) {
case 1:
out.println("# The German es-zed is special--the normal mapping is to SS.");
out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
break;
case 2:
out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below.");
break;
case 3: out.println("# Ligatures"); break;
case 4: skipLine = true; break;
case 5: out.println("# No corresponding uppercase precomposed character"); break;
case 6: Utility.appendFile("com/ibm/text/UCD/SpecialCasingIota.txt", Utility.UTF8, out); break;
case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break;
case 8: skipLine = true; break;
}
if (!skipLine) out.println();
}
out.println(line);
}
Utility.appendFile("com/ibm/text/UCD/SpecialCasingFooter.txt", Utility.UTF8, out);
udf.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
}

View File

@ -1,93 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseTest.java,v $
* $Date: 2004/02/07 01:01:15 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
abstract public class GenerateCaseTest implements UCD_Types {
public static void main(String[] args) throws IOException {
System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61");
PrintWriter out = Utility.openPrintWriter("CaseTest.txt", Utility.UTF8_WINDOWS);
out.println("# CaseTest");
out.println("# Generated: " + Default.getDate() + ", MED");
Utility.appendFile("CaseTestHeader.txt", Utility.LATIN1, out);
for (int cp = 0; cp < 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!Default.ucd().isAllocated(cp)) continue;
if (Default.ucd().isHangulSyllable(cp)) continue;
byte cat = Default.ucd().getCategory(cp);
if (cp == PRIVATE_USE) continue;
String lower = Default.ucd().getCase(cp, FULL, LOWER);
String upper = Default.ucd().getCase(cp, FULL, UPPER);
String title = Default.ucd().getCase(cp, FULL, TITLE);
String fold = Default.ucd().getCase(cp, FULL, FOLD);
if (lower.equals(upper)
&& lower.equals(title)
&& lower.equals(fold)) continue;
String s = UTF16.valueOf(cp);
write(out, s, true);
// if (cp == '\u0345') continue; // don't add combining for this special case
s = s + testChar;
String s2 = Default.nfd().normalize(s);
String lower1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, LOWER));
String upper1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, UPPER));
String title1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, TITLE));
String fold1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, FOLD));
if (lower1.equals(Default.nfc().normalize(lower+testChar))
&& upper1.equals(Default.nfc().normalize(upper+testChar))
&& title1.equals(Default.nfc().normalize(title+testChar))
&& fold1.equals(Default.nfc().normalize(fold+testChar))
) continue;
write(out, s, true);
}
out.println("# total lines: " + counter);
out.close();
}
static final char testChar = '\u0316';
static int counter = 0;
static void write(PrintWriter out, String ss, boolean doComment) {
String s = Default.nfd().normalize(ss);
String lower = Default.nfc().normalize(Default.ucd().getCase(s, FULL, LOWER));
String upper = Default.nfc().normalize(Default.ucd().getCase(s, FULL, UPPER));
String title = Default.nfc().normalize(Default.ucd().getCase(s, FULL, TITLE));
String fold = Default.nfc().normalize(Default.ucd().getCase(s, FULL, FOLD));
out.println(Utility.hex(ss) + "; "
+ Utility.hex(lower) + "; "
+ Utility.hex(upper) + "; "
+ Utility.hex(title) + "; "
+ Utility.hex(fold)
+ (doComment ? "\t# " + Default.ucd().getName(ss) : "")
);
counter++;
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,777 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class GenerateLineBreakTest implements UCD_Types {
// COMMON STUFF for Hangul
static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5;
static final String[] hNames = {"L", "V", "T", "LV", "LVT"};
static byte getHangulType(int cp) {
if (Default.ucd().isLeadingJamo(cp)) return hL;
if (Default.ucd().isVowelJamo(cp)) return hV;
if (Default.ucd().isTrailingJamo(cp)) return hT;
if (Default.ucd().isHangulSyllable(cp)) {
if (Default.ucd().isDoubleHangul(cp)) return hLV;
return hLVT;
}
return hNot;
}
//============================
protected String rule;
protected String fileName = "Line";
// all the other items are supplied in UCD_TYPES
static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT,
LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT,
LB2_LIMIT = (byte)(LB_SUP + 1);
String[] samples = new String[100];
byte[] TypeOrder = {
LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
// missing from Pair Table
LB_SP, LB_BK, LB_CR, LB_LF,
// resolved types below
LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
// 3 JAMO CLASSES, plus supplementary
LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP
};
public static void main(String[] args) throws IOException {
new GenerateLineBreakTest().run();
new GenerateWordBreakTest().run();
}
// stuff that subclasses need to override
public void run() throws IOException {
findSamples();
// test individual cases
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
+ fileName + "</title></head>");
out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>");
generateTable(out, false);
out.println("<h3>Recommended:</h3>");
generateTable(out, true);
out.println("</body></html>");
out.close();
String[] testCase = new String[50];
// do main test
for (int k = 0; k < 2; ++k) {
out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
int counter = 0;
out.println("# Default " + fileName + " Break Test");
out.println("# Generated: " + Default.getDate() + ", MED");
out.println("#");
out.println("# Format:");
out.println("# <string> (# <comment>)? ");
out.println("# <string> contains hex Unicode code points, with ");
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
out.println("#\t" + NOBREAK + " wherever there is not.");
out.println("# <comment> the format can change, but currently it shows:");
out.println("#\t- the sample character name");
out.println("#\t- (x) the line_break property* for the sample character");
out.println("#\t- [x] the rule that determines whether there is a break or not");
out.println("#");
out.println("# Samples:");
out.println("# The test currently takes all pairs of linebreak types*,");
out.println("# picks a sample for each type, and generates three strings: ");
out.println("#\t- the pair alone");
out.println("#\t- the pair alone with an imbeded space");
out.println("#\t- the pair alone with embedded combining marks");
out.println("# The sample for each type is simply the first code point (above NULL)");
out.println("# with that property.");
out.println("# * Note:");
out.println("#\t- SG is omitted");
out.println("#\t- 3 different Jamo characters and a supplementary character are added");
out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments");
out.println("#\t instead of the linebreak property");
out.println("# These samples may be extended in the future.");
out.println("#");
for (int ii = 0; ii < getLimit(); ++ii) {
int i = TypeOrder[ii];
if (i == LB_SG) continue;
String before = samples[i];
for (int jj = 0; jj < getLimit(); ++jj) {
Utility.dot(counter);
int j = TypeOrder[jj];
if (j == LB_SG) continue;
String after = samples[j];
// do line straight
int len = genTestItems(before, after, testCase);
for (int q = 0; q < len; ++q) {
printLine(out, testCase[q], k != 0 && q == 0, false);
++counter;
}
}
}
out.println("# Lines: " + counter);
out.close();
}
}
// stuff that subclasses need to override
public int genTestItems(String before, String after, String[] results) {
results[0] = before + after;
results[1] = before + " " + after;
results[2] = before + "\u0301\u0308" + after;
return 3;
}
// stuff that subclasses need to override
boolean skipType(byte type) {
return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX;
}
// stuff that subclasses need to override
public String getTypeID(int cp) {
byte result = getType(cp);
if (result == LB_SUP) return "SUP";
if (result >= LB_LIMIT) return hNames[result - LB_LIMIT];
return Default.ucd().getLineBreakID_fromIndex(result);
}
// stuff that subclasses need to override
public byte getType(int cp) {
if (cp > 0xFFFF) return LB_SUP;
byte result = getHangulType(cp);
if (result != hNot) return (byte)(result + LB_LIMIT);
return Default.ucd().getLineBreak(cp);
}
public int getLimit() {
return LB2_LIMIT;
}
public int getTableLimit() {
return LB_SUP; // skip last;
}
public void generateTable(PrintWriter out, boolean recommended) {
String width = "width='" + (100 / (getTableLimit() + 1)) + "%'";
out.print("<table border='1' cellspacing='0'><tr><th " + width + "></th>");
byte type;
for (int i = 0; i < getTableLimit(); ++i) {
type = TypeOrder[i];
if (skipType(type)) continue;
String h = getTypeID(samples[TypeOrder[i]]);
out.print("<th " + width + ">" + h + "</th>");
}
out.print("</tr>");
String[] rule = new String[1];
String[] rule2 = new String[1];
for (int i = 0; i < getTableLimit(); ++i) {
type = TypeOrder[i];
if (skipType(type)) continue;
String before = samples[type];
String line = "<tr><th>" + getTypeID(before) + "</th>";
for (int j = 0; j < getTableLimit(); ++j) {
type = TypeOrder[j];
if (skipType(type)) continue;
String after = samples[type];
String t = getTableEntry(before, after, recommended, rule);
String background = "";
String t2 = getTableEntry(before, after, !recommended, rule2);
if (!t.equals(t2)) {
if (t.equals(NOBREAK)) {
background = " bgcolor='#CCFFFF'";
} else {
background = " bgcolor='#FFFF00'";
}
} else if (t.equals(NOBREAK)) {
background = " bgcolor='#CCCCFF'";
}
line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
}
out.println(line + "</tr>");
}
out.println("</table>");
}
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
String t = "_";
boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
String spaceRule = rule;
boolean spaceBreak2 = isBreak(before + " " + after, before.length(), recommended);
String spaceRule2 = rule;
boolean normalBreak = isBreak(before + after, before.length(), recommended);
String normalRule = rule;
if (!normalBreak) {
if (!spaceBreak && !spaceBreak2) {
t = "^";
rule = spaceRule.equals(normalRule) ? normalRule : spaceRule + "/" + normalRule;
if (!spaceRule2.equals(normalRule) && !spaceRule2.equals(spaceRule)) {
rule += "/" + spaceRule2;
}
} else {
t = "%";
rule = normalRule;
}
}
ruleOut[0] = rule;
return t;
}
static final String BREAK = "\u00F7";
static final String NOBREAK = "\u00D7";
public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) {
int cp;
StringBuffer string = new StringBuffer();
StringBuffer comment = new StringBuffer("\t# ");
String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK;
string.append(status);
comment.append(' ').append(status).append(" [").append(rule).append(']');
for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, offset);
if (string.length() > 0) {
string.append(' ');
comment.append(' ');
}
string.append(Utility.hex(cp));
comment.append(Default.ucd().getName(cp) + " (" + getTypeID(cp) + ")");
status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK;
string.append(' ').append(status);
comment.append(' ').append(status).append(" [").append(rule).append(']');
}
if (comments) string.append(comment);
out.println(string);
}
public void findSamples() {
for (int i = 1; i <= 0x10FFFF; ++i) {
if (!Default.ucd().isAllocated(i)) continue;
if (0xD800 <= i && i <= 0xDFFF) continue;
if(i == 0x1100) {
System.out.print("here");
}
byte lb = getType(i);
if (samples[lb] == null) {
samples[lb] = UTF16.valueOf(i);
}
}
for (int i = 0; i < TypeOrder.length; ++i) {
String sample = samples[i];
System.out.println(getTypeID(sample) + ":\t" + Default.ucd().getCodeAndName(sample));
}
}
public String getTypeID(String s) {
if (s == null) return "<null>";
if (s.length() == 1) return getTypeID(s.charAt(0));
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(" ");
result.append(getTypeID(cp));
}
return result.toString();
}
public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
int cp;
for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
byte f = getResolvedType(cp, recommended);
if (f != notLBType) return i;
}
return -1;
}
public byte getResolvedType (int cp, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
byte result = getType(cp);
switch (result) {
case LB_AI: result = LB_AI; break;
// case LB_CB: result = LB_ID; break;
case LB_SA: result = LB_AL; break;
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
case LB_XX: result = LB_AL; break;
}
if (recommended) {
if (getHangulType(cp) != hNot) {
result = LB_ID;
}
}
return result;
}
public boolean onCodepointBoundary(String s, int offset) {
if (offset < 0 || offset > s.length()) return false;
if (offset == 0 || offset == s.length()) return true;
if (UTF16.isLeadSurrogate(s.charAt(offset-1))
&& UTF16.isTrailSurrogate(s.charAt(offset))) return false;
return true;
}
// find out whether there is a break at offset
// WARNING: as a side effect, sets "rule"
public boolean isBreak(String source, int offset, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
// this is taken care of in the getResolvedType function
// LB 2a Never break at the start of text
rule="2a";
if (offset <= 0) return false;
// LB 2b Always break at the end of text
rule="2b";
if (offset >= source.length()) return true;
// UTF-16: never break in the middle of a code point
if (!onCodepointBoundary(source, offset)) return false;
// now get the character before and after, and their types
int cpBefore = UTF16.charAt(source, offset-1);
int cpAfter = UTF16.charAt(source, offset);
byte before = getResolvedType(cpBefore, recommended);
byte after = getResolvedType(cpAfter, recommended);
rule="3a";
// Always break after hard line breaks (but never between CR and LF).
// CR ^ LF
if (before == LB_CR && after == LB_LF) return false;
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
//LB 3b Dont break before hard line breaks.
rule="3b";
if (after == LB_BK || after == LB_LF | after == LB_CR) return false;
// LB 4 Dont break before spaces or zero-width space.
// × SP
// × ZW
rule="4";
if (after == LB_SP || after == LB_ZW) return false;
// LB 5 Break after zero-width space.
// ZW ÷
rule="5";
if (before == LB_ZW) return true;
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
rule="6";
if (after == LB_CM) return false;
if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false;
if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false;
if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false;
boolean setBase = false;
if (before == LB_CM) {
setBase = true;
int backOffset = findLastNon(source, offset, LB_CM, recommended);
if (backOffset < 0) {
before = LB_ID;
} else {
before = getResolvedType(UTF16.charAt(source, backOffset), recommended);
}
}
// LB 7 In all of the following rules, if a space is the base character for a combining mark,
// the space is changed to type ID. In other words, break before SP CM* in the same cases as
// one would break before an ID.
rule="7";
if (setBase && before == LB_SP) before = LB_ID;
// LB 8 Dont break before ] or ! or ; or /, even after spaces.
// × CL, × EX, × IS, × SY
rule="8";
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
// find the last non-space character; we will need it
byte lastNonSpace = before;
if (lastNonSpace == LB_SP) {
int backOffset = findLastNon(source, offset, LB_CM, recommended);
if (backOffset >= 0) {
lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended);
}
}
// LB 9 Dont break after [, even after spaces.
// OP SP* ×
rule="9";
if (lastNonSpace == LB_OP) return false;
// LB 10 Dont break within [, , even with intervening spaces.
// QU SP* × OP
rule="10";
if (lastNonSpace == LB_QU && after == LB_OP) return false;
// LB 11 Dont break within ]h, even with intervening spaces.
// CL SP* × NS
rule="11";
if (lastNonSpace == LB_CL && after == LB_NS) return false;
// LB 11a Dont break within , even with intervening spaces.
// B2 × B2
rule="11a";
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
if (recommended) {
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
rule="11b";
if (after == LB_GL || before == LB_GL) return false;
}
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
rule="12";
// LB 12 Break after spaces
// SP ÷
if (before == LB_SP) return true;
if (!recommended) {
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
rule="13";
if (after == LB_GL || before == LB_GL) return false;
}
rule="14";
// LB 14 Dont break before or after
// × QU
// QU ×
if (before == LB_QU || after == LB_QU) return false;
// LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces,
// small kana and other non- starters, or after acute accents:
// × BA
// × HY
// × NS
// BB ×
if (recommended) {
// LB 14a Break before and after CB
// CB ÷
// ÷ CB
if (before == LB_CB || after == LB_CB) return true;
}
rule="15";
if (after == LB_NS) return false;
if (after == LB_HY) return false;
if (after == LB_BA) return false;
if (before == LB_BB) return false;
if (!recommended) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY ÷
// ÷ BB
rule="15b";
if (before == LB_HY) return true;
if (after == LB_BB) return true;
}
// LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis:
// AL × IN
// ID × IN
// IN × IN
// NU × IN
// Examples: 9..., a..., H...
rule="16";
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
if (before == LB_IN && after == LB_IN) return false;
// Don't break alphanumerics.
// LB 17 Dont break within a9, 3a, or H%
// ID × PO
// AL × NU
// NU × AL
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
// This is approximated with the following rules. (Some cases already handled above,
// like 9,, [9.)
rule="17";
if (before == LB_ID && after == LB_PO) return false;
if (before == LB_AL && after == LB_NU) return false;
if (before == LB_NU && after == LB_AL) return false;
// LB 18 Dont break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]%
rule="18";
if (before == LB_CL && after == LB_PO) return false;
if (before == LB_HY && after == LB_NU) return false;
if (before == LB_IS && after == LB_NU) return false;
if (before == LB_NU && after == LB_NU) return false;
if (before == LB_NU && after == LB_PO) return false;
if (before == LB_PR && after == LB_AL) return false;
if (before == LB_PR && after == LB_HY) return false;
if (before == LB_PR && after == LB_ID) return false;
if (before == LB_PR && after == LB_NU) return false;
if (before == LB_PR && after == LB_OP) return false;
if (before == LB_SY && after == LB_NU) return false;
if (recommended) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY ÷
// ÷ BB
rule="18b";
if (before == LB_HY) return true;
if (after == LB_BB) return true;
}
// LB 19 Dont break between alphabetics (at)
// AL × AL
rule="19";
if (before == LB_AL && after == LB_AL) return false;
// LB 20 Break everywhere else
// ALL ÷
// ÷ ALL
rule="20";
return true;
}
static class GenerateWordBreakTest extends GenerateLineBreakTest {
static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8,
oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES!
L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT,
LIMIT = LVT + 1;
static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" };
static UCDProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend);
static UCDProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
static UCDProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
{
fileName = "Word";
TypeOrder = new byte[LIMIT];
for (byte i = 0; i < TypeOrder.length; ++i) {
TypeOrder[i] = i;
}
}
boolean skipType(byte type) {
return false;
}
public int getLimit() {
return LIMIT;
}
public int getTableLimit() {
return LIMIT;
}
// stuff that subclasses need to override
public int genTestItems(String before, String after, String[] results) {
results[0] = before + after;
return 1;
}
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
boolean normalBreak = isBreak(before + after, before.length(), recommended);
String normalRule = rule;
ruleOut[0] = rule;
return normalBreak ? BREAK : NOBREAK;
}
// stuff that subclasses need to override
public String getTypeID(int cp) {
byte type = getType(cp);
if (type >= oLIMIT) return hNames[type - oLIMIT];
return Names[type];
}
// stuff that subclasses need to override
public byte getType(int cp) {
// single characters
if (cp == 0xA) return LF;
if (cp == 0xD) return CR;
if (cp == 0x034F) return CGJ;
if (cp == 0x2028 || cp == 0x2029) return Control;
// Hangul
byte result = getHangulType(cp);
if (result != hNot) return (byte)(result + oLIMIT);
// other properties
// category based
byte cat = Default.ucd().getCategory(cp);
if (cat == Cc) return Control;
if (cat == Cf) return Extend;
if (((1<<cat) & LETTER_MASK) != 0) return LetterBase;
// other binary properties
if (linkProp.hasValue(cp)) return Link;
if (extendProp.hasValue(cp)) return Extend;
if (baseProp.hasValue(cp)) return Base;
return Other;
}
public byte getResolvedType(int cp, boolean recommended) {
return getType(cp);
}
public boolean isBreak(String source, int offset, boolean recommended) {
rule="1";
if (offset < 0 || offset > source.length()) return false;
if (offset == 0) return true;
rule = "2";
if (offset == source.length()) return true;
// UTF-16: never break in the middle of a code point
if (!onCodepointBoundary(source, offset)) return false;
// now get the character before and after, and their types
int cpBefore = UTF16.charAt(source, offset-1);
int cpAfter = UTF16.charAt(source, offset);
byte before = getResolvedType(cpBefore, recommended);
byte after = getResolvedType(cpAfter, recommended);
rule = "3";
if (before == CR && after == LF) return false;
rule = "4";
if (before == CR || before == LF || before == Control
|| after == Control || after == LF || after == CR) return true;
rule = "6";
if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
rule = "7";
if ((before == LV || before == V) && (after == V || after == T)) return false;
rule = "8";
if ((before == LVT || before == T) && (after == T)) return false;
rule = "9";
if (after == Extend) return false;
if (recommended) {
if (after == Link || after == CGJ) return false;
} else {
// Do not break around a CGJ.
rule = "10";
if (before == CGJ && (after == Base
|| after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false;
rule = "11";
if (after == CGJ) return false;
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
rule = "12";
//Link Extend* × LetterBase (12)
if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
int backOffset = findLastNon(source, offset, Extend, recommended);
if (backOffset >= 0) {
byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended);
if (last == Link) return false;
}
}
rule = "13";
if (after == Link) return false;
}
// Otherwise break after all characters.
rule = "14";
return true;
}
}
}

View File

@ -1,125 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $
* $Date: 2006/04/05 22:12:45 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import java.util.*;
import java.io.*;
public final class GenerateNamedSequences implements UCD_Types {
static final boolean DEBUG = false;
static public String showVarGlyphs(String code0, String code1, String shape, String description) {
if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
String abbShape = "";
if (shape.length() != 0) {
abbShape = '-' + shape.substring(0,4);
if (description.indexOf("feminine") >= 0) abbShape += "fem";
}
return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape
+ "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
}
/*
# Field 0: the variation sequence
# Field 1: the description of the desired appearance
# Field 2: where the appearance is only different in in particular shaping environments
# this field lists them. The possible values are: isolated, initial, medial, final.
# If more than one is present, there are spaces between them.
*/
static public void generate() throws IOException {
// read the data and compose the table
String table = "<table><tr><th width='10%'>Rep Glyph</th><th>Hex Sequence</th><th>Name</th><th>Copyable</th></tr>";
String[] splits = new String[4];
String[] codes = new String[20];
String[] shapes = new String[4];
BufferedReader in = Utility.openUnicodeFile("NamedSequences", Default.ucdVersion(), true, Utility.LATIN1);
Transliterator unicodexml = Transliterator.getInstance("hex/xml");
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line, ';', splits);
String name = splits[0];
int codeCount = Utility.split(splits[1], ' ', codes);
StringBuffer codeBuffer = new StringBuffer();
for (int i = 0; i < codeCount; ++i) {
UTF16.append(codeBuffer, Integer.parseInt(codes[i],16));
}
String codeWithHyphens = splits[1].replaceAll("\\s", "-");
String codeAlt = "U+" + splits[1].replaceAll("\\s", " U+");
String codeString = unicodexml.transliterate(codeBuffer.toString());
// <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
//table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
String imageName = "images/U" + codeWithHyphens + ".gif";
if (splits[1].compareTo("1780") >= 0 && splits[1].compareTo("1800") < 0) {
String codeNoSpaces2 = splits[1].replaceAll("\\s", "");
imageName = "http://www.unicode.org/reports/tr28/images/" + codeNoSpaces2 + ".gif";
}
table += "<tr>"
+ "<td class='copy'><img alt='(" + codeAlt + ")' src='" + imageName + "'><br><tt>"
+ splits[1] + "</tt></td>"
+ "<td>" + splits[1] + "</td>"
+ "</td><td>" + name + "</td>"
+ "<td class='copy'>" + codeString + "</td>"
+ "</tr>\n";
System.out.println(splits[1] + "\t" + codeString);
}
in.close();
table += "</table>";
// now write out the results
String directory = "DerivedData/";
String filename = directory + "NamedSequences" + UnicodeDataFile.getHTMLFileSuffix(true);
PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
/*
String[] batName = {""};
String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
String version = Default.ucd().getVersion();
int lastDot = version.lastIndexOf('.');
String updateDirectory = version.substring(0,lastDot) + "-Update";
int updateV = version.charAt(version.length()-1) - '0';
if (updateV != 0) updateDirectory += (char)('1' + updateV);
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
*/
String[] replacementList = {
"@revision@", Default.ucd().getVersion(),
//"@updateDirectory@", updateDirectory,
"@date@", Default.getDate(),
"@table@", table};
Utility.appendFile("com/ibm/text/UCD/NamedSequences-Template.html", Utility.UTF8, out, replacementList);
out.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
}
}

View File

@ -1,136 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java,v $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import java.util.*;
import java.io.*;
public final class GenerateStandardizedVariants implements UCD_Types {
static final boolean DEBUG = false;
static public String showVarGlyphs(String code0, String code1, String shape, String description) {
if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
String abbShape = "";
if (shape.length() != 0) {
abbShape = '-' + shape.substring(0,4);
if (description.indexOf("feminine") >= 0) abbShape += "fem";
}
return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape
+ "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
}
/*
# Field 0: the variation sequence
# Field 1: the description of the desired appearance
# Field 2: where the appearance is only different in in particular shaping environments
# this field lists them. The possible values are: isolated, initial, medial, final.
# If more than one is present, there are spaces between them.
*/
static public void generate() throws IOException {
// read the data and compose the table
String table = "<table><tr><th>Rep Glyph</th><th>Character Sequence</th><th>Context</th><th width='10%'>Alt Glyph</th><th>Description of variant appearance</th></tr>";
String[] splits = new String[4];
String[] codes = new String[2];
String[] shapes = new String[4];
BufferedReader in = Utility.openUnicodeFile("StandardizedVariants", Default.ucdVersion(), true, Utility.LATIN1);
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
if (line.length() == 0) continue;
int count = Utility.split(line, ';', splits);
int codeCount = Utility.split(splits[0], ' ', codes);
int code = Utility.codePointFromHex(codes[0]);
// <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
table += "<td>" + splits[0] + "</td>\n";
String shape = splits[2].trim();
if (shape.equals("all")) shape = "";
table += "<td>" + Utility.replace(shape, " ", "<br>") + "</td>\n";
// http://www.unicode.org/cgi-bin/varglyph?24-1820-180B-fina
// http://www.unicode.org/cgi-bin/varglyph?24-222A-FE00
table += "<td>";
if (shape.length() == 0) {
table += showVarGlyphs(codes[0], codes[1], "", "");
} else {
int shapeCount = Utility.split(shape, ' ', shapes);
for (int i = 0; i < shapeCount; ++i) {
if (i != 0) table += " ";
table += showVarGlyphs(codes[0], codes[1], shapes[i], splits[1]);
}
}
table += "</td>\n";
table += "<td>" + Default.ucd().getName(code) + " " + splits[1] + "</td>\n";
table += "</tr>";
}
in.close();
table += "</table>";
// now write out the results
String directory = "DerivedData/";
String filename = directory + "StandardizedVariants" + UnicodeDataFile.getHTMLFileSuffix(true);
PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
//String[] batName = {""};
//String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
String version = Default.ucd().getVersion();
int lastDot = version.lastIndexOf('.');
String updateDirectory;
String partialFilename;
if (version.compareTo("4.1.0") < 0) {
updateDirectory = version.substring(0,lastDot) + "-Update";
int updateV = version.charAt(version.length()-1) - '0';
if (updateV != 0) updateDirectory += (char)('1' + updateV);
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
partialFilename = "StandardizedVariants-" + Default.ucd().getVersion();
} else if (version.compareTo("4.1.0") == 0) {
updateDirectory = version.substring(0,lastDot) + "/ucd";
partialFilename = "StandardizedVariants";
} else {
updateDirectory = version + "/ucd";
partialFilename = "StandardizedVariants";
}
String[] replacementList = {
"@revision@", Default.ucd().getVersion(),
"@updateDirectory@", updateDirectory,
"@filename@", partialFilename,
"@date@", Default.getDate(),
"@table@", table};
Utility.appendFile("com/ibm/text/UCD/StandardizedVariants-Template.html", Utility.UTF8, out, replacementList);
out.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
}
}

View File

@ -1,516 +0,0 @@
/*
* Created on May 3, 2005
* Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
* For terms of use, see http://www.unicode.org/terms_of_use.html
*/
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.test.util.UnicodeLabel;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
import com.ibm.icu.impl.CollectionUtilities;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.text.UTF16.StringComparator;
import com.ibm.icu.util.ULocale;
import com.ibm.text.UCD.GenerateHanTransliterator.MultiComparator;
import com.ibm.text.UCD.TestData.RegexMatcher;
import com.ibm.text.utility.Utility;
class GenerateStringPrep implements UCD_Types {
public static void main (String[] args) throws IOException {
//checkChars(false);
new GenerateStringPrep().genStringPrep();
System.out.println("Done");
}
UnicodeSet[] coreChars = new UnicodeSet[100];
UnicodeSet decomposable = new UnicodeSet();
UnicodeMap suspect = new UnicodeMap();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
ToolUnicodePropertySource ups32 = ToolUnicodePropertySource.make("3.2.0");
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
UnicodeSet wordChars = new UnicodeSet();
{
if (false) {
wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
wordChars.retainAll(ups.getSet("gc=Sk"));
}
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
" \\u055A \\u02B9 \\u02BA]"));
//wordChars.removeAll(xid_continue);
}
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
UnicodeSet non_spacing = new UnicodeSet(ups.getSet("gc=Me"))
.addAll(ups.getSet("gc=Mn"))
.removeAll(ups.getSet("Default_Ignorable_Code_Point=true"));
UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
//UnicodeSet[] decompChars = new UnicodeSet[100];
UCD ucd = Default.ucd();
static Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
{
uca0.setStrength(Collator.IDENTICAL);
}
static GenerateHanTransliterator.MultiComparator uca
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
uca0, new UTF16.StringComparator()});
UnicodeSet bidiR = new UnicodeSet(
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
UnicodeSet hasNoUpper = new UnicodeSet();
UnicodeSet hasNoUpperMinus = new UnicodeSet();
BagFormatter bf = new BagFormatter();
UnicodeSet inIDN = new UnicodeSet();
UnicodeSet isCaseFolded = new UnicodeSet();
void genStringPrep() throws IOException {
//showScriptToBlock();
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.setUnicodePropertyFactory(ups);
//bf.setValueSource(UnicodeLabel.NULL);
if (false) {
System.out.println("word chars: " + bf.showSetNames(wordChars));
System.out.println("pat: " + bf.showSetNames(patternProp));
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
}
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
// get IDNA
int idnaType = getIDNAType(cp);
idnaTypeSet[idnaType].add(cp);
String str = UTF16.valueOf(cp);
if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
if (str.equals(ucd.getCase(str, FULL, FOLD))) isCaseFolded.add(cp);
// scripts
int script = ucd.getScript(cp);
if (coreChars[script] == null)
coreChars[script] = new UnicodeSet();
coreChars[script].add(cp);
}
// fix characters with no uppercase
hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
System.out.println(bf.showSetNames(hasNoUpper));
Utility.fixDot();
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
textOut.println('\uFEFF');
textOut.println("For documentation, see idn-chars.html");
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
new String[] {"%date%", Default.getDate()});
/*
out
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>IDN Characters</title><style>");
out.println("<!--");
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
out.println(".Atomic { background-color: #CCCCFF }");
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
out.println(".Non-XID { background-color: #FFCCCC }");
out.println(".Decomposable { background-color: #FFFFCC }");
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
out.println("th { text-align: left }");
out.println("-->");
out.println("</style></head><body><table>");
*/
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
if (scriptCode == COMMON_SCRIPT
|| scriptCode == INHERITED_SCRIPT)
continue;
showCodes(htmlOut, textOut, scriptCode, htmlOut2);
}
showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
showCodes(htmlOut, textOut, non_spacing);
htmlOut.println("</table></body></html>");
htmlOut.close();
htmlOut2.println("</table></body></html>");
htmlOut2.close();
bf.setMergeRanges(false);
textOut.println();
textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
textOut.println();
bf.setValueSource("word-chars");
bf.showSetNames(textOut, wordChars);
textOut.println();
textOut.println("# *** FOR REVIEW ***");
bf.setLabelSource(UnicodeLabel.NULL);
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
textOut.println();
String value = (String)it.next();
bf.setValueSource(value);
bf.showSetNames(textOut, suspect.getSet(value));
}
textOut.close();
textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn_vs_cfnfkcid.txt");
bf = new BagFormatter();
bf.setUnicodePropertyFactory(ups);
textOut.println();
textOut.println("# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***");
UnicodeSet U32 = ups32.getSet("gc=cn").complement();
UnicodeSet CF_NFKC_ID = new UnicodeSet(xid_continue).retainAll(isNFKC).retainAll(isCaseFolded).retainAll(U32);
bf.showSetDifferences(textOut, "CF_NFKC_ID", CF_NFKC_ID, "IDN", idnaTypeSet[OK]);
textOut.close();
}
/**
*
*/
private void showScriptToBlock() {
UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
public Object compose(int codePoint, Object a, Object b) {
return a + "\t" + b;
}
};
UnicodeMap sb = ((UnicodeMap)scripts.cloneAsThawed()).composeWith(blocks, myCompose);
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
System.out.println(it.next());
}
throw new IllegalArgumentException();
}
Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
static String[][] script_to_gif = {
{"Common","common.gif"}, //Miscellaneous_Symbols
{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
{"Arabic","arabic.gif"}, //Arabic
{"Armenian","armenian.gif"}, //Armenian
{"Bengali","bengali.gif"}, //Bengali
{"Bopomofo","bopomofo.gif"}, //Bopomofo
{"Braille","braillesymbols.gif"}, //Braille_Patterns
{"Buginese","buginese.gif"}, //Buginese
{"Buhid","buhid.gif"}, //Buhid
{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
{"Cherokee","cherokee.gif"}, //Cherokee
{"Coptic","coptic.gif"}, //Coptic
{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
{"Cyrillic","cyrillic.gif"}, //Cyrillic
{"Deseret","deseret.gif"}, //Deseret
{"Devanagari","devanagari.gif"}, //Devanagari
{"Ethiopic","ethiopic.gif"}, //Ethiopic
{"Georgian","georgian.gif"}, //Georgian
{"Glagolitic","glagolitic.gif"}, //Glagolitic
{"Gothic","gothic.gif"}, //Gothic
{"Greek","greek.gif"}, //Greek_and_Coptic
{"Gujarati","gujarati.gif"}, //Gujarati
{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
{"Hanunoo","hanunoo.gif"}, //Hanunoo
{"Hebrew","hebrew.gif"}, //Hebrew
{"Hiragana","hiragana.gif"}, //Hiragana
{"Kannada","kannada.gif"}, //Kannada
{"Katakana","katakana.gif"}, //Katakana
{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
{"Khmer","khmer.gif"}, //Khmer
{"Lao","lao.gif"}, //Lao
{"Latin","latin.gif"}, //Basic_Latin
{"Limbu","limbu.gif"}, //Limbu
{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
{"Malayalam","malayalam.gif"}, //Malayalam
{"Mongolian","mongolian.gif"}, //Mongolian
{"Myanmar","myanmar.gif"}, //Myanmar
{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
{"Ogham","ogham.gif"}, //Ogham
{"Old_Italic","olditalic.gif"}, //Old_Italic
{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
{"Oriya","oriya.gif"}, //Oriya
{"Osmanya","osmanya.gif"}, //Osmanya
{"Runic","runic.gif"}, //Runic
{"Shavian","shavian.gif"}, //Shavian
{"Sinhala","sinhala.gif"}, //Sinhala
{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
{"Syriac","syriac.gif"}, //Syriac
{"Tagalog","tagalog.gif"}, //Tagalog
{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
{"Tai_Le","taile.gif"}, //Tai_Le
{"Tamil","tamil.gif"}, //Tamil
{"Telugu","telugu.gif"}, //Telugu
{"Thaana","thaana.gif"}, //Thaana
{"Thai","thai.gif"}, //Thai
{"Tibetan","tibetan.gif"}, //Tibetan
{"Tifinagh","tifinagh.gif"}, //Tifinagh
{"Ugaritic","ugaritic.gif"}, //Ugaritic
{"Yi","yi.gif"}, //Yi_Syllables
};
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
{
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
}
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
/**
*
*/
static public int getIDNAType(int cp) {
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
IDNA.DEFAULT); // USE_STD3_RULES
if (intermediate.length() == 0)
return DELETED;
outbuffer = IDNA.convertToUnicode(intermediate,
IDNA.USE_STD3_RULES);
} catch (StringPrepParseException e) {
return ILLEGAL;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
return ILLEGAL;
}
if (!TestData.equals(inbuffer, outbuffer))
return REMAPPED;
return OK;
}
static StringBuffer inbuffer = new StringBuffer();
static StringBuffer intermediate, outbuffer;
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
/**
* @param htmlOut
* @param textOut TODO
* @param scriptCode
* @param htmlOut2 TODO
* @param ucd
* @param coreChars
* @param decompChars
*/
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
if (coreChars[scriptCode] == null) return;
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
script = Utility.getUnskeleton(script.toLowerCase(),true);
System.out.println(script);
htmlOut.println();
String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
+ "'> Script: " + script + "</th></tr>";
htmlOut.println(scriptLine);
htmlOut2.println(scriptLine);
textOut.println();
textOut.println("#*** Script: " + script + " ***");
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
UnicodeSet decomp = extract(decomposable, core);
UnicodeSet pattern = extract(patternProp, core);
UnicodeSet non_id = extract(not_xid_continue, core);
UnicodeSet bicameralNoupper = new UnicodeSet();
if (!hasNoUpper.containsAll(core)) {
bicameralNoupper = extract(hasNoUpperMinus, core);
}
UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
String cat = Default.ucd().getCategoryID(it.codepoint);
String name = Default.ucd().getName(it.codepoint);
if (name.indexOf("MUSICAL SYMBOL") >= 0
|| name.indexOf("DINGBA") >= 0
|| name.indexOf("RADICAL ") >= 0
) cat = "XX";
suspect.put(it.codepoint, cat);
}
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode, uca);
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode, uca);
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode, uca);
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode, uca);
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode, uca);
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode, uca);
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode, uca);
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode, uca);
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode, uca);
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode, uca);
}
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) throws IOException {
String script = Default.ucd().getScriptID_fromIndex((byte) INHERITED_SCRIPT);
script = Utility.getUnskeleton(script.toLowerCase(),true);
String scriptLine = "<tr><th class='script'><img src='images/"
+ ((String)scriptToGif.get(script)).toLowerCase()
+ "'> Script: " + script + "</th></tr>";
htmlOut.println(scriptLine);
UnicodeMap m = getPositions();
for (Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); it.hasNext(); ) {
String type = (String) it.next();
UnicodeSet current = m.getSet(type).retainAll(non_spacing);
if (current.size() == 0) continue;
printlnSet(htmlOut, textOut, script, "Visible_Combining_Marks_" + type, current, INHERITED_SCRIPT, positionComparator);
}
}
/**
* @throws IOException
*
*/
private UnicodeMap getPositions() throws IOException {
UnicodeMap result = new UnicodeMap();
BufferedReader in = bf.openUTF8Reader("C:\\DATA\\confusables\\", "positions.txt");
String type="Undetermined";
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
if (line.length() == 0) continue;
if (line.startsWith("@")) {
type = line.substring(1);
continue;
}
String[] pieces = Utility.split(line, ';');
String code = Utility.fromHex(pieces[0]);
result.put(UTF16.charAt(code,0), type);
}
return result;
}
static Comparator positionComparator = new Comparator() {
public int compare(Object o1, Object o2) {
String s1 = (String)o1;
String s2 = (String)o2;
return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2));
}
};
/**
*
*/
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
core.removeAll(decomp);
return decomp;
}
/**
* @param htmlOut
* @param textOut TODO
* @param script TODO
* @param unicodeset
* @param scriptCode
* @param comparator TODO
* @param uca
*/
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
String script, String title, UnicodeSet unicodeset, int scriptCode, Comparator comparator) {
if (unicodeset == null)
return;
int size = unicodeset.size();
String dir = unicodeset.containsSome(bidiR)
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
htmlOut.println("<tr><th class='" + title + "'><a href='#" +
title + "'>" + title + "</a> ("
+ TestData.nf.format(size) + ")</th></tr>");
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
// <a href="#Atomic">categorization</a>
textOut.println();
textOut.println("# " + title);
bf.setValueSource(script + " ; " + title);
UnicodeSetIterator usi = new UnicodeSetIterator();
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
usi.reset(unicodeset);
while (usi.nextRange()) {
if (usi.codepoint == usi.codepointEnd) {
htmlOut.print(formatCode(UTF16
.valueOf(usi.codepoint)));
} else {
htmlOut.print(formatCode(UTF16
.valueOf(usi.codepoint))
+ ".. "
+ formatCode(UTF16
.valueOf(usi.codepointEnd)));
}
}
bf.showSetNames(textOut, unicodeset);
} else {
Set reordered = new TreeSet(comparator);
usi.reset(unicodeset);
while (usi.next()) {
String x = usi.getString();
boolean foo = reordered.add(x);
if (!foo)
throw new IllegalArgumentException("Collision with "
+ Default.ucd().getCodeAndName(x));
}
for (Iterator it = reordered.iterator(); it.hasNext();) {
Object key = it.next();
htmlOut.print(formatCode((String)key));
}
bf.showSetNames(textOut, reordered);
}
htmlOut.println("</td></tr>");
}
/**
* @param string
* @return
*/
private String formatCode(String string) {
int cat = ucd.getCategory(UTF16.charAt(string,0));
String pad = "\u00A0", pad1 = pad;
if (cat == Me || cat == Mn) {
pad = "\u00A0\u00A0";
pad1 = "\u00A0\u00A0\u25cc";
}
return "<span title='" + ucd.getCodeAndName(string) + "'>"
+ pad1
+ TransliteratorUtilities.toHTMLControl.transliterate(string)
+ pad
+ "</span> ";
}
}

View File

@ -1,74 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks-old.java,v $
* $Date: 2005/03/04 02:50:26 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UnicodeSet;
import java.util.*;
public class GenerateThaiBreaks {
public static void main(String [] args) throws IOException {
BufferedReader br = new BufferedReader(
new InputStreamReader(
new FileInputStream("\\icu4j\\src\\data\\thai6.ucs"), "UnicodeLittle"));
try {
Main.setUCD();
UnicodeSet ignorables = new UnicodeSet("[:M:]");
ignorables.retain(0x0E00, 0x0E7F); // just Thai block
ignorables.add(0x0E40, 0x0E44); // add logical order exception
ignorables.add(0, ' '); // add controls
ignorables.add('.');
UnicodeSet initials = new UnicodeSet();
UnicodeSet finals = new UnicodeSet();
UnicodeSet medials = new UnicodeSet();
while (true) {
String line = br.readLine();
if (line == null) break;
int end;
// find final consonant
for (int i = line.length() - 1; ; --i) {
char c = line.charAt(i);
if (!ignorables.contains(c)) {
finals.add(c);
end = i;
break;
}
}
boolean haveFirst = false;
for (int i = 0; i < end; ++i) {
char c = line.charAt(i);
if (ignorables.contains(c)) continue;
if (!haveFirst) {
initials.add(c);
haveFirst = true;
} else {
medials.add(c);
}
}
}
initials.removeAll(medials);
finals.removeAll(medials);
Utility.showSetNames("initials: ", initials, false, Main.ucd);
Utility.showSetNames("finals: ", finals, false, Main.ucd);
Utility.showSetNames("medials: ", medials, false, Main.ucd);
} finally {
br.close();
}
}
}

View File

@ -1,135 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
* $Date: 2006/09/24 23:32:44 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.icu.text.UTF16;
//import com.ibm.text.utility;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.Utility;
//import java.util.*;
public class GenerateThaiBreaks {
public static void main(String [] args) throws IOException {
BufferedReader br = new BufferedReader(
new InputStreamReader(
new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle"));
PrintWriter out = null;
try {
UnicodeSet ignorables = new UnicodeSet();
/* new UnicodeSet(0xE30, 0xE3A);
ignorables.add(0x0E40, 0x0E44); // add logical order exception
ignorables.add(0x0E47, 0x0E4E);
*/
ignorables.add(0, ' '); // add controls
ignorables.add('.');
UnicodeSet initials = new UnicodeSet();
UnicodeSet finals = new UnicodeSet();
UnicodeSet medials = new UnicodeSet();
char[] buffer = new char[100];
while (true) {
String line = br.readLine();
if (line == null) break;
int end = 0;
// find 'real' characters
for (int i = 0; i < line.length(); ++i) {
char c = line.charAt(i);
if (ignorables.contains(c)) continue;
buffer[end++] = c;
}
String temp = new String(buffer, 0, end);
if (temp.length() <= 1) {
initials.add(temp);
finals.add(temp);
continue;
}
initials.add(temp.substring(0,1));
//initials.add(temp.substring(0,2));
finals.add(temp.substring(temp.length()-1));
//finals.add(temp.substring(temp.length()-1));
for (int i = 1; i < temp.length() - 1; ++i) {
//medials.add(temp.substring(i, i+2));
medials.add(temp.substring(i, i+1));
}
//medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
}
System.out.println("initials size: " + initials.size());
System.out.println("finals size: " + finals.size());
System.out.println("medials size: " + medials.size());
//out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
// out.write('\uFEFF');
UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]");
finals.addAll(marks);
UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals);
UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all);
System.out.println("Never occur: " + missingThai.toPattern(true));
Utility.showSetNames("", missingThai, true, Default.ucd());
System.out.println();
UnicodeSet neverInitial = new UnicodeSet(all).removeAll(initials);
UnicodeSet neverFinal = new UnicodeSet(all).removeAll(finals);
System.out.println("Never initial: " + neverInitial.toPattern(true));
Utility.showSetNames("", neverInitial, true, Default.ucd());
System.out.println();
System.out.println("Never final: " + neverFinal.toPattern(true));
Utility.showSetNames("", neverFinal, true, Default.ucd());
System.out.println();
initials.removeAll(medials);
finals.removeAll(medials);
System.out.println("initials size: " + initials.size());
System.out.println("finals size: " + finals.size());
System.out.println("Only Initials" + initials.toPattern(true));
Utility.showSetNames("", initials, true, Default.ucd());
System.out.println();
System.out.println("Only Finals" + finals.toPattern(true));
Utility.showSetNames("", finals, true, Default.ucd());
} finally {
br.close();
if (out != null) out.close();
}
}
static class MyBreaker implements Utility.Breaker {
public String get(Object current, Object old) {
if (old == null || UTF16.charAt(current.toString(), 0) == UTF16.charAt(old.toString(), 0)) {
return current.toString() + "(" + Default.ucd().getCode(current.toString().substring(1)) + "))";
} else {
return "\r\n" + current + "(" + Default.ucd().getCode(current.toString()) + "))";
}
}
public boolean filter(Object current) { return true; }
}
}

View File

@ -1,177 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/IANANames.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.lang.UCharacter;
import java.util.*;
import java.text.NumberFormat;
import java.io.*;
public class IANANames implements UCD_Types {
private Map aliasToBase = new TreeMap();
private Map aliasToComment = new TreeMap();
private Map aliasToLine = new TreeMap();
public static void testSensitivity() throws IOException {
IANANames iNames = new IANANames();
Map m = new HashMap();
Iterator it = iNames.getIterator();
UnicodeSet removed = new UnicodeSet();
int maxLength = 0;
while (it.hasNext()) {
String alias = (String) it.next();
if (maxLength < alias.length()) maxLength = alias.length();
if (alias.length() > 40) System.out.println("Name >40: " + alias);
if (alias.indexOf(')') >= 0 || alias.indexOf('(') >= 0) System.out.println("Illegal tag: " + alias);
String skeleton = removeNonAlphanumeric(alias, removed);
String other = (String) m.get(skeleton);
if (other != null) {
String base = iNames.getBase(alias);
String otherBase = iNames.getBase(other);
if (!base.equals(otherBase)) {
System.out.println("Collision between: " + alias + " (" + base + ") and "
+ other + " (" + otherBase + ")");
} else {
System.out.println("Alias Variant: " + alias + " and " + other + " (" + base + ")");
}
} else {
m.put(skeleton, alias);
}
}
System.out.println("Max Length: " + maxLength);
System.out.println("Characters removed: ");
UnicodeSetIterator usi = new UnicodeSetIterator(removed);
while (usi.next()) {
char c = (char) usi.codepoint; // safe, can't be supplementary
System.out.println("0x" + usi.codepoint + "\t'" + c + "'\t" + UCharacter.getName(usi.codepoint));
}
}
public IANANames() throws IOException {
BufferedReader in = Utility.openReadFile(BASE_DIR + "IANA\\character-sets.txt", Utility.LATIN1);
try {
boolean atStart = true;
String lastName = "";
int counter = 0;
while (true) {
String line = in.readLine();
if (line == null) break;
counter++;
if (atStart) {
if (line.startsWith("-------------")) atStart = false;
continue;
}
if (line.trim().length() == 0) continue;
if (line.startsWith("Name:") || line.startsWith("Alias:")) {
lastName = add(line, lastName, counter);
} else if (line.startsWith("Source:") || line.startsWith("MIBenum:")
|| line.startsWith(" ")) {
continue;
} else if (line.equals("REFERENCES")) {
break;
} else {
System.out.println("Unknown Line: " + line);
}
}
} finally {
in.close();
}
}
private String add(String line, String baseName, int counter) {
// extract the alias, doing a little validity check
int pos = line.indexOf(": ");
if (pos < 0) throw new IllegalArgumentException("Bad line: " + counter + " '" + line + "'");
String alias = line.substring(pos+2).trim();
// get comment
String comment = null;
pos = alias.indexOf(' ');
if (pos >= 0) {
comment = alias.substring(pos).trim();
alias = alias.substring(0, pos);
}
// reset the baseName if we are a name
if (line.startsWith("Name:")) {
baseName = alias;
}
// store
if (!alias.equals("None")) {
if (false) {
if (baseName.equals(alias)) System.out.println();
System.out.println("Adding " + alias + "\t=> " + baseName + (comment != null ? "\t(" + comment + ")" : ""));
}
// check if it is stored already
String oldbaseName = (String) aliasToBase.get(alias);
if (oldbaseName != null) {
System.out.println("Duplicate alias (" + alias + ", " + oldbaseName + ", " + baseName + "): "
+ counter + " '" + line + "'");
}
aliasToBase.put(alias, baseName);
if (comment != null) aliasToComment.put(alias, comment);
aliasToLine.put(alias, comment);
}
return baseName;
}
public Iterator getIterator() {
return aliasToBase.keySet().iterator();
}
/**
* Returns the name for this alias, or "" if there is none
*/
public String getBase(String alias) {
return (String) aliasToBase.get(alias);
}
public static String removeNonAlphanumeric(String s, UnicodeSet removed) {
s = s.toUpperCase(Locale.ENGLISH); // can't have Turkish!
StringBuffer result = new StringBuffer();
boolean removedZero = false;
for (int i = 0; i < s.length(); ++i) {
char c = s.charAt(i);
if (c == '0') {
char cLast = result.length() > 0 ? result.charAt(result.length() - 1) : '0';
if ('0' <= cLast && cLast <= '9') {
result.append(c);
} else {
if (!removed.contains(c)) {
System.out.println("Removed '" + c + "' from " + s + " => " + result);
removed.add(c);
}
removedZero = true;
}
} else if (('A' <= c && c <= 'Z') || ('0' <= c && c <= '9')) {
result.append(c);
} else {
if (!removed.contains(c)) {
System.out.println("Removed '" + c + "' from " + s + " => " + result);
removed.add(c);
}
}
}
//if (removedZero) System.out.println("Removed 0 from " + s + " => " + result);
return result.toString();
}
}

View File

@ -1,142 +0,0 @@
package com.ibm.text.UCD;
import java.io.IOException;
import java.io.PrintWriter;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.impl.PrettyPrinter;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.Utility;
public class IDNTester {
static StringBuffer inbuffer = new StringBuffer();
static StringBuffer intermediate, outbuffer;
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
static UnicodeSet IDNInputOnly = new UnicodeSet();
static UnicodeSet IDNOutput = new UnicodeSet();
static boolean initialized = false;
static UnicodeSet IDInputOnly32 = new UnicodeSet();
static UnicodeSet IDOutput32 = new UnicodeSet();
static UnicodeSet IDInputOnly50 = new UnicodeSet();
static UnicodeSet IDOutput50 = new UnicodeSet();
static PrettyPrinter pp = new PrettyPrinter();
static PrintWriter pw;
public static void main(String[] args) throws IOException {
initialize();
pw = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "idnCount.html");
pw.println("<html><body>");
showSet("IDN InputOnly: ", IDNInputOnly);
showSet("IDN Output: ", IDNOutput);
showSet("ID InputOnly, U3.2: ", IDInputOnly32);
showSet("ID Output, U3.2: ", IDOutput32);
showSet("IDN Output - ID Output, U3.2: ", new UnicodeSet(IDNOutput).removeAll(IDOutput32));
showSet("IDN Output & ID Output, U3.2: ", new UnicodeSet(IDNOutput).retainAll(IDOutput32));
showSet("ID Output - IDN Output, U3.2: ", new UnicodeSet(IDOutput32).removeAll(IDNOutput));
showSet("ID InputOnly, U5.0: ", IDInputOnly50);
showSet("ID Output, U5.0: ", IDOutput50);
showSet("ID Output, U5.0 - U3.2: ", new UnicodeSet(IDOutput50).removeAll(IDOutput32));
pw.println("</body></html>");
pw.close();
}
public static void showSet(String title, UnicodeSet set) {
pw.println("<h2>" + title + set.size() + "</h2>" + "<p>" + pp.toPattern(set) + "</p>");
pw.println();
}
static UnicodeSet getIDNInput() {
if (!initialized) initialize();
return IDNInputOnly;
}
static UnicodeSet getIDNOutput() {
if (!initialized) initialize();
return IDNInputOnly;
}
private static void initialize() {
UnicodeSet oddballs = new UnicodeSet("[\u034F \u180B-\u180D \uFE00-\uFE0F _]");
UCD U32 = UCD.make("3.2.0");
Normalizer nfkc32 = new Normalizer(Normalizer.NFKC, "3.2.0");
UCDProperty xid32 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U32);
UnicodeSet IDInput32 = xid32.getSet();
IDInput32.add('-').removeAll(oddballs);
UCD U50 = UCD.make("5.0.0");
Normalizer nfkc50 = new Normalizer(Normalizer.NFKC, "5.0.0");
UCDProperty xid50 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U50);
UnicodeSet IDInput50 = xid50.getSet();
IDInput50.add('-').removeAll(oddballs);
for (int i = 0; i < 0x10FFFF; ++i) {
if ((i & 0xFFF) == 0) {
System.out.println(i);
System.out.flush();
}
int type = getIDNAType(i);
if (type == OK) {
IDNOutput.add(i);
} else if (type != ILLEGAL) {
IDNInputOnly.add(i);
}
if (IDInput32.contains(i)) {
splitSet(IDInputOnly32, IDOutput32, U32, nfkc32, i);
}
if (IDInput50.contains(i)) {
splitSet(IDInputOnly50, IDOutput50, U50, nfkc50, i);
}
}
initialized = true;
}
private static void splitSet(UnicodeSet inputOnlySet, UnicodeSet outputSet, UCD ucd, Normalizer nfkc, int i) {
if (i < 0x7F) {
outputSet.add(i);
return;
}
String v = UTF16.valueOf(i);
String s = ucd.getCase(i, UCD.FULL, UCD.FOLD);
if (s.equals(v)) {
s = nfkc.normalize(s);
if (s.equals(v)) {
s = ucd.getCase(s, UCD.FULL, UCD.FOLD);
if (s.equals(v)) {
outputSet.add(i);
return;
}
}
}
inputOnlySet.add(i);
}
static public int getIDNAType(int cp) {
if (cp == '-') return OK;
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
IDNA.DEFAULT); // USE_STD3_RULES
if (intermediate.length() == 0)
return DELETED;
outbuffer = IDNA.convertToUnicode(intermediate,
IDNA.USE_STD3_RULES);
} catch (StringPrepParseException e) {
return ILLEGAL;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
return ILLEGAL;
}
if (!TestData.equals(inbuffer, outbuffer))
return REMAPPED;
return OK;
}
}

View File

@ -1,37 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/IntMap.java,v $
* $Date: 2003/03/18 00:10:47 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.HashMap;
public class IntMap {
int lowest = Integer.MAX_VALUE;
int highest = Integer.MIN_VALUE;
HashMap store = new HashMap();
public Object get(int key) {
if (key < lowest || key > highest) return null;
return store.get(new Integer(key));
}
public void put(int key, Object value) {
if (key < lowest) lowest = key;
if (key > highest) highest = key;
store.put(new Integer(key), value);
}
public int size() {
return store.size();
}
}

View File

@ -1,92 +0,0 @@
Show [[:block=tamil:] & [:age=3.2:] - [:age=3.1:]]
Show [[:block=tamil:] & [:age=4.0:] - [:age=3.2:]]
Show [[:block=tamil:] & [:age=4.1:] - [:age=4.0:]]
Show [[:block=tamil:] & [:age=5.0:] - [:age=4.1:]]
Stop
Show [[:NFKCQuickCheck=No:] & [$gc:Lm]]
Stop
[$Name: $gc:Sk]
[$Name: $gc:Lm]
Show [[$whitespace] - [$gc:zs]]
Show [[$gc:zs] - [$whitespace]]
Let $letter = [$gc:Lu $gc:Ll $gc:Lt $gc:Lo $gc:Lm];
Let $number = [$gc:Nd $gc:Nl $gc:No]
Let $mark = [$gc:mn $gc:me $gc:mc]
Let $LMN = [$letter $number $mark]
Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
Let $nfc = [^$NFC_Quick_Check:No]
Show $nfc
Show [$alphabetic - [$mark $letter $number]]
Let $oldCJK = [\u1100-\u11FF \u3040-\u30FF \u3130-\u318F \u31F0-\u31FF \u3400-\u4DBF \u4E00-\u9FFF \uAC00-\uD7AF \uF900-\uFAFF \uFF65-\uFFDC]
Show [$oldCJK & $gc:cn]
Let $fixedOld = [$oldCJK-$gc:cn]
#List the non-alphabetic old items
#Show [$oldCJK-$gc:cn-$alphabetic]
#Check for differences
#Test $fixedOld = $trialNew
#ShowEach $mark
Let $uax29_outliers = [\u3031-\u3035 \u309B-\u309C \u30A0 \u30FC \uFF70 \uFF9E-\uFF9F]
Let $other_outliers = [\u3099-\u309A \u3006 \u303C \u302A-\u302E \u302F \U000E0100-\U000E01EF]
# ==========================================
# Outliers from UAX29
Show $uax29_outliers
# Additional outliers
Show $other_outliers
# Take the 5 CJK scripts
Let $trialScripts = [$script:hani $script:hang $script:kana $script:hira $script:bopo]
# Remove the non-LMN
Let $trialNewBase = [$trialScripts & $LMN]
# Add the outliers
Let $trialNew = [$trialNewBase $uax29_outliers $other_outliers]
# Show our result
Show $trialNew
# As a double-check, show script characters we're tossing
Show [$trialScripts - $trialNew]
# Compare snippets stuff
Let $guessClose = [$lb:QU $lb:Close_Punctuation]
Let $__closing_punc = ["')>\]`\}\u00AB\u00BB\u2018\u2019\u201C\u201D\u2039\u203A\u207E\u208E\u27E7\u27E9\u27EB\u2984\u2986\u2988\u298A\u298C\u298E\u2990\u2992\u2994\u2996\u2998\u29D9\u29DB\u29FD\u3009\u300B\u300D\u300F\u3011\u3015\u3017\u3019\u301B\u301E\u301F\uFD3F\uFE42\uFE44\uFE5A\uFE5C\uFF02\uFF07\uFF09\uFF3D\uFF5D\uFF63]
$guessClose = $__closing_punc
Let $guessClose = [$gc:pf $gc:pe $gc:pi]
$guessClose = $__closing_punc
Let $guessTerm = [$sb:aterm $sb:sterm]
$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? <20> ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
Let $__issymotherr = [\u00A6\u00A7\u06FD\u06FE\u0F01-\u0F03\u0F13-\u0F17\u0F1A-\u0F1F\u0FBE-\u0FC5\u0FC7-\u0FCC\u2100\u2101\u2104-\u2106\u2108\u2109\u2117\u2118\u211E-\u2121\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u2613\u2619-\u266E\u2670\u2671\u2701-\u2704\u2706-\u2709\u270C-\u2727\u2729-\u274B\u274F-\u2752\u2758-\u275E\u2761-\u2794\u2798-\u27AF\u27B1-\u27BE\u2800-\u28FF\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3012\u3013\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u3200-\u321C\u322A-\u3243\u3260-\u327B\u328A-\u32B0\u32C0-\u32CB\u32D0-\u32FE\u3300-\u3376\u337B-\u33DD\u33E0-\u33FE\uA490-\uA4A1\uA4A4-\uA4B3\uA4B5-\uA4C0\uA4C2-\uA4C4\uFFED\uFFEE\uFFFC\uFFFD]
Let $__issymothers = [\u00B6\u0482\u06E9\u09FA\u0B70\u0F34\u0F36\u0F38\u0FCF\u2114\u2123\u2125\u2127\u2129\u212E\u2132\u213A\u21D3\u220E\u2617\u274D\u2756\u3004\u3020\u327F\uA4C6\uFFE4\uFFE8]
Let $symOther = [$__issymotherr $__issymothers]
$symOther = $gcAllSymbols
[$symOther & $nfc] = [$gcAllSymbols & $nfc]

View File

@ -1,18 +0,0 @@
package com.ibm.text.UCD;
public class ListNFComplete {
// find all the characters that are
// a) not decomposed by this normalization form
// b) of combining class 0
// AND if NKC or NFKC,
// c) can never compose with a previous character
// d) can never compose with a following character
// e) can never change if another character is added
// Example: a-breve might satisfy a-d, but if you
// add an ogonek it changes to a-ogonek + breve
public static void main (String[] args) {
//Normalizer nfd = new Normalizer(Normalizer.NFD);
}
}

View File

@ -1,327 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MLStreamWriter.java,v $
* $Date: 2003/04/25 01:39:15 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import java.util.*;
import com.ibm.text.UCD.*;
public class MLStreamWriter extends Writer {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
public MLStreamWriter (PrintWriter output, boolean HTML) {
out = output;
isHTML = HTML;
}
public MLStreamWriter (PrintWriter output) {
this(output,true);
}
public MLStreamWriter el(String elementName) {
closeIfOpen();
print('<', AFTER);
print(elementName, elementName.equals("!--") ? AFTER+FORCE : AFTER);
stack.add(elementName);
inElement = true;
return this;
}
private MLStreamWriter closeIfOpen() {
if (inElement && !"!--".equals(stack.get(stack.size()-1))) {
print('>',BEFORE+FORCE);
}
inElement = false;
return this;
}
final public MLStreamWriter cel(String elementName) {
return cl().tx(elementName);
}
public MLStreamWriter at(String attributeName, String attributeValue) {
if (!inElement) {
throw new IllegalArgumentException("attribute \"" + attributeName + "\" not in element");
}
print(' ', BOTH);
print(attributeName, AFTER);
print('=', AFTER);
print('"');
print(quoted(attributeValue));
print('"', AFTER);
return this;
}
public MLStreamWriter at(String attributeName, int value) {
return at(attributeName, String.valueOf(value));
}
public MLStreamWriter CR() {
closeIfOpen();
out.println();
return this;
}
/*public MLStreamWriter comment() {
closeIfOpen();
print("<!--");
CR();
return this;
}
public MLStreamWriter endComment() {
print("-->");
return this;
}
*/
public MLStreamWriter tx(String text) {
closeIfOpen();
print(quoted(text));
return this;
}
final public MLStreamWriter tx(char text) {
return tx(String.valueOf(text));
}
final public MLStreamWriter tx(int text) {
return tx(String.valueOf(text));
}
final public MLStreamWriter tx16(String text) {
return tx(hex(text));
}
final public MLStreamWriter tx16(char text) {
return tx(hex(text));
}
final public MLStreamWriter tx16(int text) {
return tx(hex(text));
}
public MLStreamWriter cl(String closingElement) {
closeIfOpen();
String lastElement = (String)stack.remove(stack.size()-1);
if (closingElement != null && !closingElement.equals(lastElement)) {
throw new IllegalArgumentException("mismatch when closing \"" + closingElement
+ "\", current active element is \"" + lastElement + "\"");
}
if (lastElement.equals("!--")) {// hack for XML/HTML
print("-->",BEFORE+FORCE);
} else {
print("</");
print(lastElement);
print('>',BEFORE);
}
return this;
}
final public MLStreamWriter cl() {
return cl(null);
}
public MLStreamWriter closeAllElements() {
for (int i = stack.size()-1; i >= 0; --i) {
cl(null);
}
return this;
}
// stream stuff
public void write(char[] source, int start, int len) {
closeIfOpen();
// later make more efficient!!
out.print(quoted(new String(source, start, len)));
}
public void close() {
closeAllElements();
out.close();
}
public void flush() {
out.flush();
}
// Utility methods
final public MLStreamWriter cell(String ch, String type, String codepoint, String cat) {
if (codepoint == null) codepoint = ch;
int dotpos = type.indexOf('.');
if (dotpos == -1) el(type);
else {
el(type.substring(0,dotpos));
at("class",type.substring(dotpos+1));
}
/*
if (color == -1) {
el("th");
} else {
el("td");
if (color != 0xFFFFFF) {
at("bgcolor","#"+hex(color,6));
}
}
*/
tx(ch).el("br").el("tt").tx16(codepoint);
if (cat != null) tx(" ").tx(cat);
cl().cl().cl();
return this;
}
final public MLStreamWriter cell(String ch) {
return cell(ch,"td",null,null);
}
final public MLStreamWriter cell(String ch, String type) {
return cell(ch,type,null,null);
}
final public MLStreamWriter cell(String ch, String type, String codepoint) {
return cell(ch,type,codepoint,null);
}
static public String hex(int i, int width) {
String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
return "00000000".substring(result.length(),width) + result;
}
/**
* Supplies a zero-padded hex representation of an integer (without 0x)
*/
static public String hex(int i) {
return hex(i,8);
}
/**
* Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
*/
static public String hex(char i) {
return hex(i,4);
}
/**
* Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u)
*@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062"
*/
static public String hex(String s, String sep) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
if (i != 0) result.append(sep);
result.append(hex(s.charAt(i)));
}
return result.toString();
}
static public String hex(String s) {
return hex(s," ");
}
public void author(String name, String url) {
el("font").at("size","-3").tx("[").el("a").at("href",url).tx(name).cl("a").el("script").el("!--");
tx("document.write(', ', document.lastModified);");
cl("!--").cl("script").tx("]").cl("font");
}
// ================== PRIVATES =================
PrintWriter out;
boolean isHTML;
ArrayList stack = new ArrayList();
boolean inElement = false;
Normalizer formC = new Normalizer(Normalizer.NFC, "");
int len;
int maxLineLength = 60;
// later, add better line end management, indenting
static final int NONE=0, BEFORE=1, AFTER=2, BOTH=3, FORCE = 4; // chosen for bits!!
final void print(String s) {
print(s,NONE);
}
final void print(char c) {
print(c,NONE);
}
final void print(String s, int doesBreak) {
if ((doesBreak & BEFORE) != 0) tryBreak(s.length(), doesBreak);
len += s.length();
out.print(s);
if ((doesBreak & AFTER) != 0) tryBreak(0, doesBreak);
}
final void print(char c, int doesBreak) {
if ((doesBreak & BEFORE) != 0) tryBreak(1, doesBreak);
++len;
out.print(c);
if ((doesBreak & AFTER) != 0) tryBreak(0, doesBreak);
}
void tryBreak(int toAdd, int doesBreak) {
if ((doesBreak & FORCE) != 0 || (len + toAdd) > maxLineLength) {
out.println();
len = stack.size();
for (int i = 0; i < len; ++i) out.print(' ');
}
}
public String quoted(String source) {
source = formC.normalize(source);
StringBuffer result = new StringBuffer();
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
switch(ch) {
case '\'':
if (!isHTML) {
result.append("&apos;");
} else {
result.append(ch);
}
break;
case '\"':
result.append("&quot;");
break;
case '<':
result.append("&lt;");
break;
case '&':
result.append("&amp;");
break;
case '>':
result.append("&gt;");
break;
case '\n': case '\r': case '\t':
result.append(ch);
break;
default: if (ch < ' ' // do surrogates later
|| ch >= '\u007F' && ch <= '\u009F'
|| ch >= '\uD800' && ch <= '\uDFFF'
|| ch >= '\uFFFE') {
result.append('\uFFFD');
} else {
result.append(ch);
}
break;
}
}
return result.toString();
}
}

View File

@ -1,350 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.37 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.Date;
import com.ibm.text.utility.*;
public final class Main implements UCD_Types {
static final String classPrefix = "com.ibm.text.UCD.";
static final String[] CORE_FILES = {
"CaseFolding",
"CompositionExclusions",
"DerivedCoreProperties",
"DerivedNormalizationProps",
"NormalizationTest",
"PropertyAliases",
"PropList",
"Scripts",
"SpecialCasing",
"HangulSyllableType",
"DerivedAge",
"StandardizedVariants",
"HangulSyllableType",
//"OtherDerivedProperties",
};
static final String[] EXTRACTED_FILES = {
"DerivedBidiClass",
"DerivedBinaryProperties",
"DerivedCombiningClass",
"DerivedDecompositionType",
"DerivedEastAsianWidth",
"DerivedGeneralCategory",
"DerivedJoiningGroup",
"DerivedJoiningType",
"DerivedLineBreak",
"DerivedNumericType",
"DerivedNumericValues",
};
static final String[] ALL_FILES = {
"Core", "Extracted"
};
public static void main (String[] args) throws Exception {
System.out.println("*** Start *** " + Default.getDate());
try {
for (int i = 0; i < args.length; ++i) {
long mask = 0;
String arg = args[i];
if (arg.charAt(0) == '#') return; // skip rest of line
Utility.fixDot();
System.out.println();
System.out.println("** Argument: " + args[i] + " ** " + Default.getDate());
// Expand string arguments
if (arg.equalsIgnoreCase("ALL")) {
args = Utility.append(ALL_FILES, Utility.subarray(args, i+1));
i = -1;
continue;
}
if (arg.equalsIgnoreCase("CORE")) {
args = Utility.append(CORE_FILES, Utility.subarray(args, i+1));
i = -1;
continue;
}
if (arg.equalsIgnoreCase("EXTRACTED")) {
args = Utility.append(EXTRACTED_FILES, Utility.subarray(args, i+1));
i = -1;
continue;
}
// make sure the UCD is set up
if (arg.equalsIgnoreCase("version")) {
Default.setUCD(args[++i]);
continue;
}
// Now handle other options
if (arg.equalsIgnoreCase("verify")) {
VerifyUCD.verify();
VerifyUCD.checkCanonicalProperties();
VerifyUCD.CheckCaseFold();
VerifyUCD.checkAgainstUInfo();
} else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{Default.ucdVersion()});
else if (arg.equalsIgnoreCase("statistics")) VerifyUCD.statistics();
else if (arg.equalsIgnoreCase("NFSkippable")) NFSkippable.main(null);
else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
else if (arg.equalsIgnoreCase("onetime")) VerifyUCD.oneTime();
else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0);
else if (arg.equalsIgnoreCase("romajiTransliterator")) GenerateHanTransliterator.main(1);
else if (arg.equalsIgnoreCase("pinYinTransliterator")) GenerateHanTransliterator.main(2);
else if (arg.equalsIgnoreCase("hanproperties")) GenerateHanTransliterator.readUnihan();
else if (arg.equalsIgnoreCase("fixChineseOverrides")) GenerateHanTransliterator.fixChineseOverrides();
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
else if (arg.equalsIgnoreCase("testenum")) SampleEnum.test();
else if (arg.equalsIgnoreCase("quicktest")) QuickTest.test();
else if (arg.equalsIgnoreCase("TernaryStore")) TernaryStore.test();
else if (arg.equalsIgnoreCase("checkBIDI")) VerifyUCD.checkBIDI();
else if (arg.equalsIgnoreCase("Buildnames")) BuildNames.main(null);
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
else if (arg.equalsIgnoreCase("binary")) FastBinarySearch.test();
else if (arg.equalsIgnoreCase("GenerateCaseTest")) GenerateCaseTest.main(null);
else if (arg.equalsIgnoreCase("checkDecompFolding")) VerifyUCD.checkDecompFolding();
else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);
else if (arg.equalsIgnoreCase("checkcollator")) CheckCollator.main(null);
//else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
else if (arg.equalsIgnoreCase("checkCase3")) VerifyUCD.checkCase3();
else if (arg.equalsIgnoreCase("checkCaseLong")) VerifyUCD.checkCase2(true);
else if (arg.equalsIgnoreCase("checkCaseShort")) VerifyUCD.checkCase2(false);
else if (arg.equalsIgnoreCase("checkCanonicalProperties")) VerifyUCD.checkCanonicalProperties();
else if (arg.equalsIgnoreCase("CheckCaseFold")) VerifyUCD.CheckCaseFold();
else if (arg.equalsIgnoreCase("genIDN")) VerifyUCD.genIDN();
else if (arg.equalsIgnoreCase("VerifyIDN")) VerifyUCD.VerifyIDN();
else if (arg.equalsIgnoreCase("NFTest")) VerifyUCD.NFTest();
else if (arg.equalsIgnoreCase("test1")) VerifyUCD.test1();
//else if (arg.equalsIgnoreCase("TrailingZeros")) GenerateData.genTrailingZeros();
else if (arg.equalsIgnoreCase("GenerateThaiBreaks")) GenerateThaiBreaks.main(null);
else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]});
else if (arg.equalsIgnoreCase("MakeUnicodeFiles")) MakeUnicodeFiles.main(new String[]{});
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
else if (arg.equalsIgnoreCase("TestDirectoryIterator")) DirectoryIterator.test();
//else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical();
else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.checkNameList();
//else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0");
else if (arg.equalsIgnoreCase("Compare14652")) Compare14652.main(null);
//else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts();
/*else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
*/
// EXTRACTED PROPERTIES
/*
else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
GenerateData.generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedBidiClass");
} else if (arg.equalsIgnoreCase("DerivedBinaryProperties")) {
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES+1, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedBinaryProperties" );
} else if (arg.equalsIgnoreCase("DerivedCombiningClass")) {
GenerateData.generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedCombiningClass" );
} else if (arg.equalsIgnoreCase("DerivedDecompositionType")) {
GenerateData.generateVerticalSlice(DECOMPOSITION_TYPE, DECOMPOSITION_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedDecompositionType" );
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
GenerateData.generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedEastAsianWidth" );
} else if (arg.equalsIgnoreCase("DerivedGeneralCategory")) {
GenerateData.generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedGeneralCategory" );
} else if (arg.equalsIgnoreCase("DerivedJoiningGroup")) {
GenerateData.generateVerticalSlice(JOINING_GROUP, JOINING_GROUP+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedJoiningGroup" );
} else if (arg.equalsIgnoreCase("DerivedJoiningType")) {
GenerateData.generateVerticalSlice(JOINING_TYPE, JOINING_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedJoiningType" );
} else if (arg.equalsIgnoreCase("DerivedLineBreak")) {
GenerateData.generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedLineBreak" );
} else if (arg.equalsIgnoreCase("DerivedNumericType")) {
GenerateData.generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedNumericType" );
} else if (arg.equalsIgnoreCase("HangulSyllableType")) {
GenerateData.generateVerticalSlice(HANGUL_SYLLABLE_TYPE,HANGUL_SYLLABLE_TYPE+NEXT_ENUM, GenerateData.HEADER_EXTEND,
"DerivedData/", "HangulSyllableType" );
} else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
GenerateData.generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedNumericValues" );
}
*/
else if (arg.equalsIgnoreCase("StandardizedVariants")) {
GenerateStandardizedVariants.generate();
// OTHER STANDARD PROPERTIES
} else if (arg.equalsIgnoreCase("CaseFolding")) {
GenerateCaseFolding.makeCaseFold(true);
GenerateCaseFolding.makeCaseFold(false);
} else if (arg.equalsIgnoreCase("SpecialCasing")) {
GenerateCaseFolding.generateSpecialCasing(true);
GenerateCaseFolding.generateSpecialCasing(false);
/* } else if (arg.equalsIgnoreCase("CompositionExclusions")) {
GenerateData.generateCompExclusions();
} else if (arg.equalsIgnoreCase("DerivedAge")) {
GenerateData.generateAge("DerivedData/", "DerivedAge");
} else if (arg.equalsIgnoreCase("backwardsCompat")) {
GenerateData.backwardsCompat("DerivedData/extracted/", "Compatibility_ID_START",
new int[] {ID_Start, ID_Continue_NO_Cf, Mod_ID_Start, Mod_ID_Continue_NO_Cf});
} else if (arg.equalsIgnoreCase("DerivedCoreProperties")) {
GenerateData.generateDerived(DERIVED_CORE, true, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedCoreProperties");
} else if (arg.equalsIgnoreCase("DerivedNormalizationProps")) {
GenerateData.generateDerived(DERIVED_NORMALIZATION, true, GenerateData.HEADER_DERIVED, "DerivedData/",
"DerivedNormalizationProps" );
} else if (arg.equalsIgnoreCase("NormalizationTest")) {
GenerateData.writeNormalizerTestSuite("DerivedData/", "NormalizationTest");
} else if (arg.equalsIgnoreCase("PropertyAliases")) {
GenerateData.generatePropertyAliases();
} else if (arg.equalsIgnoreCase("PropList")) {
GenerateData.generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + NEXT_ENUM,
GenerateData.HEADER_EXTEND, "DerivedData/", "PropList");
} else if (arg.equalsIgnoreCase("Scripts")) {
GenerateData.generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM,
GenerateData.HEADER_SCRIPTS, "DerivedData/", "Scripts");
// OTHER TESTING
} else if (arg.equalsIgnoreCase("OtherDerivedProperties")) {
//mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
GenerateData.generateDerived((byte)(ALL & ~DERIVED_CORE & ~DERIVED_NORMALIZATION), false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
} else if (arg.equalsIgnoreCase("AllBinary")) {
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,
GenerateData.HEADER_EXTEND, "OtherDerived/", "AllBinary");
} else if (arg.equalsIgnoreCase("DerivedGeneralCategoryTEST")) {
GenerateData.generateVerticalSlice(CATEGORY+29, CATEGORY+32, GenerateData.HEADER_DERIVED,
"DerivedData/", "DerivedGeneralCategory" );
} else if (arg.equalsIgnoreCase("listDifferences")) {
CompareProperties.listDifferences();
} else if (arg.equalsIgnoreCase("partition")) {
CompareProperties.partition();
} else if (arg.equalsIgnoreCase("propertyStatistics")) {
CompareProperties.statistics();
} else if (arg.equalsIgnoreCase("listAccents")) {
GenerateData.listCombiningAccents();
} else if (arg.equalsIgnoreCase("listGreekVowels")) {
GenerateData.listGreekVowels();
} else if (arg.equalsIgnoreCase("listKatakana")) {
GenerateData.listKatakana();
*/
/*
} else if (arg.equalsIgnoreCase("DerivedFullNormalization")) {
mask = Utility.setBits(0, DerivedProperty.GenNFD, DerivedProperty.GenNFKC);
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedFullNormalization" );
} else if (arg.equalsIgnoreCase("caseignorable")) {
mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i);
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "CaseIgnorable" );
} else if (arg.equalsIgnoreCase("nfunsafestart")) {
mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart);
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "NFUnsafeStart");
*/
} else {
CallArgs.call(new String[]{arg}, classPrefix);
}
//checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
//checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
//GenerateData.generateDerived(Utility.setBits(0, DerivedProperty.PropMath, DerivedProperty.Mod_ID_Continue_NO_Cf),
// GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedPropData2" );
//GenerateData.generateVerticalSlice(SCRIPT, SCRIPT+1, "ScriptCommon" );
//listStrings("LowerCase" , 0,0);
//GenerateData.generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedData/", "DerivedPropData1" );
// AGE stuff
//UCD ucd = UCD.make();
//System.out.println(ucd.getAgeID(0x61));
//System.out.println(ucd.getAgeID(0x2FA1D));
//
}
} finally {
System.out.println("*** Done *** " + Default.getDate());
}
}
}

View File

@ -1,506 +0,0 @@
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodePropertySource;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.text.utility.Utility;
import com.ibm.text.utility.Utility.Encoding;
public class MakeNamesChart {
static int lastCodePoint = -1;
static boolean lastCodePointIsOld = false;
static int lastDecompType = UCD.NONE;
static final String chartPrefix = "c_";
static final String namePrefix = "n_";
static UnicodeSet skipChars;// = new UnicodeSet("[[:gc=cn:]-[:noncharactercodepoint:]]");
static UnicodeSet rtl;// = new UnicodeSet("[[:bidiclass=r:][:bidiclass=al:]]");
static UnicodeSet usePicture;// = new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");
static UCD ucd41;
public static void main(String[] args) throws Exception {
//ConvertUCD.main(new String[]{"5.0.0"});
BlockInfo blockInfo = new BlockInfo("5.0.0", "NamesList.txt");
// http://www.unicode.org/~book/incoming/kenfiles/U50M051010.lst
Default.setUCD("5.0.0");
ucd41 = UCD.make("4.1.0");
ToolUnicodePropertySource up = ToolUnicodePropertySource.make("5.0.0");
skipChars = new UnicodeSet(up.getSet("gc=cn")).removeAll(up.getSet("gc=cn"));
//"[[:gc=cn:]-[:noncharactercodepoint:]]");
rtl = new UnicodeSet(up.getSet("bidiclass=r")).addAll(up.getSet("bidiclass=al"));// "[[:bidiclass=r:][:bidiclass=al:]]");
usePicture = new UnicodeSet(up.getSet("whitespace=true")).addAll(up.getSet("defaultignorablecodepoint=true"));// new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");
List nameList = new ArrayList();
ArrayList lines = new ArrayList();
UnicodeSet collectedCodePoints = new UnicodeSet();
BitSet nameListNew = new BitSet();
int limit = Integer.MAX_VALUE;
for (int count = 0; count < limit; ++count) {
if (!blockInfo.next(lines)) break;
String firstLine = (String)lines.get(0);
if (firstLine.startsWith("@@@")) continue;
String[] lineParts = firstLine.split("\t");
String fileName = lineParts[1] + ".html";
nameList.add(firstLine);
System.out.println();
System.out.println("file: " + chartPrefix + fileName);
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" +
TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])) +
"</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
"<base target='names'></head><body>");
// header
out.println("<table class='headerTable'><tr><td class='headerLeft'>" +
lineParts[1] +
" <a href='help.html'>help</a></td><td class='headerCenter'>" +
getHeading(lineParts[2]) +
"</td><td class='headerRight'><a href='mainList.html'>index</a> " +
lineParts[3] +
"</td></tr></table>");
if ("Unassigned".equals(lineParts[2])) {
System.out.println("debug");
}
// first pass through and collect all the code points
collectedCodePoints.clear();
for (int i = 1; i < lines.size(); ++i) {
String line = (String)lines.get(i);
int cp1 = line.charAt(0);
if (cp1 != '@' && cp1 != '\t') {
int cp = Integer.parseInt(line.split("\t")[0],16);
collectedCodePoints.add(cp);
}
}
collectedCodePoints.removeAll(skipChars);
if (collectedCodePoints.size() == 0) {
out.println("<p align='center'>No Names List</p>");
} else {
out.println("<div align='center'><table class='chart'><tr>");
int counter = 0;
for (UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); it.next();) {
if ((counter % 16) == 0 && counter != 0) {
out.println("</tr><tr>");
}
String tdclass = "cell";
if (counter < 16) tdclass = "cellw";
if (it.codepoint == 0x242) {
System.out.println("debug");
}
boolean isNew = isNew(it.codepoint);
if (isNew) tdclass += "new";
String hexcp = Utility.hex(it.codepoint, 4);
String title = "";
String name = Default.ucd().getName(it.codepoint);
if (name != null) title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'";
out.println("<td class='" + tdclass + "'"
+ title
+ ">\u00A0"
+ showChar(it.codepoint) + "\u00A0<br><tt><a href='" + namePrefix + fileName + "#"+ hexcp + "'>" +
hexcp + "</a></tt></td>");
counter++;
}
if (counter > 16) {
counter &= 0xF;
if (counter != 0) for (; counter < 16; ++counter) out.println("<td class='cell'>\u00A0</td>");
out.println("</tr></table></div>");
}
}
out.close();
out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + fileName);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
"<link rel='stylesheet' type='text/css' href='namelist.css'></head><body>");
// now do the characters
boolean inTable = false;
for (int i = 1; i < lines.size(); ++i) {
String line = (String)lines.get(i);
try {
if (line.startsWith("@")) {
finishItem(out);
if (inTable) {
out.println("</table>");
inTable = false;
}
if (line.startsWith("@+")) {
line = line.substring(2).trim();
out.println("<p class='comment'>"
+ line
+ "</p>");
} else {
line = line.substring(1).trim();
out.println("<h2>"
+ line
+ "</h2>");
}
} else {
if (!inTable) {
out.println("<table>");
inTable = true;
}
//String line2 = lineParts[1];
if (line.startsWith("\t")) {
String body = line.trim();
if (false && line.indexOf(body) != 1) {
System.out.println("Format error: too much inital whitespace: <" + line + ">");
}
char firstChar = body.charAt(0);
switch (firstChar) {
case '*': body = "\u2022 " + body.substring(2); break;
case ':': body = checkCanonical(lastCodePoint, body); break;
case '#': body = checkCompatibility(lastCodePoint, body); break;
case 'x': body = getOther(body); break;
case '=': break;
default: throw new IllegalArgumentException("Huh? " + body);
}
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td>"
+ maybeNameStyle(showTextConvertingHex(body, firstChar != '='), firstChar == '=')
+ "</td></tr>");
} else {
finishItem(out);
lineParts = line.split("\t");
String x = lineParts[0];
lastCodePoint = Integer.parseInt(x,16);
boolean lastCodePointIsNew = isNew(lastCodePoint);
if (lastCodePointIsNew) nameListNew.set(nameList.size()-1, true);
out.println("<tr><td"
+ (lastCodePointIsNew ? " class='new'" : "")
+ "><code><a name='" + x + "'>" + x + "</a></code></td><td>\u00A0"
+ showChar(lastCodePoint) + "\u00A0</td><td"
+ (lastCodePointIsNew ? " class='new'" : "") + ">"
+ nameStyle(showTextConvertingHex(lineParts[1], false)) + "</td></tr>");
lastDecompType = Default.ucd().getDecompositionType(lastCodePoint);
}
}
} catch (Exception e) {
throw (IllegalArgumentException) new IllegalArgumentException("Error on line: " + line)
.initCause(e);
}
}
finishItem(out);
out.close();
}
blockInfo.in.close();
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", "mainList.html");
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
"<title>Main List</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
"<base target='chart'></head><body><table>");
for (int i = 0; i < nameList.size(); ++i) {
String line = (String) nameList.get(i);
String[] lineParts = line.split("\t");
String fileName = lineParts[1] + ".html";
out.println("<tr><td><code>" + lineParts[1] +
"</code></td><td"
+ (nameListNew.get(i) ? " class='new'" : "")
+ "><a href='" + chartPrefix + fileName + "'>" + getHeading(lineParts[2]) + "</a></td><td><code>" +
lineParts[3] +"</code></td></tr>");
}
out.println("</table></body></html>");
out.close();
BagFormatter bf = new BagFormatter();
//System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in decomps", hasNoName));
System.out.println("Name differences: Canonical");
showNameDifferences(hasNameCan, hasNoNameCan);
System.out.println("Name differences: Compatibility");
showNameDifferences(hasNameComp, hasNoNameComp);
// System.out.println("Characters with names in decomps: " + hasName.toPattern(true));
// System.out.println("Characters without names in decomps: " + hasNoName.toPattern(true));
// System.out.println("Characters sometimes with, sometimes without names in decomps: " + both.toPattern(true));
System.out.println("Done");
}
private static boolean isNew(int codepoint) {
return Default.ucd().isAllocated(codepoint) && !ucd41.isAllocated(codepoint);
}
private static void showNameDifferences(Map hasName, Map hasNoName) {
Set both = new TreeSet(hasNoName.keySet());
both.retainAll(hasName.keySet());
//hasNoName.removeAll(both);
//hasName.removeAll(both);
for (Iterator it = both.iterator(); it.hasNext();) {
String decomp = (String) it.next();
System.out.println();
System.out.println("decomp: " + Utility.hex(decomp));
System.out.println("Has name in: " + Utility.hex((String)hasName.get(decomp)));
System.out.println("Has no name in: " + Utility.hex((String)hasNoName.get(decomp)));
}
System.out.println("Count: " + both.size());
}
static TestIdentifiers ti;
static {
try {
ti = new TestIdentifiers("L");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void finishItem(PrintWriter out) {
if (lastCodePoint < 0) return;
if (lastDecompType != UCD.NONE) {
System.out.println("Alert: missing decomp for " + Utility.hex(lastCodePoint));
}
String str = UTF16.valueOf(lastCodePoint);
String upper = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.UPPER), "\u2191");
showForm(out, str, upper, null, Default.ucd().getCase(str,UCD.FULL,UCD.TITLE), "\u2195");
String lower = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.LOWER), "\u2193");
showForm(out, lower, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.FOLD), "\u2194");
String dc = Default.ucd().getDecompositionMapping(lastCodePoint);
String nfd = showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB");
//String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB");
String nfkd = showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD");
if (nfkd.equals(str)) {
Set s = ti.getConfusables(lastCodePoint, "MA");
if (s.size() > 1) {
sortedSet.clear();
for (Iterator it = s.iterator(); it.hasNext();) {
sortedSet.add(Default.nfkd().normalize((String)it.next()));
}
sortedSet.remove(nfkd); // remove me
for (Iterator it = sortedSet.iterator(); it.hasNext();) {
String other = (String)it.next();
if (nfkd.equals(Default.nfkd().normalize(other))) continue;
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='conf'>\u279F\u00A0"
+ showTextConvertingHex(Utility.hex(other, 4, " + "), true)
+ " "
+ Default.ucd().getName(other, UCD.NORMAL, " + ").toLowerCase()
// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
+ "</td></tr>");
}
}
}
lastCodePoint = -1;
}
static Set sortedSet = new TreeSet(Collator.getInstance(ULocale.ENGLISH));
private static String showForm(PrintWriter out, String str, String str2, String str3, String transformed, String symbol) {
if (!transformed.equals(str) && !transformed.equals(str2) && !transformed.equals(str3)) {
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='c'>" + symbol + "\u00A0"
+ showTextConvertingHex(Utility.hex(transformed, 4, " + "), true)
+ (UTF16.countCodePoint(transformed) != 1 ? "" :
" " + Default.ucd().getName(transformed, UCD.NORMAL, " + ").toLowerCase())
// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
+ "</td></tr>");
}
return transformed;
}
static public String getHeading(String name) {
int pos = name.lastIndexOf(" (");
if (pos < 0) return name;
return name.substring(0, pos);
}
private static String maybeNameStyle(String string, boolean b) {
if (b && string.equals(string.toUpperCase(Locale.ENGLISH))) return nameStyle(string);
return string;
}
private static String nameStyle(String string) {
// TODO Auto-generated method stub
String result = "<i>" + Default.ucd().getCase(string, UCD.FULL, UCD.TITLE) + "</i>";
// if it has any &xxx;, then restore them.
int position = 0;
while (true) {
if (!escapeMatch.reset(result).find(position)) break;
int start = escapeMatch.start();
position = escapeMatch.end();
result = result.substring(0,start)
+ result.substring(start, position).toLowerCase()
+ result.substring(position);
}
return result;
}
static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher("");
private static String showTextConvertingHex(String body, boolean addCharToHex) {
body = TransliteratorUtilities.toHTML.transliterate(body);
if (addCharToHex) {
int position = 0;
while (position < body.length()) {
if (!findHex.reset(body).find(position)) break;
position = findHex.end();
int start = findHex.start();
int len = position - start;
if (len < 4 || len > 6) continue;
int cp = Integer.parseInt(findHex.group(),16);
if (cp > 0x10FFFF) continue;
String insert = "\u00A0" + showChar(cp);
String beginning = body.substring(0,start)
+ "<code>" + body.substring(start, position) + "</code>"
+ insert;
body = beginning + body.substring(position);
position = beginning.length();
}
}
return body;
}
static Matcher pointer = Pattern.compile("x \\((.*) - ([0-9A-F]+)\\)").matcher("");
static Matcher pointer2 = Pattern.compile("x ([0-9A-F]{4,6})").matcher("");
static Matcher findHex = Pattern.compile("[0-9A-F]+").matcher("");
private static String getOther(String body) {
// of form: x (hyphenation point - 2027)
// => arrow 2027 X hyphenation point
int cp;
String name = null;
if (pointer.reset(body).matches()) {
cp = Integer.parseInt(pointer.group(2),16);
name = pointer.group(1);
String name2 = Default.ucd().getName(cp);
if (name2 == null) name2 = "<not a character>";
if (!name.equalsIgnoreCase(name2)) {
System.out.println("Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint));
System.out.println("\tName is: " + name2);
}
} else if (pointer2.reset(body).matches()) {
cp = Integer.parseInt(pointer2.group(1),16);
// name = UCharacter.getName(cp).toLowerCase();
// System.out.println("Irregular format: " + body);
} else {
throw new IllegalArgumentException("Bad format: " + body);
}
return "\u2192 " + Utility.hex(cp,4) /*+ " " + showChar(cp)*/ + (name != null ? " " + name : "");
}
static String showChar(int cp) {
if (usePicture.contains(cp)) {
int rep = '\u2588';
if (cp <= 0x20) rep = 0x2400 + cp;
else if (cp == 0x7F) rep = 0x2421;
return "<span class='inv'>" + (char)rep + "</span>";
//String hex = Utility.hex(cp);
//return "<img alt='" + hex + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + hex + "'>";
}
int type = Default.ucd().getCategory(cp);
if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) {
return "\u2588";
}
String result = TransliteratorUtilities.toHTML.transliterate(UTF16.valueOf(cp));
if (type == UCD.Me || type == UCD.Mn) {
result = "\u25CC" + result;
} else if (rtl.contains(cp)) {
result = "\u200E" + result + "\u200E";
}
return result;
}
//static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]");
static final Map hasNoNameCan = new TreeMap();
static final Map hasNameCan = new TreeMap();
static final Map hasNoNameComp = new TreeMap();
static final Map hasNameComp = new TreeMap();
private static String checkCanonical(int codePoint, String body) {
body = body.substring(2);
if (lastDecompType != UCD.CANONICAL) {
System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
}
String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
String hexed = Utility.hex(lastDecomp, 4, " ");
String hexed2 = hexed;
if (UTF16.countCodePoint(lastDecomp) == 1) {
hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
}
if (hexed.equalsIgnoreCase(body)) {
hasNoNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
} else if (hexed2.equalsIgnoreCase(body)) {
hasNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
} else {
System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
System.out.println("\tShould be: " + hexed);
}
lastDecompType = UCD.NONE;
return "\u2261 " + body;
}
private static String checkCompatibility(int codePoint, String body) {
body = body.substring(2);
if (lastDecompType <= UCD.CANONICAL) {
System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
}
String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
String hexed = Utility.hex(lastDecomp, 4, " ");
if (lastDecompType != UCD.COMPAT_UNSPECIFIED) {
String lastDecompID = Default.ucd().getDecompositionTypeID(lastCodePoint);
hexed = "<" + lastDecompID + "> " + hexed;
}
String hexed2 = hexed;
if (UTF16.countCodePoint(lastDecomp) == 1) {
hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
}
if (hexed.equalsIgnoreCase(body)) {
hasNoNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
} else if (hexed2.equalsIgnoreCase(body)) {
hasNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
} else {
System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
System.out.println("\tShould be: " + hexed);
}
lastDecompType = UCD.NONE;
return "\u2248 " + body;
}
static class BlockInfo {
BufferedReader in;
String lastLine;
BlockInfo (String version, String filename) throws IOException {
in = Utility.openUnicodeFile(filename, version, true, Utility.LATIN1_WINDOWS);
//in = BagFormatter.openUTF8Reader(dir, filename);
}
boolean next(List inout) throws IOException {
inout.clear();
if (lastLine != null) {
inout.add(lastLine);
lastLine = null;
}
while (true) {
String line = in.readLine();
if (line == null) break;
if (line.startsWith("@@\t")) {
lastLine = line;
break;
}
inout.add(line);
}
return inout.size() > 0;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,405 +0,0 @@
Generate: .*BreakTest.*
DeltaVersion: 17
CopyrightYear: 2006
File: auxiliary/GraphemeBreakProperty
Property: Grapheme_Cluster_Break
Format: skipValue=Other
File: auxiliary/WordBreakProperty
Property: Word_Break
Format: skipValue=Other
File: auxiliary/SentenceBreakProperty
Property: Sentence_Break
Format: skipValue=Other
File: auxiliary/GraphemeBreakTest
Property: SPECIAL
File: auxiliary/WordBreakTest
Property: SPECIAL
File: auxiliary/LineBreakTest
Property: SPECIAL
File: auxiliary/SentenceBreakTest
Property: SPECIAL
File: Blocks
Property: Block
# Note: When comparing block names, casing, whitespace, hyphens,
# and underbars are ignored.
# For example, "Latin Extended-A" and "latin extended a" are equivalent.
# For more information on the comparison of property values,
# see UCD.html.
Format: valueList skipUnassigned=No_Block
File: CaseFolding
Property: SPECIAL
File: DerivedAge
Property: Age
Format: nameStyle=none noLabel skipValue=unassigned
Value: 1.1
# Assigned as of Unicode 1.1.0 (June, 1993)
# [excluding removed Hangul Syllables]
Value: 2.0
# Newly assigned in Unicode 2.0.0 (July, 1996)
Value: 2.1
# Newly assigned in Unicode 2.1.2 (May, 1998)
Value: 3.0
# Newly assigned in Unicode 3.0.0 (September, 1999)
Value: 3.1
# Newly assigned in Unicode 3.1.0 (March, 2001)
Value: 3.2
# Newly assigned in Unicode 3.2.0 (March, 2002)
Value: 4.0
# Newly assigned in Unicode 4.0.0 (April, 2003)
Value: 4.1
# Newly assigned in Unicode 4.1.0 (March, 2005)
Value: 5.0
# Newly assigned in Unicode 5.0.0 (XXX, 2006)
File: extracted/DerivedBidiClass
Property: Bidi_Class
# Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
# Unlike other properties, unassigned code points in blocks
# reserved for right-to-left scripts are given either types R or AL.
# The unassigned characters that default to R are:
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF
# \uFB1D-\uFB4F \U00010840-\U000109FF \U00010A60-\U00010FFF
# The unassigned characters that default to AL are:
# Arabic, Syriac, Arabic_Supplement, Thaana, Arabic_Presentation_Forms_A,
# Arabic_Presentation_Forms_B, minus the Noncharacter_Code_Points
# For all other cases:
Format: valueStyle=short skipUnassigned=Left_To_Right
File: extracted/DerivedBinaryProperties
Property: Bidi_Mirrored
# Bidi_Mirrored (listing UnicodeData.txt, field 9: see UCD.html)
File: extracted/DerivedCombiningClass
Property: Canonical_Combining_Class
# Combining Class (listing UnicodeData.txt, field 3: see UCD.html)
Format: nameStyle=none valueStyle=short skipUnassigned=Not_Reordered
File: DerivedCoreProperties
Property: Math
# Derived Property: Math
# Generated from: Sm + Other_Math
Property: Alphabetic
# Derived Property: Alphabetic
# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic
Property: Lowercase
# Derived Property: Lowercase
# Generated from: Ll + Other_Lowercase
Property: Uppercase
# Derived Property: Uppercase
# Generated from: Lu + Other_Uppercase
Property: ID_Start
# Derived Property: ID_Start
# Characters that can start an identifier.
# Generated from Lu+Ll+Lt+Lm+Lo+Nl+Other_ID_Start
# NOTE: See UAX #31 for more information
Property: ID_Continue
# Derived Property: ID_Continue
# Characters that can continue an identifier.
# Generated from: ID_Start + Mn+Mc+Nd+Pc + Other_ID_Continue
# NOTE: See UAX #31 for more information
Property: XID_Start
# Derived Property: XID_Start
# ID_Start modified for closure under NFKx
# Modified as described in UAX #15
# NOTE: Does NOT remove the non-NFKx characters.
# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))
# NOTE: See UAX #31 for more information
Property: XID_Continue
# Derived Property: XID_Continue
# Mod_ID_Continue modified for closure under NFKx
# Modified as described in UAX #15
# NOTE: Cf characters should be filtered out.
# NOTE: Does NOT remove the non-NFKx characters.
# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))
# NOTE: See UAX #31 for more information
Property: Default_Ignorable_Code_Point
# Derived Property: Default_Ignorable_Code_Point
# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters
# - White_Space - FFF9..FFFB (Annotation Characters)
Property: Grapheme_Extend
# Derived Property: Grapheme_Extend
# Generated from: Me + Mn + Other_Grapheme_Extend
# Note: depending on an application's interpretation of Co (private use),
# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither.
Property: Grapheme_Base
# Derived Property: Grapheme_Base
# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
# Note: depending on an application's interpretation of Co (private use),
# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither.
Property: Grapheme_Link
# Derived Property: Grapheme_Link (deprecated)
# Generated from: Canonical_Combining_Class=Virama
# Use Canonical_Combining_Class=Virama directly instead
File: extracted/DerivedDecompositionType
Property: Decomposition_Type
Format: skipValue=None
# Decomposition_Type (from UnicodeData.txt, field 5: see UCD.html)
File: extracted/DerivedEastAsianWidth
Property: East_Asian_Width
Format: valueStyle=short skipUnassigned=Neutral
# East_Asian_Width (listing EastAsianWidth.txt, field 1)
File: extracted/DerivedGeneralCategory
Property: General_Category
Format: valueStyle=short noLabel
File: extracted/DerivedJoiningGroup
Property: Joining_Group
# Joining Group (listing ArabicShaping.txt, field 3)
Format: skipValue=No_Joining_Group
File: extracted/DerivedJoiningType
Property: Joining_Type
# Type T is derived, as described in ArabicShaping.txt
Format: valueStyle=short skipValue=Non_Joining
File: extracted/DerivedLineBreak
Property: Line_Break
Format: valueStyle=short skipUnassigned=Unknown
File: DerivedNormalizationProps
Property: FC_NFKC_Closure
# Derived Property: FC_NFKC_Closure
# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));
# Then if (c != b) add the mapping from a to c to the set of
# mappings that constitute the FC_NFKC_Closure list
# Uses the full case folding from CaseFolding.txt, without the T option.
Format: nameStyle=short
Property: Full_Composition_Exclusion
# Derived Property: Full_Composition_Exclusion
# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions
Property: NFD_QuickCheck
# Derived Property: NFD_QuickCheck
# Generated from computing decomposibles
Format: nameStyle=short valueStyle=short skipValue=Yes
Property: NFC_QuickCheck
# Derived Property: NFC_QuickCheck
# Generated from computing decomposibles (and characters that may compose with previous ones)
Format: nameStyle=short valueStyle=short skipValue=Yes
Property: NFKD_QuickCheck
# Derived Property: NFKD_QuickCheck
# Generated from computing decomposibles
Format: nameStyle=short valueStyle=short skipValue=Yes
Property: NFKC_QuickCheck
# Derived Property: NFKC_QuickCheck
# Generated from computing decomposibles (and characters that may compose with previous ones)
Format: nameStyle=short valueStyle=short skipValue=Yes
Property: Expands_On_NFD
# Derived Property: Expands_On_NFD
# Generated according to UAX #15.
# Characters whose normalized length is not one.
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!
Property: Expands_On_NFC
# Derived Property: Expands_On_NFC
# Generated according to UAX #15.
# Characters whose normalized length is not one.
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!
Property: Expands_On_NFKD
# Derived Property: Expands_On_NFKD
# Generated according to UAX #15.
# Characters whose normalized length is not one.
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!
Property: Expands_On_NFKC
# Derived Property: Expands_On_NFKC
# Generated according to UAX #15.
# Characters whose normalized length is not one.
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!
File: extracted/DerivedNumericType
Property: Numeric_Type
# Numeric Type (from UnicodeData.txt, field 6/7/8 plus Unihan.txt: see UCD.html)
Format: skipValue=None
File: extracted/DerivedNumericValues
Property: Numeric_Value
# Numeric Values (from UnicodeData.txt, field 6/7/8)
# WARNING: Certain valus, such as 0.16666667, are repeating fractions
# Although they are only printed with a limited number of decimal places
# in this file, they should be expressed to the limits of the precision
# available when used.
Format: sortNumeric
File: HangulSyllableType
Property: Hangul_Syllable_Type
Format: valueStyle=short skipValue=Not_Applicable
File: NormalizationTest
Property: SPECIAL
File: PropList
Property: White_Space
Property: Bidi_Control
Property: Join_Control
Property: Dash
Property: Hyphen
Property: Quotation_Mark
Property: Terminal_Punctuation
Property: Other_Math
Property: Hex_Digit
Property: ASCII_Hex_Digit
Property: Other_Alphabetic
Property: Ideographic
Property: Diacritic
Property: Extender
Property: Other_Lowercase
Property: Other_Uppercase
Property: Noncharacter_Code_Point
Property: Other_Grapheme_Extend
Property: IDS_Binary_Operator
Property: IDS_Trinary_Operator
Property: Radical
Property: Unified_Ideograph
Property: Other_Default_Ignorable_Code_Point
Property: Deprecated
Property: Soft_Dotted
Property: Logical_Order_Exception
Property: Other_ID_Start
Property: Other_ID_Continue
Property: STerm
Property: Variation_Selector
Property: Pattern_White_Space
Property: Pattern_Syntax
File: PropertyAliases
Property: SPECIAL
File: PropertyValueAliases
Property: SPECIAL
File: Scripts
Property: Script
Format: nameStyle=none skipValue=Unknown
File: SpecialCasing
Property: SPECIAL
File: StandardizedVariants
Property: SPECIAL
File: NamedSequences
Property: SPECIAL
HackName: noBreak
HackName: Arabic_Presentation_Forms-A
HackName: Arabic_Presentation_Forms-B
HackName: CJK_Symbols_and_Punctuation
HackName: Combining_Diacritical_Marks_for_Symbols
HackName: Enclosed_CJK_Letters_and_Months
HackName: Greek_and_Coptic
HackName: Halfwidth_and_Fullwidth_Forms
HackName: Latin-1_Supplement
HackName: Latin_Extended-A
HackName: Latin_Extended-B
HackName: Miscellaneous_Mathematical_Symbols-A
HackName: Miscellaneous_Mathematical_Symbols-B
HackName: Miscellaneous_Symbols_and_Arrows
HackName: Superscripts_and_Subscripts
HackName: Supplemental_Arrows-A
HackName: Supplemental_Arrows-B
HackName: Supplementary_Private_Use_Area-A
HackName: Supplementary_Private_Use_Area-B
HackName: Canadian-Aboriginal
#HackName: Old-Italic
FinalComments
Note that PropertyAliases sorts by the long name, while PropertyValueAliases
sorts by the short name
ArabicShaping
BidiMirroring
CompositionExclusions
EastAsianWidth
LineBreak
StandardizedVariants
UnicodeData

View File

@ -1,50 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyFloatLister.java,v $
* $Date: 2004/03/11 19:03:17 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import java.text.NumberFormat;
import java.util.Locale;
class MyFloatLister extends PropertyLister {
private double propMask;
NumberFormat nf = NumberFormat.getNumberInstance(Locale.US);
public MyFloatLister(UCD ucd, double f, PrintWriter output) {
this.propMask = f;
this.output = output;
this.ucdData = ucd;
nf.setGroupingUsed(false);
nf.setMaximumFractionDigits(8);
nf.setMinimumFractionDigits(1);
}
public String valueName(int cp) {
return nf.format(ucdData.getNumericValue(cp));
}
public String optionalName(int cp) {
return ucdData.getNumericTypeID(cp);
}
public byte status(int cp) {
//if ((cp & 0xFFF) == 0) System.out.println("# " + Utility.hex(cp));
if (false && !ucdData.isRepresented(cp)) {
if (ucdData.mapToRepresentative(cp, ucdData.getCompositeVersion()) != cp) return PropertyLister.CONTINUE;
return PropertyLister.CONTINUE;
}
if (ucdData.getCategory(cp) == Cn) return PropertyLister.CONTINUE;
return ucdData.getNumericValue(cp) == propMask ? INCLUDE : EXCLUDE;
}
}

View File

@ -1,123 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
* $Date: 2004/02/18 03:08:59 $
* $Revision: 1.12 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
final class MyPropertyLister extends PropertyLister {
static final boolean BRIDGE = false;
private int propMask;
private boolean isDefaultValue = false;
private UCDProperty up;
public MyPropertyLister(UCD ucd, int propMask, PrintWriter output) {
this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
up = UnifiedBinaryProperty.make(propMask, ucd);
if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat
isDefaultValue = up.isDefaultValue();
}
public String headerString() {
int main = (propMask & 0xFF00);
if (main == COMBINING_CLASS) {
String s = UCD.getCombiningClassID_fromIndex((short)(propMask & 0xFF), LONG);
if (s.charAt(0) <= '9') s = "Other Combining Class";
return "# " + s;
} else if (main == BINARY_PROPERTIES) {
return "";
} else if (main == JOINING_GROUP) {
return "";
} else {
return "";
/*
String shortID = up.getName(SHORT);
String longID = up.getName(LONG);
return "# ???? " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")");
*/
}
}
public String valueName(int cp) {
if (up.getValueType() == BINARY_PROP) return up.getName();
return up.getValue(cp);
}
public String missingValueName() {
return up.getValue(NORMAL);
}
public String optionalComment(int cp) {
if (propMask < COMBINING_CLASS) return ""; // skip gen cat
int cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
return ucdData.getCategoryID(cp);
}
/*
public String optionalName(int cp) {
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
return Utility.hex(ucdData.getDecompositionMapping(cp));
} else {
return "";
}
}
*/
public byte status(int cp) {
//if (cp == 0xFFFF) {
// System.out.println("# " + Utility.hex(cp));
//}
byte cat = ucdData.getCategory(cp);
//if (cp == 0x0385) {
// System.out.println(Utility.hex(firstRealCp));
//}
if (isDefaultValue
&& cat == Cn
&& propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point)
&& propMask != (BINARY_PROPERTIES | Other_Default_Ignorable_Code_Point)
&& propMask != (CATEGORY | Cn)) {
if (BRIDGE) return CONTINUE;
else return EXCLUDE;
}
boolean inSet = up.hasValue(cp);
/*
if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) {
if (propMask == (SCRIPT | LATIN_SCRIPT)) inSet = cp <= 0x1D6A3;
else if (propMask == (SCRIPT | GREEK_SCRIPT)) inSet = cp > 0x1D6A3;
}
*/
/* HACK
1D400;MATHEMATICAL BOLD CAPITAL A;Lu;0;L;<font> 0041;;;;N;;;;;
1D6A3;MATHEMATICAL MONOSPACE SMALL Z;Ll;0;L;<font> 007A;;;;N;;;;;
1D6A8;MATHEMATICAL BOLD CAPITAL ALPHA;Lu;0;L;<font> 0391;;;;N;;;;;
1D7C9;MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL;Ll;0;L;<font> 03D6;;;;N;;;;;
*/
if (!inSet) return EXCLUDE;
return INCLUDE;
}
}

View File

@ -1,20 +0,0 @@
package com.ibm.text.UCD;
public class NFCSkippable {
// find all the characters that are
// a) not decomposed by this normalization form
// b) of combining class 0
// AND if NKC or NFKC,
// c) can never compose with a previous character
// d) can never compose with a following character
// e) can never change if another character is added
// Example: a-breve might satisfy a-d, but if you
// add an ogonek it changes to a-ogonek + breve
public boolean is(int cp) {
return false;
}
public static void main (String[] args) {
}
}

View File

@ -1,301 +0,0 @@
package com.ibm.text.UCD;
import com.ibm.icu.impl.CollectionUtilities;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
import java.util.BitSet;
import com.ibm.text.utility.*;
import java.io.PrintWriter;
public final class NFSkippable extends UCDProperty {
static final boolean DEBUG = false;
private Normalizer nf;
private Normalizer nfd;
private UCD ucd;
private boolean composes;
private int[] realTrailers = new int[100];
private int realTrailerCount = 0;
public NFSkippable(byte normalizerMode, UCD inputUCD) {
isStandard = false;
this.ucd = inputUCD;
nf = new Normalizer(normalizerMode, ucd.getVersion());
name = nf.getName() + "_Skippable";
shortName = nf.getName() + "_Skip";
header = "# Derived Property: " + name
+ "\r\n# Generated according to UAX #15."
+ "\r\n# Characters that don't interact with any others in this normalization form."
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC;
// preprocess to find possible trailers
if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) {
if (nf.isTrailing(cp2)) {
//System.out.println("Trailing: " + ucd.getCodeAndName(cp2));
if (ucd.isNonLeadJamo(cp2)) {
//System.out.println("Jamo: " + ucd.getCodeAndName(cp2));
continue;
}
realTrailers[realTrailerCount++] = cp2;
}
}
Utility.fixDot();
//System.out.println("trailer count: " + realTrailerCount);
}
/** A skippable character is<br>
* a) unassigned, or ALL of the following:<br>
* b) of combining class 0.<br>
* c) not decomposed by this normalization form.<br>
* AND if NKC or NFKC, <br>
* d) can never compose with a previous character.<br>
* e) can never compose with a following character.<br>
* f) can never change if another character is added.
* Example: a-breve might satisfy all but f, but if you
* add an ogonek it changes to a-ogonek + breve
*/
String cause = "";
public boolean hasValue(int cp) {
// quick check on some special classes
if (DEBUG) cause = "\t\tunassigned";
if (!ucd.isAssigned(cp)) return true;
if (DEBUG) cause = "\t\tnf differs";
if (!nf.isNormalized(cp)) return false;
if (DEBUG) cause = "\t\tnon-zero cc";
if (ucd.getCombiningClass(cp) != 0) return false;
if (DEBUG) cause = "";
if (!composes) return true;
// now special checks for composing normalizers
if (DEBUG) cause = "\t\tleading";
if (nf.isLeading(cp)) return false;
if (DEBUG) cause = "\t\ttrailing";
if (nf.isTrailing(cp)) return false;
// OPTIMIZATION -- careful
// If there is no NFD decomposition, then this character's accents can't be
// "displaced", so we don't have to test further
if (DEBUG) cause = "\t\tno decomp";
if (nfd.isNormalized(cp)) return true;
// OPTIMIZATION -- careful
// Hangul syllables are skippable IFF they are isLeadingJamoComposition
if (ucd.isHangulSyllable(cp)) return !ucd.isLeadingJamoComposition(cp);
// We now see if adding another character causes a problem.
// brute force for now!!
// We do skip the trailing Jamo, since those never displace!
StringBuffer base = new StringBuffer(UTF16.valueOf(cp));
int baseLen = base.length();
for (int i = 0; i < realTrailerCount; ++i) {
base.setLength(baseLen); // shorten if needed
base.append(UTF16.valueOf(realTrailers[i]));
String probe = base.toString();
String result = nf.normalize(probe);
if (!result.equals(probe)) {
if (DEBUG) cause = "\t\tinteracts with " + ucd.getCodeAndName(realTrailers[i]);
return false;
}
}
// passed the sieve, so we are ok
if (DEBUG) cause = "";
return true;
}
// both the following should go into UTF16
public static String replace(String source, int toReplace, int replacement) {
if (0 <= toReplace && toReplace <= 0xFFFF
&& 0 <= replacement && replacement <= 0xFFFF) {
return source.replace((char)toReplace, (char)replacement);
}
return replace(source, UTF16.valueOf(toReplace), UTF16.valueOf(replacement));
}
public static String replace(String source, String toReplace, String replacement) {
int pos = 0;
StringBuffer result = new StringBuffer(source.length());
while (true) {
int newPos = source.indexOf(toReplace, pos);
if (newPos >= 0) {
result.append(source.substring(pos, newPos));
result.append(replacement);
pos = newPos + toReplace.length();
} else if (pos != 0) {
result.append(source.substring(pos));
return result.toString();
} else {
return source; // no change necessary
}
}
}
static void writeStringInPieces(PrintWriter pw, String s, String term) {
int start;
int end;
int lineLen = 64;
for (start = 0; ; start = end) {
if (start == 0) pw.print("\t \"");
else pw.print("\t+ \"");
end = s.length();
if (end > start + lineLen) end = start + lineLen;
// if we have a slash in the last 5 characters, backup
int lastSlash = s.lastIndexOf('\\', end);
if (lastSlash >= end-5) end = lastSlash;
// backup if we broke on a \
while (end > start && s.charAt(end-1) == '\\') --end;
pw.print(s.substring(start, end));
if (end == s.length()) {
pw.println('"' + term);
break;
} else {
pw.println('"');
}
}
}
static void testWriteStringInPieces() {
String test =
"[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
+ "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD\\u00F"
+ "F-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137\\u0139-"
+ "\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165\\u0168-\\u017"
+ "E\\u01A0-\\u01A1\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u"
+ "01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F\\u0226";
PrintWriter pw = new PrintWriter(System.out);
writeStringInPieces(pw,test,"");
writeStringInPieces(pw,replace(test, "\\", "\\\\"),"");
pw.flush();
}
static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller
public static void main (String[] args) throws java.io.IOException {
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS);
out.println(Utility.BOM);
out.println("NFSafeSets");
out.println("Version: " + Default.ucd().getVersion());
out.println("Date: " + Default.getDate());
out.println();
for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) {
UCDProperty up = DerivedProperty.make(mode, Default.ucd());
generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up);
}
for (byte mode = NFD; mode <= NFKC; ++mode) {
NFSkippable skipper = new NFSkippable(mode, Default.ucd());
generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
}
System.out.println("Done");
out.close();
}
static Collator UCA = Collator.getInstance(ULocale.ROOT);
static void generateSet(PrintWriter out, String label, UCDProperty up) {
System.out.println("Generating: " + up.getName(NORMAL));
UnicodeSet result = new UnicodeSet();
for (int cp = 0; cp <= limit; ++cp) {
Utility.dot(cp);
if (up.hasValue(cp)) result.add(cp);
}
Utility.fixDot();
String rSet = result.toPattern(true);
rSet = replace(rSet, "\\U", "\\\\U");
rSet = replace(rSet, "\\u", "\\\\u");
out.println(label + " = new UnicodeSet(");
writeStringInPieces(out, rSet, ", false);");
if (true) {
rSet = result.toPattern(false);
} else {
rSet = CollectionUtilities.prettyPrint(result, true, null, null, UCA, UCA);
}
out.println("/*Unicode: ");
writeStringInPieces(out, rSet, "*/");
out.println();
out.flush();
System.out.println("Done");
}
/*
// DerivedProperty dp = new DerivedProperty(UCD.make(version));
System.out.println(skipper.getName(NORMAL));
UnicodeSet result = new UnicodeSet();
for (int cp = 0; cp <= limit; ++cp) {
Utility.dot(cp);
if (skipper.hasProperty(cp)) result.add(cp);
}
Utility.fixDot();
String rSet = result.toPattern(true);
rSet = replace(rSet, "\\U", "\\\\U");
out.println("\tSKIPPABLE[" + skipper.getName(NORMAL)
+ "] = new UnicodeSet(");
writeStringInPieces(out, rSet, ", false);");
out.println();
rSet = result.toPattern(false);
out.println("/*Unicode: ");
*/
//writeStringInPieces(out, rSet, "*/");
/*out.println();
out.flush();
if (false) {
NFSkippable skipper = new NFSkippable(Normalizer.NFC,"");
NFSkippable skipper2 = new NFSkippable(Normalizer.NFKC,"");
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (cp > 0xFF) {
if (!skipper.ucd.isAssigned(cp)) continue;
byte cat = skipper.ucd.getCategory(cp);
if (cat == PRIVATE_USE || cat == SURROGATE) continue;
if (skipper.ucd.getCombiningClass(cp) != 0) continue;
if (!skipper.nf.isNormalized(cp)) continue;
if ((cp < 0xAC00 || cp > 0xAE00)
&& cp != skipper.ucd.mapToRepresentative(cp, false)) continue;
}
if (skipper2.hasProperty(cp) == skipper.hasProperty(cp)) continue;
String status = (skipper.hasProperty(cp) ? " SKIPc " : "NOSKIPc ")
+ (skipper2.hasProperty(cp) ? " SKIPkc " : "NOSKIPkc ");
System.out.println(status
+ skipper.ucd.getCodeAndName(cp)
+ skipper.cause);
}
}
*/
}

View File

@ -1,153 +0,0 @@
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<meta name="keywords" content="unicode, variant glyphs">
<meta name="description" content="Describes and displays standardized variant glyphs">
<title>Named Sequences</title>
<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
<style>
<!--
.copy { text-align: center; font-size: 150% }
th, td { vertical-align: middle }
tt { font-size: 8pt }
table { padding: 2pt }
-->
</style>
</head>
<body bgcolor="#ffffff">
<table class="header">
<tr>
<td class="icon"><a href="http://www.unicode.org">
<img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="http://www.unicode.org/ucd">Unicode
Character Database</a></td>
</tr>
<tr>
<td class="gray">&nbsp;</td>
</tr>
</table>
<div style="margin:1em">
<table border="1" cellpadding="0" cellspacing="1" style="border-collapse: collapse" bordercolor="#111111" width="100%" id="AutoNumber1">
<tr>
<td width="100%">
<p style="text-align: right">L2-XXX</p>
<p><i>To: UTC<br>
From: Mark Davis<br>
Date: 2005-04-28</i></p>
<p><i>One of the original ideas for Unicode 4.1.0 was to produce a NamedSequences.html,
following the pattern of StandardizedVariants.html. This document was generated along those
lines, but not added into U4.1.0. My suggestion instead is to add this file (with suitable
style modifications, of course) as a chart someplace accessible under
<a href="http://unicode.org/charts/">http://unicode.org/charts/</a>.</i></p>
<p><i>Alternatively, we could also combine this with the StandardizedVariants.html to provide
a unified chart of sequences, again someplace under <a href="http://unicode.org/charts/">
http://unicode.org/charts/</a>.</i></p>
<p><i><b>Note:</b> we don&#39;t have some of the glyphs quite right yet, but it should be
sufficient for discussing the format. One of the innovations is having a separate column of
text that for copy&amp;paste; that needs discussion also.</i></td>
</tr>
</table>
<h1><i><font color="#990000">&nbsp;PROPOSED WORKING DRAFT<br>
</font></i>Named Sequences</h1>
<table class="wide">
<tr>
<td valign="top" width="144">Revision</td>
<td valign="top">@revision@</td>
</tr>
<tr>
<td valign="top" width="144">Authors</td>
<td valign="top">Members of the Editorial Committee</td>
</tr>
<tr>
<td valign="top" width="144">Date</td>
<td valign="top">@date@</td>
</tr>
<tr>
<td valign="top" width="144">This Version</td>
<td valign="top">
<a href="http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html">
http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Previous Version</td>
<td valign="top">n/a</td>
</tr>
<tr>
<td valign="top" width="144">Latest Version</td>
<td valign="top">n/a</td>
</tr>
</table>
<h3><br>
<i>Summary</i></h3>
<blockquote>
<p>This file provides a visual display of the named sequences derived from NamedSequences.txt.<i>The
proposal is to add this, </i></p>
</blockquote>
<h3><i>Status</i></h3>
<blockquote>
<p><i>The file and the files described herein are part of the
<a href="http://www.unicode.org/ucd">Unicode Character Database</a> (UCD) and are governed by
the <a href="#Terms of Use">UCD Terms of Use</a> stated at the end.</i></p>
</blockquote>
<hr width="50%">
<h2>Introduction</h2>
<p>The tables here exhaustively lists the valid, registered named sequences. The columns include a
representative glyph, the sequence of code points in hex, and the name of the sequence. In
addition, there is a last column entitled <i>Copyable</i>, which contains the literal text forming
the sequence. That text can be copied and pasting in elsewhere. The display of the text in this
column is up to the capabilities of the browser and the set of available fonts. For more
information, see <a href="http://www.unicode.org/help/display_problems.html">Display Problems?</a>.</p>
<blockquote>
<p><a name="fonts"><b>Note: </b></a>The representative glyphs used to show the names sequences
are often derived from different physical fonts than the representative glyphs in the standard.
They may therefore exhibit minor differences in size, proportion, style, or weight.</p>
</blockquote>
<p>@table@</p>
<hr width="50%">
<h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
<h3><i>Disclaimer</i></h3>
<blockquote>
<p><i>The Unicode Character Database is provided as is by Unicode, Inc. No claims are made as to
fitness for any particular purpose. No warranties of any kind are expressed or implied. The
recipient agrees to determine applicability of information provided. If this file has been
purchased on magnetic or optical media from Unicode, Inc., the sole remedy for any claim will be
exchange of defective media within 90 days of receipt.</i></p>
<p><i>This disclaimer is applicable for all other data files accompanying the Unicode Character
Database, some of which have been compiled by the Unicode Consortium, and some of which have
been supplied by other sources.</i></p>
</blockquote>
<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
<blockquote>
<p><i>Recipient is granted the right to make copies in any form for internal distribution and to
freely use the information supplied in the creation of products supporting the Unicode<sup>TM</sup>
Standard. The files in the Unicode Character Database can be redistributed to third parties or
other organizations (whether for profit or not) as long as this notice and the disclaimer notice
are retained. Information can be extracted from these files and used in documentation or
programs, as long as there is an accompanying notice indicating the source.</i></p>
</blockquote>
<hr width="50%">
<div align="center">
<center>
<table cellspacing="0" cellpadding="0" border="0">
<tr>
<td><a href="http://www.unicode.org/unicode/copyright.html">
<img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
</tr>
</table>
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
</script>
</center>
</div>
<blockquote>
</blockquote>
</div>
</body>
</html>

View File

@ -1,32 +0,0 @@
#
# Normalization Test Suite
# Format:
#
# Columns (c1, c2,...) are separated by semicolons
# Comments are indicated with hash marks
#
# CONFORMANCE:
# 1. The following invariants must be true for all conformant implementations
#
# NFC
# c2 == NFC(c1) == NFC(c2) == NFC(c3)
# c4 == NFC(c4) == NFC(c5)
#
# NFD
# c3 == NFD(c1) == NFD(c2) == NFD(c3)
# c5 == NFD(c4) == NFD(c5)
#
# NFKC
# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
#
# NFKD
# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
#
# 2. For every code point X assigned in this version of Unicode that is not specifically
# listed in Part 1, the following invariants must be true for all conformant
# implementations:
#
# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
#
@Part0 # Specific cases
#

View File

@ -1,665 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2006/09/24 23:32:44 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.*;
import com.sun.java_cup.internal.internal_error;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
* See UTR#15 for details.<br>
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
* @author Mark Davis
*/
public final class Normalizer implements UCD_Types {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
public static boolean SHOW_PROGRESS = false;
/**
* Create a normalizer for a given form.
*/
public Normalizer(byte form, String unicodeVersion) {
this.form = form;
this.composition = (form & NF_COMPOSITION_MASK) != 0;
this.compatibility = (form & NF_COMPATIBILITY_MASK) != 0;
this.data = getData(unicodeVersion);
}
/**
* Create a normalizer for a given form.
*/
// public Normalizer(byte form) {
// this(form,"");
//}
/**
* Return string name
*/
public static String getName(byte form) {
return UCD_Names.NF_NAME[form];
}
/**
* Return string name
*/
public String getName() {
return getName(form);
}
/**
* Return string name
*/
public String getUCDVersion() {
return data.getUCDVersion();
}
/**
* Does compose?
*/
public boolean isComposition() {
return composition;
}
/**
* Does compose?
*/
public boolean isCompatibility() {
return compatibility;
}
/**
* Normalizes text according to the chosen form,
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public StringBuffer normalize(String source, StringBuffer target) {
// First decompose the source into target,
// then compose if the form requires.
if (source.length() != 0) {
internalDecompose(source, target, true, compatibility);
if (composition) {
internalCompose(target);
}
}
return target;
}
/**
* Normalizes text according to the chosen form,
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public boolean isFCD(String source) {
if (source.length() == 0) return true;
StringBuffer noReorder = new StringBuffer();
StringBuffer reorder = new StringBuffer();
internalDecompose(source, noReorder, false, false);
internalDecompose(source, reorder, true, false);
return reorder.toString().equals(noReorder.toString());
}
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(String source) {
return normalize(source, new StringBuffer()).toString();
}
/**
* Normalizes text according to the chosen form
* @param newLocaleID the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(int cp) {
return normalize(UTF16.valueOf(cp));
}
/**
private StringBuffer hasDecompositionBuffer = new StringBuffer();
public boolean hasDecomposition(int cp) {
hasDecompositionBuffer.setLength(0);
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
if (hasDecompositionBuffer.length() != 1) return true;
return cp != hasDecompositionBuffer.charAt(0);
}
*/
/**
* Does a quick check to see if the string is in the current form. Checks canonical order and
* isAllowed().
* @param newLocaleID source text
* @return YES, NO, MAYBE
*/
/*
public static final int NO = 0, YES = 1, MAYBE = -1;
public int quickCheck(String source) {
short lastCanonicalClass = 0;
int result = YES;
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
short canonicalClass = data.getCanonicalClass(ch);
if (lastCanonicalClass > canonicalClass && canonicalClass != 0) {
return NO;
}
int check = isAllowed(ch);
if (check == NO) return NO;
if (check == MAYBE) result = MAYBE;
}
return result;
}
/**
* Find whether the given character is allowed in the current form.
* @return YES, NO, MAYBE
*/
/*
public int isAllowed(char ch) {
if (composition) {
if (compatibility) {
if (data.isCompatibilityExcluded(ch)) {
return NO;
}
} else {
if (data.isExcluded(ch)) {
return NO;
}
}
if (data.isTrailing(ch)) {
return MAYBE;
}
} else { // decomposition: both NFD and NFKD
if (data.normalizationDiffers(compatibility,ch)) return NO;
}
return YES;
}
/**
* Utility: Gets the combining class of a character from the
* Unicode Character Database. Only a byte is needed, but since they are signed in Java
* return an int to forstall problems.
* @param ch the source character
* @return value from 0 to 255
*/
public short getCanonicalClass(int ch) {
return data.getCanonicalClass(ch);
}
/**
* Utility: Checks whether there is a recursive decomposition of a character from the
* Unicode Character Database. It is compatibility or canonical according to the particular
* normalizer.
* @param ch the source character
*/
public boolean isNormalized(int ch) {
return !data.normalizationDiffers(ch, composition, compatibility);
}
/**
* Utility: Checks whether there is a recursive decomposition of a character from the
* Unicode Character Database. It is compatibility or canonical according to the particular
* normalizer.
* @param ch the source character
*/
public boolean isNormalized(String s) {
if (UTF16.countCodePoint(s) > 1) {
return !data.normalizationDiffers(UTF16.charAt(s,0), composition, compatibility);
}
return s.equals(normalize(s)); // TODO: OPTIMIZE LATER
}
/**
* Utility: Gets recursive decomposition of a character from the
* Unicode Character Database.
* @param compatibility If false selects the recursive
* canonical decomposition, otherwise selects
* the recursive compatibility AND canonical decomposition.
* @param ch the source character
* @param buffer buffer to be filled with the decomposition
*/
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
data.getRecursiveDecomposition(ch, buffer, compatibility);
}
/**
* Utility: Gets composition mapping.
* @return IntEnumeration with the pair -> value mapping, where the
* pair is firstChar << 16 | secondChar.
* Will need to be fixed for surrogates.
*/
public void getCompositionStatus(BitSet leading, BitSet trailing, BitSet resulting) {
Iterator it = data.compTable.keySet().iterator();
while (it.hasNext()) {
Long key = (Long)it.next();
Integer result = (Integer)data.compTable.get(key);
long keyLong = key.longValue();
if (leading != null) leading.set((int)(keyLong >>> 32));
if (trailing != null) trailing.set((int)keyLong);
if (resulting != null) resulting.set(result.intValue());
}
for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
if (trailing != null && UCD.isNonLeadJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
}
if (leading != null) {
for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
if (UCD.isDoubleHangul(i)) leading.set(i); // set all two-Jamo syllables
}
}
}
public boolean isTrailing(int cp) {
return this.composition ? data.isTrailing(cp) : false;
}
public boolean isLeading(int cp) {
return this.composition ? data.isLeading(cp) : false;
}
public int getComposition(int first, int second) {
return data.getPairwiseComposition(first, second);
}
// ======================================
// PRIVATES
// ======================================
/**
* The current form.
*/
private byte form;
private boolean composition;
private boolean compatibility;
private UnicodeMap substituteMapping;
/**
* Decomposes text, either canonical or compatibility,
* replacing contents of the target buffer.
* @param form the normalization form. If NF_COMPATIBILITY_MASK
* bit is on in this byte, then selects the recursive
* compatibility decomposition, otherwise selects
* the recursive canonical decomposition.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
private void internalDecompose(String source, StringBuffer target, boolean reorder, boolean compat) {
StringBuffer buffer = new StringBuffer();
int ch32;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
buffer.setLength(0);
ch32 = UTF16.charAt(source, i);
String sub = substituteMapping == null ? null : (String) substituteMapping.getValue(ch32);
if (sub != null) {
buffer.append(sub);
} else {
data.getRecursiveDecomposition(ch32, buffer, compat);
}
// add all of the characters in the decomposition.
// (may be just the original character, if there was
// no decomposition mapping)
int ch;
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
ch = UTF16.charAt(buffer, j);
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0 && reorder) {
// bubble-sort combining marks as necessary
int ch2;
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
ch2 = UTF16.charAt(target, k-1);
if (data.getCanonicalClass(ch2) <= chClass) break;
}
}
target.insert(k, UTF16.valueOf(ch));
}
}
}
/**
* Composes text in place. Target must already
* have been decomposed.
* Uses UTF16, which is a utility class for supplementary character support in Java.
* @param target input: decomposed text.
* output: the resulting normalized text.
*/
private void internalCompose(StringBuffer target) {
int starterPos = 0;
int starterCh = UTF16.charAt(target,0);
int compPos = UTF16.getCharCount(starterCh); // length of last composition
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
int oldLen = target.length();
// Loop on the decomposed characters, combining where possible
int ch;
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
ch = UTF16.charAt(target, decompPos);
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
+ ", decompPos: " + decompPos
+ ", compPos: " + compPos
+ ", ch: " + Utility.hex(ch)
);
int chClass = data.getCanonicalClass(ch);
int composite = data.getPairwiseComposition(starterCh, ch);
if (composite != data.NOT_COMPOSITE
&& (lastClass < chClass || lastClass == 0)) {
UTF16.setCharAt(target, starterPos, composite);
// we know that we will only be replacing non-supplementaries by non-supplementaries
// so we don't have to adjust the decompPos
starterCh = composite;
} else {
if (chClass == 0) {
starterPos = compPos;
starterCh = ch;
}
lastClass = chClass;
UTF16.setCharAt(target, compPos, ch);
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
System.out.println("ADJUSTING: " + Utility.hex(target));
decompPos += target.length() - oldLen;
oldLen = target.length();
}
compPos += UTF16.getCharCount(ch);
}
}
target.setLength(compPos);
}
static class Stub {
private UCD ucd;
private HashMap compTable = new HashMap();
private BitSet isSecond = new BitSet();
private BitSet isFirst = new BitSet();
private BitSet canonicalRecompose = new BitSet();
private BitSet compatibilityRecompose = new BitSet();
static final int NOT_COMPOSITE = 0xFFFF;
Stub(String version) {
ucd = UCD.make(version);
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAssigned(i)) continue;
if (ucd.isPUA(i)) continue;
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
if (ucd.isLeadingJamoComposition(i)) isFirst.set(i);
byte dt = ucd.getDecompositionType(i);
if (dt != CANONICAL) continue;
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
try {
String s = ucd.getDecompositionMapping(i);
int len = UTF16.countCodePoint(s);
if (len != 2) {
if (len > 2) {
if (ucd.getVersion().compareTo("3.0.0") >= 0) {
throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
}
}
continue;
}
int a = UTF16.charAt(s, 0);
if (ucd.getCombiningClass(a) != 0) continue;
isFirst.set(a);
int b = UTF16.charAt(s, UTF16.getCharCount(a));
isSecond.set(b);
// have a recomposition, so set the bit
canonicalRecompose.set(i);
// set the compatibility recomposition bit
// ONLY if the component characters
// don't compatibility decompose
if (ucd.getDecompositionType(a) <= CANONICAL
&& ucd.getDecompositionType(b) <= CANONICAL) {
compatibilityRecompose.set(i);
}
long key = (((long)a)<<32) | b;
/*if (i == '\u1E0A' || key == 0x004400000307) {
System.out.println(Utility.hex(s));
System.out.println(Utility.hex(i));
System.out.println(Utility.hex(key));
}*/
compTable.put(new Long(key), new Integer(i));
} catch (Exception e) {
throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
}
}
}
// process compatibilityRecompose
// have to do this afterwards, since we don't know whether the pieces
// are allowable until we have processed all the characters
/*
Iterator it = compTable.keySet().iterator();
while (it.hasNext()) {
Long key = (Long)it.next();
int cp = compTable.get(key);
long keyLong = key.longValue();
int first = (int)(keyLong >>> 32);
int second = (int)keyLong;
if (ucd.
*/
}
String getUCDVersion() {
return ucd.getVersion();
}
/*
Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS
Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
Problem: differs: true, call: false U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
Problem: differs: true, call: false U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE
Problem: differs: true, call: false U+1FC1 GREEK DIALYTIKA AND PERISPOMENI
Problem: differs: true, call: false U+1FCD GREEK PSILI AND VARIA
Problem: differs: true, call: false U+1FCE GREEK PSILI AND OXIA
Problem: differs: true, call: false U+1FCF GREEK PSILI AND PERISPOMENI
Problem: differs: true, call: false U+1FDD GREEK DASIA AND VARIA
Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA
Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI
Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
*/
short getCanonicalClass(int cp) {
return ucd.getCombiningClass(cp);
}
boolean isTrailing(int cp) {
return isSecond.get(cp);
}
boolean isLeading(int cp) {
return isFirst.get(cp);
}
boolean normalizationDiffers(int cp, boolean composition, boolean compat) {
byte dt = ucd.getDecompositionType(cp);
if (!composition) {
if (compat) return dt >= CANONICAL;
else return dt == CANONICAL;
} else {
// almost the same, except that we add back in the characters
// that RECOMPOSE
if (compat) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
else return dt == CANONICAL && !canonicalRecompose.get(cp);
}
}
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compat) {
byte dt = ucd.getDecompositionType(cp);
// we know we decompose all CANONICAL, plus > CANONICAL if compat is TRUE.
if (dt == CANONICAL || dt > CANONICAL && compat) {
String s = ucd.getDecompositionMapping(cp);
if (s.equals(UTF16.valueOf(cp))) {
System.out.println("fix");
}
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
getRecursiveDecomposition(cp, buffer, compat);
}
} else {
UTF16.append(buffer, cp);
}
}
int getPairwiseComposition(int starterCh, int ch) {
int hangulPoss = UCD.composeHangul(starterCh, ch);
if (hangulPoss != 0xFFFF) return hangulPoss;
Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
if (obj == null) return 0xFFFF;
return ((Integer)obj).intValue();
}
}
/**
* Contains normalization data from the Unicode Character Database.
* use false for the minimal set, true for the real set.
*/
private Stub data;
private static HashMap versionCache = new HashMap();
private static Stub getData (String version) {
if (version.length() == 0) version = UCD.latestVersion;
Stub result = (Stub)versionCache.get(version);
if (result == null) {
result = new Stub(version);
versionCache.put(version, result);
}
return result;
}
public UnicodeMap getSubstituteMapping() {
return substituteMapping;
}
public Normalizer setSubstituteMapping(UnicodeMap substituteMapping) {
this.substituteMapping = substituteMapping;
return this;
}
static UnicodeMap spacingMap;;
public void setSpacingSubstitute() {
if (spacingMap == null) {
makeSpacingMap();
}
setSubstituteMapping(spacingMap);
}
private void makeSpacingMap() {
spacingMap = new UnicodeMap();
StringBuffer b = new StringBuffer();
main:
for (int i = 0; i <= 0x10FFFF; ++i) {
boolean compat = data.ucd.getDecompositionType(i) >= data.ucd.CANONICAL;
if (!compat) continue;
b.setLength(0);
data.getRecursiveDecomposition(i, b, true);
if (b.length() == 1) continue;
char firstChar = b.charAt(0);
if (firstChar != 0x20 && firstChar != '\u0640') continue;
// if rest are just Mn or Me marks, then add to substitute mapping
int cp;
for (int j = 1; j < b.length(); j += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(b,j);
int cat = data.ucd.getCategory(cp);
if (cat != data.ucd.Mn && cat != data.ucd.Me) continue main;
}
spacingMap.put(i, UTF16.valueOf(i));
}
String[][] specials = {
{"[\\u0384\\u1FFD]", "\u00B4"},
{"[\\uFFE3]", "\u00AF"},
{"[\\uFE49-\\uFE4C]", "\u203E"},
{"[\\u1FED]", "\u00A8\u0300"},
{"[\\u1FEE\\u0385]", "\u00A8\u0301"},
{"[\\u1FC1]", "\u00A8\u0342"},
{"[\\u1FBD]", "\u1FBF"},
{"[\\u1FCD]", "\u1FBF\u0300"},
{"[\\u1FCE]", "\u1FBF\u0301"},
{"[\\u1FCF]", "\u1FBF\u0342"},
{"[\\u1FDD]", "\u1FFE\u0300"},
{"[\\u1FDE]", "\u1FFE\u0301"},
{"[\\u1FDF]", "\u1FFE\u0342"},
{"[\\uFC5E]", "\uFE72\u0651"},
{"[\\uFC5F]", "\uFE74\u0651"},
{"[\\uFC60]", "\uFE76\u0651"},
{"[\\uFC61]", "\uFE78\u0651"},
{"[\\uFC62]", "\uFE7A\u0651"},
{"[\\uFC63]", "\uFE7C\u0670"},
{"[\\uFCF2]", "\uFE77\u0651"},
{"[\\uFCF3]", "\uFE79\u0651"},
{"[\\uFCF4]", "\uFE7B\u0651"},
};
int count = 0;
UnicodeSet mappedChars = spacingMap.keySet();
for (int i = 0; i < specials.length; ++i) {
UnicodeSet source = new UnicodeSet(specials[i][0]);
if (!mappedChars.containsAll(source)) {
throw new InternalError("Remapping character that doesn't need it!" + source);
}
spacingMap.putAll(source, specials[i][1]);
count += source.size();
}
spacingMap.freeze();
}
/**
* Just accessible for testing.
*/
/*
boolean isExcluded (char ch) {
return data.isExcluded(ch);
}
/**
* Just accessible for testing.
*/
/*
String getRawDecompositionMapping (char ch) {
return data.getRawDecompositionMapping(ch);
}
//*/
}

View File

@ -1,349 +0,0 @@
package com.ibm.text.UCD;
import java.util.*;
import com.ibm.text.*;
import com.ibm.icu.text.UTF16;
import com.ibm.text.utility.*;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
* See UTR#15 for details.<br>
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
* @author Mark Davis
*/
public class NormalizerSample implements UCD_Types {
static final String copyright = "Copyright (C) 2001, IBM Corp. and Unicode Inc. All Rights Reserved.";
public static boolean SHOW_PROGRESS = false;
/**
* Create a normalizer for a given form.
*/
public NormalizerSample(byte form, String unicodeVersion) {
this.composition = (form & COMPOSITION_MASK) != 0;
this.compatibility = (form & COMPATIBILITY_MASK) != 0;
this.data = getData(unicodeVersion);
}
/**
* Create a normalizer for a given form.
*/
public NormalizerSample(byte form) {
this(form,"");
}
/**
* Masks for the form selector
*/
public static final byte
COMPATIBILITY_MASK = 1,
COMPOSITION_MASK = 2;
/**
* Normalization Form Selector
*/
public static final byte
NFD = 0 ,
NFKD = COMPATIBILITY_MASK,
NFC = COMPOSITION_MASK,
NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
/**
* Normalizes text according to the chosen form,
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public StringBuffer normalize(String source, StringBuffer target) {
// First decompose the source into target,
// then compose if the form requires.
if (source.length() != 0) {
internalDecompose(source, target);
if (composition) {
internalCompose(target);
}
}
return target;
}
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(String source) {
return normalize(source, new StringBuffer()).toString();
}
/**
* Normalizes text according to the chosen form
* @param newLocaleID the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(int cp) {
return normalize(UTF16.valueOf(cp));
}
/**
*/
private StringBuffer hasDecompositionBuffer = new StringBuffer();
public boolean hasDecomposition(int cp) {
hasDecompositionBuffer.setLength(0);
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
if (hasDecompositionBuffer.length() != 1) return true;
return cp != hasDecompositionBuffer.charAt(0);
}
/**
* Utility: Checks whether there is a recursive decomposition of a character from the
* Unicode Character Database. It is compatibility or canonical according to the particular
* normalizer.
* @param ch the source character
*/
public boolean normalizationDiffers(int ch) {
return data.normalizationDiffers(ch, composition, compatibility);
}
/**
* Utility: Gets recursive decomposition of a character from the
* Unicode Character Database.
* @param compatibility If false selects the recursive
* canonical decomposition, otherwise selects
* the recursive compatibility AND canonical decomposition.
* @param ch the source character
* @param buffer buffer to be filled with the decomposition
*/
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
data.getRecursiveDecomposition(ch, buffer, compatibility);
}
// ======================================
// PRIVATES
// ======================================
/**
* The current form.
*/
private boolean composition;
private boolean compatibility;
/**
* Decomposes text, either canonical or compatibility,
* replacing contents of the target buffer.
* @param form the normalization form. If COMPATIBILITY_MASK
* bit is on in this byte, then selects the recursive
* compatibility decomposition, otherwise selects
* the recursive canonical decomposition.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
private void internalDecompose(String source, StringBuffer target) {
StringBuffer buffer = new StringBuffer();
int ch32;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
buffer.setLength(0);
ch32 = UTF16.charAt(source, i);
data.getRecursiveDecomposition(ch32, buffer, compatibility);
// add all of the characters in the decomposition.
// (may be just the original character, if there was
// no decomposition mapping)
int ch;
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
ch = UTF16.charAt(buffer, j);
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0) {
// bubble-sort combining marks as necessary
int ch2;
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
ch2 = UTF16.charAt(target, k-1);
if (data.getCanonicalClass(ch2) <= chClass) break;
}
}
target.insert(k, UTF16.valueOf(ch));
}
}
}
/**
* Composes text in place. Target must already
* have been decomposed.
* Uses UTF16, which is a utility class for supplementary character support in Java.
* @param target input: decomposed text.
* output: the resulting normalized text.
*/
private void internalCompose(StringBuffer target) {
int starterPos = 0;
int starterCh = UTF16.charAt(target,0);
int compPos = UTF16.getCharCount(starterCh); // length of last composition
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
int oldLen = target.length();
// Loop on the decomposed characters, combining where possible
int ch;
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
ch = UTF16.charAt(target, decompPos);
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
+ ", decompPos: " + decompPos
+ ", compPos: " + compPos
+ ", ch: " + Utility.hex(ch)
);
int chClass = data.getCanonicalClass(ch);
int composite = data.getPairwiseComposition(starterCh, ch);
if (composite != data.NOT_COMPOSITE
&& (lastClass < chClass || lastClass == 0)) {
UTF16.setCharAt(target, starterPos, composite);
// we know that we will only be replacing non-supplementaries by non-supplementaries
// so we don't have to adjust the decompPos
starterCh = composite;
} else {
if (chClass == 0) {
starterPos = compPos;
starterCh = ch;
}
lastClass = chClass;
UTF16.setCharAt(target, compPos, ch);
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
System.out.println("ADJUSTING: " + Utility.hex(target));
decompPos += target.length() - oldLen;
oldLen = target.length();
}
compPos += UTF16.getCharCount(ch);
}
}
target.setLength(compPos);
}
// The following class makes use of the UCD class, which accesses data in the Unicode Character Database
static class Stub {
private UCD ucd;
private HashMap compTable = new HashMap();
private BitSet isSecond = new BitSet();
private BitSet canonicalRecompose = new BitSet();
private BitSet compatibilityRecompose = new BitSet();
static final int NOT_COMPOSITE = 0xFFFF;
Stub(String version) {
ucd = UCD.make(version);
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAssigned(i)) continue;
if (ucd.isPUA(i)) continue;
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
byte dt = ucd.getDecompositionType(i);
if (dt != CANONICAL) continue;
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
try {
String s = ucd.getDecompositionMapping(i);
int len = UTF16.countCodePoint(s);
if (len != 2) {
if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
continue;
}
int a = UTF16.charAt(s, 0);
if (ucd.getCombiningClass(a) != 0) continue;
int b = UTF16.charAt(s, UTF16.getCharCount(a));
isSecond.set(b);
// have a recomposition, so set the bit
canonicalRecompose.set(i);
// set the compatibility recomposition bit
// ONLY if the component characters
// don't compatibility decompose
if (ucd.getDecompositionType(a) <= CANONICAL
&& ucd.getDecompositionType(b) <= CANONICAL) {
compatibilityRecompose.set(i);
}
long key = (((long)a)<<32) | b;
compTable.put(new Long(key), new Integer(i));
} catch (Exception e) {
throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
}
}
}
}
short getCanonicalClass(int cp) {
return ucd.getCombiningClass(cp);
}
boolean isTrailing(int cp) {
return isSecond.get(cp);
}
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
byte dt = ucd.getDecompositionType(cp);
if (!composition) {
if (compatibility) return dt >= CANONICAL;
else return dt == CANONICAL;
} else {
// almost the same, except that we add back in the characters
// that RECOMPOSE
if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
else return dt == CANONICAL && !canonicalRecompose.get(cp);
}
}
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
byte dt = ucd.getDecompositionType(cp);
// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
if (dt == CANONICAL || dt > CANONICAL && compatibility) {
String s = ucd.getDecompositionMapping(cp);
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
getRecursiveDecomposition(cp, buffer, compatibility);
}
} else {
UTF16.append(buffer, cp);
}
}
int getPairwiseComposition(int starterCh, int ch) {
int hangulPoss = UCD.composeHangul(starterCh, ch);
if (hangulPoss != 0xFFFF) return hangulPoss;
Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
if (obj == null) return 0xFFFF;
return ((Integer)obj).intValue();
}
}
/**
* Contains normalization data from the Unicode Character Database.
* use false for the minimal set, true for the real set.
*/
private Stub data;
private static HashMap versionCache = new HashMap();
private static Stub getData (String version) {
if (version.length() == 0) version = UCD.latestVersion;
Stub result = (Stub)versionCache.get(version);
if (result == null) {
result = new Stub(version);
versionCache.put(version, result);
}
return result;
}
}

View File

@ -1,109 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/OldUnicodeMap.java,v $
* $Date: 2005/03/04 02:50:26 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* Class that maps from codepoints to an index, and optionally a label.
*/
public class OldUnicodeMap {
UnicodeSet[] sets = new UnicodeSet[50];
String[] labels = new String[50];
int count = 0;
public int add(String label, UnicodeSet set) {
return add(label, set, false, true);
}
/**
* Add set
*@param removeOld true: remove any collisions from sets already in the map
* if false, remove any collisions from this set
*@param signal: print a warning when collisions occur
*/
public int add(String label, UnicodeSet set, boolean removeOld, boolean signal) {
// remove from any preceding!!
for (int i = 0; i < count; ++i) {
if (!set.containsSome(sets[i])) continue;
if (signal) showOverlap(label, set, i);
if (removeOld) {
sets[i] = sets[i].removeAll(set);
} else {
set = set.removeAll(sets[i]);
}
}
sets[count] = set;
labels[count++] = label;
return (short)(count - 1);
}
public void showOverlap(String label, UnicodeSet set, int i) {
UnicodeSet delta = new UnicodeSet(set).retainAll(sets[i]);
System.out.println("Warning! Overlap with " + label + " and " + labels[i]
+ ": " + delta);
}
public int getIndex(int codepoint) {
for (int i = count - 1; i >= 0; --i) {
if (sets[i].contains(codepoint)) return i;
}
return -1;
}
public int getIndexFromLabel(String label) {
for (int i = count - 1; i >= 0; --i) {
if (labels[i].equalsIgnoreCase(label)) return i;
}
return -1;
}
public String getLabel(int codepoint) {
return getLabelFromIndex(getIndex(codepoint));
}
public String getLabelFromIndex(int index) {
if (index < 0 || index >= count) return null;
return labels[index];
}
public UnicodeSet getSetFromIndex(int index) {
if (index < 0 || index >= count) return null;
return new UnicodeSet(sets[index]); // protect from changes
}
public int size() {
return count;
}
public int setLabel(int index, String label) {
labels[index] = label;
return index;
}
public int put(int codepoint, int index) {
if (sets[index] == null) {
sets[index] = new UnicodeSet();
if (index >= count) count = index + 1;
}
sets[index].add(codepoint);
return index;
}
}

View File

@ -1,76 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ProcessUnihan.java,v $
* $Date: 2005/03/04 02:50:26 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import java.util.*;
// stub file, ignore
public final class ProcessUnihan {
/*
static final boolean TESTING = false;
static int type;
public static void main() {
try {
type = 0;
System.out.println("Starting");
process();
} catch (Exception e) {
System.out.println("Exception: " + e);
}
}
static PrintWriter out;
static PrintWriter err;
static int count;
static int oldLine;
static Map map = new HashMap();
static Map tags = new HashMap();
static void process() throws java.io.IOException {
int lineCounter = 0;
String[] parts = new String[3];
//out = Utility.openPrintWriter("Transliterate_Han_English.txt");
//err = Utility.openPrintWriter("Transliterate_Han_English.log.txt");
BufferedReader in = Utility.openUnicodeFile("Unihan", "3.2.0", true, Utility.UTF8);
while (true) {
Utility.dot(++lineCounter);
String line = in.readLine();
if (line == null) break;
int commentPos = line.indexOf('#');
if (commentPos >= 0) line = line.substring(0,commentPos);
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line, '#', parts);
int code = Integer.parseInt(parts[0].substring(2), 16);
Byte itag = (Byte) tags.get("a");
if (itag == null) {}
String tag = parts[1];
String value = parts[2];
if (tags.containsKey(tag)) {}
}
}
*/
}

View File

@ -1,41 +0,0 @@
#
# This file contains aliases for properties used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.
# For information on which properties are normative, see UCD.html.
#
# The names may be translated in appropriate environments, and additional
# aliases may be useful.
#
# FORMAT
#
# Each line has two or more fields, separated by semicolons.
#
# First Field: The first field is an abbreviated name for the property.
#
# Second Field: The second field is a long name
#
# The above are the preferred aliases. Other aliases may be listed in additional fields.
#
# Loose matching should be applied to all property names and property values, with
# the exception of String Property values. With loose matching of property names and
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
#
# NOTE: Property value names are NOT unique across properties. For example:
#
# AL means Arabic Letter for the Bidi_Class property, and
# AL means Alpha_Left for the Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
#
# In addition, some property names may be the same as some property value names.
# For example:
#
# sc means the Script property, and
# Sc means the General_Category property value Currency_Symbol (Sc)
#
# The combination of property value and property name is, however, unique.
#
# For more information, see UTS #18: Regular Expression Guidelines
# ================================================

View File

@ -1,248 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
* $Date: 2003/03/19 17:30:56 $
* $Revision: 1.11 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UnicodeSet;
import java.text.NumberFormat;
abstract public class PropertyLister implements UCD_Types {
static final boolean COMPRESS_NAMES = false;
static final boolean DROP_INDICATORS = true;
protected UCD ucdData;
protected PrintWriter output;
protected boolean showOnConsole;
protected boolean usePropertyComment = true;
protected boolean breakByCategory = true;
protected int firstRealCp = -2;
protected int lastRealCp = -2;
protected boolean alwaysBreaks = false; // set to true if property only breaks
protected boolean commentOut = false;
protected boolean useKenName = true; // set to false to get meaningful names
private UnicodeSet set = new UnicodeSet();
public static final byte INCLUDE = 0, BREAK = 1, CONTINUE = 2, EXCLUDE = 3;
/**
* @return status. Also have access to firstRealCp, lastRealCp
*/
abstract public byte status(int cp);
public String headerString() {
return "";
}
public String valueName(int cp) {
return "";
}
public String missingValueName() {
return "";
}
public String optionalName(int cp) {
return "";
}
public String optionalComment(int cp) {
if (!usePropertyComment) return "";
return ucdData.getModCatID_fromIndex(getModCat(cp));
}
public int minPropertyWidth() {
return 1;
}
public void format(int startCp, int endCp, int realCount) {
try {
set.add(startCp, endCp);
String prop = valueName(startCp);
String opt = "";
String optCom = "";
String commentSep = " # ";
if (commentOut) commentSep = "";
if (prop.length() > 0) prop = "; " + prop;
opt = optionalName(startCp);
if (opt.length() > 0) opt = "; " + opt;
optCom = optionalComment(startCp);
if (optCom.length() > 0) optCom += " ";
String startName = getKenName(startCp);
String line;
String pgap = Utility.repeat(" ", minPropertyWidth() - prop.length() - opt.length());
if (startCp != endCp) {
String endName = getKenName(endCp);
int bridge = endCp - startCp + 1 - realCount;
String count = (bridge == 0) ? "" + realCount : realCount + "/" + bridge;
String countStr = Utility.repeat(" ", 3-count.length()) + "[" + count + "] ";
String gap = Utility.repeat(" ", 12 - width(startCp) - width(endCp));
line = Utility.hex(startCp,4) + ".." + Utility.hex(endCp,4) + gap
+ prop + opt + pgap + commentSep + optCom
+ countStr;
if (startName.length() != 0 || endName.length() != 0) {
int com = 0;
if (COMPRESS_NAMES) com = commonInitialWords(startName, endName);
if (com == 0) {
line += startName + ".." + endName;
} else {
line += startName.substring(0,com)
+ "(" + startName.substring(com) + ".." + endName.substring(com) + ")";
}
}
} else {
String gap = alwaysBreaks
? Utility.repeat(" ", 6 - width(startCp))
: Utility.repeat(" ", 14 - width(startCp));
String gap2 = alwaysBreaks
? " "
: " ";
line = Utility.hex(startCp,4) + gap
+ prop + opt + pgap + commentSep + optCom + gap2
+ startName;
}
if (commentOut) {
line = "# " + line;
}
output.println(line);
if (showOnConsole) System.out.println(line);
} catch (Exception e) {
throw new ChainException("Format error {0}, {1}",
new Object[]{new Integer(startCp), new Integer(endCp)}, e);
}
}
int width(int cp) {
return cp <= 0xFFFF ? 4
: cp <= 0xFFFFF ? 5
: 6;
}
String getKenName(int cp) {
String result = ucdData.getName(cp);
if (!useKenName) return result;
if (result == null) return "";
if (DROP_INDICATORS && result.charAt(0) == '<') {
if (cp < 0xFF) return "<control>";
return "";
}
return result;
}
byte getModCat(int cp) {
byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
return result;
}
/**
* @return common initial substring length ending with SPACE or HYPHEN-MINUS. 0 if there is none
*/
public static int commonInitialWords(String a, String b) {
if (a.length() > b.length()) {
String temp = a;
a = b;
b = temp;
}
int lastSpace = 0;
for (int i = 0; i < a.length(); ++i) {
char ca = a.charAt(i);
char cb = b.charAt(i);
if (ca != cb) return lastSpace;
if (ca == ' ' || ca == '-') lastSpace = i + 1;
}
if (b.length() == a.length() || b.charAt(a.length()) == ' ' || b.charAt(a.length()) == '-') {
lastSpace = a.length();
}
return lastSpace;
}
public int print() {
set.clear();
int count = 0;
firstRealCp = -1;
byte firstRealCpCat = -1;
lastRealCp = -1;
int realRangeCount = 0;
String header = headerString();
if (header.length() != 0) {
// System.out.println(header);
output.println(header);
output.println();
}
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
byte s = status(cp);
if (alwaysBreaks && s == INCLUDE) s = BREAK;
if (s == INCLUDE && firstRealCp != -1) {
if (getModCat(cp) != firstRealCpCat) s = BREAK;
}
switch(s) {
case CONTINUE:
break; // do nothing
case INCLUDE:
if (firstRealCp == -1) {
firstRealCp = cp;
firstRealCpCat = getModCat(firstRealCp);
}
lastRealCp = cp;
count++;
realRangeCount++;
break;
case BREAK:
if (firstRealCp != -1) {
format(firstRealCp, lastRealCp, realRangeCount);
}
lastRealCp = firstRealCp = cp;
firstRealCpCat = getModCat(firstRealCp);
realRangeCount = 1;
count++;
break;
case EXCLUDE:
if (firstRealCp != -1) {
format(firstRealCp, lastRealCp, realRangeCount);
firstRealCp = -1;
realRangeCount = 0;
}
break;
}
}
if (firstRealCp != -1) {
format(firstRealCp, lastRealCp, realRangeCount);
}
if (count == 0) {
output.println("# No values for " + missingValueName());
System.out.println("ZERO COUNT for " + missingValueName());
}
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(0);
nf.setGroupingUsed(false);
output.println();
output.println("# Total code points: " + nf.format(count));
output.println();
//System.out.println(headerString());
//System.out.println(set.toPattern(true));
return count;
}
}

View File

@ -1,49 +0,0 @@
#
# This file contains aliases for property values used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.
# For information on which properties are normative, see UCD.html.
#
# The names may be translated in appropriate environments, and additional
# aliases may be useful.
#
# FORMAT
#
# Each line describes a property value name.
# This consists of three or more fields, separated by semicolons.
#
# First Field: The first field describes the property for which that
# property value name is used.
#
# Second Field: The second field is an abbreviated name.
# If there is no abbreviated name available, the field is marked with "n/a".
#
# Third Field: The third field is a long name.
#
# In the case of ccc, there are 4 fields. The second field is numeric, third
# is abbreviated, and fourth is long.
#
# The above are the preferred aliases. Other aliases may be listed in additional fields.
#
# Loose matching should be applied to all property names and property values, with
# the exception of String Property values. With loose matching of property names and
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
#
# NOTE: Property value names are NOT unique across properties. For example:
#
# AL means Arabic Letter for the Bidi_Class property, and
# AL means Alpha_Left for the Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
#
# In addition, some property names may be the same as some property value names.
# For example:
#
# sc means the Script property, and
# Sc means the General_Category property value Currency_Symbol (Sc)
#
# The combination of property value and property name is, however, unique.
#
# For more information, see UTS #18: Regular Expression Guidelines
# ================================================

File diff suppressed because it is too large Load Diff

View File

@ -1,266 +0,0 @@
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class ScriptExceptions {
public static UnicodeSet getExceptions() {
UnicodeSet contents = new UnicodeSet();
// "FAIL: " => "contents.add(0x"
// ";" => ");//"
// ".." => ", 0x"
contents.add(0x005E);// COMMON # (Sk) CIRCUMFLEX ACCENT
contents.add(0x0060);// COMMON # (Sk) GRAVE ACCENT
contents.add(0x00A8);// COMMON # (Sk) DIAERESIS
contents.add(0x00AF);// COMMON # (Sk) MACRON
contents.add(0x00B4);// COMMON # (Sk) ACUTE ACCENT
contents.add(0x00B8);// COMMON # (Sk) CEDILLA
contents.add(0x02B9, 0x02BA);// COMMON # (Sk) MODIFIER LETTER PRIME, 0xMODIFIER LETTER DOUBLE PRIME
contents.add(0x02C2, 0x02CF);// COMMON # (Sk) MODIFIER LETTER LEFT ARROWHEAD, 0xMODIFIER LETTER LOW ACUTE ACCENT
contents.add(0x02D2, 0x02DF);// COMMON # (Sk) MODIFIER LETTER CENTRED RIGHT HALF RING, 0xMODIFIER LETTER CROSS ACCENT
contents.add(0x02E5, 0x02ED);// COMMON # (Sk) MODIFIER LETTER EXTRA-HIGH TONE BAR, 0xMODIFIER LETTER UNASPIRATED
contents.add(0x0374, 0x0375);// COMMON # (Sk) GREEK NUMERAL SIGN, 0xGREEK LOWER NUMERAL SIGN
contents.add(0x0384, 0x0385);// COMMON # (Sk) GREEK TONOS, 0xGREEK DIALYTIKA TONOS
contents.add(0x1FBD);// COMMON # (Sk) GREEK KORONIS
contents.add(0x1FBF, 0x1FC1);// COMMON # (Sk) GREEK PSILI, 0xGREEK DIALYTIKA AND PERISPOMENI
contents.add(0x1FCD, 0x1FCF);// COMMON # (Sk) GREEK PSILI AND VARIA, 0xGREEK PSILI AND PERISPOMENI
contents.add(0x1FDD, 0x1FDF);// COMMON # (Sk) GREEK DASIA AND VARIA, 0xGREEK DASIA AND PERISPOMENI
contents.add(0x1FED, 0x1FEF);// COMMON # (Sk) GREEK DIALYTIKA AND VARIA, 0xGREEK VARIA
contents.add(0x1FFD, 0x1FFE);// COMMON # (Sk) GREEK OXIA, 0xGREEK DASIA
contents.add(0x309B, 0x309C);// COMMON # (Sk) KATAKANA-HIRAGANA VOICED SOUND MARK, 0xKATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
contents.add(0xFF3E);// COMMON # (Sk) FULLWIDTH CIRCUMFLEX ACCENT
contents.add(0xFF40);// COMMON # (Sk) FULLWIDTH GRAVE ACCENT
contents.add(0xFFE3);// COMMON # (Sk) FULLWIDTH MACRON
contents.add(0x0640);// COMMON # (Lm) ARABIC TATWEEL
contents.add(0x3006);// COMMON # (Lo) IDEOGRAPHIC CLOSING MARK
contents.add(0x303C);// COMMON # (Lo) MASU MARK
contents.add(0x2135, 0x2138);// COMMON # (Lo) ALEF SYMBOL..DALET SYMBOL
contents.add(0x1714);// TAGALOG # (Mn) TAGALOG SIGN VIRAMA
contents.add(0x1734);// HANUNOO # (Mn) HANUNOO SIGN PAMUDPOD
//contents.add(0x0F3E, 0x0F3F);// COMMON # (Mc) TIBETAN SIGN YAR TSHES, 0xTIBETAN SIGN MAR TSHES
contents.add(0x2071);// COMMON # (LC) SUPERSCRIPT LATIN SMALL LETTER I
contents.add(0x2102);// COMMON # (LC) DOUBLE-STRUCK CAPITAL C
contents.add(0x2107);// COMMON # (LC) EULER CONSTANT
contents.add(0x210A, 0x2113);// COMMON # (LC) SCRIPT SMALL G, 0xSCRIPT SMALL L
contents.add(0x2115);// COMMON # (LC) DOUBLE-STRUCK CAPITAL N
contents.add(0x2119, 0x211D);// COMMON # (LC) DOUBLE-STRUCK CAPITAL P, 0xDOUBLE-STRUCK CAPITAL R
contents.add(0x2124);// COMMON # (LC) DOUBLE-STRUCK CAPITAL Z
contents.add(0x2128);// COMMON # (LC) BLACK-LETTER CAPITAL Z
contents.add(0x212C, 0x212D);// COMMON # (LC) SCRIPT CAPITAL B, 0xBLACK-LETTER CAPITAL C
contents.add(0x212F, 0x2131);// COMMON # (LC) SCRIPT SMALL E, 0xSCRIPT CAPITAL F
contents.add(0x2133, 0x2134);// COMMON # (LC) SCRIPT CAPITAL M, 0xSCRIPT SMALL O
contents.add(0x2139);// COMMON # (LC) INFORMATION SOURCE
contents.add(0x213D, 0x213F);// COMMON # (LC) DOUBLE-STRUCK SMALL GAMMA, 0xDOUBLE-STRUCK CAPITAL PI
contents.add(0x2145, 0x2149);// COMMON # (LC) DOUBLE-STRUCK ITALIC CAPITAL D, 0xDOUBLE-STRUCK ITALIC SMALL J
contents.add(0x1D400, 0x1D454);// COMMON # (LC) MATHEMATICAL BOLD CAPITAL A, 0xMATHEMATICAL ITALIC SMALL G
contents.add(0x1D456, 0x1D49C);// COMMON # (LC) MATHEMATICAL ITALIC SMALL I, 0xMATHEMATICAL SCRIPT CAPITAL A
contents.add(0x1D49E, 0x1D49F);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL C, 0xMATHEMATICAL SCRIPT CAPITAL D
contents.add(0x1D4A2);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL G
contents.add(0x1D4A5, 0x1D4A6);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL J, 0xMATHEMATICAL SCRIPT CAPITAL K
contents.add(0x1D4A9, 0x1D4AC);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL N, 0xMATHEMATICAL SCRIPT CAPITAL Q
contents.add(0x1D4AE, 0x1D4B9);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL S, 0xMATHEMATICAL SCRIPT SMALL D
contents.add(0x1D4BB);// COMMON # (LC) MATHEMATICAL SCRIPT SMALL F
contents.add(0x1D4BD, 0x1D4C0);// COMMON # (LC) MATHEMATICAL SCRIPT SMALL H, 0xMATHEMATICAL SCRIPT SMALL K
contents.add(0x1D4C2, 0x1D4C3);// COMMON # (LC) MATHEMATICAL SCRIPT SMALL M, 0xMATHEMATICAL SCRIPT SMALL N
contents.add(0x1D4C5, 0x1D505);// COMMON # (LC) MATHEMATICAL SCRIPT SMALL P, 0xMATHEMATICAL FRAKTUR CAPITAL B
contents.add(0x1D507, 0x1D50A);// COMMON # (LC) MATHEMATICAL FRAKTUR CAPITAL D, 0xMATHEMATICAL FRAKTUR CAPITAL G
contents.add(0x1D50D, 0x1D514);// COMMON # (LC) MATHEMATICAL FRAKTUR CAPITAL J, 0xMATHEMATICAL FRAKTUR CAPITAL Q
contents.add(0x1D516, 0x1D51C);// COMMON # (LC) MATHEMATICAL FRAKTUR CAPITAL S, 0xMATHEMATICAL FRAKTUR CAPITAL Y
contents.add(0x1D51E, 0x1D539);// COMMON # (LC) MATHEMATICAL FRAKTUR SMALL A, 0xMATHEMATICAL DOUBLE-STRUCK CAPITAL B
contents.add(0x1D53B, 0x1D53E);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK CAPITAL D, 0xMATHEMATICAL DOUBLE-STRUCK CAPITAL G
contents.add(0x1D540, 0x1D544);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK CAPITAL I, 0xMATHEMATICAL DOUBLE-STRUCK CAPITAL M
contents.add(0x1D546);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK CAPITAL O
contents.add(0x1D54A, 0x1D550);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK CAPITAL S, 0xMATHEMATICAL DOUBLE-STRUCK CAPITAL Y
contents.add(0x1D552, 0x1D6A3);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK SMALL A, 0xMATHEMATICAL MONOSPACE SMALL Z
contents.add(0x1D6A8, 0x1D6C0);// COMMON # (LC) MATHEMATICAL BOLD CAPITAL ALPHA, 0xMATHEMATICAL BOLD CAPITAL OMEGA
contents.add(0x1D6C2, 0x1D6DA);// COMMON # (LC) MATHEMATICAL BOLD SMALL ALPHA, 0xMATHEMATICAL BOLD SMALL OMEGA
contents.add(0x1D6DC, 0x1D6FA);// COMMON # (LC) MATHEMATICAL BOLD EPSILON SYMBOL, 0xMATHEMATICAL ITALIC CAPITAL OMEGA
contents.add(0x1D6FC, 0x1D714);// COMMON # (LC) MATHEMATICAL ITALIC SMALL ALPHA, 0xMATHEMATICAL ITALIC SMALL OMEGA
contents.add(0x1D716, 0x1D734);// COMMON # (LC) MATHEMATICAL ITALIC EPSILON SYMBOL, 0xMATHEMATICAL BOLD ITALIC CAPITAL OMEGA
contents.add(0x1D736, 0x1D74E);// COMMON # (LC) MATHEMATICAL BOLD ITALIC SMALL ALPHA, 0xMATHEMATICAL BOLD ITALIC SMALL OMEGA
contents.add(0x1D750, 0x1D76E);// COMMON # (LC) MATHEMATICAL BOLD ITALIC EPSILON SYMBOL, 0xMATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA
contents.add(0x1D770, 0x1D788);// COMMON # (LC) MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA, 0xMATHEMATICAL SANS-SERIF BOLD SMALL OMEGA
contents.add(0x1D78A, 0x1D7A8);// COMMON # (LC) MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL, 0xMATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA
contents.add(0x1D7AA, 0x1D7C2);// COMMON # (LC) MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA, 0xMATHEMATICAL SANS-SERIF BOLD IT ALIC SMALL OMEGA
contents.add(0x1D7C4, 0x1D7C9);// COMMON # (LC) MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL, 0xMATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL
contents.add(0x02BB, 0x02C1);// COMMON # (0xLm) MODIFIER LETTER TURNED COMMA, 0xMODIFIER LETTER REVERSED GLOTTAL STOP
contents.add(0x02D0, 0x02D1);// COMMON # (0xLm) MODIFIER LETTER TRIANGULAR COLON, 0xMODIFIER LETTER HALF TRIANGULAR COLON
contents.add(0x02EE);// COMMON # (0xLm) MODIFIER LETTER DOUBLE APOSTROPHE
contents.add(0x3031, 0x3035);// COMMON # (0xLm) VERTICAL KANA REPEAT MARK, 0xVERTICAL KANA REPEAT MARK LOWER HALF
contents.add(0x30FC);// COMMON # (0xLm) KATAKANA-HIRAGANA PROLONGED SOUND MARK
contents.add(0xFF70);// COMMON # (0xLm) HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
contents.add(0xFF9E, 0xFF9F);// COMMON # (0xLm) HALFWIDTH KATAKANA VOICED SOUND MARK, 0xHALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
contents.add(0x0483, 0x0486);// CYRILLIC # (0xMn) COMBINING CYRILLIC TITLO, 0xCOMBINING CYRILLIC PSILI PNEUMATA
contents.add(0x0711);// SYRIAC # (0xMn) SYRIAC LETTER SUPERSCRIPT ALAPH
contents.add(0x0730, 0x074A);// SYRIAC # (0xMn) SYRIAC PTHAHA ABOVE, 0xSYRIAC BARREKH
contents.add(0x07A6, 0x07B0);// THAANA # (0xMn) THAANA ABAFILI, 0xTHAANA SUKUN
contents.add(0x0901, 0x0902);// DEVANAGARI # (0xMn) DEVANAGARI SIGN CANDRABINDU, 0xDEVANAGARI SIGN ANUSVARA
contents.add(0x093C);// DEVANAGARI # (0xMn) DEVANAGARI SIGN NUKTA
contents.add(0x0941, 0x0948);// DEVANAGARI # (0xMn) DEVANAGARI VOWEL SIGN U, 0xDEVANAGARI VOWEL SIGN AI
contents.add(0x094D);// DEVANAGARI # (0xMn) DEVANAGARI SIGN VIRAMA
contents.add(0x0951, 0x0954);// DEVANAGARI # (0xMn) DEVANAGARI STRESS SIGN UDATTA, 0xDEVANAGARI ACUTE ACCENT
contents.add(0x0962, 0x0963);// DEVANAGARI # (0xMn) DEVANAGARI VOWEL SIGN VOCALIC L, 0xDEVANAGARI VOWEL SIGN VOCALIC LL
contents.add(0x0981);// BENGALI # (0xMn) BENGALI SIGN CANDRABINDU
contents.add(0x09BC);// BENGALI # (0xMn) BENGALI SIGN NUKTA
contents.add(0x09C1, 0x09C4);// BENGALI # (0xMn) BENGALI VOWEL SIGN U, 0xBENGALI VOWEL SIGN VOCALIC RR
contents.add(0x09CD);// BENGALI # (0xMn) BENGALI SIGN VIRAMA
contents.add(0x09E2, 0x09E3);// BENGALI # (0xMn) BENGALI VOWEL SIGN VOCALIC L, 0xBENGALI VOWEL SIGN VOCALIC LL
contents.add(0x0A02);// GURMUKHI # (0xMn) GURMUKHI SIGN BINDI
contents.add(0x0A3C);// GURMUKHI # (0xMn) GURMUKHI SIGN NUKTA
contents.add(0x0A41, 0x0A42);// GURMUKHI # (0xMn) GURMUKHI VOWEL SIGN U, 0xGURMUKHI VOWEL SIGN UU
contents.add(0x0A47, 0x0A48);// GURMUKHI # (0xMn) GURMUKHI VOWEL SIGN EE, 0xGURMUKHI VOWEL SIGN AI
contents.add(0x0A4B, 0x0A4D);// GURMUKHI # (0xMn) GURMUKHI VOWEL SIGN OO, 0xGURMUKHI SIGN VIRAMA
contents.add(0x0A70, 0x0A71);// GURMUKHI # (0xMn) GURMUKHI TIPPI, 0xGURMUKHI ADDAK
contents.add(0x0A81, 0x0A82);// GUJARATI # (0xMn) GUJARATI SIGN CANDRABINDU, 0xGUJARATI SIGN ANUSVARA
contents.add(0x0ABC);// GUJARATI # (0xMn) GUJARATI SIGN NUKTA
contents.add(0x0AC1, 0x0AC5);// GUJARATI # (0xMn) GUJARATI VOWEL SIGN U, 0xGUJARATI VOWEL SIGN CANDRA E
contents.add(0x0AC7, 0x0AC8);// GUJARATI # (0xMn) GUJARATI VOWEL SIGN E, 0xGUJARATI VOWEL SIGN AI
contents.add(0x0ACD);// GUJARATI # (0xMn) GUJARATI SIGN VIRAMA
contents.add(0x0B01);// ORIYA # (0xMn) ORIYA SIGN CANDRABINDU
contents.add(0x0B3C);// ORIYA # (0xMn) ORIYA SIGN NUKTA
contents.add(0x0B3F);// ORIYA # (0xMn) ORIYA VOWEL SIGN I
contents.add(0x0B41, 0x0B43);// ORIYA # (0xMn) ORIYA VOWEL SIGN U, 0xORIYA VOWEL SIGN VOCALIC R
contents.add(0x0B4D);// ORIYA # (0xMn) ORIYA SIGN VIRAMA
contents.add(0x0B56);// ORIYA # (0xMn) ORIYA AI LENGTH MARK
contents.add(0x0B82);// TAMIL # (0xMn) TAMIL SIGN ANUSVARA
contents.add(0x0BC0);// TAMIL # (0xMn) TAMIL VOWEL SIGN II
contents.add(0x0BCD);// TAMIL # (0xMn) TAMIL SIGN VIRAMA
contents.add(0x0C3E, 0x0C40);// TELUGU # (0xMn) TELUGU VOWEL SIGN AA, 0xTELUGU VOWEL SIGN II
contents.add(0x0C46, 0x0C48);// TELUGU # (0xMn) TELUGU VOWEL SIGN E, 0xTELUGU VOWEL SIGN AI
contents.add(0x0C4A, 0x0C4D);// TELUGU # (0xMn) TELUGU VOWEL SIGN O, 0xTELUGU SIGN VIRAMA
contents.add(0x0C55, 0x0C56);// TELUGU # (0xMn) TELUGU LENGTH MARK, 0xTELUGU AI LENGTH MARK
contents.add(0x0CBF);// KANNADA # (0xMn) KANNADA VOWEL SIGN I
contents.add(0x0CC6);// KANNADA # (0xMn) KANNADA VOWEL SIGN E
contents.add(0x0CCC, 0x0CCD);// KANNADA # (0xMn) KANNADA VOWEL SIGN AU, 0xKANNADA SIGN VIRAMA
contents.add(0x0D41, 0x0D43);// MALAYALAM # (0xMn) MALAYALAM VOWEL SIGN U, 0xMALAYALAM VOWEL SIGN VOCALIC R
contents.add(0x0D4D);// MALAYALAM # (0xMn) MALAYALAM SIGN VIRAMA
contents.add(0x0DCA);// SINHALA # (0xMn) SINHALA SIGN AL-LAKUNA
contents.add(0x0DD2, 0x0DD4);// SINHALA # (0xMn) SINHALA VOWEL SIGN KETTI IS-PILLA, 0xSINHALA VOWEL SIGN KETTI PAA-PILLA
contents.add(0x0DD6);// SINHALA # (0xMn) SINHALA VOWEL SIGN DIGA PAA-PILLA
contents.add(0x0E31);// THAI # (0xMn) THAI CHARACTER MAI HAN-AKAT
contents.add(0x0E34, 0x0E3A);// THAI # (0xMn) THAI CHARACTER SARA I, 0xTHAI CHARACTER PHINTHU
contents.add(0x0E47, 0x0E4E);// THAI # (0xMn) THAI CHARACTER MAITAIKHU, 0xTHAI CHARACTER YAMAKKAN
contents.add(0x0EB1);// LAO # (0xMn) LAO VOWEL SIGN MAI KAN
contents.add(0x0EB4, 0x0EB9);// LAO # (0xMn) LAO VOWEL SIGN I, 0xLAO VOWEL SIGN UU
contents.add(0x0EBB, 0x0EBC);// LAO # (0xMn) LAO VOWEL SIGN MAI KON, 0xLAO SEMIVOWEL SIGN LO
contents.add(0x0EC8, 0x0ECD);// LAO # (0xMn) LAO TONE MAI EK, 0xLAO NIGGAHITA
contents.add(0x0F18, 0x0F19);// TIBETAN # (0xMn) TIBETAN ASTROLOGICAL SIGN -KHYUD PA, 0xTIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
contents.add(0x0F35);// TIBETAN # (0xMn) TIBETAN MARK NGAS BZUNG NYI ZLA
contents.add(0x0F37);// TIBETAN # (0xMn) TIBETAN MARK NGAS BZUNG SGOR RTAGS
contents.add(0x0F39);// TIBETAN # (0xMn) TIBETAN MARK TSA -PHRU
contents.add(0x0F71, 0x0F7E);// TIBETAN # (0xMn) TIBETAN VOWEL SIGN AA, 0xTIBETAN SIGN RJES SU NGA RO
contents.add(0x0F80, 0x0F84);// TIBETAN # (0xMn) TIBETAN VOWEL SIGN REVERSED I, 0xTIBETAN MARK HALANTA
contents.add(0x0F86, 0x0F87);// TIBETAN # (0xMn) TIBETAN SIGN LCI RTAGS, 0xTIBETAN SIGN YANG RTAGS
contents.add(0x0F90, 0x0F97);// TIBETAN # (0xMn) TIBETAN SUBJOINED LETTER KA, 0xTIBETAN SUBJOINED LETTER JA
contents.add(0x0F99, 0x0FBC);// TIBETAN # (0xMn) TIBETAN SUBJOINED LETTER NYA, 0xTIBETAN SUBJOINED LETTER FIXED-FORM RA
contents.add(0x0FC6);// TIBETAN # (0xMn) TIBETAN SYMBOL PADMA GDAN
contents.add(0x102D, 0x1030);// MYANMAR # (0xMn) MYANMAR VOWEL SIGN I, 0xMYANMAR VOWEL SIGN UU
contents.add(0x1032);// MYANMAR # (0xMn) MYANMAR VOWEL SIGN AI
contents.add(0x1036, 0x1037);// MYANMAR # (0xMn) MYANMAR SIGN ANUSVARA, 0xMYANMAR SIGN DOT BELOW
contents.add(0x1039);// MYANMAR # (0xMn) MYANMAR SIGN VIRAMA
contents.add(0x1058, 0x1059);// MYANMAR # (0xMn) MYANMAR VOWEL SIGN VOCALIC L, 0xMYANMAR VOWEL SIGN VOCALIC LL
contents.add(0x17B7, 0x17BD);// KHMER # (0xMn) KHMER VOWEL SIGN I, 0xKHMER VOWEL SIGN UA
contents.add(0x17C6);// KHMER # (0xMn) KHMER SIGN NIKAHIT
contents.add(0x17C9, 0x17D3);// KHMER # (0xMn) KHMER SIGN MUUSIKATOAN, 0xKHMER SIGN BATHAMASAT
contents.add(0x18A9);// MONGOLIAN # (0xMn) MONGOLIAN LETTER ALI GALI DAGALGA
contents.add(0x1712, 0x1713);// TAGALOG # (0xMn) TAGALOG VOWEL SIGN I, 0xTAGALOG VOWEL SIGN U
contents.add(0x1732, 0x1733);// HANUNOO # (0xMn) HANUNOO VOWEL SIGN I, 0xHANUNOO VOWEL SIGN U
contents.add(0x1752, 0x1753);// BUHID # (0xMn) BUHID VOWEL SIGN I, 0xBUHID VOWEL SIGN U
contents.add(0x1772, 0x1773);// TAGBANWA # (0xMn) TAGBANWA VOWEL SIGN I, 0xTAGBANWA VOWEL SIGN U
//contents.add(0x1D165, 0x1D166);// COMMON # (0xMc) MUSICAL SYMBOL COMBINING STEM, 0xMUSICAL SYMBOL COMBINING SPRECHGESANG STEM
//contents.add(0x1D16D, 0x1D172);// COMMON # (0xMc) MUSICAL SYMBOL COMBINING AUGMENTATION DOT, 0xMUSICAL SYMBOL COMBINING FLAG-5
contents.add(0x0966, 0x096F);// DEVANAGARI # (0xNd) DEVANAGARI DIGIT ZERO, 0xDEVANAGARI DIGIT NINE
contents.add(0x09E6, 0x09EF);// BENGALI # (0xNd) BENGALI DIGIT ZERO, 0xBENGALI DIGIT NINE
contents.add(0x0A66, 0x0A6F);// GURMUKHI # (0xNd) GURMUKHI DIGIT ZERO, 0xGURMUKHI DIGIT NINE
contents.add(0x0AE6, 0x0AEF);// GUJARATI # (0xNd) GUJARATI DIGIT ZERO, 0xGUJARATI DIGIT NINE
contents.add(0x0B66, 0x0B6F);// ORIYA # (0xNd) ORIYA DIGIT ZERO, 0xORIYA DIGIT NINE
contents.add(0x0BE7, 0x0BEF);// TAMIL # (0xNd) TAMIL DIGIT ONE, 0xTAMIL DIGIT NINE
contents.add(0x0C66, 0x0C6F);// TELUGU # (0xNd) TELUGU DIGIT ZERO, 0xTELUGU DIGIT NINE
contents.add(0x0CE6, 0x0CEF);// KANNADA # (0xNd) KANNADA DIGIT ZERO, 0xKANNADA DIGIT NINE
contents.add(0x0D66, 0x0D6F);// MALAYALAM # (0xNd) MALAYALAM DIGIT ZERO, 0xMALAYALAM DIGIT NINE
contents.add(0x0E50, 0x0E59);// THAI # (0xNd) THAI DIGIT ZERO, 0xTHAI DIGIT NINE
contents.add(0x0ED0, 0x0ED9);// LAO # (0xNd) LAO DIGIT ZERO, 0xLAO DIGIT NINE
contents.add(0x0F20, 0x0F29);// TIBETAN # (0xNd) TIBETAN DIGIT ZERO, 0xTIBETAN DIGIT NINE
contents.add(0x1040, 0x1049);// MYANMAR # (0xNd) MYANMAR DIGIT ZERO, 0xMYANMAR DIGIT NINE
contents.add(0x1369, 0x1371);// ETHIOPIC # (0xNd) ETHIOPIC DIGIT ONE, 0xETHIOPIC DIGIT NINE
contents.add(0x17E0, 0x17E9);// KHMER # (0xNd) KHMER DIGIT ZERO, 0xKHMER DIGIT NINE
contents.add(0x1810, 0x1819);// MONGOLIAN # (0xNd) MONGOLIAN DIGIT ZERO, 0xMONGOLIAN DIGIT NINE
contents.add(0x16EE, 0x16F0);// RUNIC # (0xNl) RUNIC ARLAUG SYMBOL, 0xRUNIC BELGTHOR SYMBOL
contents.add(0x3007);// HAN # (0xNl) IDEOGRAPHIC NUMBER ZERO
contents.add(0x3021, 0x3029);// HAN # (0xNl) HANGZHOU NUMERAL ONE, 0xHANGZHOU NUMERAL NINE
contents.add(0x3038, 0x303A);// HAN # (0xNl) HANGZHOU NUMERAL TEN, 0xHANGZHOU NUMERAL THIRTY
contents.add(0x1034A);// GOTHIC # (0xNl) GOTHIC LETTER NINE HUNDRED
contents.add(0x0BF0, 0x0BF2);// TAMIL # (0xNo) TAMIL NUMBER TEN, 0xTAMIL NUMBER ONE THOUSAND
contents.add(0x0F2A, 0x0F33);// TIBETAN # (0xNo) TIBETAN DIGIT HALF ONE, 0xTIBETAN DIGIT HALF ZERO
contents.add(0x1372, 0x137C);// ETHIOPIC # (0xNo) ETHIOPIC NUMBER TEN, 0xETHIOPIC NUMBER TEN THOUSAND
contents.add(0x2E80, 0x2E99);// HAN # (0xSo) CJK RADICAL REPEAT, 0xCJK RADICAL RAP
contents.add(0x2E9B, 0x2EF3);// HAN # (0xSo) CJK RADICAL CHOKE, 0xCJK RADICAL C-SIMPLIFIED TURTLE
contents.add(0x2F00, 0x2FD5);// HAN # (0xSo) KANGXI RADICAL ONE, 0xKANGXI RADICAL FLUTE
contents.add(0xA490, 0xA4A1);// YI # (0xSo) YI RADICAL QOT, 0xYI RADICAL GA
contents.add(0xA4A4, 0xA4B3);// YI # (0xSo) YI RADICAL DDUR, 0xYI RADICAL JO
contents.add(0xA4B5, 0xA4C0);// YI # (0xSo) YI RADICAL JJY, 0xYI RADICAL SHAT
contents.add(0xA4C2, 0xA4C4);// YI # (0xSo) YI RADICAL SHOP, 0xYI RADICAL ZZIET
contents.add(0xA4C6);// YI # (0xSo) YI RADICAL KE
return contents;
}
}

View File

@ -1,25 +0,0 @@
package com.ibm.text.UCD;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import java.util.List;
public class ScriptTimeline {
public static void main(String[] args) {
String[] versions = { "2.0.0", "2.1.2", "3.0.0", "3.1.0", "3.2.0", "4.0.0", "4.1.0", "5.0.0" };
for (int s = 0; s < UScript.CODE_LIMIT; ++s) {
String scriptName = UScript.getName(s);
UnicodeSet chars = new UnicodeSet().applyPropertyAlias("script", scriptName);
if (chars.size() == 0) continue;
System.out.print(scriptName);
for (int v = 0; v < versions.length; ++v) {
UnicodeSet age = new UnicodeSet();
age.applyPropertyAlias("age", versions[v]);
System.out.print("\t" + new UnicodeSet(chars).retainAll(age).size());
}
System.out.println();
}
}
}

View File

@ -1,75 +0,0 @@
# ================================================================================
# Conditional mappings
# ================================================================================
# Special case for final form of sigma
03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
# Note: the following cases for non-final are already in the UnicodeData file.
# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
# Note: the following cases are not included, since they would case-fold in lowercasing
# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
# ================================================================================
# Locale-sensitive mappings
# ================================================================================
# Lithuanian
# Lithuanian retains the dot in a lowercase i when followed by accents.
# Remove DOT ABOVE after "i" with upper or titlecase
0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
# Introduce an explicit dot above when lowercasing capital I's and J's
# whenever there are more accents above.
# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
# ================================================================================
# Turkish and Azeri
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# The following rules handle those cases.
0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
# This matches the behavior of the canonically equivalent I-dot_above
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
# When uppercasing, i turns into a dotted capital I
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
# Note: the following case is already in the UnicodeData file.
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
# EOF

View File

@ -1,46 +0,0 @@
#
# Special Casing Properties
#
# This file is a supplement to the UnicodeData file.
# It contains additional information about the casing of Unicode characters.
# (For compatibility, the UnicodeData.txt file only contains case mappings for
# characters where they are 1-1, and does not have locale-specific mappings.)
# For more information, see the discussion of Case Mappings in the Unicode Standard.
#
# All code points not listed in this file that do not have a simple case mappings
# in UnicodeData.txt map to themselves.
# ================================================================================
# Format
# ================================================================================
# The entries in this file are in the following machine-readable format:
#
# <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
#
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more
# than one character, they are separated by spaces. Other than as used to separate
# elements, spaces are to be ignored.
#
# The <condition_list> is optional. Where present, it consists of one or more locale IDs
# or contexts, separated by spaces. In these conditions:
# - A condition list overrides the normal behavior if all of the listed conditions are true.
# - The context is always the context of the characters in the original string,
# NOT in the resulting string.
# - Case distinctions in the condition list are not significant.
# - Conditions preceded by "Not_" represent the negation of the condition.
#
# A locale ID is defined by taking any language tag as defined by
# RFC 3066 (or its successor), and replacing '-' by '_'.
#
# A context for a character C is defined by Section 3.13 Default Case
# Operations, of The Unicode Standard, Version 5.0.
# (This is identical to the context defined by Unicode 4.1.0,
# as specified in http://www.unicode.org/versions/Unicode4.1.0/)
#
# Parsers of this file must be prepared to deal with future additions to this format:
# * Additional contexts
# * Additional fields
# ================================================================================
# ================================================================================
# Unconditional mappings
# ================================================================================

View File

@ -1,13 +0,0 @@
# IMPORTANT-when capitalizing iota-subscript (0345)
# It MUST be in normalized form--moved to the end of any sequence of combining marks.
# This is because logically it represents a following base character!
# E.g. <iota_subscript> (<Mn> | <Mc> | <Me>)+ => (<Mn> | <Mc> | <Me>)+ <iota_subscript>
# It should never be the first character in a word, so in titlecasing it can be left as is.
# The following cases are already in the UnicodeData file, so are only commented here.
# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI
# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
# have special uppercases.
# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!

View File

@ -1,108 +0,0 @@
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<meta name="keywords" content="unicode, variant glyphs">
<meta name="description" content="Describes and displays standardized variant glyphs">
<title>Standardized Variants</title>
<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
</head>
<body bgcolor="#ffffff">
<table class="header">
<tr>
<td class="icon"><a href="http://www.unicode.org">
<img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="http://www.unicode.org/ucd">Unicode
Character Database</a></td>
</tr>
<tr>
<td class="gray">&nbsp;</td>
</tr>
</table>
<blockquote>
<h1>Standardized Variants</h1>
<table class="wide">
<tr>
<td valign="top" width="144">Revision</td>
<td valign="top">@revision@</td>
</tr>
<tr>
<td valign="top" width="144">Authors</td>
<td valign="top">Members of the Editorial Committee</td>
</tr>
<tr>
<td valign="top" width="144">Date</td>
<td valign="top">@date@</td>
</tr>
<tr>
<td valign="top" width="144">This Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/@updateDirectory@/@filename@.html">
http://www.unicode.org/Public/@updateDirectory@/@filename@.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Previous Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html">
http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Latest Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html">
http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html</a></td>
</tr>
</table>
<h3><br>
<i>Summary</i></h3>
<blockquote>
<p>This file provides a visual display of the standard variant sequences derived from
StandardizedVariants.txt.</p>
</blockquote>
<h3><i>Status</i></h3>
<blockquote>
<p><i>This file and the files described herein are part of the Unicode Character Database and
are governed by the terms of use at <a href="http://www.unicode.org/terms_of_use.html">
http://www.unicode.org/terms_of_use.html</a>.</i></p>
</blockquote>
<hr width="50%">
<h2>Introduction</h2>
<p>The tables here <i>exhaustively</i> lists the valid, registered combinations of base character
plus variation indicator. All combinations not listed in StandardizedVariants.txt are unspecified
and are reserved for future standardization; no conformant process may interpret them as
standardized variants. Variation selectors and their use are described in The Unicode Standard.</p>
<p>These mathematical variants are all produced with the addition of Variation Selector 1 (VS1 or
U+FE00) to mathematical operator base characters. There is no variation according to context. The
Mongolian variants use the Mongolian Variant Selectors, and may vary according to context. That
is, if a contextual shape is not listed below, then the variation sequence has an unmodified
appearance. At this time no Han variants exist.</p>
<blockquote>
<p><a name="fonts"><b>Note: </b></a>The glyphs used to show the variations are often derived
from different physical fonts than the representative glyphs in the standard. They may therefore
exhibit minor differences in size, proportion, or weight <i>unrelated</i> to the intentional
difference in feature that is the defining element of the variation. Such minor differences
should be ignored. Likewise, in some cases the existing representative fonts may not yet contain
newly encoded characters and hence some representative glyphs shown in these tables may have a
slightly different style than others.</p>
</blockquote>
<p>@table@</p>
<hr width="50%">
<div align="center">
<center>
<table cellspacing="0" cellpadding="0" border="0">
<tr>
<td><a href="http://www.unicode.org/unicode/copyright.html">
<img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
</tr>
</table>
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
</script>
</center>
</div>
</blockquote>
</body>
</html>

View File

@ -1,566 +0,0 @@
package com.ibm.text.UCD;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.lang.UCharacter;
import com.ibm.text.utility.*;
import java.util.*;
import java.io.*;
// Enumerated properties will be IntCodePointProperty.
// The string values they return will be the property value names.
// Binary properties are Enumerated properties. They return 0 or 1
public final class TernaryStore {
static final int DONE = Integer.MIN_VALUE;
static final int NOT_FOUND = Integer.MIN_VALUE+1;
// for testing
static DepthPrinter dp;
static void test() throws java.io.IOException {
PrintWriter pw = Utility.openPrintWriter("TestTernary.txt", Utility.LATIN1_WINDOWS);
try {
dp = new DepthPrinter(pw);
String[] tests = {"the", "quick", "fish", "fisherman", "fishes",
"brown", "brow", "bracket", "bright", "brat",
"brough", "dogs", "upper", "zebra",
"fisher"};
test("Simple: ", tests, tests.length);
tests = new String[300000];
int counter = 0;
int i;
for (i = 0; counter < tests.length && i <= 0x10FFFF; ++i) {
if (Default.ucd().hasComputableName(i)) continue;
String temp = UCharacter.getName(i);
if (temp != null) tests[counter++] = temp.trim();
}
System.out.println("max-cp: " + Utility.hex(i));
test("Unicode Names: ", tests, counter);
//if (true) return;
BufferedReader br = Utility.openReadFile(UCD_Types.BASE_DIR + "dict\\DiploFreq.txt", Utility.LATIN1);
String line;
counter = 0;
while (counter < tests.length) {
line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
Utility.dot(counter);
int tabPos = line.indexOf('\t');
if (tabPos < 0) {
System.out.println("???" + line);
continue;
}
tests[counter++] = line.substring(tabPos+1);
}
test("French: ", tests, counter);
} finally {
pw.close();
}
}
static void test(String title, String[] tests, int len) {
System.out.println();
System.out.println(title);
dp.println();
dp.print(title, 0);
dp.println();
TernaryStore.Builder builder = new TernaryStore.Builder();
int charCount = 0;
for (int i = 0; i < len; ++i) {
builder.add(tests[i], i);
charCount += tests[i].length();
}
System.out.println("charCount: " + charCount);
TernaryStore store = builder.build();
store.showNodes();
store.checkNodes();
dp.println("Storage");
dp.println(store.stringStore.toString());
System.out.println("StorageSize: " + store.stringStore.toString().length());
Matcher matcher = store.getMatcher();
for (int i = 0; i < len; ++i) {
int check = test(tests[i], matcher);
if (check != i) {
System.out.println("\tFail, result: " + tests[i] + ", " + check);
}
}
}
static int test(String s, Matcher matcher) {
matcher.reset(s, 0);
int lastResult = -1;
for (int result = matcher.next(); result != DONE; result = matcher.next()) {
lastResult = result;
}
return lastResult;
}
static final class Node {
String getString(StringStore stringStore) {
if (stringCode < 0) return tempString;
return stringStore.get(stringCode);
}
void setString(String s) {
tempString = s;
}
String tempString;
int stringCode = -1;
Node less;
Node greater;
Node next;
int result = NOT_FOUND;
public String toString(StringStore store) {
return getString(store)
+ (result != NOT_FOUND ? "(" + result + ")" : "")
+ (next != null ? next.toString() : "");
}
}
Node base;
StringStore stringStore = new StringStore();
final static class Matcher {
TernaryStore store;
String s;
int position;
Node lastNode;
void reset(String s, int position) {
this.s = s;
this.position = position;
this.lastNode = store.base;
}
// returns the next result
// or DONE when done
// sets position to point after end of found string
int next() {
while (lastNode != null && position < s.length()) {
char ch = s.charAt(position++);
do {
String nodeString = lastNode.getString(store.stringStore);
char first = nodeString.charAt(0);
if (ch == first) {
// now check the rest of the string
for (int i = 1; i < nodeString.length(); ++i) {
char other = nodeString.charAt(i);
if (other != s.charAt(position++)) {
return DONE;
}
}
// if we succeed, return result if there is one
int result = lastNode.result;
lastNode = lastNode.next;
if (result != NOT_FOUND) return result;
break; // get next char
}
// otherwise branch sideways, keeping same char
if (ch > first) {
lastNode = lastNode.greater;
} else {
lastNode = lastNode.less;
}
} while (lastNode != null);
}
return DONE;
}
}
public Matcher getMatcher() {
Matcher result = new Matcher();
result.store = this;
return result;
}
public void showNodes() {
showNodes2(base, "", 5);
}
public void showNodes2(Node n, String path, int depth) {
if (n.less != null) {
showNodes2(n.less, path+"-", depth);
}
dp.print("", depth);
if (false) dp.print(path);
dp.print(n.getString(stringStore));
if (n.result != NOT_FOUND) dp.print("/" + n.result);
dp.println();
if (n.next != null) {
showNodes2(n.next, path+".", depth+n.getString(stringStore).length());
}
if (n.greater != null) {
showNodes2(n.greater, path+"+", depth);
}
}
static class NodeInfo {
int nodeCount;
int resultCount;
int nullLessCount;
int nullGreaterCount;
int nullSimpleCount;
int nullNextCount;
}
public void checkNodes() {
NodeInfo nodeInfo = new NodeInfo();
checkNodes(base, nodeInfo);
System.out.println("Nodes: " + nodeInfo.nodeCount);
System.out.println("nullLessCount: " + nodeInfo.nullLessCount);
System.out.println("nullGreaterCount: " + nodeInfo.nullGreaterCount);
System.out.println("nullNextCount: " + nodeInfo.nullNextCount);
System.out.println("resultCount: " + nodeInfo.resultCount);
System.out.println("nullSimpleCount: " + nodeInfo.nullSimpleCount);
}
public void checkNodes(Node n, NodeInfo nodeInfo) {
nodeInfo.nodeCount++;
if (n.result != NOT_FOUND) nodeInfo.resultCount++;
if (n.less != null) {
checkNodes(n.less, nodeInfo);
} else {
nodeInfo.nullLessCount++;
if (n.greater == null && n.result == NOT_FOUND) nodeInfo.nullSimpleCount++;
}
if (n.next != null) {
checkNodes(n.next, nodeInfo);
} else {
nodeInfo.nullNextCount++;
}
if (n.greater != null) {
checkNodes(n.greater, nodeInfo);
} else {
nodeInfo.nullGreaterCount++;
}
}
final static class DepthPrinter {
private PrintWriter pw;
private int currentDepth = 0;
private String leader = ".";
DepthPrinter(PrintWriter pw) {
this.pw = pw;
}
void print(char ch) {
print(ch, 0);
}
void print(String s) {
print(s, 0);
}
void print(char ch, int depth) {
print(String.valueOf(ch), depth);
}
void print(String s, int depth) {
int delta = depth - currentDepth;
if (delta > 0) {
pw.print(Utility.repeat(leader, delta - 1));
currentDepth = depth;
}
pw.print(s);
currentDepth += s.length();
}
void println() {
pw.println();
currentDepth = 0;
}
void println(String s) {
pw.print(s);
pw.println();
currentDepth = 0;
}
}
final static class StringStore {
// initially, there is a simple strategy
private String buffer = "";
private static final char TERMINATOR = '\u007E';
private static final int PIECE_LENGTH = 5;
private static String[] pieces = new String[50]; // HACK
private static Set strings = new HashSet();
public void add(String s) {
strings.add(s);
}
public void compact() {
System.out.println("Adding Pieces");
// add all the pieces
Iterator it = strings.iterator();
Set additions = new HashSet();
while (it.hasNext()) {
String s = (String)it.next();
int len = Utility.split(s, ' ', pieces);
for (int i = 0; i < len; ++i) {
additions.add(pieces[i]);
}
}
store(additions);
store(strings);
}
private void store(Set stuff) {
System.out.println("Sorting");
// sort them by length, longest first
Set ordered = new TreeSet();
Iterator it = stuff.iterator();
while (it.hasNext()) {
String s = (String)it.next();
ordered.add(new Pair(new Integer(-s.length()), s));
}
System.out.println("Storing");
// add them
it = ordered.iterator();
while (it.hasNext()) {
String s = (String)(((Pair)it.next()).second);
get(s);
}
}
private int get(String s) {
System.out.println("Adding: \'" + s + "\'");
int index;
if (s.indexOf(' ') < 0) {
index = addNoSplit(s);
System.out.println("\tReturning: " + index);
return index;
}
int len = Utility.split(s, ' ', pieces);
StringBuffer itemCodes = new StringBuffer();
for (int i = 0; i < len; ++i) {
String piece = pieces[i];
itemCodes.append((char)addNoSplit(piece));
/*for (int j = 0; j < piece.length(); j += PIECE_LENGTH) {
int maxLen = j + PIECE_LENGTH;
if (maxLen > piece.length()) maxLen = piece.length();
itemCodes.append((char)addNoSplit(piece.substring(j, maxLen)));
}*/
}
index = 0x8000 | addNoSplit(itemCodes.toString()); // mark it as composite
System.out.println("\tReturning: " + index);
return index;
}
private int addNoSplit(String s) {
System.out.println("\tAdding2: \'" + s + "\'");
String sTerm = s + TERMINATOR;
int index = buffer.indexOf(sTerm);
if (index >= 0) return index;
index = buffer.length();
buffer += sTerm;
System.out.println("\t\tReturning2: " + index);
return index;
}
public String get(int index) {
String result;
System.out.println("Fetching: " + index);
if ((index & 0x8000) == 0) {
int end = buffer.indexOf(TERMINATOR, index);
result = buffer.substring(index, end);
System.out.println("\tReturning: '" + result + "'");
return result;
}
index &= ~0x8000; // remove 1 bit
int end = buffer.indexOf(TERMINATOR, index);
result = "";
for (int i = index; i < end; ++i) {
if (result.length() != 0) result += " ";
result += get(buffer.charAt(i));
}
System.out.println("\tReturning: '" + result + "'");
return result;
}
public String toString() {
return buffer;
}
}
final static class Builder {
Map map = new TreeMap();
String[] names;
TernaryStore store;
Set set = new TreeSet();
public void add(String name, int result) {
map.put(name, new Integer(result));
}
public TernaryStore build() {
// flatten strings into array
names = new String[map.size()];
Iterator it = map.keySet().iterator();
int count = 0;
while (it.hasNext()) {
names[count++] = (String) it.next();
if (false) {
dp.print((count-1) + " " + names[count-1]);
dp.println();
}
}
// build nodes
store = new TernaryStore();
addNode(0, names.length);
// free storage
names = null;
map.clear();
System.out.println("compacting");
compactStore(store.base);
store.stringStore.compact();
//compactStrings(store);
//set.clear(); // free more storage
replaceStrings(store.base);
//map.clear(); // free storage
// free storage
TernaryStore result = store;
store = null;
return result;
}
/*
void compactStrings(TernaryStore t) {
// we have a set of Pairs, first is length, second is string
// compact them, word by word
Iterator it = set.iterator();
while (it.hasNext()) {
String string = ((String)((Pair)it.next()).second);
int index = t.stringStore.add(string);
if (true) {
System.out.println("Checking: " + index);
String reverse = t.stringStore.get(index);
if (!reverse.equals(string)) {
System.out.println("source: \'" + string + "\'");
System.out.println("reverse: \'" + reverse + "\'");
throw new IllegalArgumentException("Failed roundtrip");
}
}
map.put(string, new Integer(index));
}
}
*/
public void replaceStrings(Node n) {
n.stringCode = store.stringStore.get(n.getString(store.stringStore));
n.setString(null);
if (n.less != null) replaceStrings(n.less);
if (n.next != null) replaceStrings(n.next);
if (n.greater != null) replaceStrings(n.greater);
}
public void compactStore(Node n) {
Node nextNode = n.next;
if (false) dp.println(n.toString());
while (n.result == NOT_FOUND && nextNode != null && nextNode.greater == null
&& nextNode.less == null) {
n.setString(n.getString(store.stringStore) + nextNode.getString(store.stringStore));
n.result = nextNode.result;
n.next = nextNode = nextNode.next; // remove old node
}
// add strings sorted by length, longest first
store.stringStore.add(n.getString(store.stringStore));
if (n.less != null) compactStore(n.less);
if (n.next != null) compactStore(n.next);
if (n.greater != null) compactStore(n.greater);
}
private void addNode(int start, int limit) {
if (start >= limit) return;
int mid = (start + limit) / 2;
//System.out.println("start: " + start + ", mid: " + mid + ", limit: " + limit);
//System.out.println("adding: " + names[mid]);
addNode(names[mid], ((Integer)map.get(names[mid])).intValue());
addNode(start, mid);
addNode(mid+1, limit);
}
private void addNode(String s, int result) {
if (store.base == null) {
store.base = addRest(s, 0, result);
return;
}
Node n = store.base;
Node lastNode = n;
for (int i = 0; i < s.length(); ++i) {
char ch = s.charAt(i);
while (true) {
char first = n.getString(store.stringStore).charAt(0);
if (ch == first) {
if (n.next == null) {
n.next = addRest(s, i+1, result);
return;
}
lastNode = n;
n = n.next;
break; // get next char
}
// otherwise branch sideways, keeping same char
if (ch > first) {
if (n.greater == null) {
n.greater = addRest(s, i, result);
return;
}
n = n.greater;
} else {
if (n.less == null) {
n.less = addRest(s, i, result);
return;
}
n = n.less;
}
}
}
lastNode.result = result;
}
private Node addRest(String s, int position, int result) {
Node lastNode = null;
for (int i = s.length() - 1; i >= position; --i) {
Node n = new Node();
n.setString(s.substring(i, i+1)); // + "" to force a new string
if (lastNode == null) {
n.result = result;
}
n.next = lastNode;
lastNode = n;
}
return lastNode;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,378 +0,0 @@
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.XEquivalenceClass;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.text.utility.Utility;
import com.ibm.icu.lang.UCharacter;
public class TestIdentifiers {
public static void main(String[] args) throws IOException {
String[] tests = { "SØS", "façade", "MOPE", "VOP", "scope", "ibm", "vop",
"toys-я-us", "1iνе", "back", "boгing" };
TestIdentifiers ti = new TestIdentifiers("L");
TestIdentifiers tiany = new TestIdentifiers("A");
ti.loadIdentifiers();
UnicodeSet idnCharSet = ti.idnChars.getSet("output", new UnicodeSet());
System.out.println("idnCharSet: " + idnCharSet.size());
UnicodeSet idnCharNonStarting = ti.nonstarting;
System.out.println("idnCharNonStarting: " + idnCharSet);
if (true) return;
for (int i = 0; i < tests.length; ++i) {
System.out.print(tests[i]);
String folded = UCharacter.foldCase(tests[i], true);
if (folded.equals(tests[i])) {
ti.testItem(tests[i]);
} else {
System.out.print("\t");
tiany.testItem(tests[i]);
System.out.print(folded);
ti.testItem(folded);
}
for (int j = 0; j < tests[i].length(); ++j) {
int cp = tests[i].charAt(j);
Set s = ti.getConfusables(cp, "MA");
System.out.println(Default.ucd().getCodeAndName(cp));
for (Iterator it = s.iterator(); it.hasNext();) {
System.out.println("\t= " + Default.ucd().getCodeAndName((String)it.next()));
}
}
}
}
void testItem(String test) {
test = Normalizer.normalize(test, Normalizer.DECOMP_COMPAT);
BitSet scripts = new BitSet();
System.out.print("\t" + caseType + "\t");
boolean foundProblem = false;
if (hasWholeScriptConfusable(test, scripts)) {
System.out.print("whole-script confusables: ");
for (int j = 0; j < scripts.length(); ++j) {
if (scripts.get(j))
System.out.print(UScript.getName(j) + " ");
}
System.out.println();
foundProblem = true;
}
if (hasMixedScriptConfusable(test)) {
System.out.println("mixed-script confusable");
foundProblem = true;
}
if (!foundProblem) {
System.out.println("no confusable");
}
}
private static final String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
private static UnicodeSet commonAndInherited = new UnicodeSet(
"[[:script=common:][:script=inherited:]]");
private static UnicodeSet XIDContinueSet = new UnicodeSet("[:xidcontinue:]")
.add('-');
private static final boolean DEBUG = false;
private String caseType;
TestIdentifiers(String caseType) throws IOException {
this.caseType = caseType;
loadWholeScriptConfusables(caseType);
}
private static class UnicodeSetToScript {
public int getScript() {
return script;
}
public UnicodeSetToScript setScript(int script) {
this.script = script;
return this;
}
public UnicodeSet getSet() {
return set;
}
public UnicodeSetToScript setSet(UnicodeSet set) {
this.set = set;
return this;
}
private UnicodeSet set;
private int script;
}
UnicodeSetToScript[][] scriptToUnicodeSetToScript = new UnicodeSetToScript[UScript.CODE_LIMIT][];
UnicodeSet[] fastReject = new UnicodeSet[UScript.CODE_LIMIT];
UnicodeMap idnChars = new UnicodeMap();
UnicodeSet nonstarting = new UnicodeSet();
void loadIdentifiers() throws IOException {
BufferedReader br = BagFormatter.openUTF8Reader(indir,
"idnchars.txt");
String line = null;
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null)
break;
if (line.length() == 0)
continue;
String[] pieces = Utility.split(line, ';');
// part 0 is range
String range = pieces[0].trim();
int rangeDivider = range.indexOf("..");
int start, end;
if (rangeDivider < 0) {
start = end = Integer.parseInt(range, 16);
} else {
start = Integer.parseInt(range.substring(0, rangeDivider),
16);
end = Integer.parseInt(range.substring(rangeDivider + 2),
16);
}
// part 1 is script1
String type = pieces[1].trim().intern();
if (type.equals("nonstarting")) nonstarting.add(start,end);
else idnChars.putAll(start, end, type);
}
} catch (Exception e) {
throw (RuntimeException) new RuntimeException("Failure on line "
+ line).initCause(e);
}
br.close();
}
Map type_equivalences;
void loadConfusables() throws IOException {
BufferedReader br = BagFormatter.openUTF8Reader(indir,
"confusables.txt");
String line = null;
type_equivalences = new HashMap();
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null)
break;
if (line.length() == 0)
continue;
String[] pieces = Utility.split(line, ';');
// part 0 is source code point
String s = Utility.fromHex(pieces[0].trim());
// part 1 is script1
String t = Utility.fromHex(pieces[1].trim());
String type = pieces[2].trim();
XEquivalenceClass ec = (XEquivalenceClass) type_equivalences.get(type);
if (ec == null) type_equivalences.put(type, ec = new XEquivalenceClass(""));
ec.add(s, t);
//System.out.println(type + ": " + Default.ucd().getCodeAndName(s) + " => " + Default.ucd().getCodeAndName(t));
}
} catch (Exception e) {
throw (RuntimeException) new RuntimeException("Failure on line "
+ line).initCause(e);
}
br.close();
}
public Set getConfusables(int cp, String type) {
try {
if (type_equivalences == null) loadConfusables();
} catch (IOException e) {
return null;
}
XEquivalenceClass ec = (XEquivalenceClass) type_equivalences.get(type);
return ec.getEquivalences(UTF16.valueOf(cp));
}
void loadWholeScriptConfusables(String filterType) throws IOException {
UnicodeSet[][] script_script_set = new UnicodeSet[UScript.CODE_LIMIT][UScript.CODE_LIMIT];
for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
script_script_set[i] = new UnicodeSet[UScript.CODE_LIMIT];
}
BufferedReader br = BagFormatter.openUTF8Reader(indir,
"confusablesWholeScript.txt");
String line = null;
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null)
break;
if (line.length() == 0)
continue;
String[] pieces = Utility.split(line, ';');
// part 0 is range
String range = pieces[0].trim();
int rangeDivider = range.indexOf("..");
int start, end;
if (rangeDivider < 0) {
start = end = Integer.parseInt(range, 16);
} else {
start = Integer.parseInt(range.substring(0, rangeDivider),
16);
end = Integer.parseInt(range.substring(rangeDivider + 2),
16);
}
// part 1 is script1
int script1 = UScript.getCodeFromName(pieces[1].trim());
// part 2 is script2
int script2 = UScript.getCodeFromName(pieces[2].trim());
String type = pieces[3].trim();
if (!type.equals(filterType))
continue;
if (script_script_set[script1][script2] == null) {
script_script_set[script1][script2] = new UnicodeSet();
}
script_script_set[script1][script2].add(start, end);
}
for (int i = 0; i < script_script_set.length; ++i) {
UnicodeSet accept = new UnicodeSet();
List curr = new ArrayList();
for (int j = 0; j < script_script_set[i].length; ++j) {
if (script_script_set[i][j] == null)
continue;
accept.addAll(script_script_set[i][j]);
curr.add(new UnicodeSetToScript().setScript(j).setSet(
script_script_set[i][j]));
if (DEBUG && i == UScript.LATIN)
System.out.println(UScript.getName(i) + "; "
+ UScript.getName(j) + "; "
+ script_script_set[i][j]);
}
if (curr.size() == 0)
continue;
scriptToUnicodeSetToScript[i] = (UnicodeSetToScript[]) curr
.toArray(new UnicodeSetToScript[curr.size()]);
fastReject[i] = accept.complement();
if (DEBUG && i == UScript.LATIN)
System.out.println(UScript.getName(i) + "; "
+ fastReject[i]);
}
} catch (Exception e) {
throw (RuntimeException) new RuntimeException("Failure on line "
+ line).initCause(e);
}
br.close();
}
/*
* for this routine, we don't care what the targetScripts are, just whether
* there is at least one whole-script confusable.
*/
boolean hasWholeScriptConfusable(String givenString, BitSet resultingScripts) {
int givenScript = getSingleScript(givenString);
if (givenScript == UScript.INVALID_CODE)
return false;
UnicodeSet givenSet = new UnicodeSet().addAll(givenString).removeAll(
commonAndInherited);
return hasWholeScriptConfusable(givenScript, givenSet, resultingScripts);
}
/**
*
*/
private boolean hasWholeScriptConfusable(int givenScript,
UnicodeSet givenSet, BitSet resultingScripts) {
resultingScripts.clear();
if (fastReject[givenScript] == null)
return false;
if (fastReject[givenScript].containsSome(givenSet))
return false;
UnicodeSetToScript[] possibles = scriptToUnicodeSetToScript[givenScript];
for (int i = 0; i < possibles.length; ++i) {
if (possibles[i].set.containsAll(givenSet)) {
resultingScripts.set(possibles[i].script);
}
}
return !resultingScripts.isEmpty();
}
/*
* for this routine, we don't care what the targetScripts are, just
* whether there is at least one whole-script confusable.
*/
boolean hasMixedScriptConfusable(String givenString) {
UnicodeSet givenSet = new UnicodeSet().addAll(givenString).removeAll(
commonAndInherited);
UnicodeSet[] byScript = getScripts(givenSet);
BitSet wholeScripts = new BitSet();
boolean result = false;
main: for (int i = 0; i < byScript.length; ++i) {
if (byScript[i] == null)
continue;
// see if the other characters have whole script confusables in
// my script
for (int j = 0; j < byScript.length; ++j) {
if (j == i || byScript[j] == null)
continue;
if (!hasWholeScriptConfusable(j, byScript[j], wholeScripts))
continue main;
if (!wholeScripts.get(i))
continue main; // doesn't have the
// one we want
result = true;
}
return result; // passed the guantlet
}
return false;
}
/*
* Returns UScript.INVALID_CODE if mixed script, otherwise the script
*/
public static int getSingleScript(String source) {
int lastScript = UScript.INVALID_CODE;
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
int script = UScript.getScript(cp);
if (script == UScript.COMMON || script == UScript.INHERITED) {
if (XIDContinueSet.contains(cp)) {
if (lastScript == UScript.INVALID_CODE)
lastScript = script;
continue; // skip if not identifier
}
script = UScript.COMMON;
}
if (lastScript == UScript.INVALID_CODE)
lastScript = script;
else if (script != lastScript)
return UScript.INVALID_CODE;
}
return lastScript;
}
public static UnicodeSet[] getScripts(UnicodeSet sourceSet) {
UnicodeSet[] byScript = new UnicodeSet[UScript.CODE_LIMIT];
for (UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet); usi
.next();) {
int script = UScript.getScript(usi.codepoint);
if (byScript[script] == null)
byScript[script] = new UnicodeSet();
byScript[script].add(usi.codepoint);
}
return byScript;
}
}

View File

@ -1,187 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java,v $
* $Date: 2004/10/14 17:54:56 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.text.UnicodeSet;
public class TestNameUniqueness implements UCD_Types {
public static void main(String[] args) throws Exception {
checkNameList();
// new TestNameUniqueness().checkNames();
}
Map names = new HashMap();
int[] charCount = new int[128];
int[] samples = new int[128];
public static class NameIterator {
int fileCount = -1;
String line;
BufferedReader br;
String[] pieces = new String[3];
/**
* @return null when done
*/
static String[][] files = {
{"C:\\DATA\\", "pdam1040630.lst"},
{"C:\\DATA\\UCD\\4.1.0-Update\\", "NamedCompositeEntities-4.1.0d2.txt"}
};
public String next() {
while (true) {
try {
if (br != null) line = br.readLine();
if (line == null) {
fileCount++;
br = BagFormatter.openReader(files[fileCount][0], files[fileCount][1], "ISO-8859-1");
line = br.readLine();
}
} catch (IOException e) {}
if (line == null) return null;
if (line.length() == 0) continue;
if (fileCount == 0) {
char c = line.charAt(0);
// skip if doesn't start with hex digit
if (!(('0' <= c && c <= '9') || ('A' <= c && c <= 'F'))) continue;
Utility.split(line,'\t',pieces,true);
Utility.split(pieces[1],'(',pieces,true);
Utility.split(pieces[0],'*',pieces,true);
return pieces[0];
} else {
Utility.split(line,';',pieces,true);
return pieces[1];
}
//throw new IllegalArgumentException("Illegal file type");
}
}
}
public static void checkNameList() throws IOException {
Map map = new HashMap();
NameIterator nameIterator = new NameIterator();
int lineCount = 0;
while (true) {
String name = nameIterator.next();
if (name == null) break;
String key;
try {
if (name.startsWith("<")) key = name;
else key = UnicodeProperty.toNameSkeleton(name);
} catch (RuntimeException e) {
System.out.println("Error on " + nameIterator.line);
throw e;
}
Object value = map.get(key);
if (value != null && !key.startsWith("<")) {
System.out.println("*!*!*!* Collision at " + key + " between: ");
System.out.println("\t" + value);
System.out.println("\t" + nameIterator.line);
//throw new IllegalArgumentException();
}
map.put(key, nameIterator.line);
if (nameIterator.line.startsWith("116C")
|| nameIterator.line.startsWith("1180")
|| name.indexOf('-') >= 0
|| (lineCount++ % 1000) == 0) {
System.out.println("[" + lineCount + "]\t" + nameIterator.line + "\t" + name);
System.out.println("\t" + name);
System.out.println("\t" + key);
}
}
}
void checkNames() throws IOException {
PrintWriter out = Utility.openPrintWriter("name_uniqueness.txt", Utility.LATIN1_WINDOWS);
try {
out.println("Collisions");
out.println();
for (int cp = 0; cp < 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!Default.ucd().isAllocated(cp)) continue;
if (Default.ucd().hasComputableName(cp)) continue;
int cat = Default.ucd().getCategory(cp);
if (cat == Cc) continue;
String name = Default.ucd().getName(cp);
String processedName = processName(cp, name);
Integer existing = (Integer) names.get(processedName);
if (existing != null) {
out.println("Collision between: "
+ Default.ucd().getCodeAndName(existing.intValue())
+ ", " + Default.ucd().getCodeAndName(cp));
} else {
names.put(processedName, new Integer(cp));
}
}
out.println();
out.println("Samples");
out.println();
for (int i = 0; i < charCount.length; ++i) {
int count = charCount[i];
if (count == 0) continue;
String sampleName = Default.ucd().getCodeAndName(samples[i]);
out.println(count + "\t'" + ((char)i)
+ "'\t" + Default.ucd().getCodeAndName(samples[i])
+ "\t=>\t" + processName(samples[i], Default.ucd().getName(samples[i])));
}
out.println();
out.println("Name Samples");
out.println();
for (int i = 0; i < 256; ++i) {
int cat = Default.ucd().getCategory(i);
if (cat == Cc) continue;
out.println(Default.ucd().getCodeAndName(i)
+ "\t=>\t" + processName(i, Default.ucd().getName(i)));
}
} finally {
out.close();
}
}
static final String[][] replacements = {
//{"SMALL LETTER", ""},
{"LETTER", ""},
{"CHARACTER", ""},
{"DIGIT", ""},
{"SIGN", ""},
//{"WITH", ""},
};
StringBuffer processNamesBuffer = new StringBuffer();
String processName(int codePoint, String name) {
name = Utility.replace(name, replacements);
processNamesBuffer.setLength(0);
for (int i = 0; i < name.length(); ++i) {
char c = name.charAt(i);
++charCount[c];
if (samples[c] == 0) samples[c] = codePoint;
if ('A' <= c && c <= 'Z'
|| '0' <= c && c <= '9') processNamesBuffer.append(c);
}
if (processNamesBuffer.length() == name.length()) return name;
return processNamesBuffer.toString();
}
}

View File

@ -1,246 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $
* $Date: 2004/02/12 08:23:16 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.text.utility.*;
public final class TestNormalization {
static final String DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\Update 3.0.1\\";
static final boolean SKIP_FILE = true;
static PrintWriter out = null;
static BufferedReader in = null;
static BitSet charsListed = new BitSet(0x110000);
static int errorCount = 0;
static int lineErrorCount = 0;
static String originalLine = "";
static String lastLine = "";
public static void main(String[] args) throws java.io.IOException {
System.out.println("Creating Normalizers");
String[] testSet = {"a\u0304\u0328", "a\u0328\u0304"};
for (int i = 0; i < testSet.length; ++i) {
String s = testSet[i];
boolean test = Default.nfc().isFCD(s);
System.out.println(test + ": " + Default.ucd().getCodeAndName(s));
}
String x = UTF32.valueOf32(0x10000);
check("NFC", Default.nfc(), x);
check("NFD", Default.nfd(), x);
check("NFKC", Default.nfkc(), x);
check("NFKD", Default.nfkd(), x);
out = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("NormalizationTestLog.txt"),
"UTF8"),
32*1024));
in = new BufferedReader (
new FileReader (DIR + "NormalizationTest.txt"),
32*1024);
try {
String[] parts = new String[10];
System.out.println("Checking files");
int count = 0;
while (true) {
String line = in.readLine();
if ((count++ & 0x3FF) == 0) System.out.println("#LINE: " + line);
if (line == null) break;
originalLine = line;
int pos = line.indexOf('#');
if (pos >= 0) {
line = line.substring(0,pos);
}
line = line.trim();
if (line.length() == 0) continue;
int splitCount = Utility.split(line, ';', parts);
// FIX check splitCount
for (int i = 0; i < splitCount; ++i) {
parts[i] = Utility.fromHex(parts[i]);
}
if (UTF32.length32(parts[0]) == 1) {
int code = UTF32.char32At(parts[0],0);
charsListed.set(code);
if ((code & 0x3FF) == 0) System.out.println("# " + Utility.hex(code));
}
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
errorCount += check("NFCa", Default.nfc(), parts[1], parts[0]);
errorCount += check("NFCb", Default.nfc(), parts[1], parts[1]);
errorCount += check("NFCc", Default.nfc(), parts[1], parts[2]);
// c4 == NFC(c4) == NFC(c5)
errorCount += check("NFCd", Default.nfc(), parts[3], parts[3]);
errorCount += check("NFCe", Default.nfc(), parts[3], parts[4]);
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
errorCount += check("NFDa", Default.nfd(), parts[2], parts[0]);
errorCount += check("NFDb", Default.nfd(), parts[2], parts[1]);
errorCount += check("NFDc", Default.nfd(), parts[2], parts[2]);
// c5 == NFD(c4) == NFD(c5)
errorCount += check("NFDd", Default.nfd(), parts[4], parts[3]);
errorCount += check("NFDe", Default.nfd(), parts[4], parts[4]);
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
errorCount += check("NFKCa", Default.nfkc(), parts[3], parts[0]);
errorCount += check("NFKCb", Default.nfkc(), parts[3], parts[1]);
errorCount += check("NFKCc", Default.nfkc(), parts[3], parts[2]);
errorCount += check("NFKCd", Default.nfkc(), parts[3], parts[3]);
errorCount += check("NFKCe", Default.nfkc(), parts[3], parts[4]);
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
errorCount += check("NFKDa", Default.nfkd(), parts[4], parts[0]);
errorCount += check("NFKDb", Default.nfkd(), parts[4], parts[1]);
errorCount += check("NFKDc", Default.nfkd(), parts[4], parts[2]);
errorCount += check("NFKDd", Default.nfkd(), parts[4], parts[3]);
errorCount += check("NFKDe", Default.nfkd(), parts[4], parts[4]);
}
System.out.println("Total errors in file: " + errorCount
+ ", lines: " + lineErrorCount);
errorCount = lineErrorCount = 0;
System.out.println("Checking Missing");
checkMissing();
System.out.println("Total errors in unlisted items: " + errorCount
+ ", lines: " + lineErrorCount);
} finally {
if (in != null) in.close();
if (out != null) out.close();
}
}
static String lastBase = "";
public static int check(String type, Normalizer n, String base, String other) {
try {
String trans = n.normalize(other);
if (!trans.equals(base)) {
String temp = "";
if (!lastLine.equals(originalLine)) {
temp = "// " + originalLine;
lastLine = originalLine;
}
if (!base.equals(lastBase)) {
lastBase = base;
lineErrorCount++;
}
String otherList = "";
if (!base.equals(other)) {
otherList = "(" + Default.ucd().getCodeAndName(other) + ")";
}
out.println("DIFF " + type + ": "
+ Default.ucd().getCodeAndName(base) + " != "
+ type
+ otherList
+ " == " + Default.ucd().getCodeAndName(trans)
+ temp
);
return 1;
}
} catch (Exception e) {
throw new ChainException("DIFF " + type + ": "
+ Default.ucd().getCodeAndName(base) + " != "
+ type + "(" + Default.ucd().getCodeAndName(other) + ")", new Object[]{}, e);
}
return 0;
}
public static int check(String type, Normalizer n, String base) {
return check(type, n, base, base);
}
static void checkMissing() {
for (int missing = 0; missing < 0x100000; ++missing) {
if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing));
if (charsListed.get(missing)) continue;
String x = UTF32.valueOf32(missing);
errorCount += check("NFC", Default.nfc(), x);
errorCount += check("NFD", Default.nfd(), x);
errorCount += check("NFKC", Default.nfkc(), x);
errorCount += check("NFKD", Default.nfkd(), x);
}
}
public static void checkStarters () {
System.out.println("Checking Starters");
UnicodeSet leading = new UnicodeSet();
UnicodeSet trailing = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
if (Default.nfc().isLeading(i)) leading.add(i);
if (Default.ucd().getCombiningClass(i) != 0) continue;
if (Default.nfc().isTrailing(i)) trailing.add(i);
}
System.out.println("Leading: " + leading.size());
System.out.println("Trailing Starters: " + trailing.size());
UnicodeSetIterator lead = new UnicodeSetIterator(leading);
UnicodeSetIterator trail = new UnicodeSetIterator(trailing);
UnicodeSet followers = new UnicodeSet();
Map map = new TreeMap(new CompareProperties.UnicodeSetComparator());
while (lead.next()) {
trail.reset();
followers.clear();
while (trail.next()) {
if (Default.nfc().getComposition(lead.codepoint, trail.codepoint) != 0xFFFF) {
followers.add(trail.codepoint);
}
}
if (followers.size() == 0) continue;
System.out.println(Default.ucd().getCode(lead.codepoint)
+ "\t" + followers.toPattern(true));
UnicodeSet possLead = (UnicodeSet) map.get(followers);
if (possLead == null) {
possLead = new UnicodeSet();
map.put(followers.clone(), possLead);
}
possLead.add(lead.codepoint);
}
Iterator it = map.keySet().iterator();
BagFormatter bf = new BagFormatter();
bf.setLineSeparator("<br>");
bf.setLabelSource(null);
bf.setAbbreviated(true);
while (it.hasNext()) {
UnicodeSet t = (UnicodeSet) it.next();
UnicodeSet l = (UnicodeSet) map.get(t);
System.out.println("<tr><td>"
+ bf.showSetNames(l)
+ "</td><td>"
+ bf.showSetNames(t)
+ "</td></tr>");
}
}
}

View File

@ -1,259 +0,0 @@
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.ParseException;
import java.text.ParsePosition;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.Tabber;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.tool.UOption;
import com.ibm.icu.text.SymbolTable;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeMatcher;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.Utility;
public class TestUnicodeInvariants {
private static final int
HELP1 = 0,
FILE = 1,
RANGE = 2,
TABLE = 3
;
private static final UOption[] options = {
UOption.HELP_H(),
UOption.create("file", 'f', UOption.REQUIRES_ARG),
UOption.create("norange", 'n', UOption.NO_ARG),
UOption.create("table", 't', UOption.NO_ARG),
};
public static void main(String[] args) throws IOException {
UOption.parseArgs(args, options);
String file = "UnicodeInvariants.txt";
if (options[FILE].doesOccur) file = options[FILE].value;
boolean doRange = !options[RANGE].doesOccur;
System.out.println("File:\t" + file);
System.out.println("Ranges?\t" + doRange);
System.out.println("HTML?\t" + options[TABLE].doesOccur);
testInvariants(file, doRange);
}
/**
* Chain together several SymbolTables.
* @author Davis
*/
static class ChainedSymbolTable implements SymbolTable {
// TODO: add accessors?
private List symbolTables;
/**
* Each SymbolTable is each accessed in order by the other methods,
* so the first in the list is accessed first, etc.
* @param symbolTables
*/
ChainedSymbolTable(SymbolTable[] symbolTables) {
this.symbolTables = Arrays.asList(symbolTables);
}
public char[] lookup(String s) {
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
SymbolTable st = (SymbolTable) it.next();
char[] result = st.lookup(s);
if (result != null) return result;
}
return null;
}
public UnicodeMatcher lookupMatcher(int ch) {
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
SymbolTable st = (SymbolTable) it.next();
UnicodeMatcher result = st.lookupMatcher(ch);
if (result != null) return result;
}
return null;
}
// Warning: this depends on pos being left alone unless a string is returned!!
public String parseReference(String text, ParsePosition pos, int limit) {
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
SymbolTable st = (SymbolTable) it.next();
String result = st.parseReference(text, pos, limit);
if (result != null) return result;
}
return null;
}
}
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
public static void testInvariants(String outputFile, boolean doRange) throws IOException {
String[][] variables = new String[100][2];
int variableCount = 0;
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
out.write('\uFEFF'); // BOM
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", outputFile);
BagFormatter errorLister = new BagFormatter();
errorLister.setMergeRanges(doRange);
errorLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
errorLister.setShowLiteral(TransliteratorUtilities.toXML);
if (options[TABLE].doesOccur) errorLister.setTabber(new Tabber.HTMLTabber());
BagFormatter showLister = new BagFormatter();
showLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
showLister.setMergeRanges(doRange);
showLister.setShowLiteral(TransliteratorUtilities.toXML);
if (options[TABLE].doesOccur) showLister.setTabber(new Tabber.HTMLTabber());
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"),
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
ParsePosition pp = new ParsePosition(0);
int parseErrorCount = 0;
int testFailureCount = 0;
while (true) {
String line = in.readLine();
if (line == null) break;
if (line.startsWith("\uFEFF")) line = line.substring(1);
out.println(line);
line = line.trim();
int pos = line.indexOf('#');
if (pos >= 0) line = line.substring(0,pos).trim();
if (line.length() == 0) continue;
if (line.equalsIgnoreCase("Stop")) break;
// fix all the variables
String oldLine = line;
line = Utility.replace(line, variables, variableCount);
// detect variables
if (line.startsWith("Let")) {
int x = line.indexOf('=');
variables[variableCount][0] = line.substring(3,x).trim();
variables[variableCount][1] = line.substring(x+1).trim();
variableCount++;
if (false) System.out.println("Added variable: <" + variables[variableCount-1][0] + "><"
+ variables[variableCount-1][1] + ">");
continue;
}
// detect variables
if (line.startsWith("Show")) {
String part = line.substring(4).trim();
if (part.startsWith("Each")) {
part = part.substring(4).trim();
showLister.setMergeRanges(false);
}
pp.setIndex(0);
UnicodeSet leftSet = new UnicodeSet(part, pp, st);
showLister.showSetNames(out, leftSet);
showLister.setMergeRanges(doRange);
continue;
}
if (line.startsWith("Test")) {
line = line.substring(4).trim();
}
char relation = 0;
String rightSide = null;
String leftSide = null;
UnicodeSet leftSet = null;
UnicodeSet rightSet = null;
try {
pp.setIndex(0);
leftSet = new UnicodeSet(line, pp, st);
leftSide = line.substring(0,pp.getIndex());
eatWhitespace(line, pp);
relation = line.charAt(pp.getIndex());
if (!INVARIANT_RELATIONS.contains(relation)) {
throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false),
pp.getIndex());
}
pp.setIndex(pp.getIndex()+1); // skip char
eatWhitespace(line, pp);
int start = pp.getIndex();
rightSet = new UnicodeSet(line, pp, st);
rightSide = line.substring(start,pp.getIndex());
eatWhitespace(line, pp);
if (line.length() != pp.getIndex()) {
throw new ParseException("Extra characters at end", pp.getIndex());
}
} catch (ParseException e) {
out.println("PARSE ERROR:\t" + line.substring(0,e.getErrorOffset())
+ "<@>" + line.substring(e.getErrorOffset()));
out.println();
out.println("**** START Error Info ****");
out.println(e.getMessage());
out.println("**** END Error Info ****");
out.println();
parseErrorCount++;
continue;
} catch (IllegalArgumentException e) {
out.println("PARSE ERROR:\t" + line);
out.println();
out.println("**** START Error Info ****");
out.println(e.getMessage());
out.println("**** END Error Info ****");
out.println();
parseErrorCount++;
continue;
}
boolean ok = true;
switch(relation) {
case '=': case '\u2261': ok = leftSet.equals(rightSet); break;
case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break;
case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break;
case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break;
case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break;
case '!': ok = leftSet.containsNone(rightSet); break;
case '?': ok = !leftSet.equals(rightSet)
&& !leftSet.containsAll(rightSet)
&& !rightSet.containsAll(leftSet)
&& !leftSet.containsNone(rightSet);
break;
default: throw new IllegalArgumentException("Internal Error");
}
if (ok) continue;
out.println();
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
out.println("**** START Error Info ****");
errorLister.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
out.println("**** END Error Info ****");
out.println();
testFailureCount++;
}
out.println();
out.println("**** SUMMARY ****");
out.println();
out.println("ParseErrorCount=" + parseErrorCount);
out.println("TestFailureCount=" + testFailureCount);
out.close();
System.out.println("ParseErrorCount=" + parseErrorCount);
System.out.println("TestFailureCount=" + testFailureCount);
}
/**
* @param line
* @param pp
*/
private static void eatWhitespace(String line, ParsePosition pp) {
int cp = 0;
int i;
for (i = pp.getIndex(); i < line.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(line, i);
if (!com.ibm.icu.lang.UCharacter.isUWhiteSpace(cp)) {
break;
}
}
pp.setIndex(i);
}
}

View File

@ -1,780 +0,0 @@
package com.ibm.text.UCD;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.Utility;
public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
static final boolean DEBUG = false;
private UCD ucd;
private Normalizer nfc, nfd, nfkd, nfkc;
private static boolean needAgeCache = true;
private static UCD[] ucdCache = new UCD[UCD_Types.LIMIT_AGE];
private static HashMap factoryCache = new HashMap();
public static synchronized ToolUnicodePropertySource make(String version) {
ToolUnicodePropertySource result = (ToolUnicodePropertySource) factoryCache.get(version);
if (result != null)
return result;
result = new ToolUnicodePropertySource(version);
factoryCache.put(version, result);
return result;
}
private ToolUnicodePropertySource(String version) {
ucd = UCD.make(version);
nfc = new Normalizer(Normalizer.NFC, ucd.getVersion());
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
nfkc = new Normalizer(Normalizer.NFKC, ucd.getVersion());
nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion());
version = ucd.getVersion(); // regularize
// first the special cases
if (DEBUG)
System.out.println("Adding Simple Cases");
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0)
return null;
return ucd.getName(codepoint);
}
}.setValues("<string>").setMain("Name", "na", UnicodeProperty.MISC, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
if (DEBUG && codepoint == 0x1D100) {
System.out.println("here");
}
//if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
return ucd.getBlock(codepoint);
}
protected UnicodeMap _getUnicodeMap() {
return ucd.blockData;
}
}.setValues(ucd.getBlockNames(null)).setMain("Block", "blk", UnicodeProperty.CATALOG, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
//if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
return ucd.getBidiMirror(codepoint);
}
}.setValues("<string>").setMain("Bidi_Mirroring_Glyph", "bmg", UnicodeProperty.STRING, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
//if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
return ucd.getCase(codepoint, UCD_Types.FULL, UCD_Types.FOLD);
}
}.setValues("<string>").setMain("Case_Folding", "cf", UnicodeProperty.STRING, version));
add(new UnicodeProperty.SimpleProperty() {
NumberFormat nf = NumberFormat.getInstance();
{
nf.setGroupingUsed(false);
nf.setMaximumFractionDigits(8);
nf.setMinimumFractionDigits(1);
}
public String _getValue(int codepoint) {
double num = ucd.getNumericValue(codepoint);
if (Double.isNaN(num))
return null;
return nf.format(num);
}
}.setMain("Numeric_Value", "nv", UnicodeProperty.NUMERIC, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int cp) {
if (!ucd.isRepresented(cp))
return null;
String b = nfkc.normalize(ucd.getCase(cp, UCD_Types.FULL, UCD_Types.FOLD));
String c = nfkc.normalize(ucd.getCase(b, UCD_Types.FULL, UCD_Types.FOLD));
if (c.equals(b))
return null;
return c;
}
public int getMaxWidth(boolean isShort) {
return 14;
}
}.setMain("FC_NFKC_Closure", "FC_NFKC", UnicodeProperty.STRING, version)
//.addName("FNC")
);
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
if (!nfd.isNormalized(codepoint))
return "No";
else if (nfd.isTrailing(codepoint))
throw new IllegalArgumentException("Internal Error!");
else
return "Yes";
}
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases().setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
if (!nfc.isNormalized(codepoint))
return "No";
else if (nfc.isTrailing(codepoint))
return "Maybe";
else
return "Yes";
}
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases().setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
if (!nfkd.isNormalized(codepoint))
return "No";
else if (nfkd.isTrailing(codepoint))
throw new IllegalArgumentException("Internal Error!");
else
return "Yes";
}
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases().setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
if (!nfkc.isNormalized(codepoint))
return "No";
else if (nfkc.isTrailing(codepoint))
return "Maybe";
else
return "Yes";
}
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases().setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version));
/*
add(new UnicodeProperty.SimpleProperty() {
public String _getValue(int codepoint) {
if (!nfx.isNormalized(codepoint)) return NO;
else if (nfx.isTrailing(codepoint)) return MAYBE;
else return "";
}
}.setMain("NFD_QuickCheck", "nv", UnicodeProperty.NUMERIC, version)
.setValues("<number>"));
*/
// Now the derived properties
if (DEBUG)
System.out.println("Derived Properties");
for (int i = 0; i < DerivedProperty.DERIVED_PROPERTY_LIMIT; ++i) {
UCDProperty prop = DerivedProperty.make(i);
if (prop == null)
continue;
if (!prop.isStandard())
continue;
String name = prop.getName();
if (getProperty(name) != null) {
if (DEBUG)
System.out.println("Iterated Names: " + name + ", ALREADY PRESENT*");
continue; // skip if already there
}
int type = prop.getValueType();
if (i == UCD_Types.FC_NFKC_Closure)
type = UnicodeProperty.STRING;
else if (i == UCD_Types.FullCompExclusion)
type = UnicodeProperty.BINARY;
else
type = remapUCDType(type);
if (DEBUG)
System.out.println(prop.getName());
add(new UCDPropertyWrapper(prop, type, false));
}
// then the general stuff
if (DEBUG)
System.out.println("Other Properties");
List names = new ArrayList();
UnifiedProperty.getAvailablePropertiesAliases(names, ucd);
Iterator it = names.iterator();
while (it.hasNext()) {
String name = (String) it.next();
if (getProperty(name) != null) {
if (DEBUG)
System.out.println("Iterated Names: " + name + ", ALREADY PRESENT");
continue; // skip if already there
}
if (DEBUG)
System.out.println("Iterated Names: " + name);
add(new ToolUnicodeProperty(name));
}
int compositeVersion = ucd.getCompositeVersion();
if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() {
{
unicodeMap = new UnicodeMap();
unicodeMap.setErrorOnReset(true);
unicodeMap.put(0xD, "CR");
unicodeMap.put(0xA, "LF");
UnicodeProperty cat = getProperty("General_Category");
UnicodeSet temp = cat.getSet("Line_Separator").addAll(cat.getSet("Paragraph_Separator")).addAll(cat.getSet("Control")).addAll(cat.getSet("Format")).remove(0xD).remove(0xA).remove(0x200C)
.remove(0x200D);
unicodeMap.putAll(temp, "Control");
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
unicodeMap.putAll(graphemeExtend, "Extend");
UnicodeProperty hangul = getProperty("Hangul_Syllable_Type");
unicodeMap.putAll(hangul.getSet("L"), "L");
unicodeMap.putAll(hangul.getSet("V"), "V");
unicodeMap.putAll(hangul.getSet("T"), "T");
unicodeMap.putAll(hangul.getSet("LV"), "LV");
unicodeMap.putAll(hangul.getSet("LVT"), "LVT");
unicodeMap.setMissing("Other");
}
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version).addValueAliases(new String[][] { { "Control", "CN" }, { "Extend", "EX" }, { "Other", "XX" }, }, true)
.swapFirst2ValueAliases());
if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() {
{
unicodeMap = new UnicodeMap();
unicodeMap.setErrorOnReset(true);
UnicodeProperty cat = getProperty("General_Category");
unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format");
UnicodeProperty script = getProperty("Script");
unicodeMap.putAll(script.getSet("Katakana").addAll(new UnicodeSet("[\u3031\u3032\u3033\u3034\u3035\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]")), "Katakana");
Object foo = unicodeMap.getSet("Katakana");
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
UnicodeProperty lineBreak = getProperty("Line_Break");
unicodeMap.putAll(getProperty("Alphabetic").getSet("true").add(0x05F3).removeAll(getProperty("Ideographic").getSet("true")).removeAll(unicodeMap.getSet("Katakana"))
//.removeAll(script.getSet("Thai"))
//.removeAll(script.getSet("Lao"))
.removeAll(lineBreak.getSet("SA")).removeAll(script.getSet("Hiragana")).removeAll(graphemeExtend), "ALetter");
unicodeMap.putAll(new UnicodeSet("[\\u0027\\u00B7\\u05F4\\u2019\\u2027\\u003A]"), "MidLetter");
unicodeMap.putAll(lineBreak.getSet("Infix_Numeric").remove(0x003A), "MidNum");
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "ExtendNumLet");
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
unicodeMap.setMissing("Other");
}
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version).addValueAliases(
new String[][] { { "Format", "FO" }, { "Katakana", "KA" }, { "ALetter", "LE" }, { "MidLetter", "ML" }, { "MidNum", "MN" }, { "Numeric", "NU" }, { "ExtendNumLet", "EX" }, { "Other", "XX" }, },
true).swapFirst2ValueAliases());
if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() {
{
unicodeMap = new UnicodeMap();
unicodeMap.setErrorOnReset(true);
unicodeMap.putAll(new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"), "Sep");
UnicodeProperty cat = getProperty("General_Category");
unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format");
unicodeMap.putAll(getProperty("Whitespace").getSet("true").removeAll(unicodeMap.getSet("Sep")).remove(0xA0), "Sp");
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
unicodeMap.putAll(getProperty("Lowercase").getSet("true").removeAll(graphemeExtend), "Lower");
unicodeMap.putAll(getProperty("Uppercase").getSet("true").addAll(cat.getSet("Titlecase_Letter")), "Upper");
UnicodeSet temp = getProperty("Alphabetic").getSet("true").add(0xA0).add(0x5F3).removeAll(unicodeMap.getSet("Lower")).removeAll(unicodeMap.getSet("Upper")).removeAll(graphemeExtend);
unicodeMap.putAll(temp, "OLetter");
UnicodeProperty lineBreak = getProperty("Line_Break");
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
unicodeMap.put(0x002E, "ATerm");
unicodeMap.putAll(getProperty("STerm").getSet("true").removeAll(unicodeMap.getSet("ATerm")), "STerm");
unicodeMap.putAll(cat.getSet("Open_Punctuation").addAll(cat.getSet("Close_Punctuation")).addAll(lineBreak.getSet("Quotation")).remove(0x05F3).removeAll(unicodeMap.getSet("ATerm")).removeAll(
unicodeMap.getSet("STerm")), "Close");
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
unicodeMap.setMissing("Other");
}
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version).addValueAliases(
new String[][] { { "Sep", "SE" }, { "Format", "FO" }, { "Sp", "SP" }, { "Lower", "LO" }, { "Upper", "UP" }, { "OLetter", "LE" }, { "Numeric", "NU" }, { "ATerm", "AT" }, { "STerm", "ST" },
{ "Close", "CL" }, { "Other", "XX" }, }, false).swapFirst2ValueAliases());
}
static String[] YES_NO_MAYBE = { "N", "M", "Y" };
static String[] LONG_YES_NO_MAYBE = { "No", "Maybe", "Yes" };
static String[] YES_NO = { "N", "Y" };
static String[] LONG_YES_NO = { "No", "Yes" };
/*
"Bidi_Mirroring_Glyph", "Block", "Case_Folding", "Case_Sensitive", "ISO_Comment",
"Lowercase_Mapping", "Name", "Numeric_Value", "Simple_Case_Folding",
"Simple_Lowercase_Mapping", "Simple_Titlecase_Mapping", "Simple_Uppercase_Mapping",
"Titlecase_Mapping", "Unicode_1_Name", "Uppercase_Mapping", "isCased", "isCasefolded",
"isLowercase", "isNFC", "isNFD", "isNFKC", "isNFKD", "isTitlecase", "isUppercase",
"toNFC", "toNFD", "toNFKC", "toNKFD"
});
*/
/*
private class NameProperty extends UnicodeProperty.SimpleProperty {
{set("Name", "na", "<string>", UnicodeProperty.STRING);}
public String getPropertyValue(int codepoint) {
if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
return ucd.getName(codepoint);
}
}
*/
static class UCDPropertyWrapper extends UnicodeProperty {
UCDProperty ucdProperty;
boolean yes_no_maybe;
UCDPropertyWrapper(UCDProperty ucdProperty, int type, boolean yes_no_maybe) {
this.ucdProperty = ucdProperty;
setType(type);
String name = ucdProperty.getName(UCDProperty.LONG);
if (name == null)
ucdProperty.getName(UCDProperty.SHORT);
setName(name);
this.yes_no_maybe = yes_no_maybe;
}
protected String _getVersion() {
return ucdProperty.getUCD().getVersion();
}
protected String _getValue(int codepoint) {
String result = ucdProperty.getValue(codepoint, UCDProperty.LONG);
if (result.length() == 0) {
return "False";
}
return result;
}
protected List _getNameAliases(List result) {
addUnique(ucdProperty.getName(UCDProperty.SHORT), result);
String name = getName();
addUnique(name, result);
if (name.equals("White_Space"))
addUnique("space", result);
return result;
}
protected List _getValueAliases(String valueAlias, List result) {
if (isType(BINARY_MASK)) {
if (valueAlias.equals("True"))
addUnique("T", result);
else if (valueAlias.equals("False"))
addUnique("F", result);
addUnique(valueAlias, result);
}
if (yes_no_maybe) {
if (valueAlias.equals("Yes"))
addUnique("Y", result);
else if (valueAlias.equals("No"))
addUnique("N", result);
else if (valueAlias.equals("Maybe"))
addUnique("M", result);
addUnique(valueAlias, result);
}
return result;
}
protected List _getAvailableValues(List result) {
if (isType(BINARY_MASK)) {
addUnique("True", result);
addUnique("False", result);
}
if (yes_no_maybe) {
addUnique("No", result);
addUnique("Maybe", result);
addUnique("Yes", result);
}
return result;
}
}
static final int ODD_BALLS = (1 << UCD_Types.Cn) | (1 << UCD_Types.Co) | (1 << UCD_Types.Cs) | (1 << UCD.Cc);
/* (non-Javadoc)
* @see com.ibm.icu.dev.test.util.UnicodePropertySource#getPropertyAliases(java.util.Collection)
*/
private class ToolUnicodeProperty extends UnicodeProperty {
com.ibm.text.UCD.UCDProperty up;
int propMask;
static final int EXTRA_START = 0x10000;
private ToolUnicodeProperty(String propertyAlias) {
propMask = UnifiedProperty.getPropmask(propertyAlias, ucd);
up = UnifiedProperty.make(propMask, ucd);
if (up == null)
throw new IllegalArgumentException("Not found: " + propertyAlias);
if (propertyAlias.equals("Case_Fold_Turkish_I")) {
System.out.println(propertyAlias + " " + getTypeName(getType()));
}
setType(getPropertyTypeInternal());
setName(propertyAlias);
}
public List _getAvailableValues(List result) {
if (result == null)
result = new ArrayList();
int type = getType() & CORE_MASK;
if (type == STRING || type == MISC)
result.add("<string>");
else if (type == NUMERIC)
result.add("<number>");
else if (type == BINARY) {
result.add("True");
result.add("False");
} else if (type == ENUMERATED || type == CATALOG) {
byte style = UCD_Types.LONG;
int prop = propMask >> 8;
String temp = null;
boolean titlecase = false;
for (int i = 0; i < 256; ++i) {
boolean check = false;
try {
switch (prop) {
case UCD_Types.CATEGORY >> 8:
temp = (ucd.getCategoryID_fromIndex((byte) i, style));
break;
case UCD_Types.COMBINING_CLASS >> 8:
temp = (ucd.getCombiningClassID_fromIndex((short) i, style));
break;
case UCD_Types.BIDI_CLASS >> 8:
temp = (ucd.getBidiClassID_fromIndex((byte) i, style));
break;
case UCD_Types.DECOMPOSITION_TYPE >> 8:
temp = (ucd.getDecompositionTypeID_fromIndex((byte) i, style));
//check = temp != null;
break;
case UCD_Types.NUMERIC_TYPE >> 8:
temp = (ucd.getNumericTypeID_fromIndex((byte) i, style));
titlecase = true;
break;
case UCD_Types.EAST_ASIAN_WIDTH >> 8:
temp = (ucd.getEastAsianWidthID_fromIndex((byte) i, style));
break;
case UCD_Types.LINE_BREAK >> 8:
temp = (ucd.getLineBreakID_fromIndex((byte) i, style));
break;
case UCD_Types.JOINING_TYPE >> 8:
temp = (ucd.getJoiningTypeID_fromIndex((byte) i, style));
break;
case UCD_Types.JOINING_GROUP >> 8:
temp = (ucd.getJoiningGroupID_fromIndex((byte) i, style));
break;
case UCD_Types.SCRIPT >> 8:
temp = (ucd.getScriptID_fromIndex((byte) i, style));
titlecase = true;
if (UnicodeProperty.UNUSED.equals(temp))
continue;
if (temp != null)
temp = UCharacter.toTitleCase(Locale.ENGLISH, temp, null);
break;
case UCD_Types.AGE >> 8:
temp = (ucd.getAgeID_fromIndex((byte) i, style));
break;
case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8:
temp = (ucd.getHangulSyllableTypeID_fromIndex((byte) i, style));
break;
default:
throw new IllegalArgumentException("Internal Error: " + prop);
}
} catch (ArrayIndexOutOfBoundsException e) {
continue;
}
if (check)
System.out.println("Value: " + temp);
if (temp != null && temp.length() != 0 && !temp.equals(UNUSED)) {
result.add(Utility.getUnskeleton(temp, titlecase));
}
if (check)
System.out.println("Value2: " + temp);
}
//if (prop == (UCD_Types.DECOMPOSITION_TYPE>>8)) result.add("none");
//if (prop == (UCD_Types.JOINING_TYPE>>8)) result.add("Non_Joining");
//if (prop == (UCD_Types.NUMERIC_TYPE>>8)) result.add("None");
}
return result;
}
public List _getNameAliases(List result) {
if (result == null)
result = new ArrayList();
addUnique(Utility.getUnskeleton(up.getName(UCD_Types.SHORT), false), result);
String longName = up.getName(UCD_Types.LONG);
addUnique(Utility.getUnskeleton(longName, true), result);
// hack
if (longName.equals("White_Space"))
addUnique("space", result);
return result;
}
public List _getValueAliases(String valueAlias, List result) {
if (result == null)
result = new ArrayList();
int type = getType() & CORE_MASK;
if (type == STRING || type == MISC || type == NUMERIC) {
UnicodeProperty.addUnique(valueAlias, result);
return result;
} else if (type == BINARY) {
UnicodeProperty.addUnique(valueAlias, result);
return lookup(valueAlias, UCD_Names.YN_TABLE_LONG, UCD_Names.YN_TABLE, null, result);
} else if (type == ENUMERATED || type == CATALOG) {
byte style = UCD_Types.LONG;
int prop = propMask >> 8;
boolean titlecase = false;
for (int i = 0; i < 256; ++i) {
try {
switch (prop) {
case UCD_Types.CATEGORY >> 8:
return lookup(valueAlias, UCD_Names.LONG_GENERAL_CATEGORY, UCD_Names.GENERAL_CATEGORY, UCD_Names.EXTRA_GENERAL_CATEGORY, result);
case UCD_Types.COMBINING_CLASS >> 8:
addUnique(String.valueOf(0xFF & Utility.lookup(valueAlias, UCD_Names.LONG_COMBINING_CLASS, true)), result);
return lookup(valueAlias, UCD_Names.LONG_COMBINING_CLASS, UCD_Names.COMBINING_CLASS, null, result);
case UCD_Types.BIDI_CLASS >> 8:
return lookup(valueAlias, UCD_Names.LONG_BIDI_CLASS, UCD_Names.BIDI_CLASS, null, result);
case UCD_Types.DECOMPOSITION_TYPE >> 8:
return lookup(valueAlias, UCD_Names.LONG_DECOMPOSITION_TYPE, UCD_Names.DECOMPOSITION_TYPE, null, result);
case UCD_Types.NUMERIC_TYPE >> 8:
return lookup(valueAlias, UCD_Names.LONG_NUMERIC_TYPE, UCD_Names.NUMERIC_TYPE, null, result);
case UCD_Types.EAST_ASIAN_WIDTH >> 8:
return lookup(valueAlias, UCD_Names.LONG_EAST_ASIAN_WIDTH, UCD_Names.EAST_ASIAN_WIDTH, null, result);
case UCD_Types.LINE_BREAK >> 8:
lookup(valueAlias, UCD_Names.LONG_LINE_BREAK, UCD_Names.LINE_BREAK, null, result);
if (valueAlias.equals("Inseparable"))
addUnique("Inseperable", result);
// Inseparable; Inseperable
return result;
case UCD_Types.JOINING_TYPE >> 8:
return lookup(valueAlias, UCD_Names.LONG_JOINING_TYPE, UCD_Names.JOINING_TYPE, null, result);
case UCD_Types.JOINING_GROUP >> 8:
return lookup(valueAlias, UCD_Names.JOINING_GROUP, null, null, result);
case UCD_Types.SCRIPT >> 8:
return lookup(valueAlias, UCD_Names.LONG_SCRIPT, UCD_Names.SCRIPT, UCD_Names.EXTRA_SCRIPT, result);
case UCD_Types.AGE >> 8:
return lookup(valueAlias, UCD_Names.AGE, null, null, result);
case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8:
return lookup(valueAlias, UCD_Names.LONG_HANGUL_SYLLABLE_TYPE, UCD_Names.HANGUL_SYLLABLE_TYPE, null, result);
default:
throw new IllegalArgumentException("Internal Error: " + prop);
}
} catch (ArrayIndexOutOfBoundsException e) {
continue;
}
}
}
throw new ArrayIndexOutOfBoundsException("not supported yet");
}
public String _getValue(int codepoint) {
byte style = UCD_Types.LONG;
String temp = null;
boolean titlecase = false;
switch (propMask >> 8) {
case UCD_Types.CATEGORY >> 8:
temp = (ucd.getCategoryID_fromIndex(ucd.getCategory(codepoint), style));
break;
case UCD_Types.COMBINING_CLASS >> 8:
temp = (ucd.getCombiningClassID_fromIndex(ucd.getCombiningClass(codepoint), style));
//if (temp.startsWith("Fixed_")) temp = temp.substring(6);
break;
case UCD_Types.BIDI_CLASS >> 8:
temp = (ucd.getBidiClassID_fromIndex(ucd.getBidiClass(codepoint), style));
break;
case UCD_Types.DECOMPOSITION_TYPE >> 8:
temp = (ucd.getDecompositionTypeID_fromIndex(ucd.getDecompositionType(codepoint), style));
if (temp == null || temp.length() == 0)
temp = "none";
break;
case UCD_Types.NUMERIC_TYPE >> 8:
temp = (ucd.getNumericTypeID_fromIndex(ucd.getNumericType(codepoint), style));
titlecase = true;
if (temp == null || temp.length() == 0)
temp = "None";
break;
case UCD_Types.EAST_ASIAN_WIDTH >> 8:
temp = (ucd.getEastAsianWidthID_fromIndex(ucd.getEastAsianWidth(codepoint), style));
break;
case UCD_Types.LINE_BREAK >> 8:
temp = (ucd.getLineBreakID_fromIndex(ucd.getLineBreak(codepoint), style));
break;
case UCD_Types.JOINING_TYPE >> 8:
temp = (ucd.getJoiningTypeID_fromIndex(ucd.getJoiningType(codepoint), style));
if (temp == null || temp.length() == 0)
temp = "Non_Joining";
break;
case UCD_Types.JOINING_GROUP >> 8:
temp = (ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(codepoint), style));
break;
case UCD_Types.SCRIPT >> 8:
temp = (ucd.getScriptID_fromIndex(ucd.getScript(codepoint), style));
if (temp != null)
temp = UCharacter.toTitleCase(Locale.ENGLISH, temp, null);
titlecase = true;
break;
case UCD_Types.AGE >> 8:
temp = getAge(codepoint);
break;
case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8:
temp = (ucd.getHangulSyllableTypeID_fromIndex(ucd.getHangulSyllableType(codepoint), style));
break;
}
if (temp != null)
return Utility.getUnskeleton(temp, titlecase);
if (isType(BINARY_MASK)) {
return up.hasValue(codepoint) ? "True" : "False";
}
throw new IllegalArgumentException("Failed to find value for " + Utility.hex(codepoint));
}
public String getAge(int codePoint) {
if (codePoint == 0xF0000) {
System.out.println("debug point");
}
if (needAgeCache) {
for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) {
ucdCache[i] = UCD.make(UCD_Names.AGE_VERSIONS[i]);
}
needAgeCache = false;
}
for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) {
if (ucdCache[i].isAllocated(codePoint))
return UCD_Names.AGE[i];
}
return UCD_Names.AGE[UCD_Types.UNKNOWN];
}
/* (non-Javadoc)
* @see com.ibm.icu.dev.test.util.UnicodePropertySource#getPropertyType()
*/
private int getPropertyTypeInternal() {
switch (propMask) {
case UCD_Types.BINARY_PROPERTIES | UCD_Types.CaseFoldTurkishI:
case UCD_Types.BINARY_PROPERTIES | UCD_Types.Non_break:
return EXTENDED_BINARY;
}
switch (propMask >> 8) {
case UCD_Types.SCRIPT >> 8:
case UCD_Types.AGE >> 8:
return CATALOG;
}
int mask = 0;
if (!up.isStandard())
mask = EXTENDED_MASK;
return remapUCDType(up.getValueType()) | mask;
}
public String _getVersion() {
return up.ucd.getVersion();
}
}
private int remapUCDType(int result) {
switch (result) {
case UCD_Types.NUMERIC_PROP:
result = UnicodeProperty.NUMERIC;
break;
case UCD_Types.STRING_PROP:
result = UnicodeProperty.STRING;
break;
case UCD_Types.MISC_PROP:
result = UnicodeProperty.STRING;
break;
case UCD_Types.CATALOG_PROP:
result = UnicodeProperty.ENUMERATED;
break;
case UCD_Types.FLATTENED_BINARY_PROP:
case UCD_Types.ENUMERATED_PROP:
result = UnicodeProperty.ENUMERATED;
break;
case UCD_Types.BINARY_PROP:
result = UnicodeProperty.BINARY;
break;
case UCD_Types.UNKNOWN_PROP:
default:
result = UnicodeProperty.STRING;
//throw new IllegalArgumentException("Type: UNKNOWN_PROP");
}
return result;
}
static List lookup(String valueAlias, String[] main, String[] aux, Map aux2, List result) {
//System.out.println(valueAlias + "=>");
//System.out.println("=>" + aux[pos]);
if (aux != null) {
int pos = 0xFF & Utility.lookup(valueAlias, main, true);
UnicodeProperty.addUnique(aux[pos], result);
}
UnicodeProperty.addUnique(valueAlias, result);
if (aux2 != null) {
String xtra = (String) aux2.get(valueAlias);
if (xtra != null)
UnicodeProperty.addUnique(xtra, result);
}
return result;
}
/*
static class DerivedPropertyWrapper extends UnicodeProperty {
UCDProperty derivedProperty;
UCD ucd;
DerivedPropertyWrapper(int derivedPropertyID, UCD ucd) {
this.ucd = ucd;
derivedProperty = DerivedProperty.make(derivedPropertyID, ucd);
}
protected String _getVersion() {
return ucd.getVersion();
}
protected String _getValue(int codepoint) {
return derivedProperty.getValue(codepoint, UCD_Types.LONG);
}
protected List _getNameAliases(List result) {
if (result != null) result = new ArrayList(1);
addUnique(derivedProperty.getName(UCD_Types.SHORT), result);
addUnique(derivedProperty.getName(UCD_Types.LONG), result);
return null;
}
protected List _getValueAliases(String valueAlias, List result) {
// TODO Auto-generated method stub
return null;
}
protected List _getAvailableValues(List result) {
// TODO Auto-generated method stub
return null;
}
}
*/
}

View File

@ -1,226 +0,0 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>Unicode Character Database</title>
<style>
<!--
table { padding: 4 }
td { padding: 4 }
-->
</style>
</head>
<body>
<span class="cb" id style="DISPLAY: block">
<h1 align="center">Unicode Character Database (UCD) in XML Format</h1>
<h1 align="center"><b><font color="#FF0000">WARNING: FORMAT IS DRAFT!</font></b></h1>
<p align="center">MD 2000.10.16</p>
<table border="1" width="40%" align="right" cellspacing="4" cellpadding="0">
<tr>
<td width="100%" bgcolor="#C0C0C0"><span class="cb" id
style="DISPLAY: block">
<h4 align="center">Using Internet Explorer</h4>
<p>The UCD-Main.xml file can be read in Internet Explorer (5.0 and above).
However:</p>
<ul>
<li>It may take a few minutes to load completely.</li>
<li>The XML parser in IE does not appear to be conformant: it seems to
break on</span> the following valid code points (and others):
<ul>
<li>&lt;IEbugs<br>
c1='&amp;#xFFF9;'<br>
c2='&amp;#xFFFA;'<br>
c3='&amp;#xFFFB;'<br>
c4='&amp;#xFFFC;'<br>
c5='&amp;#xFFFD;'<br>
c6='&amp;#xF0000;'<br>
c7='&amp;#xFFFFD;'<br>
c8='&amp;#x100000;'<br>
c9='&amp;#x10FFFD;'/&gt;</li>
</ul>
</li>
</ul>
</td>
</tr>
</table>
<p><a href="UCD-Main.xml">UCD-Main.xml</a> provides an XML format for the main
files in the Unicode Character Database. These include:</p>
<ul>
<li><code>UnicodeData.txt</code></li>
<li><code>ArabicShaping.txt</code></li>
<li><code>Jamo.txt</code></li>
<li><code>SpecialCasing.txt</code></li>
<li><code>CompositionExclusions.txt</code></li>
<li><code>EastAsianWidth.txt</code></li>
<li><code>LineBreak.txt</code></li>
<li><code>BidiMirroring.txt</code></li>
<li><code>CaseFolding.txt</code></li>
<li><code>Blocks.txt</code></li>
<li><code>PropList.alpha.txt</code></li>
</ul>
<p>Other files in the UCD have very different structure or purpose, and are best
expressed with separate files. Some annotational data, such as that in
NamesList.txt or the 10646 comment in UnicodeData, is also best served with
separate files. The current UCD files not yet in XML format are:</p>
<ul>
<li><code>Unihan.txt</code></li>
<li><code>NamesList.txt</code></li>
<li><code>Index.txt</code></li>
<li><code>NormalizationTest.txt</code></li>
</ul>
<h3>Format</h3>
<p>The Unicode blocks are provided as a list of &lt;block .../&gt; elements,
with attributes providing the start, end, and name.</p>
<p>Each assigned code point is a &lt;e .../&gt; element, with attributes
supplying specific properties. The meaning of the attributes is specified below.
There is one exception: large ranges of code points&nbsp; for characters such as
Hangul Syllables are abbreviated by indicating the start and end of the range.</p>
<p>Because of the volume of data, the attribute names are abbreviated. A <a
href="#AttributeAbbreviations">key</a> explains the abbreviations, and relates
them to the fields and values of the original UCD semicolon-delimited files.
With few exceptions, the values in the XML are directly copied from data in the
original UCD semicolon-delimited files. Those exceptions are described <a
href="http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html#DataModifications">below</a>.</p>
<p>Numeric character references (NCRs) are used to encode the Unicode code
points. Some Unicode code points cannot be transmitted in XML, even as NCRs (see
<a href="http://www.w3.org/TR/REC-xml#charsets">http://www.w3.org/TR/REC-xml#charsets</a>),
or would not be visibly distinct (TAB, CR, LF) in the data. Such code points are
represented by '#xX;', where X is a hex number.</p>
<h3><a name="AttributeAbbreviations">Attribute Abbreviations</a></h3>
<p>To reduce the size of the document, the following attribute abbreviations are
used. If an attribute is missing, that means it gets a default value. The
defaults are listed in parentheses below. If there is no specific default, then
a missing attribute should be read as N/A (not applicable). A default with '='
means the default is the value of another other field (recursively!). Thus if
the titlecase attribute is missing, then the value is the same as the uppercase.
If that in turn is missing, then the value is the same as the code point itself.</p>
<p>For a description of the source files, see <a
href="http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html">UnicodeCharacterDatabase.html</a>.
That file also has links to the descriptions of the fields within the files.
Since the PropList values are so long, they will probably also be abbreviated in
the future.</p>
<table border="1" width="100%">
<tr>
<td width="50%" valign="top"><span class="cb" id style="DISPLAY: block">
<h4>UnicodeData</h4>
<p>&nbsp; c: code point<br>
&nbsp; n: name<br>
&nbsp; gc: general category (Lo)<br>
&nbsp; cc: combining class (0)<br>
&nbsp; bc: bidi category (L)<br>
&nbsp; dm: decomposition mapping<br>
&nbsp; dt: decomposition type (canonical)<br>
&nbsp; nt: numeric type<br>
&nbsp; nv: numeric value<br>
&nbsp; bm: bidi mirrored (N)<br>
&nbsp; uc: uppercase (=c)<br>
&nbsp; lc: lowercase (=c)<br>
&nbsp; tc: titlecase (=uc)</p>
<h4>SpecialCasing:</h4>
<p>&nbsp; sl: special lower (=lc)<br>
&nbsp; su: special upper (=uc)<br>
&nbsp; st: special title (=su)<br>
&nbsp; sc: special case condition</p>
<h4>CaseFolding:</h4>
<p>&nbsp; fc: foldcase (=sl)</span></td>
<td width="50%" valign="top"><span class="cb" id style="DISPLAY: block">
<h4>CompositionExclusions:</h4>
<p>&nbsp; ce: composition exclusion (N)</p>
<h4>EastAsianWidth:</h4>
<p>&nbsp; ea: east asian width (N)</p>
<h4>Jamo:</h4>
<p>&nbsp; jn: jamo name</p>
<h4>LineBreak:</h4>
<p>&nbsp; lb: line break class (AL)</p>
<h4>ArabicShaping:</h4>
<p>&nbsp; jt: joining type<br>
&nbsp; jg: joining group</p>
<h4>BidiMirroring:</h4>
<p>&nbsp; bg: bidi mirroring glyph (=c)</p>
<p><b>PropList:</b></p>
<p>&nbsp; xs: space-delimited list of properties from the file</p>
<p><b><i>WARNING: these values are likely to change!</i></b></span></td>
</tr>
</table>
<br>
<h3><a name="DataModifications">Data Modifications</a></h3>
</span>
<p>The XML format is generated from the original semicolon-delimited UCD files.
In general, all fields and values are direct copies. However, there are some
changes, detailed below.</p>
<h4>1. Some redundant or annotational fields are omitted</h4>
<table border="1" width="100%">
<tr>
<td width="50%" valign="top"><b>UnicodeData<br>
</b>1.0 Name<br>
10646 comment<br>
<br>
<b>CaseFolding<br>
</b>Type (since it is computable from whether the fold equals the normal
lowercase)
<p><b>ArabicShaping<br>
</b>Name<br>
<br>
<b>EastAsianWidth<br>
</b>Name<br>
<br>
<b>LineBreak<br>
</b>Name</p>
</td>
<td width="50%" valign="top"><b>PropList</b><font face="Times New Roman"
color="#000000">
<p>The fields are based on the proposed PropList.alpha, which changes the
fields considerably.</p>
</font>
<p><span class="cb" id style="display: block"><b><i>WARNING: other values
are also likely to change!</i></b></span></p>
</td>
</tr>
</table>
<h4>2. Some fields are broken into several fields; others may be combined into a
single field</h4>
<ul>
<li><b>dt: </b>decomposition tag
<ul>
<li>the 'tag' field extracted from the decomposition mapping. If there is
no tag, the value is &quot;canonical&quot;. Only has meaning if there is
a decomposition (<b>dm</b>).</li>
</ul>
</li>
<li><b>nt: </b>numeric type
<ul>
<li>an enumeration [decimal, digit, numeric] for the type of number. It
replaces having duplicate field values for numbers</li>
</ul>
</li>
<li><b>rg: </b>range
<ul>
<li>used for ranges of values that share characteristics, instead of
having to do a substring check.<br>
&quot;START&quot; corresponds to &quot;&lt;..., First&gt;&quot;<br>
&quot;END&quot; corresponds to &quot;&lt;..., Last&gt;&quot;</li>
</ul>
</li>
<li><b>nc: </b>name computed
<ul>
<li>if &quot;COMPUTED&quot;, indicates that the name must be computed:
e.g. Hangul Syllables, Ideographs</li>
</ul>
</li>
<li><b>na: </b>name annotation
<ul>
<li>used for code points that do not really have associated names, like
control characters and private use characters. The data in that case is
either extracted from the &quot;&lt;...&gt;&quot; style name in the old
format, or gotten from the &quot;1.0 Unicode name&quot;.</li>
</ul>
</li>
</ul>
</body>
</html>

File diff suppressed because it is too large Load Diff

View File

@ -1,180 +0,0 @@
package com.ibm.text.UCD;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.*;
public abstract class UCDProperty implements UCD_Types {
// TODO: turn all of these into privates, and use setters only
protected UCD ucd;
protected boolean isStandard = true;
protected byte type = NOT_DERIVED;
private byte valueType = BINARY_PROP;
protected boolean hasUnassigned = false;
protected boolean isBinary = true;
protected byte defaultValueStyle = SHORT;
protected byte defaultPropertyStyle = LONG;
protected String valueName;
protected String numberValueName;
protected String shortValueName;
protected String header;
protected String subheader;
protected String name;
protected String shortName;
protected String numberName;
protected boolean skeletonize = true;
/**
* Return the UCD in use
*/
public UCD getUCD() { return ucd; }
/**
* Is it part of the standard, or just for my testing?
*/
public boolean isStandard() { return isStandard; }
public void setStandard(boolean in) { isStandard = in; }
public boolean skipInDerivedListing() {return false;}
public boolean isDefaultValue() {return false;}
/**
* What type is it? DERIVED..
*/
public byte getType() { return type; }
public void setType(byte in) { type = in; }
/**
* Does getProperty vary in contents? ENUMERATED,...
*/
public byte getValueType() { return valueType; }
public void setValueType(byte in) { valueType = in; }
/**
* Does it apply to any unassigned characters?
*/
public boolean hasUnassigned() { return hasUnassigned; }
public void setHasUnassigned(boolean in) { hasUnassigned = in; }
/** Header used in DerivedXXX files
*/
public String getHeader() { return header; }
public void setHeader(String in) { header = in; }
/** Header used in DerivedXXX files
*/
public String getSubHeader() { return subheader; }
public void setSubHeader(String in) { subheader = in; }
/**
* Get the full name. Style is SHORT, NORMAL, LONG
*/
public String getFullName(byte style) {
return getPropertyName(style) + "=" + getValue(style);
}
public String getFullName() {
return getFullName(NORMAL);
}
/**
* Get the property name. Style is SHORT, NORMAL, LONG
*/
public String getPropertyName(byte style) {
if (style == NORMAL) style = defaultPropertyStyle;
switch (style) {
case LONG: return skeletonize ? Utility.getUnskeleton(name.toString(), false) : name.toString();
case SHORT: return shortName.toString();
case NUMBER: return numberName.toString();
default: throw new IllegalArgumentException("Bad property: " + style);
}
}
public String getPropertyName() { return getPropertyName(NORMAL); }
public void setPropertyName(byte style, String in) {
if (style == NORMAL) style = defaultPropertyStyle;
switch (style) {
case LONG: name = Utility.getUnskeleton(in, false); break;
case SHORT: shortName = in; break;
case NUMBER: numberName = in; break;
default: throw new IllegalArgumentException("Bad property: " + style);
}
}
/**
* Get the value name. Style is SHORT, NORMAL, LONG
* "" if hasValue is false
* MUST OVERRIDE getValue(cp...) if valueVaries
*/
public String getValue(int cp, byte style) {
if (!hasValue(cp)) return "";
return getValue(style);
}
public String getValue(int cp) { return getValue(cp, NORMAL); }
public void setValue(byte style, String in) {
if (getValueType() < BINARY_PROP) throw new IllegalArgumentException("Can't set varying value: " + style);
if (style == NORMAL) style = defaultValueStyle;
switch (style) {
case LONG: valueName = Utility.getUnskeleton(in, false); break;
case SHORT: shortValueName = in; break;
case NUMBER: numberValueName = in; break;
default: throw new IllegalArgumentException("Bad value: " + style);
}
}
public String getValue(byte style) {
if (getValueType() < BINARY_PROP) throw new IllegalArgumentException(
"Value varies in " + getName(LONG) + "; call getValue(cp)");
try {
if (style == NORMAL) style = defaultValueStyle;
switch (style) {
case LONG: return Utility.getUnskeleton(valueName.toString(), false);
case SHORT: return shortValueName.toString();
case NUMBER: return numberValueName.toString();
default: throw new IllegalArgumentException("Bad property: " + style);
}
} catch (RuntimeException e) {
throw new com.ibm.text.utility.ChainException("Unset value string in " + getName(LONG), null, e);
}
}
/**
* special hack for NFD/NFKD
*/
public String getListingValue(int cp) {
if (getValueType() != BINARY_PROP) return getValue(cp, LONG);
return getPropertyName(LONG);
}
/**
* Does it have the propertyValue?
*/
abstract public boolean hasValue(int cp);
/**
* Get the set of characters it contains
*/
private UnicodeSet cache = null;
public UnicodeSet getSet() {
if (cache == null) {
cache = new UnicodeSet();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (hasValue(cp)) cache.add(cp);
}
}
return (UnicodeSet) cache.clone();
}
///////////////////////////////////////////
// Old Name for compatibility
boolean isTest() { return isStandard(); }
String getName(byte style) { return getPropertyName(style); }
String getName() { return getPropertyName(); }
}

File diff suppressed because it is too large Load Diff

View File

@ -1,575 +0,0 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.33 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
public interface UCD_Types {
static final byte BINARY_FORMAT = 17; // bumped if binary format of UCD changes. Forces rebuild
public static final String BASE_DIR = "C:\\DATA\\";
public static final String UCD_DIR = BASE_DIR + "UCD\\";
public static final String BIN_DIR = BASE_DIR + "BIN\\";
public static final String GEN_DIR = BASE_DIR + "GEN\\";
public static final char DOTTED_CIRCLE = '\u25CC';
public static final int
CJK_BASE = 0x4E00,
CJK_LIMIT = 0x9FFF+1,
CJK_COMPAT_USED_BASE = 0xFA0E,
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
CJK_A_BASE = 0x3400,
CJK_A_LIMIT = 0x4DBF+1,
CJK_B_BASE = 0x20000,
CJK_B_LIMIT = 0x2A6DF+1;
// Unicode Property Types
static final byte
NOT_DERIVED = 1,
DERIVED_CORE = 2,
DERIVED_NORMALIZATION = 4,
DERIVED_ALL = 0x6,
ALL = (byte)-1;
static final byte
NUMERIC_PROP = 0,
STRING_PROP = 1,
MISC_PROP = 2,
CATALOG_PROP = 3,
ENUMERATED_PROP = 4,
BINARY_PROP = 5,
FLATTENED_BINARY_PROP = 6,
UNKNOWN_PROP = 7;
/*
0 Code value in 4-digit hexadecimal format.
1 Unicode 2.1 Character Name. These names match exactly the
2 General Category. This is a useful breakdown into various "character
3 Canonical Combining Classes. The classes used for the
4 Bidirectional Category. See the list below for an explanation of the
5 Character Decomposition. In the Unicode Standard, not all of
6 Decimal digit value. This is a numeric field. If the character
7 Digit value. This is a numeric field. If the character represents a
8 Numeric value. This is a numeric field. If the character has the
9 If the characters has been identified as a "mirrored" character in
10 Unicode 1.0 Name. This is the old name as published in Unicode 1.0.
11 10646 Comment field. This field is informative.
12 Upper case equivalent mapping. If a character is part of an
13 Lower case equivalent mapping. Similar to 12. This field is informative.
14 Title case equivalent mapping. Similar to 12. This field is informative.
*/
// for IDs
static final byte NUMBER = -2, SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2, EXTRA_ALIAS = 3;
// Binary ENUM Grouping
public static final int
CATEGORY = 0,
COMBINING_CLASS = 0x100,
BIDI_CLASS = 0x200,
DECOMPOSITION_TYPE = 0x300,
NUMERIC_TYPE = 0x400,
EAST_ASIAN_WIDTH = 0x500,
LINE_BREAK = 0x600,
JOINING_TYPE = 0x700,
JOINING_GROUP = 0x800,
BINARY_PROPERTIES = 0x900,
SCRIPT = 0xA00,
AGE = 0xB00,
HANGUL_SYLLABLE_TYPE = 0xC00,
DERIVED = 0xD00,
LIMIT_ENUM = DERIVED + 0x100,
NEXT_ENUM = 0x100;
public static final int LIMIT_COMBINING_CLASS = 256;
// getCategory
public static final byte
UNASSIGNED = 0,
UPPERCASE_LETTER = 1,
LOWERCASE_LETTER = 2,
TITLECASE_LETTER = 3,
MODIFIER_LETTER = 4,
OTHER_LETTER = 5,
NON_SPACING_MARK = 6,
ENCLOSING_MARK = 7,
COMBINING_SPACING_MARK = 8,
DECIMAL_DIGIT_NUMBER = 9,
LETTER_NUMBER = 10,
OTHER_NUMBER = 11,
SPACE_SEPARATOR = 12,
LINE_SEPARATOR = 13,
PARAGRAPH_SEPARATOR = 14,
CONTROL = 15,
FORMAT = 16,
UNUSED_CATEGORY = 17,
PRIVATE_USE = 18,
SURROGATE = 19,
DASH_PUNCTUATION = 20,
START_PUNCTUATION = 21,
END_PUNCTUATION = 22,
CONNECTOR_PUNCTUATION = 23,
OTHER_PUNCTUATION = 24,
MATH_SYMBOL = 25,
CURRENCY_SYMBOL = 26,
MODIFIER_SYMBOL = 27,
OTHER_SYMBOL = 28,
INITIAL_PUNCTUATION = 29,
FINAL_PUNCTUATION = 30,
LIMIT_CATEGORY = FINAL_PUNCTUATION+1,
// Unicode abbreviations
Lu = UPPERCASE_LETTER,
Ll = LOWERCASE_LETTER,
Lt = TITLECASE_LETTER,
Lm = MODIFIER_LETTER,
Lo = OTHER_LETTER,
Mn = NON_SPACING_MARK,
Me = ENCLOSING_MARK,
Mc = COMBINING_SPACING_MARK,
Nd = DECIMAL_DIGIT_NUMBER,
Nl = LETTER_NUMBER,
No = OTHER_NUMBER,
Zs = SPACE_SEPARATOR,
Zl = LINE_SEPARATOR,
Zp = PARAGRAPH_SEPARATOR,
Cc = CONTROL,
Cf = FORMAT,
Cs = SURROGATE,
Co = PRIVATE_USE,
Cn = UNASSIGNED,
Pc = CONNECTOR_PUNCTUATION,
Pd = DASH_PUNCTUATION,
Ps = START_PUNCTUATION,
Pe = END_PUNCTUATION,
Po = OTHER_PUNCTUATION,
Pi = INITIAL_PUNCTUATION,
Pf = FINAL_PUNCTUATION,
Sm = MATH_SYMBOL,
Sc = CURRENCY_SYMBOL,
Sk = MODIFIER_SYMBOL,
So = OTHER_SYMBOL;
static final int
LETTER_MASK = (1<<Lu) | (1<<Ll) | (1<<Lt) | (1<<Lm) | (1 << Lo),
CASED_LETTER_MASK = (1<<Lu) | (1<<Ll) | (1<<Lt),
MARK_MASK = (1<<Mn) | (1<<Me) | (1<<Mc),
NUMBER_MASK = (1<<Nd) | (1<<Nl) | (1<<No),
SEPARATOR_MASK = (1<<Zs) | (1<<Zl) | (1<<Zp),
CONTROL_MASK = (1<<Cc) | (1<<Cf) | (1<<Cs) | (1<<Co),
PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
UNASSIGNED_MASK = (1<<Cn),
BASE_MASK = LETTER_MASK | NUMBER_MASK | PUNCTUATION_MASK | SYMBOL_MASK | (1<<Mc),
NONSPACING_MARK_MASK = (1<<Mn) | (1<<Me);
// Binary Properties
public static final byte
BidiMirrored = 0,
CompositionExclusion = 1,
White_space = 2,
Non_break = 3,
Bidi_Control = 4,
Join_Control = 5,
Dash = 6,
Hyphen = 7,
Quotation_Mark = 8,
Terminal_Punctuation = 9,
Math_Property = 10,
Hex_Digit = 11,
ASCII_Hex_Digit = 12,
Other_Alphabetic = 13,
Ideographic = 14,
Diacritic = 15,
Extender = 16,
Other_Lowercase = 17,
Other_Uppercase = 18,
Noncharacter_Code_Point = 19,
CaseFoldTurkishI = 20,
Other_GraphemeExtend = 21,
GraphemeLink = 22,
IDS_BinaryOperator = 23,
IDS_TrinaryOperator = 24,
Radical = 25,
UnifiedIdeograph = 26,
Other_Default_Ignorable_Code_Point = 27,
Deprecated = 28,
Soft_Dotted = 29,
Logical_Order_Exception = 30,
Other_ID_Start = 31,
Sentence_Terminal = 32,
Variation_Selector = 33,
Other_ID_Continue = 34,
Pattern_White_Space = 35,
Pattern_Syntax = 36,
LIMIT_BINARY_PROPERTIES = 37;
/*
static final int
BidiMirroredMask = 1<<BidiMirrored,
CompositionExclusionMask = 1<<CompositionExclusion,
AlphabeticMask = 1<<Other_Alphabetic,
Bidi_ControlMask = 1<<Bidi_Control,
DashMask = 1<<Dash,
DiacriticMask = 1<<Diacritic,
ExtenderMask = 1<<Extender,
Hex_DigitMask = 1<<Hex_Digit,
HyphenMask = 1<<Hyphen,
IdeographicMask = 1<<Ideographic,
Join_ControlMask = 1<<Join_Control,
Math_PropertyMask = 1<<Math_Property,
Non_breakMask = 1<<Non_break,
Noncharacter_Code_PointMask = 1<<Noncharacter_Code_Point,
Other_LowercaseMask = 1<<Other_Lowercase,
Other_UppercaseMask = 1<<Other_Uppercase,
Quotation_MarkMask = 1<<Quotation_Mark,
Terminal_PunctuationMask = 1<<Terminal_Punctuation,
White_spaceMask = 1<<White_space;
*/
// line break
public static final byte
LB_XX = 0, LB_OP = 1, LB_CL = 2, LB_QU = 3, LB_GL = 4, LB_NS = 5, LB_EX = 6, LB_SY = 7,
LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15,
LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23,
LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28,
LB_NL = 29,
LB_WJ = 30,
LB_JL = 31,
LB_JV = 32,
LB_JT = 33,
LB_H2 = 34,
LB_H3 = 35,
//LB_JL = 29,
//LB_JV = 30,
//LB_JT = 31,
LIMIT_LINE_BREAK = 36,
LB_LIMIT = LIMIT_LINE_BREAK;
// east asian width
public static final byte
EAN = 0, EAA = 1, EAH = 2, EAW = 3, EAF = 4, EANa = 5,
LIMIT_EAST_ASIAN_WIDTH = 6;
// bidi class
static final byte
BIDI_L = 0, // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
BIDI_R = 1, // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
BIDI_EN = 2, // European Number
BIDI_ES = 3, // European Number Separator
BIDI_ET = 4, // European Number Terminator
BIDI_AN = 5, // Arabic Number
BIDI_CS = 6, // Common Number Separator
BIDI_B = 7, // Block Separator
BIDI_S = 8, // Segment Separator
BIDI_WS = 9, // Whitespace
BIDI_ON = 10, // Other Neutrals ; All other characters: punctuation, symbols
LIMIT_BIDI_2 = 11,
BIDI_UNUSED = 11,
BIDI_BN = 12,
BIDI_NSM = 13,
BIDI_AL = 14,
BIDI_LRO = 15,
BIDI_RLO = 16,
BIDI_LRE = 17,
BIDI_RLE = 18,
BIDI_PDF = 19,
LIMIT_BIDI_CLASS = 20;
// decompositionType
static final byte NONE = 0,
CANONICAL = 1,
COMPATIBILITY = 2,
COMPAT_UNSPECIFIED = 2, // Otherwise unspecified compatibility character.
COMPAT_FONT = 3, // A font variant (e.g. a blackletter form).
COMPAT_NOBREAK = 4, // A no-break version of a space or hyphen.
COMPAT_INITIAL = 5, // // An initial presentation form (Arabic).
COMPAT_MEDIAL = 6, // // A medial presentation form (Arabic).
COMPAT_FINAL = 7, // // A final presentation form (Arabic).
COMPAT_ISOLATED = 8, // An isolated presentation form (Arabic).
COMPAT_CIRCLE = 9, // An encircled form.
COMPAT_SUPER = 10, // A superscript form.
COMPAT_SUB = 11, // A subscript form.
COMPAT_VERTICAL = 12, // A vertical layout presentation form.
COMPAT_WIDE = 13, // A wide (or zenkaku) compatibility character.
COMPAT_NARROW = 14, // A narrow (or hankaku) compatibility character.
COMPAT_SMALL = 15, // A small variant form (CNS compatibility).
COMPAT_SQUARE = 16, // A CJK squared font variant.
COMPAT_FRACTION = 17, // A vulgar fraction form.
LIMIT_DECOMPOSITION_TYPE = 18;
// mirrored type
static final byte NO = 0, YES = 1, LIMIT_MIRRORED = 2;
// for QuickCheck
static final byte QNO = 0, QMAYBE = 1, QYES = 2;
// case type
static final byte LOWER = 0, TITLE = 1, UPPER = 2, UNCASED = 3, FOLD = 3, LIMIT_CASE = 4;
static final byte SIMPLE = 0, FULL = 8;
// normalization type
static final byte UNNORMALIZED = 0, C = 1, KC = 2, D = 3, KD = 4, FORM_LIMIT = 5;
// numericType
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
LIMIT_NUMERIC_TYPE = 4;
// HAN_PRIMARY = 4, HAN_ACCOUNTING = 5, HAN_OTHER = 6,
// WARNING, reset to 7 if all properties desired!!
static final byte NA = 0, L = 1, V = 2, T = 3, LV = 4, LVT = 5,
HANGUL_SYLLABLE_TYPE_LIMIT = 6;
public static final byte // SCRIPT CODE
COMMON_SCRIPT = 0,
LATIN_SCRIPT = 1,
GREEK_SCRIPT = 2,
CYRILLIC_SCRIPT = 3,
ARMENIAN_SCRIPT = 4,
HEBREW_SCRIPT = 5,
ARABIC_SCRIPT = 6,
SYRIAC_SCRIPT = 7,
THAANA_SCRIPT = 8,
DEVANAGARI_SCRIPT = 9,
BENGALI_SCRIPT = 10,
GURMUKHI_SCRIPT = 11,
GUJARATI_SCRIPT = 12,
ORIYA_SCRIPT = 13,
TAMIL_SCRIPT = 14,
TELUGU_SCRIPT = 15,
KANNADA_SCRIPT = 16,
MALAYALAM_SCRIPT = 17,
SINHALA_SCRIPT = 18,
THAI_SCRIPT = 19,
LAO_SCRIPT = 20,
TIBETAN_SCRIPT = 21,
MYANMAR_SCRIPT = 22,
GEORGIAN_SCRIPT = 23,
UNUSED_SCRIPT = 24,
HANGUL_SCRIPT = 25,
ETHIOPIC_SCRIPT = 26,
CHEROKEE_SCRIPT = 27,
ABORIGINAL_SCRIPT = 28,
OGHAM_SCRIPT = 29,
RUNIC_SCRIPT = 30,
KHMER_SCRIPT = 31,
MONGOLIAN_SCRIPT = 32,
HIRAGANA_SCRIPT = 33,
KATAKANA_SCRIPT = 34,
BOPOMOFO_SCRIPT = 35,
HAN_SCRIPT = 36,
YI_SCRIPT = 37,
OLD_ITALIC_SCRIPT = 38,
GOTHIC_SCRIPT = 39,
DESERET_SCRIPT = 40,
INHERITED_SCRIPT = 41,
TAGALOG_SCRIPT = 42,
HANUNOO_SCRIPT = 43,
BUHID_SCRIPT = 44,
TAGBANWA_SCRIPT = 45,
LIMBU = 46,
TAI_LE = 47,
LINEAR_B = 48,
UGARITIC = 49,
SHAVIAN = 50,
OSMANYA = 51,
CYPRIOT = 52,
BRAILLE = 53,
KATAKANA_OR_HIRAGANA = 54,
BUGINESE = 55,
COPTIC = 56,
NEW_TAI_LUE = 57,
GLAGOLITIC = 58,
TIFINAGH = 59,
SYLOTI_NAGRI = 60,
OLD_PERSIAN = 61,
KHAROSHTHI = 62,
Balinese = 63,
Cuneiform = 64,
Phoenician = 65,
Phags_Pa = 66,
NKo = 67,
Unknown_Script = 68,
LIMIT_SCRIPT = 69;
static final int
UNKNOWN = 0,
AGE11 = 1,
AGE20 = 2,
AGE21 = 3,
AGE30 = 4,
AGE31 = 5,
AGE32 = 6,
AGE40 = 7,
AGE41 = 8,
AGE50 = 9,
LIMIT_AGE = 10;
static final String[] AGE_VERSIONS = {
"?",
"1.1.0",
"2.0.0",
"2.1.2",
"3.0.0",
"3.1.0",
"3.2.0",
"4.0.0",
"4.1.0",
"5.0.0"
};
public static byte
JT_C = 0,
JT_D = 1,
JT_R = 2,
JT_U = 3,
JT_L = 4,
JT_T = 5,
LIMIT_JOINING_TYPE = 6;
public static byte
NO_SHAPING = 0,
AIN = 1,
ALAPH = 2,
ALEF = 3,
BEH = 4,
BETH = 5,
DAL = 6,
DALATH_RISH = 7,
E = 8,
FEH = 9,
FINAL_SEMKATH = 10,
GAF = 11,
GAMAL = 12,
HAH = 13,
HAMZA_ON_HEH_GOAL = 14,
HE = 15,
HEH = 16,
HEH_GOAL = 17,
HETH = 18,
KAF = 19,
KAPH = 20,
KNOTTED_HEH = 21,
LAM = 22,
LAMADH = 23,
MEEM = 24,
MIM = 25,
NOON = 26,
NUN = 27,
PE = 28,
QAF = 29,
QAPH = 30,
REH = 31,
REVERSED_PE = 32,
SAD = 33,
SADHE = 34,
SEEN = 35,
SEMKATH = 36,
SHIN = 37,
SWASH_KAF = 38,
TAH = 39,
TAW = 40,
TEH_MARBUTA = 41,
TETH = 42,
WAW = 43,
SYRIAC_WAW = 44,
YEH = 45,
YEH_BARREE = 46,
YEH_WITH_TAIL = 47,
YUDH = 48,
YUDH_HE = 49,
ZAIN = 50,
ZHAIN = 51,
KHAPH = 52,
FE = 53,
LIMIT_JOINING_GROUP = 54;
static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3;
public static final int
NF_COMPATIBILITY_MASK = 2,
NF_COMPOSITION_MASK = 1;
// DERIVED PROPERTY
static final byte
PropMath = 0,
PropAlphabetic = 1,
PropLowercase = 2,
PropUppercase = 3,
ID_Start = 4,
ID_Continue_NO_Cf = 5,
Mod_ID_Start = 6,
Mod_ID_Continue_NO_Cf = 7,
Missing_Uppercase = 8,
Missing_Lowercase = 9,
Missing_Mixedcase = 10,
FC_NFKC_Closure = 11,
FullCompExclusion = 12,
FullCompInclusion = 13,
QuickNFD = 14,
QuickNFC = 15,
QuickNFKD = 16,
QuickNFKC = 17,
ExpandsOnNFD = 18,
ExpandsOnNFC = 19,
ExpandsOnNFKD = 20,
ExpandsOnNFKC = 21,
GenNFD = 22,
GenNFC = 23,
GenNFKD = 24,
GenNFKC = 25,
DefaultIgnorable = 26,
GraphemeExtend = 27,
GraphemeBase = 28,
FC_NFC_Closure = 29,
Other_Case_Ignorable = 30,
Case_Ignorable = 31,
Type_i = 32,
NFC_Leading = 33,
NFC_TrailingNonZero = 34,
NFC_TrailingZero = 35,
NFC_Resulting = 36,
NFD_UnsafeStart = 37,
NFC_UnsafeStart = 38,
NFKD_UnsafeStart = 39,
NFKC_UnsafeStart = 40,
NFD_Skippable = 41,
NFC_Skippable = 42,
NFKD_Skippable = 43,
NFKC_Skippable = 44,
Case_Sensitive = 45,
DERIVED_PROPERTY_LIMIT = 46;
}

Some files were not shown because too many files have changed in this diff Show More