ICU-7611 delete old copy of Mark's Java unicodetools from before he moved them to the unicode.org repository
X-SVN-Rev: 27924
This commit is contained in:
parent
baed720ac1
commit
a7c0c94a15
@ -1,392 +0,0 @@
|
||||
package com.ibm.text;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import java.util.BitSet;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.Iterator;
|
||||
import java.text.NumberFormat;
|
||||
import com.ibm.text.utility.FastIntBinarySearch;
|
||||
|
||||
public class TestICU4J {
|
||||
public static void main(String[] args) {
|
||||
String a = UTF16.valueOf(0x10000);
|
||||
String b = Normalizer.normalize("a\u0308", Normalizer.NFC);
|
||||
System.out.println(b);
|
||||
/*
|
||||
System.out.println(UCharacter.getType(0x10FFFF));
|
||||
System.out.println(UCharacter.getName(0x61));
|
||||
*/
|
||||
testUnicodeSetSpeed(Character.TITLECASE_LETTER, 100);
|
||||
testUnicodeSetSpeed(Character.UNASSIGNED, 1);
|
||||
}
|
||||
|
||||
static final boolean SHOW_ERRORS = false;
|
||||
static boolean OPTIMIZATION = true;
|
||||
|
||||
static void testUnicodeSetSpeed(int prop, int ITERATIONS) {
|
||||
NumberFormat numb = NumberFormat.getNumberInstance();
|
||||
NumberFormat percent = NumberFormat.getPercentInstance();
|
||||
double start, delta, oldDelta;
|
||||
int temp = 0;
|
||||
Set s;
|
||||
UnicodeSet us;
|
||||
Iterator it;
|
||||
UnicodeSetIterator uit;
|
||||
|
||||
BitSet bs = new BitSet();
|
||||
System.out.println();
|
||||
System.out.println("Getting characters for property " + prop);
|
||||
int total = 0;
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
if (UCharacter.getType(cp) == prop) {
|
||||
bs.set(cp);
|
||||
++total;
|
||||
}
|
||||
}
|
||||
System.out.println("Total characters: " + numb.format(total));
|
||||
System.out.println("Loop Iterations: " + numb.format(ITERATIONS));
|
||||
System.out.println();
|
||||
|
||||
System.out.println("Testing Add speed");
|
||||
|
||||
s = new TreeSet();
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
s.clear();
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (bs.get(cp)) {
|
||||
s.add(new Integer(cp));
|
||||
}
|
||||
}
|
||||
}
|
||||
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||||
System.out.println("Set add time: " + numb.format(delta));
|
||||
System.out.println("Total characters: " + numb.format(s.size()));
|
||||
|
||||
us = new UnicodeSet();
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
us.clear();
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (bs.get(cp)) {
|
||||
optimizedAdd(us,cp);
|
||||
}
|
||||
}
|
||||
}
|
||||
optimizedDone(us);
|
||||
delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||||
System.out.println("UnicodeSet add time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
|
||||
System.out.println("Total characters: " + numb.format(us.size()) + ", ranges: " + us.getRangeCount());
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Testing Contains speed");
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (s.contains(new Integer(cp)) != bs.get(cp)) {
|
||||
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
|
||||
}
|
||||
}
|
||||
}
|
||||
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||||
System.out.println("Set contains time: " + numb.format(delta));
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (us.contains(cp) != bs.get(cp)) {
|
||||
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
|
||||
}
|
||||
}
|
||||
}
|
||||
delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||||
System.out.println("UnicodeSet contains time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
|
||||
|
||||
setupBinary(us);
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (binaryContains(cp) != bs.get(cp)) {
|
||||
if (SHOW_ERRORS) System.out.println("Error at: " + info(cp));
|
||||
}
|
||||
}
|
||||
}
|
||||
delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||||
System.out.println("BINARY UnicodeSet contains time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
|
||||
|
||||
System.out.println("Testing Iteration speed");
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
it = s.iterator();
|
||||
while (it.hasNext()) {
|
||||
temp += ((Integer)it.next()).intValue();
|
||||
}
|
||||
}
|
||||
oldDelta = delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||||
System.out.println("Set iteration time: " + numb.format(delta));
|
||||
|
||||
uit = new UnicodeSetIterator(us);
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
uit.reset();
|
||||
while (uit.next()) {
|
||||
temp += uit.codepoint;
|
||||
}
|
||||
}
|
||||
delta = (System.currentTimeMillis() - start)/ITERATIONS;
|
||||
System.out.println("UnicodeSet iteration time: " + numb.format(delta) + ", " + percent.format(delta/oldDelta));
|
||||
|
||||
uit.reset();
|
||||
start = System.currentTimeMillis();
|
||||
while (uit.nextRange()) {
|
||||
System.out.println(info(uit.codepoint, uit.codepointEnd));
|
||||
}
|
||||
}
|
||||
|
||||
static FastIntBinarySearch fibs;
|
||||
|
||||
static void setupBinary(UnicodeSet us) {
|
||||
int[] dummySearch = new int[us.getRangeCount()*2];
|
||||
int dummyLimit = 0;
|
||||
UnicodeSetIterator uit = new UnicodeSetIterator(us);
|
||||
while (uit.nextRange()) {
|
||||
dummySearch[dummyLimit++] = uit.codepoint;
|
||||
dummySearch[dummyLimit++] = uit.codepointEnd+1;
|
||||
}
|
||||
fibs = new FastIntBinarySearch(dummySearch);
|
||||
}
|
||||
|
||||
static boolean binaryContains(int cp) {
|
||||
return ((fibs.findIndex(cp) & 1) != 0); // return true if odd
|
||||
}
|
||||
|
||||
|
||||
static String info(int cp) {
|
||||
return Integer.toString(cp, 16).toUpperCase() + " " + UCharacter.getName(cp);
|
||||
}
|
||||
|
||||
static String info(int cpStart, int cpEnd) {
|
||||
if (cpStart == cpEnd) {
|
||||
return Integer.toString(cpStart, 16).toUpperCase()
|
||||
+ " " + UCharacter.getName(cpStart);
|
||||
}
|
||||
return Integer.toString(cpStart, 16).toUpperCase() + ".." + Integer.toString(cpEnd, 16).toUpperCase()
|
||||
+ " " + UCharacter.getName(cpStart) + ".." + UCharacter.getName(cpEnd);
|
||||
}
|
||||
|
||||
static int first;
|
||||
static int limit = -2;
|
||||
|
||||
static void optimizedAdd(UnicodeSet us, int cp) {
|
||||
if (!OPTIMIZATION) {
|
||||
us.add(cp);
|
||||
return;
|
||||
}
|
||||
if (cp == limit) {
|
||||
++limit;
|
||||
} else {
|
||||
if (limit > 0) {
|
||||
us.add(first, limit - 1);
|
||||
// System.out.println(info(first, limit-1));
|
||||
}
|
||||
first = cp;
|
||||
limit = cp + 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void optimizedDone(UnicodeSet us) {
|
||||
if (!OPTIMIZATION) return;
|
||||
if (limit > 0) {
|
||||
us.add(first, limit - 1);
|
||||
//System.out.println(info(first, limit-1));
|
||||
}
|
||||
limit = -2; // reset to invalid
|
||||
}
|
||||
|
||||
|
||||
public static class UXCharacter {
|
||||
/**
|
||||
* Provides interface for properties in
|
||||
* http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt
|
||||
* and their values in
|
||||
* http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
|
||||
*/
|
||||
|
||||
/**
|
||||
* Tests a particular code point to see if the cited property has the given value.
|
||||
*
|
||||
* Sample: the following are equivalent
|
||||
* <pre>
|
||||
* if (UCharacter.test("LB", "AL", cp)) ...
|
||||
* if (UCharacter.test("line break", "alphabetic", cp)) ...
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public static boolean test(String propertyName, String propertyValue, int codePoint) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
|
||||
* @param set the resulting value. The set is cleared,
|
||||
* then all the code points with the given <property, value> are added.
|
||||
*
|
||||
* Sample: the following are equivalent
|
||||
* <pre>
|
||||
* if (UCharacter.test("WSpace", cp)) ...
|
||||
* if (UCharacter.test("White_Space", cp)) ...
|
||||
* if (UCharacter.test("White_Space", "true", cp)) ...
|
||||
* if (!UCharacter.test("White_Space", "false", cp)) ...
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public static void getSet(String propertyName, String propertyValue, UnicodeSet set) {
|
||||
// logical implemenation. Real implementation would be way faster!
|
||||
set.clear();
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (test(propertyName, propertyValue, cp)) set.add(cp);
|
||||
}
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
// POSSIBLE ADDITIONAL UTILITIES FOR CONVENIENCE OR SPEED
|
||||
// ======================================================
|
||||
|
||||
/**
|
||||
* Tests a particular code point to see if the cited boolean property is true.
|
||||
* @param propertyName the cited property
|
||||
* @param codePoint the particular code point
|
||||
* @return true if the cited property has the given value for the specified code point.
|
||||
*
|
||||
* Sample: the following are equivalent
|
||||
* <pre>
|
||||
* if (UCharacter.test("WSpace", cp)) ...
|
||||
* if (UCharacter.test("White_Space", cp)) ...
|
||||
* if (UCharacter.test("White_Space", "true", cp)) ...
|
||||
* if (!UCharacter.test("White_Space", "false", cp)) ...
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public static boolean test(String booleanPropertyName, int codePoint) {
|
||||
return test(booleanPropertyName, "true", codePoint);
|
||||
}
|
||||
|
||||
// ===============================================
|
||||
// The following allow access to properties by number, saving a string lookup
|
||||
// on each call.
|
||||
// ===============================================
|
||||
|
||||
|
||||
/**
|
||||
* Gets an index for higher-speed access to properties.
|
||||
*
|
||||
* Sample:
|
||||
* <pre>
|
||||
* int prop = UCharacter.getPropertyIndexIndex("LB");
|
||||
* int value = UCharacter.getValueIndex("LB", "AL");
|
||||
* while (true) {
|
||||
* ...
|
||||
* if (test(prop, value, codePoint)) ...
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public static int getPropertyIndex(String propertyName) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets maximum property index, used for iterating through properties
|
||||
*
|
||||
*/
|
||||
public static int getMaxPropertyIndex() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static final byte // NAME_STYLE
|
||||
SHORT = 0,
|
||||
DEFAULT = 1,
|
||||
LONG = 2;
|
||||
|
||||
/**
|
||||
* Gets property name
|
||||
*
|
||||
*/
|
||||
public static String getPropertyName(int propertyIndex, byte namestyle) {
|
||||
return "";
|
||||
}
|
||||
|
||||
/*
|
||||
* Tests a particular code point to see if the cited property has the given value.
|
||||
*/
|
||||
public static boolean test(int propertyIndex, String propertyValue, int codePoint) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
|
||||
*/
|
||||
public static void getSet(int propertyIndex, String propertyValue, UnicodeSet set) {
|
||||
}
|
||||
|
||||
// ===============================================
|
||||
// The following allow access to enumerated property values by number,
|
||||
// saving a string lookup on each call.
|
||||
// They are only valid for enumerated properties
|
||||
// including the combining character class (0..255).
|
||||
// ===============================================
|
||||
|
||||
/**
|
||||
* Gets an index for higher-speed access to property values.
|
||||
* Only valid for enumerated properties.
|
||||
*/
|
||||
public static int getValueIndex(String propertyName, String propertyValue) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets maximum value index for a given property, used for iterating through property values.
|
||||
* Only valid for enumerated properties.
|
||||
*
|
||||
*/
|
||||
public static int getMaxValueIndex(int propertyIndex) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets property value, corresponding to one of the values passed in
|
||||
*
|
||||
*/
|
||||
public static String getValueName(int propertyIndex, int valueIndex, byte namestyle) {
|
||||
return "";
|
||||
}
|
||||
|
||||
/*
|
||||
* Tests a particular code point to see if the cited property has the given value.
|
||||
*/
|
||||
public static boolean test(int propertyIndex, int valueIndex, int codePoint) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a UnicodeSet of code points that have the given propertyvalue for the given property.
|
||||
*/
|
||||
public static void getSet(int propertyIndex, int valueIndex, UnicodeSet set) {
|
||||
}
|
||||
|
||||
|
||||
/* OPEN ISSUES:
|
||||
- Don't like the names of the functions. Any better options? test => hasValue? hasPropertyValue?
|
||||
- Should getSet really ADD to the set (avoiding the clear?) and be called addProperties?
|
||||
Maybe faster sometimes, but might also be more errorprone.
|
||||
*/
|
||||
|
||||
}
|
||||
}
|
@ -1,66 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java,v $
|
||||
* $Date: 2004/02/06 18:32:04 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.Reader;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.FileReader;
|
||||
import java.text.MessageFormat;
|
||||
import java.io.IOException;
|
||||
import com.ibm.text.UCD.Normalizer;
|
||||
import com.ibm.text.UCD.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UCD.UnifiedBinaryProperty;
|
||||
import com.ibm.text.UCD.UCDProperty;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
|
||||
public class AbbreviatedUnicodeSetIterator extends UnicodeSetIterator {
|
||||
|
||||
private boolean abbreviated;
|
||||
private int perRange;
|
||||
|
||||
public AbbreviatedUnicodeSetIterator() {
|
||||
super();
|
||||
abbreviated = false;
|
||||
}
|
||||
|
||||
public void reset(UnicodeSet newSet) {
|
||||
reset(newSet, false);
|
||||
}
|
||||
|
||||
public void reset(UnicodeSet newSet, boolean abb) {
|
||||
reset(newSet, abb, 100);
|
||||
}
|
||||
|
||||
public void reset(UnicodeSet newSet, boolean abb, int density) {
|
||||
super.reset(newSet);
|
||||
abbreviated = abb;
|
||||
perRange = newSet.getRangeCount();
|
||||
if (perRange != 0) {
|
||||
perRange = density / perRange;
|
||||
}
|
||||
}
|
||||
|
||||
protected void loadRange(int myRange) {
|
||||
super.loadRange(myRange);
|
||||
if (abbreviated && (endElement > nextElement + perRange)) {
|
||||
endElement = nextElement + perRange;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,256 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $
|
||||
* $Date: 2002/07/03 02:15:47 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public final class CEList implements java.lang.Comparable, UCD_Types {
|
||||
int[] contents;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
int count;
|
||||
|
||||
public CEList (int[] source, int start, int end) {
|
||||
count = end-start;
|
||||
contents = new int[count];
|
||||
System.arraycopy(source, start, contents, 0, count);
|
||||
startOffset = 0;
|
||||
endOffset = count;
|
||||
}
|
||||
|
||||
public CEList(int[] source) {
|
||||
this(source, 0, source.length);
|
||||
}
|
||||
|
||||
private CEList(int[] source, int start, int end, boolean spare) {
|
||||
contents = source;
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
count = end - start;
|
||||
}
|
||||
|
||||
public CEList append(CEList that) {
|
||||
int[] newContents = new int[count + that.count];
|
||||
System.arraycopy(contents, startOffset, newContents, 0, count);
|
||||
System.arraycopy(that.contents, that.startOffset, newContents, count, that.count);
|
||||
return new CEList(newContents, 0, count + that.count, true);
|
||||
}
|
||||
|
||||
public CEList sub(int start, int end) {
|
||||
return new CEList(contents, start, end, true);
|
||||
}
|
||||
|
||||
public CEList start(int end) {
|
||||
return new CEList(contents, 0, end, true);
|
||||
}
|
||||
|
||||
public CEList end(int start) {
|
||||
return new CEList(contents, start, contents.length, true);
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public int at(int i) {
|
||||
i -= startOffset;
|
||||
if (i < 0 || i >= count) throw new ArrayIndexOutOfBoundsException(i);
|
||||
return contents[i];
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int result = count;
|
||||
for (int i = startOffset; i < endOffset; ++i) {
|
||||
result *= 37;
|
||||
result += contents[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
try {
|
||||
CEList that = (CEList)other;
|
||||
if (count != that.count) return false;
|
||||
int delta = that.startOffset - startOffset;
|
||||
for (int i = startOffset; i < endOffset; ++i) {
|
||||
if (contents[i] != that.contents[i + delta]) return false;
|
||||
}
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public int compareTo(Object other) {
|
||||
CEList that = (CEList)other;
|
||||
try {
|
||||
int delta = that.startOffset - startOffset;
|
||||
int min = endOffset;
|
||||
int min2 = that.endOffset - delta;
|
||||
if (min > min2) min = min2;
|
||||
|
||||
for (int i = startOffset; i < min; ++i) {
|
||||
if (contents[i] != that.contents[i + delta]) {
|
||||
if ((contents[i] & 0xFFFFFFFFL)
|
||||
< (that.contents[i + delta] & 0xFFFFFFFFL)) return -1;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (count < that.count) return -1;
|
||||
if (count > that.count) return 1;
|
||||
return 0;
|
||||
} catch (RuntimeException e) {
|
||||
System.out.println("This: " + this + ", that: " + other);
|
||||
System.out.println(startOffset + ", " + endOffset
|
||||
+ ", " + count + ", " + contents.length);
|
||||
System.out.println(that.startOffset + ", " + that.endOffset
|
||||
+ ", " + that.count + ", " + that.contents.length);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
public static byte remap(int ch, byte type, int t) {
|
||||
if (type != CANONICAL) {
|
||||
if (0x3041 <= ch && ch <= 0x3094) t = 0xE; // hiragana
|
||||
else if (0x30A1 <= ch && ch <= 0x30FA) t = 0x11; // katakana
|
||||
}
|
||||
switch (type) {
|
||||
case COMPATIBILITY: t = (t == 8) ? 0xA : 4; break;
|
||||
case COMPAT_FONT: t = (t == 8) ? 0xB : 5; break;
|
||||
case COMPAT_NOBREAK: t = 0x1B; break;
|
||||
case COMPAT_INITIAL: t = 0x17; break;
|
||||
case COMPAT_MEDIAL: t = 0x18; break;
|
||||
case COMPAT_FINAL: t = 0x19; break;
|
||||
case COMPAT_ISOLATED: t = 0x1A; break;
|
||||
case COMPAT_CIRCLE: t = (t == 0x11) ? 0x13 : (t == 8) ? 0xC : 6; break;
|
||||
case COMPAT_SUPER: t = 0x14; break;
|
||||
case COMPAT_SUB: t = 0x15; break;
|
||||
case COMPAT_VERTICAL: t = 0x16; break;
|
||||
case COMPAT_WIDE: t= (t == 8) ? 9 : 3; break;
|
||||
case COMPAT_NARROW: t = (0xFF67 <= ch && ch <= 0xFF6F) ? 0x10 : 0x12; break;
|
||||
case COMPAT_SMALL: t = (t == 0xE) ? 0xE : 0xF; break;
|
||||
case COMPAT_SQUARE: t = (t == 8) ? 0x1D : 0x1C; break;
|
||||
case COMPAT_FRACTION: t = 0x1E; break;
|
||||
}
|
||||
return (byte)t;
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
if (startOffset >= endOffset) return toString(0);
|
||||
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = startOffset; i < endOffset; ++i) {
|
||||
if (i != startOffset) result.append(' ');
|
||||
result.append(toString(contents[i]));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String toString(int[] ces, int len) {
|
||||
if (len <= 0) return toString(0);
|
||||
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (i != 0) result.append(' ');
|
||||
result.append(toString(ces[i]));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String toString(IntStack ces) {
|
||||
if (ces.length() <= 0) return toString(0);
|
||||
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < ces.length(); ++i) {
|
||||
if (i != 0) result.append(' ');
|
||||
result.append(toString(ces.get(i)));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String toString(int ce) {
|
||||
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
|
||||
+ Utility.hex(UCA.getSecondary(ce)) + "."
|
||||
+ Utility.hex(UCA.getTertiary(ce)) + "]"
|
||||
// + "(" + NAME3[UCA.getTertiary(ce)] + ")"
|
||||
;
|
||||
}
|
||||
|
||||
static final String[] NAME3 = {
|
||||
"IGNORE", // 0
|
||||
"BLK", // Unused?
|
||||
"MIN",
|
||||
"WIDE",
|
||||
"COMPAT",
|
||||
"FONT",
|
||||
"CIRCLE",
|
||||
"RES-2",
|
||||
"CAP",
|
||||
"WIDECAP",
|
||||
"COMPATCAP",
|
||||
"FONTCAP",
|
||||
"CIRCLECAP",
|
||||
"HIRA-SMALL",
|
||||
"HIRA",
|
||||
"SMALL",
|
||||
"SMALL-NARROW",
|
||||
"KATA",
|
||||
"NARROW",
|
||||
"CIRCLE-KATA",
|
||||
"SUP-MNN",
|
||||
"SUB-MNS",
|
||||
"VERT", // Missing??
|
||||
"AINI",
|
||||
"AMED",
|
||||
"AFIN",
|
||||
"AISO",
|
||||
"NOBREAK", // Missing?
|
||||
"SQUARED",
|
||||
"SQUAREDCAP",
|
||||
"FRACTION",
|
||||
"MAX"
|
||||
};
|
||||
|
||||
// testing
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
/* This: [0241.0020.0004], that: [0F6B.0020.0002]
|
||||
1, 2, 1, 2
|
||||
0, 1, 1, 1
|
||||
*/
|
||||
CEList t1 = new CEList(new int[] {0, 0x02412004});
|
||||
t1 = t1.sub(1,2);
|
||||
CEList t2 = new CEList(new int[] {0x0F6B2002});
|
||||
System.out.println(t1.compareTo(t2));
|
||||
|
||||
|
||||
CEList foo = new CEList(new int[] {0, 1, 2, 3, 4});
|
||||
CEList fuu = new CEList(new int[] {});
|
||||
int cc = foo.compareTo(fuu);
|
||||
System.out.println(cc);
|
||||
|
||||
System.out.println(foo);
|
||||
System.out.println(foo.start(2));
|
||||
System.out.println(foo.end(1));
|
||||
CEList fii = new CEList(new int[] {2, 3});
|
||||
CEList foo2 = foo.sub(2,4);
|
||||
System.out.println(fii.equals(foo2));
|
||||
System.out.println(fii.compareTo(foo2));
|
||||
System.out.println(fii.compareTo(foo));
|
||||
System.out.println(fii.hashCode() == foo2.hashCode());
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -1,826 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Case.java,v $
|
||||
* $Date: 2001/08/31 00:20:40 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
|
||||
public final class Case {
|
||||
|
||||
static StringBuffer out = new StringBuffer();
|
||||
|
||||
static String fold(char c) {
|
||||
return fold(String.valueOf(c));
|
||||
}
|
||||
|
||||
static String fold(String in) {
|
||||
synchronized (out) {
|
||||
out.setLength(0);
|
||||
for (int i = 0; i < in.length(); ++i) {
|
||||
char c = in.charAt(i);
|
||||
String f = CF[c];
|
||||
if (f == null) out.append(c);
|
||||
else out.append(f);
|
||||
}
|
||||
return out.toString();
|
||||
}
|
||||
}
|
||||
|
||||
static String[] CF = new String[65536];
|
||||
static {
|
||||
CF[0x0041]="\u0061";
|
||||
CF[0x0042]="\u0062";
|
||||
CF[0x0043]="\u0063";
|
||||
CF[0x0044]="\u0064";
|
||||
CF[0x0045]="\u0065";
|
||||
CF[0x0046]="\u0066";
|
||||
CF[0x0047]="\u0067";
|
||||
CF[0x0048]="\u0068";
|
||||
CF[0x0049]="\u0069";
|
||||
CF[0x004A]="\u006A";
|
||||
CF[0x004B]="\u006B";
|
||||
CF[0x004C]="\u006C";
|
||||
CF[0x004D]="\u006D";
|
||||
CF[0x004E]="\u006E";
|
||||
CF[0x004F]="\u006F";
|
||||
CF[0x0050]="\u0070";
|
||||
CF[0x0051]="\u0071";
|
||||
CF[0x0052]="\u0072";
|
||||
CF[0x0053]="\u0073";
|
||||
CF[0x0054]="\u0074";
|
||||
CF[0x0055]="\u0075";
|
||||
CF[0x0056]="\u0076";
|
||||
CF[0x0057]="\u0077";
|
||||
CF[0x0058]="\u0078";
|
||||
CF[0x0059]="\u0079";
|
||||
CF[0x005A]="\u007A";
|
||||
CF[0x00B5]="\u03BC";
|
||||
CF[0x00C0]="\u00E0";
|
||||
CF[0x00C1]="\u00E1";
|
||||
CF[0x00C2]="\u00E2";
|
||||
CF[0x00C3]="\u00E3";
|
||||
CF[0x00C4]="\u00E4";
|
||||
CF[0x00C5]="\u00E5";
|
||||
CF[0x00C6]="\u00E6";
|
||||
CF[0x00C7]="\u00E7";
|
||||
CF[0x00C8]="\u00E8";
|
||||
CF[0x00C9]="\u00E9";
|
||||
CF[0x00CA]="\u00EA";
|
||||
CF[0x00CB]="\u00EB";
|
||||
CF[0x00CC]="\u00EC";
|
||||
CF[0x00CD]="\u00ED";
|
||||
CF[0x00CE]="\u00EE";
|
||||
CF[0x00CF]="\u00EF";
|
||||
CF[0x00D0]="\u00F0";
|
||||
CF[0x00D1]="\u00F1";
|
||||
CF[0x00D2]="\u00F2";
|
||||
CF[0x00D3]="\u00F3";
|
||||
CF[0x00D4]="\u00F4";
|
||||
CF[0x00D5]="\u00F5";
|
||||
CF[0x00D6]="\u00F6";
|
||||
CF[0x00D8]="\u00F8";
|
||||
CF[0x00D9]="\u00F9";
|
||||
CF[0x00DA]="\u00FA";
|
||||
CF[0x00DB]="\u00FB";
|
||||
CF[0x00DC]="\u00FC";
|
||||
CF[0x00DD]="\u00FD";
|
||||
CF[0x00DE]="\u00FE";
|
||||
CF[0x00DF]="\u0073\u0073";
|
||||
CF[0x0100]="\u0101";
|
||||
CF[0x0102]="\u0103";
|
||||
CF[0x0104]="\u0105";
|
||||
CF[0x0106]="\u0107";
|
||||
CF[0x0108]="\u0109";
|
||||
CF[0x010A]="\u010B";
|
||||
CF[0x010C]="\u010D";
|
||||
CF[0x010E]="\u010F";
|
||||
CF[0x0110]="\u0111";
|
||||
CF[0x0112]="\u0113";
|
||||
CF[0x0114]="\u0115";
|
||||
CF[0x0116]="\u0117";
|
||||
CF[0x0118]="\u0119";
|
||||
CF[0x011A]="\u011B";
|
||||
CF[0x011C]="\u011D";
|
||||
CF[0x011E]="\u011F";
|
||||
CF[0x0120]="\u0121";
|
||||
CF[0x0122]="\u0123";
|
||||
CF[0x0124]="\u0125";
|
||||
CF[0x0126]="\u0127";
|
||||
CF[0x0128]="\u0129";
|
||||
CF[0x012A]="\u012B";
|
||||
CF[0x012C]="\u012D";
|
||||
CF[0x012E]="\u012F";
|
||||
CF[0x0130]="\u0069";
|
||||
CF[0x0131]="\u0069";
|
||||
CF[0x0132]="\u0133";
|
||||
CF[0x0134]="\u0135";
|
||||
CF[0x0136]="\u0137";
|
||||
CF[0x0139]="\u013A";
|
||||
CF[0x013B]="\u013C";
|
||||
CF[0x013D]="\u013E";
|
||||
CF[0x013F]="\u0140";
|
||||
CF[0x0141]="\u0142";
|
||||
CF[0x0143]="\u0144";
|
||||
CF[0x0145]="\u0146";
|
||||
CF[0x0147]="\u0148";
|
||||
CF[0x0149]="\u02BC\u006E";
|
||||
CF[0x014A]="\u014B";
|
||||
CF[0x014C]="\u014D";
|
||||
CF[0x014E]="\u014F";
|
||||
CF[0x0150]="\u0151";
|
||||
CF[0x0152]="\u0153";
|
||||
CF[0x0154]="\u0155";
|
||||
CF[0x0156]="\u0157";
|
||||
CF[0x0158]="\u0159";
|
||||
CF[0x015A]="\u015B";
|
||||
CF[0x015C]="\u015D";
|
||||
CF[0x015E]="\u015F";
|
||||
CF[0x0160]="\u0161";
|
||||
CF[0x0162]="\u0163";
|
||||
CF[0x0164]="\u0165";
|
||||
CF[0x0166]="\u0167";
|
||||
CF[0x0168]="\u0169";
|
||||
CF[0x016A]="\u016B";
|
||||
CF[0x016C]="\u016D";
|
||||
CF[0x016E]="\u016F";
|
||||
CF[0x0170]="\u0171";
|
||||
CF[0x0172]="\u0173";
|
||||
CF[0x0174]="\u0175";
|
||||
CF[0x0176]="\u0177";
|
||||
CF[0x0178]="\u00FF";
|
||||
CF[0x0179]="\u017A";
|
||||
CF[0x017B]="\u017C";
|
||||
CF[0x017D]="\u017E";
|
||||
CF[0x017F]="\u0073";
|
||||
CF[0x0181]="\u0253";
|
||||
CF[0x0182]="\u0183";
|
||||
CF[0x0184]="\u0185";
|
||||
CF[0x0186]="\u0254";
|
||||
CF[0x0187]="\u0188";
|
||||
CF[0x0189]="\u0256";
|
||||
CF[0x018A]="\u0257";
|
||||
CF[0x018B]="\u018C";
|
||||
CF[0x018E]="\u01DD";
|
||||
CF[0x018F]="\u0259";
|
||||
CF[0x0190]="\u025B";
|
||||
CF[0x0191]="\u0192";
|
||||
CF[0x0193]="\u0260";
|
||||
CF[0x0194]="\u0263";
|
||||
CF[0x0196]="\u0269";
|
||||
CF[0x0197]="\u0268";
|
||||
CF[0x0198]="\u0199";
|
||||
CF[0x019C]="\u026F";
|
||||
CF[0x019D]="\u0272";
|
||||
CF[0x019F]="\u0275";
|
||||
CF[0x01A0]="\u01A1";
|
||||
CF[0x01A2]="\u01A3";
|
||||
CF[0x01A4]="\u01A5";
|
||||
CF[0x01A6]="\u0280";
|
||||
CF[0x01A7]="\u01A8";
|
||||
CF[0x01A9]="\u0283";
|
||||
CF[0x01AC]="\u01AD";
|
||||
CF[0x01AE]="\u0288";
|
||||
CF[0x01AF]="\u01B0";
|
||||
CF[0x01B1]="\u028A";
|
||||
CF[0x01B2]="\u028B";
|
||||
CF[0x01B3]="\u01B4";
|
||||
CF[0x01B5]="\u01B6";
|
||||
CF[0x01B7]="\u0292";
|
||||
CF[0x01B8]="\u01B9";
|
||||
CF[0x01BC]="\u01BD";
|
||||
CF[0x01C4]="\u01C6";
|
||||
CF[0x01C5]="\u01C6";
|
||||
CF[0x01C7]="\u01C9";
|
||||
CF[0x01C8]="\u01C9";
|
||||
CF[0x01CA]="\u01CC";
|
||||
CF[0x01CB]="\u01CC";
|
||||
CF[0x01CD]="\u01CE";
|
||||
CF[0x01CF]="\u01D0";
|
||||
CF[0x01D1]="\u01D2";
|
||||
CF[0x01D3]="\u01D4";
|
||||
CF[0x01D5]="\u01D6";
|
||||
CF[0x01D7]="\u01D8";
|
||||
CF[0x01D9]="\u01DA";
|
||||
CF[0x01DB]="\u01DC";
|
||||
CF[0x01DE]="\u01DF";
|
||||
CF[0x01E0]="\u01E1";
|
||||
CF[0x01E2]="\u01E3";
|
||||
CF[0x01E4]="\u01E5";
|
||||
CF[0x01E6]="\u01E7";
|
||||
CF[0x01E8]="\u01E9";
|
||||
CF[0x01EA]="\u01EB";
|
||||
CF[0x01EC]="\u01ED";
|
||||
CF[0x01EE]="\u01EF";
|
||||
CF[0x01F0]="\u006A\u030C";
|
||||
CF[0x01F1]="\u01F3";
|
||||
CF[0x01F2]="\u01F3";
|
||||
CF[0x01F4]="\u01F5";
|
||||
CF[0x01F6]="\u0195";
|
||||
CF[0x01F7]="\u01BF";
|
||||
CF[0x01F8]="\u01F9";
|
||||
CF[0x01FA]="\u01FB";
|
||||
CF[0x01FC]="\u01FD";
|
||||
CF[0x01FE]="\u01FF";
|
||||
CF[0x0200]="\u0201";
|
||||
CF[0x0202]="\u0203";
|
||||
CF[0x0204]="\u0205";
|
||||
CF[0x0206]="\u0207";
|
||||
CF[0x0208]="\u0209";
|
||||
CF[0x020A]="\u020B";
|
||||
CF[0x020C]="\u020D";
|
||||
CF[0x020E]="\u020F";
|
||||
CF[0x0210]="\u0211";
|
||||
CF[0x0212]="\u0213";
|
||||
CF[0x0214]="\u0215";
|
||||
CF[0x0216]="\u0217";
|
||||
CF[0x0218]="\u0219";
|
||||
CF[0x021A]="\u021B";
|
||||
CF[0x021C]="\u021D";
|
||||
CF[0x021E]="\u021F";
|
||||
CF[0x0222]="\u0223";
|
||||
CF[0x0224]="\u0225";
|
||||
CF[0x0226]="\u0227";
|
||||
CF[0x0228]="\u0229";
|
||||
CF[0x022A]="\u022B";
|
||||
CF[0x022C]="\u022D";
|
||||
CF[0x022E]="\u022F";
|
||||
CF[0x0230]="\u0231";
|
||||
CF[0x0232]="\u0233";
|
||||
CF[0x0345]="\u03B9";
|
||||
CF[0x0386]="\u03AC";
|
||||
CF[0x0388]="\u03AD";
|
||||
CF[0x0389]="\u03AE";
|
||||
CF[0x038A]="\u03AF";
|
||||
CF[0x038C]="\u03CC";
|
||||
CF[0x038E]="\u03CD";
|
||||
CF[0x038F]="\u03CE";
|
||||
CF[0x0390]="\u03B9\u0308\u0301";
|
||||
CF[0x0391]="\u03B1";
|
||||
CF[0x0392]="\u03B2";
|
||||
CF[0x0393]="\u03B3";
|
||||
CF[0x0394]="\u03B4";
|
||||
CF[0x0395]="\u03B5";
|
||||
CF[0x0396]="\u03B6";
|
||||
CF[0x0397]="\u03B7";
|
||||
CF[0x0398]="\u03B8";
|
||||
CF[0x0399]="\u03B9";
|
||||
CF[0x039A]="\u03BA";
|
||||
CF[0x039B]="\u03BB";
|
||||
CF[0x039C]="\u03BC";
|
||||
CF[0x039D]="\u03BD";
|
||||
CF[0x039E]="\u03BE";
|
||||
CF[0x039F]="\u03BF";
|
||||
CF[0x03A0]="\u03C0";
|
||||
CF[0x03A1]="\u03C1";
|
||||
CF[0x03A3]="\u03C2";
|
||||
CF[0x03A4]="\u03C4";
|
||||
CF[0x03A5]="\u03C5";
|
||||
CF[0x03A6]="\u03C6";
|
||||
CF[0x03A7]="\u03C7";
|
||||
CF[0x03A8]="\u03C8";
|
||||
CF[0x03A9]="\u03C9";
|
||||
CF[0x03AA]="\u03CA";
|
||||
CF[0x03AB]="\u03CB";
|
||||
CF[0x03B0]="\u03C5\u0308\u0301";
|
||||
CF[0x03C3]="\u03C2";
|
||||
CF[0x03D0]="\u03B2";
|
||||
CF[0x03D1]="\u03B8";
|
||||
CF[0x03D5]="\u03C6";
|
||||
CF[0x03D6]="\u03C0";
|
||||
CF[0x03DA]="\u03DB";
|
||||
CF[0x03DC]="\u03DD";
|
||||
CF[0x03DE]="\u03DF";
|
||||
CF[0x03E0]="\u03E1";
|
||||
CF[0x03E2]="\u03E3";
|
||||
CF[0x03E4]="\u03E5";
|
||||
CF[0x03E6]="\u03E7";
|
||||
CF[0x03E8]="\u03E9";
|
||||
CF[0x03EA]="\u03EB";
|
||||
CF[0x03EC]="\u03ED";
|
||||
CF[0x03EE]="\u03EF";
|
||||
CF[0x03F0]="\u03BA";
|
||||
CF[0x03F1]="\u03C1";
|
||||
CF[0x03F2]="\u03C2";
|
||||
CF[0x0400]="\u0450";
|
||||
CF[0x0401]="\u0451";
|
||||
CF[0x0402]="\u0452";
|
||||
CF[0x0403]="\u0453";
|
||||
CF[0x0404]="\u0454";
|
||||
CF[0x0405]="\u0455";
|
||||
CF[0x0406]="\u0456";
|
||||
CF[0x0407]="\u0457";
|
||||
CF[0x0408]="\u0458";
|
||||
CF[0x0409]="\u0459";
|
||||
CF[0x040A]="\u045A";
|
||||
CF[0x040B]="\u045B";
|
||||
CF[0x040C]="\u045C";
|
||||
CF[0x040D]="\u045D";
|
||||
CF[0x040E]="\u045E";
|
||||
CF[0x040F]="\u045F";
|
||||
CF[0x0410]="\u0430";
|
||||
CF[0x0411]="\u0431";
|
||||
CF[0x0412]="\u0432";
|
||||
CF[0x0413]="\u0433";
|
||||
CF[0x0414]="\u0434";
|
||||
CF[0x0415]="\u0435";
|
||||
CF[0x0416]="\u0436";
|
||||
CF[0x0417]="\u0437";
|
||||
CF[0x0418]="\u0438";
|
||||
CF[0x0419]="\u0439";
|
||||
CF[0x041A]="\u043A";
|
||||
CF[0x041B]="\u043B";
|
||||
CF[0x041C]="\u043C";
|
||||
CF[0x041D]="\u043D";
|
||||
CF[0x041E]="\u043E";
|
||||
CF[0x041F]="\u043F";
|
||||
CF[0x0420]="\u0440";
|
||||
CF[0x0421]="\u0441";
|
||||
CF[0x0422]="\u0442";
|
||||
CF[0x0423]="\u0443";
|
||||
CF[0x0424]="\u0444";
|
||||
CF[0x0425]="\u0445";
|
||||
CF[0x0426]="\u0446";
|
||||
CF[0x0427]="\u0447";
|
||||
CF[0x0428]="\u0448";
|
||||
CF[0x0429]="\u0449";
|
||||
CF[0x042A]="\u044A";
|
||||
CF[0x042B]="\u044B";
|
||||
CF[0x042C]="\u044C";
|
||||
CF[0x042D]="\u044D";
|
||||
CF[0x042E]="\u044E";
|
||||
CF[0x042F]="\u044F";
|
||||
CF[0x0460]="\u0461";
|
||||
CF[0x0462]="\u0463";
|
||||
CF[0x0464]="\u0465";
|
||||
CF[0x0466]="\u0467";
|
||||
CF[0x0468]="\u0469";
|
||||
CF[0x046A]="\u046B";
|
||||
CF[0x046C]="\u046D";
|
||||
CF[0x046E]="\u046F";
|
||||
CF[0x0470]="\u0471";
|
||||
CF[0x0472]="\u0473";
|
||||
CF[0x0474]="\u0475";
|
||||
CF[0x0476]="\u0477";
|
||||
CF[0x0478]="\u0479";
|
||||
CF[0x047A]="\u047B";
|
||||
CF[0x047C]="\u047D";
|
||||
CF[0x047E]="\u047F";
|
||||
CF[0x0480]="\u0481";
|
||||
CF[0x048C]="\u048D";
|
||||
CF[0x048E]="\u048F";
|
||||
CF[0x0490]="\u0491";
|
||||
CF[0x0492]="\u0493";
|
||||
CF[0x0494]="\u0495";
|
||||
CF[0x0496]="\u0497";
|
||||
CF[0x0498]="\u0499";
|
||||
CF[0x049A]="\u049B";
|
||||
CF[0x049C]="\u049D";
|
||||
CF[0x049E]="\u049F";
|
||||
CF[0x04A0]="\u04A1";
|
||||
CF[0x04A2]="\u04A3";
|
||||
CF[0x04A4]="\u04A5";
|
||||
CF[0x04A6]="\u04A7";
|
||||
CF[0x04A8]="\u04A9";
|
||||
CF[0x04AA]="\u04AB";
|
||||
CF[0x04AC]="\u04AD";
|
||||
CF[0x04AE]="\u04AF";
|
||||
CF[0x04B0]="\u04B1";
|
||||
CF[0x04B2]="\u04B3";
|
||||
CF[0x04B4]="\u04B5";
|
||||
CF[0x04B6]="\u04B7";
|
||||
CF[0x04B8]="\u04B9";
|
||||
CF[0x04BA]="\u04BB";
|
||||
CF[0x04BC]="\u04BD";
|
||||
CF[0x04BE]="\u04BF";
|
||||
CF[0x04C1]="\u04C2";
|
||||
CF[0x04C3]="\u04C4";
|
||||
CF[0x04C7]="\u04C8";
|
||||
CF[0x04CB]="\u04CC";
|
||||
CF[0x04D0]="\u04D1";
|
||||
CF[0x04D2]="\u04D3";
|
||||
CF[0x04D4]="\u04D5";
|
||||
CF[0x04D6]="\u04D7";
|
||||
CF[0x04D8]="\u04D9";
|
||||
CF[0x04DA]="\u04DB";
|
||||
CF[0x04DC]="\u04DD";
|
||||
CF[0x04DE]="\u04DF";
|
||||
CF[0x04E0]="\u04E1";
|
||||
CF[0x04E2]="\u04E3";
|
||||
CF[0x04E4]="\u04E5";
|
||||
CF[0x04E6]="\u04E7";
|
||||
CF[0x04E8]="\u04E9";
|
||||
CF[0x04EA]="\u04EB";
|
||||
CF[0x04EC]="\u04ED";
|
||||
CF[0x04EE]="\u04EF";
|
||||
CF[0x04F0]="\u04F1";
|
||||
CF[0x04F2]="\u04F3";
|
||||
CF[0x04F4]="\u04F5";
|
||||
CF[0x04F8]="\u04F9";
|
||||
CF[0x0531]="\u0561";
|
||||
CF[0x0532]="\u0562";
|
||||
CF[0x0533]="\u0563";
|
||||
CF[0x0534]="\u0564";
|
||||
CF[0x0535]="\u0565";
|
||||
CF[0x0536]="\u0566";
|
||||
CF[0x0537]="\u0567";
|
||||
CF[0x0538]="\u0568";
|
||||
CF[0x0539]="\u0569";
|
||||
CF[0x053A]="\u056A";
|
||||
CF[0x053B]="\u056B";
|
||||
CF[0x053C]="\u056C";
|
||||
CF[0x053D]="\u056D";
|
||||
CF[0x053E]="\u056E";
|
||||
CF[0x053F]="\u056F";
|
||||
CF[0x0540]="\u0570";
|
||||
CF[0x0541]="\u0571";
|
||||
CF[0x0542]="\u0572";
|
||||
CF[0x0543]="\u0573";
|
||||
CF[0x0544]="\u0574";
|
||||
CF[0x0545]="\u0575";
|
||||
CF[0x0546]="\u0576";
|
||||
CF[0x0547]="\u0577";
|
||||
CF[0x0548]="\u0578";
|
||||
CF[0x0549]="\u0579";
|
||||
CF[0x054A]="\u057A";
|
||||
CF[0x054B]="\u057B";
|
||||
CF[0x054C]="\u057C";
|
||||
CF[0x054D]="\u057D";
|
||||
CF[0x054E]="\u057E";
|
||||
CF[0x054F]="\u057F";
|
||||
CF[0x0550]="\u0580";
|
||||
CF[0x0551]="\u0581";
|
||||
CF[0x0552]="\u0582";
|
||||
CF[0x0553]="\u0583";
|
||||
CF[0x0554]="\u0584";
|
||||
CF[0x0555]="\u0585";
|
||||
CF[0x0556]="\u0586";
|
||||
CF[0x0587]="\u0565\u0582";
|
||||
CF[0x1E00]="\u1E01";
|
||||
CF[0x1E02]="\u1E03";
|
||||
CF[0x1E04]="\u1E05";
|
||||
CF[0x1E06]="\u1E07";
|
||||
CF[0x1E08]="\u1E09";
|
||||
CF[0x1E0A]="\u1E0B";
|
||||
CF[0x1E0C]="\u1E0D";
|
||||
CF[0x1E0E]="\u1E0F";
|
||||
CF[0x1E10]="\u1E11";
|
||||
CF[0x1E12]="\u1E13";
|
||||
CF[0x1E14]="\u1E15";
|
||||
CF[0x1E16]="\u1E17";
|
||||
CF[0x1E18]="\u1E19";
|
||||
CF[0x1E1A]="\u1E1B";
|
||||
CF[0x1E1C]="\u1E1D";
|
||||
CF[0x1E1E]="\u1E1F";
|
||||
CF[0x1E20]="\u1E21";
|
||||
CF[0x1E22]="\u1E23";
|
||||
CF[0x1E24]="\u1E25";
|
||||
CF[0x1E26]="\u1E27";
|
||||
CF[0x1E28]="\u1E29";
|
||||
CF[0x1E2A]="\u1E2B";
|
||||
CF[0x1E2C]="\u1E2D";
|
||||
CF[0x1E2E]="\u1E2F";
|
||||
CF[0x1E30]="\u1E31";
|
||||
CF[0x1E32]="\u1E33";
|
||||
CF[0x1E34]="\u1E35";
|
||||
CF[0x1E36]="\u1E37";
|
||||
CF[0x1E38]="\u1E39";
|
||||
CF[0x1E3A]="\u1E3B";
|
||||
CF[0x1E3C]="\u1E3D";
|
||||
CF[0x1E3E]="\u1E3F";
|
||||
CF[0x1E40]="\u1E41";
|
||||
CF[0x1E42]="\u1E43";
|
||||
CF[0x1E44]="\u1E45";
|
||||
CF[0x1E46]="\u1E47";
|
||||
CF[0x1E48]="\u1E49";
|
||||
CF[0x1E4A]="\u1E4B";
|
||||
CF[0x1E4C]="\u1E4D";
|
||||
CF[0x1E4E]="\u1E4F";
|
||||
CF[0x1E50]="\u1E51";
|
||||
CF[0x1E52]="\u1E53";
|
||||
CF[0x1E54]="\u1E55";
|
||||
CF[0x1E56]="\u1E57";
|
||||
CF[0x1E58]="\u1E59";
|
||||
CF[0x1E5A]="\u1E5B";
|
||||
CF[0x1E5C]="\u1E5D";
|
||||
CF[0x1E5E]="\u1E5F";
|
||||
CF[0x1E60]="\u1E61";
|
||||
CF[0x1E62]="\u1E63";
|
||||
CF[0x1E64]="\u1E65";
|
||||
CF[0x1E66]="\u1E67";
|
||||
CF[0x1E68]="\u1E69";
|
||||
CF[0x1E6A]="\u1E6B";
|
||||
CF[0x1E6C]="\u1E6D";
|
||||
CF[0x1E6E]="\u1E6F";
|
||||
CF[0x1E70]="\u1E71";
|
||||
CF[0x1E72]="\u1E73";
|
||||
CF[0x1E74]="\u1E75";
|
||||
CF[0x1E76]="\u1E77";
|
||||
CF[0x1E78]="\u1E79";
|
||||
CF[0x1E7A]="\u1E7B";
|
||||
CF[0x1E7C]="\u1E7D";
|
||||
CF[0x1E7E]="\u1E7F";
|
||||
CF[0x1E80]="\u1E81";
|
||||
CF[0x1E82]="\u1E83";
|
||||
CF[0x1E84]="\u1E85";
|
||||
CF[0x1E86]="\u1E87";
|
||||
CF[0x1E88]="\u1E89";
|
||||
CF[0x1E8A]="\u1E8B";
|
||||
CF[0x1E8C]="\u1E8D";
|
||||
CF[0x1E8E]="\u1E8F";
|
||||
CF[0x1E90]="\u1E91";
|
||||
CF[0x1E92]="\u1E93";
|
||||
CF[0x1E94]="\u1E95";
|
||||
CF[0x1E96]="\u0068\u0331";
|
||||
CF[0x1E97]="\u0074\u0308";
|
||||
CF[0x1E98]="\u0077\u030A";
|
||||
CF[0x1E99]="\u0079\u030A";
|
||||
CF[0x1E9A]="\u0061\u02BE";
|
||||
CF[0x1E9B]="\u1E61";
|
||||
CF[0x1EA0]="\u1EA1";
|
||||
CF[0x1EA2]="\u1EA3";
|
||||
CF[0x1EA4]="\u1EA5";
|
||||
CF[0x1EA6]="\u1EA7";
|
||||
CF[0x1EA8]="\u1EA9";
|
||||
CF[0x1EAA]="\u1EAB";
|
||||
CF[0x1EAC]="\u1EAD";
|
||||
CF[0x1EAE]="\u1EAF";
|
||||
CF[0x1EB0]="\u1EB1";
|
||||
CF[0x1EB2]="\u1EB3";
|
||||
CF[0x1EB4]="\u1EB5";
|
||||
CF[0x1EB6]="\u1EB7";
|
||||
CF[0x1EB8]="\u1EB9";
|
||||
CF[0x1EBA]="\u1EBB";
|
||||
CF[0x1EBC]="\u1EBD";
|
||||
CF[0x1EBE]="\u1EBF";
|
||||
CF[0x1EC0]="\u1EC1";
|
||||
CF[0x1EC2]="\u1EC3";
|
||||
CF[0x1EC4]="\u1EC5";
|
||||
CF[0x1EC6]="\u1EC7";
|
||||
CF[0x1EC8]="\u1EC9";
|
||||
CF[0x1ECA]="\u1ECB";
|
||||
CF[0x1ECC]="\u1ECD";
|
||||
CF[0x1ECE]="\u1ECF";
|
||||
CF[0x1ED0]="\u1ED1";
|
||||
CF[0x1ED2]="\u1ED3";
|
||||
CF[0x1ED4]="\u1ED5";
|
||||
CF[0x1ED6]="\u1ED7";
|
||||
CF[0x1ED8]="\u1ED9";
|
||||
CF[0x1EDA]="\u1EDB";
|
||||
CF[0x1EDC]="\u1EDD";
|
||||
CF[0x1EDE]="\u1EDF";
|
||||
CF[0x1EE0]="\u1EE1";
|
||||
CF[0x1EE2]="\u1EE3";
|
||||
CF[0x1EE4]="\u1EE5";
|
||||
CF[0x1EE6]="\u1EE7";
|
||||
CF[0x1EE8]="\u1EE9";
|
||||
CF[0x1EEA]="\u1EEB";
|
||||
CF[0x1EEC]="\u1EED";
|
||||
CF[0x1EEE]="\u1EEF";
|
||||
CF[0x1EF0]="\u1EF1";
|
||||
CF[0x1EF2]="\u1EF3";
|
||||
CF[0x1EF4]="\u1EF5";
|
||||
CF[0x1EF6]="\u1EF7";
|
||||
CF[0x1EF8]="\u1EF9";
|
||||
CF[0x1F08]="\u1F00";
|
||||
CF[0x1F09]="\u1F01";
|
||||
CF[0x1F0A]="\u1F02";
|
||||
CF[0x1F0B]="\u1F03";
|
||||
CF[0x1F0C]="\u1F04";
|
||||
CF[0x1F0D]="\u1F05";
|
||||
CF[0x1F0E]="\u1F06";
|
||||
CF[0x1F0F]="\u1F07";
|
||||
CF[0x1F18]="\u1F10";
|
||||
CF[0x1F19]="\u1F11";
|
||||
CF[0x1F1A]="\u1F12";
|
||||
CF[0x1F1B]="\u1F13";
|
||||
CF[0x1F1C]="\u1F14";
|
||||
CF[0x1F1D]="\u1F15";
|
||||
CF[0x1F28]="\u1F20";
|
||||
CF[0x1F29]="\u1F21";
|
||||
CF[0x1F2A]="\u1F22";
|
||||
CF[0x1F2B]="\u1F23";
|
||||
CF[0x1F2C]="\u1F24";
|
||||
CF[0x1F2D]="\u1F25";
|
||||
CF[0x1F2E]="\u1F26";
|
||||
CF[0x1F2F]="\u1F27";
|
||||
CF[0x1F38]="\u1F30";
|
||||
CF[0x1F39]="\u1F31";
|
||||
CF[0x1F3A]="\u1F32";
|
||||
CF[0x1F3B]="\u1F33";
|
||||
CF[0x1F3C]="\u1F34";
|
||||
CF[0x1F3D]="\u1F35";
|
||||
CF[0x1F3E]="\u1F36";
|
||||
CF[0x1F3F]="\u1F37";
|
||||
CF[0x1F48]="\u1F40";
|
||||
CF[0x1F49]="\u1F41";
|
||||
CF[0x1F4A]="\u1F42";
|
||||
CF[0x1F4B]="\u1F43";
|
||||
CF[0x1F4C]="\u1F44";
|
||||
CF[0x1F4D]="\u1F45";
|
||||
CF[0x1F50]="\u03C5\u0313";
|
||||
CF[0x1F52]="\u03C5\u0313\u0300";
|
||||
CF[0x1F54]="\u03C5\u0313\u0301";
|
||||
CF[0x1F56]="\u03C5\u0313\u0342";
|
||||
CF[0x1F59]="\u1F51";
|
||||
CF[0x1F5B]="\u1F53";
|
||||
CF[0x1F5D]="\u1F55";
|
||||
CF[0x1F5F]="\u1F57";
|
||||
CF[0x1F68]="\u1F60";
|
||||
CF[0x1F69]="\u1F61";
|
||||
CF[0x1F6A]="\u1F62";
|
||||
CF[0x1F6B]="\u1F63";
|
||||
CF[0x1F6C]="\u1F64";
|
||||
CF[0x1F6D]="\u1F65";
|
||||
CF[0x1F6E]="\u1F66";
|
||||
CF[0x1F6F]="\u1F67";
|
||||
CF[0x1F80]="\u1F00\u03B9";
|
||||
CF[0x1F81]="\u1F01\u03B9";
|
||||
CF[0x1F82]="\u1F02\u03B9";
|
||||
CF[0x1F83]="\u1F03\u03B9";
|
||||
CF[0x1F84]="\u1F04\u03B9";
|
||||
CF[0x1F85]="\u1F05\u03B9";
|
||||
CF[0x1F86]="\u1F06\u03B9";
|
||||
CF[0x1F87]="\u1F07\u03B9";
|
||||
CF[0x1F88]="\u1F00\u03B9";
|
||||
CF[0x1F89]="\u1F01\u03B9";
|
||||
CF[0x1F8A]="\u1F02\u03B9";
|
||||
CF[0x1F8B]="\u1F03\u03B9";
|
||||
CF[0x1F8C]="\u1F04\u03B9";
|
||||
CF[0x1F8D]="\u1F05\u03B9";
|
||||
CF[0x1F8E]="\u1F06\u03B9";
|
||||
CF[0x1F8F]="\u1F07\u03B9";
|
||||
CF[0x1F90]="\u1F20\u03B9";
|
||||
CF[0x1F91]="\u1F21\u03B9";
|
||||
CF[0x1F92]="\u1F22\u03B9";
|
||||
CF[0x1F93]="\u1F23\u03B9";
|
||||
CF[0x1F94]="\u1F24\u03B9";
|
||||
CF[0x1F95]="\u1F25\u03B9";
|
||||
CF[0x1F96]="\u1F26\u03B9";
|
||||
CF[0x1F97]="\u1F27\u03B9";
|
||||
CF[0x1F98]="\u1F20\u03B9";
|
||||
CF[0x1F99]="\u1F21\u03B9";
|
||||
CF[0x1F9A]="\u1F22\u03B9";
|
||||
CF[0x1F9B]="\u1F23\u03B9";
|
||||
CF[0x1F9C]="\u1F24\u03B9";
|
||||
CF[0x1F9D]="\u1F25\u03B9";
|
||||
CF[0x1F9E]="\u1F26\u03B9";
|
||||
CF[0x1F9F]="\u1F27\u03B9";
|
||||
CF[0x1FA0]="\u1F60\u03B9";
|
||||
CF[0x1FA1]="\u1F61\u03B9";
|
||||
CF[0x1FA2]="\u1F62\u03B9";
|
||||
CF[0x1FA3]="\u1F63\u03B9";
|
||||
CF[0x1FA4]="\u1F64\u03B9";
|
||||
CF[0x1FA5]="\u1F65\u03B9";
|
||||
CF[0x1FA6]="\u1F66\u03B9";
|
||||
CF[0x1FA7]="\u1F67\u03B9";
|
||||
CF[0x1FA8]="\u1F60\u03B9";
|
||||
CF[0x1FA9]="\u1F61\u03B9";
|
||||
CF[0x1FAA]="\u1F62\u03B9";
|
||||
CF[0x1FAB]="\u1F63\u03B9";
|
||||
CF[0x1FAC]="\u1F64\u03B9";
|
||||
CF[0x1FAD]="\u1F65\u03B9";
|
||||
CF[0x1FAE]="\u1F66\u03B9";
|
||||
CF[0x1FAF]="\u1F67\u03B9";
|
||||
CF[0x1FB2]="\u1F70\u03B9";
|
||||
CF[0x1FB3]="\u03B1\u03B9";
|
||||
CF[0x1FB4]="\u03AC\u03B9";
|
||||
CF[0x1FB6]="\u03B1\u0342";
|
||||
CF[0x1FB7]="\u03B1\u0342\u03B9";
|
||||
CF[0x1FB8]="\u1FB0";
|
||||
CF[0x1FB9]="\u1FB1";
|
||||
CF[0x1FBA]="\u1F70";
|
||||
CF[0x1FBB]="\u1F71";
|
||||
CF[0x1FBC]="\u03B1\u03B9";
|
||||
CF[0x1FBE]="\u03B9";
|
||||
CF[0x1FC2]="\u1F74\u03B9";
|
||||
CF[0x1FC3]="\u03B7\u03B9";
|
||||
CF[0x1FC4]="\u03AE\u03B9";
|
||||
CF[0x1FC6]="\u03B7\u0342";
|
||||
CF[0x1FC7]="\u03B7\u0342\u03B9";
|
||||
CF[0x1FC8]="\u1F72";
|
||||
CF[0x1FC9]="\u1F73";
|
||||
CF[0x1FCA]="\u1F74";
|
||||
CF[0x1FCB]="\u1F75";
|
||||
CF[0x1FCC]="\u03B7\u03B9";
|
||||
CF[0x1FD2]="\u03B9\u0308\u0300";
|
||||
CF[0x1FD3]="\u03B9\u0308\u0301";
|
||||
CF[0x1FD6]="\u03B9\u0342";
|
||||
CF[0x1FD7]="\u03B9\u0308\u0342";
|
||||
CF[0x1FD8]="\u1FD0";
|
||||
CF[0x1FD9]="\u1FD1";
|
||||
CF[0x1FDA]="\u1F76";
|
||||
CF[0x1FDB]="\u1F77";
|
||||
CF[0x1FE2]="\u03C5\u0308\u0300";
|
||||
CF[0x1FE3]="\u03C5\u0308\u0301";
|
||||
CF[0x1FE4]="\u03C1\u0313";
|
||||
CF[0x1FE6]="\u03C5\u0342";
|
||||
CF[0x1FE7]="\u03C5\u0308\u0342";
|
||||
CF[0x1FE8]="\u1FE0";
|
||||
CF[0x1FE9]="\u1FE1";
|
||||
CF[0x1FEA]="\u1F7A";
|
||||
CF[0x1FEB]="\u1F7B";
|
||||
CF[0x1FEC]="\u1FE5";
|
||||
CF[0x1FF2]="\u1F7C\u03B9";
|
||||
CF[0x1FF3]="\u03C9\u03B9";
|
||||
CF[0x1FF4]="\u03CE\u03B9";
|
||||
CF[0x1FF6]="\u03C9\u0342";
|
||||
CF[0x1FF7]="\u03C9\u0342\u03B9";
|
||||
CF[0x1FF8]="\u1F78";
|
||||
CF[0x1FF9]="\u1F79";
|
||||
CF[0x1FFA]="\u1F7C";
|
||||
CF[0x1FFB]="\u1F7D";
|
||||
CF[0x1FFC]="\u03C9\u03B9";
|
||||
CF[0x2126]="\u03C9";
|
||||
CF[0x212A]="\u006B";
|
||||
CF[0x212B]="\u00E5";
|
||||
CF[0x2160]="\u2170";
|
||||
CF[0x2161]="\u2171";
|
||||
CF[0x2162]="\u2172";
|
||||
CF[0x2163]="\u2173";
|
||||
CF[0x2164]="\u2174";
|
||||
CF[0x2165]="\u2175";
|
||||
CF[0x2166]="\u2176";
|
||||
CF[0x2167]="\u2177";
|
||||
CF[0x2168]="\u2178";
|
||||
CF[0x2169]="\u2179";
|
||||
CF[0x216A]="\u217A";
|
||||
CF[0x216B]="\u217B";
|
||||
CF[0x216C]="\u217C";
|
||||
CF[0x216D]="\u217D";
|
||||
CF[0x216E]="\u217E";
|
||||
CF[0x216F]="\u217F";
|
||||
CF[0x24B6]="\u24D0";
|
||||
CF[0x24B7]="\u24D1";
|
||||
CF[0x24B8]="\u24D2";
|
||||
CF[0x24B9]="\u24D3";
|
||||
CF[0x24BA]="\u24D4";
|
||||
CF[0x24BB]="\u24D5";
|
||||
CF[0x24BC]="\u24D6";
|
||||
CF[0x24BD]="\u24D7";
|
||||
CF[0x24BE]="\u24D8";
|
||||
CF[0x24BF]="\u24D9";
|
||||
CF[0x24C0]="\u24DA";
|
||||
CF[0x24C1]="\u24DB";
|
||||
CF[0x24C2]="\u24DC";
|
||||
CF[0x24C3]="\u24DD";
|
||||
CF[0x24C4]="\u24DE";
|
||||
CF[0x24C5]="\u24DF";
|
||||
CF[0x24C6]="\u24E0";
|
||||
CF[0x24C7]="\u24E1";
|
||||
CF[0x24C8]="\u24E2";
|
||||
CF[0x24C9]="\u24E3";
|
||||
CF[0x24CA]="\u24E4";
|
||||
CF[0x24CB]="\u24E5";
|
||||
CF[0x24CC]="\u24E6";
|
||||
CF[0x24CD]="\u24E7";
|
||||
CF[0x24CE]="\u24E8";
|
||||
CF[0x24CF]="\u24E9";
|
||||
CF[0xFB00]="\u0066\u0066";
|
||||
CF[0xFB01]="\u0066\u0069";
|
||||
CF[0xFB02]="\u0066\u006C";
|
||||
CF[0xFB03]="\u0066\u0066\u0069";
|
||||
CF[0xFB04]="\u0066\u0066\u006C";
|
||||
CF[0xFB05]="\u0073\u0074";
|
||||
CF[0xFB06]="\u0073\u0074";
|
||||
CF[0xFB13]="\u0574\u0576";
|
||||
CF[0xFB14]="\u0574\u0565";
|
||||
CF[0xFB15]="\u0574\u056B";
|
||||
CF[0xFB16]="\u057E\u0576";
|
||||
CF[0xFB17]="\u0574\u056D";
|
||||
CF[0xFF21]="\uFF41";
|
||||
CF[0xFF22]="\uFF42";
|
||||
CF[0xFF23]="\uFF43";
|
||||
CF[0xFF24]="\uFF44";
|
||||
CF[0xFF25]="\uFF45";
|
||||
CF[0xFF26]="\uFF46";
|
||||
CF[0xFF27]="\uFF47";
|
||||
CF[0xFF28]="\uFF48";
|
||||
CF[0xFF29]="\uFF49";
|
||||
CF[0xFF2A]="\uFF4A";
|
||||
CF[0xFF2B]="\uFF4B";
|
||||
CF[0xFF2C]="\uFF4C";
|
||||
CF[0xFF2D]="\uFF4D";
|
||||
CF[0xFF2E]="\uFF4E";
|
||||
CF[0xFF2F]="\uFF4F";
|
||||
CF[0xFF30]="\uFF50";
|
||||
CF[0xFF31]="\uFF51";
|
||||
CF[0xFF32]="\uFF52";
|
||||
CF[0xFF33]="\uFF53";
|
||||
CF[0xFF34]="\uFF54";
|
||||
CF[0xFF35]="\uFF55";
|
||||
CF[0xFF36]="\uFF56";
|
||||
CF[0xFF37]="\uFF57";
|
||||
CF[0xFF38]="\uFF58";
|
||||
CF[0xFF39]="\uFF59";
|
||||
CF[0xFF3A]="\uFF5A";
|
||||
// 785 case foldings total
|
||||
}
|
||||
}
|
@ -1,369 +0,0 @@
|
||||
<html><body>
|
||||
<h1
|
||||
>1. Mismatches when NFD is OFF</h1><h2
|
||||
>Date:Mon Jun 03 08:45:38 PDT 2002</h2><h2
|
||||
>File Version:-3.1.1d1</h2><p
|
||||
>Alternate Handling = NON_IGNORABLE</p><table border="1"
|
||||
><caption
|
||||
>Mismatches in UCA-NOD: Plain vs NFC: 4</caption><tr
|
||||
><th
|
||||
>Code</th><th
|
||||
>Type</th><th
|
||||
>CC?</th><th
|
||||
>Key</th></tr><tr
|
||||
><th rowSpan="2" align="right"
|
||||
>F951 CJK COMPATIBILITY IDEOGRAPH-F951<br
|
||||
></br>NFC=964B</th><th
|
||||
>Plain</th><th
|
||||
>n</th><td
|
||||
>[FF41 96FB | 0020 0020 | 0002 0002]</td></tr><tr
|
||||
><th
|
||||
>NFC</th><th
|
||||
>ERROR</th><td
|
||||
>[FF41 964B | 0020 0020 | 0002 0002]</td></tr><tr
|
||||
><th rowSpan="2" align="right"
|
||||
>FB1F HEBREW LIGATURE YIDDISH YOD YOD PATAH<br
|
||||
></br>NFC=05F2 05B7</th><th
|
||||
>Plain</th><th
|
||||
>n</th><td
|
||||
>[0EC0 0EC0 | 0020 0020 00B2 | 0004 0004 001F]</td></tr><tr
|
||||
|
||||
><th
|
||||
>NFC</th><th
|
||||
>Y</th><td
|
||||
>[0EC0 0EC0 | 0020 0020 00B2 | 0004 0004 0002]</td></tr><tr
|
||||
|
||||
><th rowSpan="2" align="right"
|
||||
>FB3A HEBREW LETTER FINAL KAF WITH DAGESH<br
|
||||
></br>NFC=05DA 05BC</th><th
|
||||
>Plain</th><th
|
||||
>n</th><td
|
||||
>[0EC1 | 0020 00B6 | 0019 0019]</td></tr><tr
|
||||
><th
|
||||
>NFC</th><th
|
||||
>Y</th><td
|
||||
>[0EC1 | 0020 00B6 | 0019 0002]</td></tr><tr
|
||||
><th rowSpan="2" align="right"
|
||||
>FB43 HEBREW LETTER FINAL PE WITH DAGESH<br
|
||||
></br>NFC=05E3 05BC</th><th
|
||||
>Plain</th><th
|
||||
>n</th><td
|
||||
>[0EC7 | 0020 00B6 | 0019 0019]</td></tr><tr
|
||||
><th
|
||||
>NFC</th><th
|
||||
>Y</th><td
|
||||
>[0EC7 | 0020 00B6 | 0019 0002]</td></tr></table><br>
|
||||
<h1>2. Differences in Ordering</h1>
|
||||
<p>Codes and names are in the white rows: bold means that the NO-NFD sort key differs from UCA key.</p>
|
||||
<p>Keys are in the light blue rows: green is the bad key, blue is UCA, black is where they equal.</p>
|
||||
<table border='1'>
|
||||
<tr><th>File Order</th><th>Code and Decomp</th><th>Key and Decomp-Key</th></tr>
|
||||
<tr><td colspan='3'></td><tr>
|
||||
<tr><td>12573</td><td>F951 CJK COMPATIBILITY IDEOGRAPH-F951<br><964B> </td><td>
|
||||
<font color='#009900'>[FF41 96FB | 0020 0020 | 0002 0002 | |]</font><br><font color='#000099'>[FF41 964B | 0020 0020 | 0002 0002 | |]</font>
|
||||
</td></tr>
|
||||
<tr><td>12574</td><td>FA09 CJK COMPATIBILITY IDEOGRAPH-FA09<br><964D> </td><td>
|
||||
[FF41 964D | 0020 0020 | 0002 0002 | |]
|
||||
</td></tr>
|
||||
</table>
|
||||
<h2>3. Primaries Incompatible with Decompositions</h2><table border='1'>
|
||||
<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>
|
||||
<tr><td>00A8</td><td>[0214]</td><td>[0209]</td><td>DIAERESIS</td></tr>
|
||||
<tr><td>00AF</td><td>[0210]</td><td>[0209]</td><td>MACRON</td></tr>
|
||||
<tr><td>00B4</td><td>[020D]</td><td>[0209]</td><td>ACUTE ACCENT</td></tr>
|
||||
<tr><td>00B8</td><td>[0219]</td><td>[0209]</td><td>CEDILLA</td></tr>
|
||||
<tr><td>02D8</td><td>[0212]</td><td>[0209]</td><td>BREVE</td></tr>
|
||||
<tr><td>02D9</td><td>[0213]</td><td>[0209]</td><td>DOT ABOVE</td></tr>
|
||||
<tr><td>02DA</td><td>[0215]</td><td>[0209]</td><td>RING ABOVE</td></tr>
|
||||
<tr><td>02DB</td><td>[021A]</td><td>[0209]</td><td>OGONEK</td></tr>
|
||||
<tr><td>02DC</td><td>[020E]</td><td>[0209]</td><td>SMALL TILDE</td></tr>
|
||||
<tr><td>02DD</td><td>[0216]</td><td>[0209]</td><td>DOUBLE ACUTE ACCENT</td></tr>
|
||||
<tr><td>037A</td><td>[0C9B]</td><td>[0209]</td><td>GREEK YPOGEGRAMMENI</td></tr>
|
||||
<tr><td>0384</td><td>[020D]</td><td>[0209]</td><td>GREEK TONOS</td></tr>
|
||||
<tr><td>0385</td><td>[0214]</td><td>[0209]</td><td>GREEK DIALYTIKA TONOS</td></tr>
|
||||
<tr><td>0CCB</td><td>[12C4]</td><td>[12C3 12C7]</td><td>KANNADA VOWEL SIGN OO</td></tr>
|
||||
<tr><td>0DDD</td><td>[1353]</td><td>[1352 1346]</td><td>SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA</td></tr>
|
||||
<tr><td>1FBD</td><td>[0217]</td><td>[0209]</td><td>GREEK KORONIS</td></tr>
|
||||
<tr><td>1FBF</td><td>[0217]</td><td>[0209]</td><td>GREEK PSILI</td></tr>
|
||||
<tr><td>1FC0</td><td>[021D]</td><td>[0209]</td><td>GREEK PERISPOMENI</td></tr>
|
||||
<tr><td>1FC1</td><td>[0214]</td><td>[0209]</td><td>GREEK DIALYTIKA AND PERISPOMENI</td></tr>
|
||||
<tr><td>1FCD</td><td>[0217]</td><td>[0209]</td><td>GREEK PSILI AND VARIA</td></tr>
|
||||
<tr><td>1FCE</td><td>[0217]</td><td>[0209]</td><td>GREEK PSILI AND OXIA</td></tr>
|
||||
<tr><td>1FCF</td><td>[0217]</td><td>[0209]</td><td>GREEK PSILI AND PERISPOMENI</td></tr>
|
||||
<tr><td>1FDD</td><td>[0218]</td><td>[0209]</td><td>GREEK DASIA AND VARIA</td></tr>
|
||||
<tr><td>1FDE</td><td>[0218]</td><td>[0209]</td><td>GREEK DASIA AND OXIA</td></tr>
|
||||
<tr><td>1FDF</td><td>[0218]</td><td>[0209]</td><td>GREEK DASIA AND PERISPOMENI</td></tr>
|
||||
<tr><td>1FED</td><td>[0214]</td><td>[0209]</td><td>GREEK DIALYTIKA AND VARIA</td></tr>
|
||||
<tr><td>1FEE</td><td>[0214]</td><td>[0209]</td><td>GREEK DIALYTIKA AND OXIA</td></tr>
|
||||
<tr><td>1FFD</td><td>[020D]</td><td>[0209]</td><td>GREEK OXIA</td></tr>
|
||||
<tr><td>1FFE</td><td>[0218]</td><td>[0209]</td><td>GREEK DASIA</td></tr>
|
||||
<tr><td>2017</td><td>[021C]</td><td>[0209]</td><td>DOUBLE LOW LINE</td></tr>
|
||||
<tr><td>203E</td><td>[0211]</td><td>[0209]</td><td>OVERLINE</td></tr>
|
||||
<tr><td>2047</td><td>[FFC0 A047]</td><td>[024E 024E]</td><td>DOUBLE QUESTION MARK</td></tr>
|
||||
<tr><td>2057</td><td>[FFC0 A057]</td><td>[02B6 02B6 02B6 02B6]</td><td>QUADRUPLE PRIME</td></tr>
|
||||
<tr><td>205F</td><td>[FFC0 A05F]</td><td>[0209]</td><td>MEDIUM MATHEMATICAL SPACE</td></tr>
|
||||
<tr><td>2071</td><td>[FFC0 A071]</td><td>[0AD3]</td><td>SUPERSCRIPT LATIN SMALL LETTER I</td></tr>
|
||||
<tr><td>213D</td><td>[FFC0 A13D]</td><td>[0C93]</td><td>DOUBLE-STRUCK SMALL GAMMA</td></tr>
|
||||
<tr><td>213E</td><td>[FFC0 A13E]</td><td>[0C93]</td><td>DOUBLE-STRUCK CAPITAL GAMMA</td></tr>
|
||||
<tr><td>213F</td><td>[FFC0 A13F]</td><td>[0CA3]</td><td>DOUBLE-STRUCK CAPITAL PI</td></tr>
|
||||
<tr><td>2140</td><td>[FFC0 A140]</td><td>[039E]</td><td>DOUBLE-STRUCK N-ARY SUMMATION</td></tr>
|
||||
<tr><td>2145</td><td>[FFC0 A145]</td><td>[0A49]</td><td>DOUBLE-STRUCK ITALIC CAPITAL D</td></tr>
|
||||
<tr><td>2146</td><td>[FFC0 A146]</td><td>[0A49]</td><td>DOUBLE-STRUCK ITALIC SMALL D</td></tr>
|
||||
<tr><td>2147</td><td>[FFC0 A147]</td><td>[0A65]</td><td>DOUBLE-STRUCK ITALIC SMALL E</td></tr>
|
||||
<tr><td>2148</td><td>[FFC0 A148]</td><td>[0AD3]</td><td>DOUBLE-STRUCK ITALIC SMALL I</td></tr>
|
||||
<tr><td>2149</td><td>[FFC0 A149]</td><td>[0AE7]</td><td>DOUBLE-STRUCK ITALIC SMALL J</td></tr>
|
||||
<tr><td>2A0C</td><td>[FFC0 AA0C]</td><td>[03C2 03C2 03C2 03C2]</td><td>QUADRUPLE INTEGRAL OPERATOR</td></tr>
|
||||
<tr><td>2A74</td><td>[FFC0 AA74]</td><td>[0237 0237 03A4]</td><td>DOUBLE COLON EQUAL</td></tr>
|
||||
<tr><td>2A75</td><td>[FFC0 AA75]</td><td>[03A4 03A4]</td><td>TWO CONSECUTIVE EQUALS SIGNS</td></tr>
|
||||
<tr><td>2A76</td><td>[FFC0 AA76]</td><td>[03A4 03A4 03A4]</td><td>THREE CONSECUTIVE EQUALS SIGNS</td></tr>
|
||||
<tr><td>2ADC</td><td>[FFC0 AADC]</td><td>[FFC0 AADD]</td><td>FORKING</td></tr>
|
||||
<tr><td>309B</td><td>[021E]</td><td>[0209]</td><td>KATAKANA-HIRAGANA VOICED SOUND MARK</td></tr>
|
||||
<tr><td>309C</td><td>[021F]</td><td>[0209]</td><td>KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK</td></tr>
|
||||
<tr><td>309F</td><td>[FFC0 B09F]</td><td>[1946 1948]</td><td>HIRAGANA DIGRAPH YORI</td></tr>
|
||||
<tr><td>30FF</td><td>[FFC0 B0FF]</td><td>[192A 1934]</td><td>KATAKANA DIGRAPH KOTO</td></tr>
|
||||
<tr><td>3251</td><td>[FFC0 B251]</td><td>[0A0D 0A0C]</td><td>CIRCLED NUMBER TWENTY ONE</td></tr>
|
||||
<tr><td>3252</td><td>[FFC0 B252]</td><td>[0A0D 0A0D]</td><td>CIRCLED NUMBER TWENTY TWO</td></tr>
|
||||
<tr><td>3253</td><td>[FFC0 B253]</td><td>[0A0D 0A0E]</td><td>CIRCLED NUMBER TWENTY THREE</td></tr>
|
||||
<tr><td>3254</td><td>[FFC0 B254]</td><td>[0A0D 0A0F]</td><td>CIRCLED NUMBER TWENTY FOUR</td></tr>
|
||||
<tr><td>3255</td><td>[FFC0 B255]</td><td>[0A0D 0A10]</td><td>CIRCLED NUMBER TWENTY FIVE</td></tr>
|
||||
<tr><td>3256</td><td>[FFC0 B256]</td><td>[0A0D 0A11]</td><td>CIRCLED NUMBER TWENTY SIX</td></tr>
|
||||
<tr><td>3257</td><td>[FFC0 B257]</td><td>[0A0D 0A12]</td><td>CIRCLED NUMBER TWENTY SEVEN</td></tr>
|
||||
<tr><td>3258</td><td>[FFC0 B258]</td><td>[0A0D 0A13]</td><td>CIRCLED NUMBER TWENTY EIGHT</td></tr>
|
||||
<tr><td>3259</td><td>[FFC0 B259]</td><td>[0A0D 0A14]</td><td>CIRCLED NUMBER TWENTY NINE</td></tr>
|
||||
<tr><td>325A</td><td>[FFC0 B25A]</td><td>[0A0E 0A0B]</td><td>CIRCLED NUMBER THIRTY</td></tr>
|
||||
<tr><td>325B</td><td>[FFC0 B25B]</td><td>[0A0E 0A0C]</td><td>CIRCLED NUMBER THIRTY ONE</td></tr>
|
||||
<tr><td>325C</td><td>[FFC0 B25C]</td><td>[0A0E 0A0D]</td><td>CIRCLED NUMBER THIRTY TWO</td></tr>
|
||||
<tr><td>325D</td><td>[FFC0 B25D]</td><td>[0A0E 0A0E]</td><td>CIRCLED NUMBER THIRTY THREE</td></tr>
|
||||
<tr><td>325E</td><td>[FFC0 B25E]</td><td>[0A0E 0A0F]</td><td>CIRCLED NUMBER THIRTY FOUR</td></tr>
|
||||
<tr><td>325F</td><td>[FFC0 B25F]</td><td>[0A0E 0A10]</td><td>CIRCLED NUMBER THIRTY FIVE</td></tr>
|
||||
<tr><td>32B1</td><td>[FFC0 B2B1]</td><td>[0A0E 0A11]</td><td>CIRCLED NUMBER THIRTY SIX</td></tr>
|
||||
<tr><td>32B2</td><td>[FFC0 B2B2]</td><td>[0A0E 0A12]</td><td>CIRCLED NUMBER THIRTY SEVEN</td></tr>
|
||||
<tr><td>32B3</td><td>[FFC0 B2B3]</td><td>[0A0E 0A13]</td><td>CIRCLED NUMBER THIRTY EIGHT</td></tr>
|
||||
<tr><td>32B4</td><td>[FFC0 B2B4]</td><td>[0A0E 0A14]</td><td>CIRCLED NUMBER THIRTY NINE</td></tr>
|
||||
<tr><td>32B5</td><td>[FFC0 B2B5]</td><td>[0A0F 0A0B]</td><td>CIRCLED NUMBER FORTY</td></tr>
|
||||
<tr><td>32B6</td><td>[FFC0 B2B6]</td><td>[0A0F 0A0C]</td><td>CIRCLED NUMBER FORTY ONE</td></tr>
|
||||
<tr><td>32B7</td><td>[FFC0 B2B7]</td><td>[0A0F 0A0D]</td><td>CIRCLED NUMBER FORTY TWO</td></tr>
|
||||
<tr><td>32B8</td><td>[FFC0 B2B8]</td><td>[0A0F 0A0E]</td><td>CIRCLED NUMBER FORTY THREE</td></tr>
|
||||
<tr><td>32B9</td><td>[FFC0 B2B9]</td><td>[0A0F 0A0F]</td><td>CIRCLED NUMBER FORTY FOUR</td></tr>
|
||||
<tr><td>32BA</td><td>[FFC0 B2BA]</td><td>[0A0F 0A10]</td><td>CIRCLED NUMBER FORTY FIVE</td></tr>
|
||||
<tr><td>32BB</td><td>[FFC0 B2BB]</td><td>[0A0F 0A11]</td><td>CIRCLED NUMBER FORTY SIX</td></tr>
|
||||
<tr><td>32BC</td><td>[FFC0 B2BC]</td><td>[0A0F 0A12]</td><td>CIRCLED NUMBER FORTY SEVEN</td></tr>
|
||||
<tr><td>32BD</td><td>[FFC0 B2BD]</td><td>[0A0F 0A13]</td><td>CIRCLED NUMBER FORTY EIGHT</td></tr>
|
||||
<tr><td>32BE</td><td>[FFC0 B2BE]</td><td>[0A0F 0A14]</td><td>CIRCLED NUMBER FORTY NINE</td></tr>
|
||||
<tr><td>32BF</td><td>[FFC0 B2BF]</td><td>[0A10 0A0B]</td><td>CIRCLED NUMBER FIFTY</td></tr>
|
||||
<tr><td>F951</td><td>[FF41 96FB]</td><td>[FF41 964B]</td><td>CJK COMPATIBILITY IDEOGRAPH-F951</td></tr>
|
||||
<tr><td>FA30</td><td>[FFC1 FA30]</td><td>[FF40 CFAE]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA30</td></tr>
|
||||
<tr><td>FA31</td><td>[FFC1 FA31]</td><td>[FF40 D0E7]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA31</td></tr>
|
||||
<tr><td>FA32</td><td>[FFC1 FA32]</td><td>[FF40 D14D]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA32</td></tr>
|
||||
<tr><td>FA33</td><td>[FFC1 FA33]</td><td>[FF40 D2C9]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA33</td></tr>
|
||||
<tr><td>FA34</td><td>[FFC1 FA34]</td><td>[FF40 D2E4]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA34</td></tr>
|
||||
<tr><td>FA35</td><td>[FFC1 FA35]</td><td>[FF40 D351]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA35</td></tr>
|
||||
<tr><td>FA36</td><td>[FFC1 FA36]</td><td>[FF40 D59D]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA36</td></tr>
|
||||
<tr><td>FA37</td><td>[FFC1 FA37]</td><td>[FF40 D606]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA37</td></tr>
|
||||
<tr><td>FA38</td><td>[FFC1 FA38]</td><td>[FF40 D668]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA38</td></tr>
|
||||
<tr><td>FA39</td><td>[FFC1 FA39]</td><td>[FF40 D840]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA39</td></tr>
|
||||
<tr><td>FA3A</td><td>[FFC1 FA3A]</td><td>[FF40 D8A8]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3A</td></tr>
|
||||
<tr><td>FA3B</td><td>[FFC1 FA3B]</td><td>[FF40 DC64]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3B</td></tr>
|
||||
<tr><td>FA3C</td><td>[FFC1 FA3C]</td><td>[FF40 DC6E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3C</td></tr>
|
||||
<tr><td>FA3D</td><td>[FFC1 FA3D]</td><td>[FF40 E094]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3D</td></tr>
|
||||
<tr><td>FA3E</td><td>[FFC1 FA3E]</td><td>[FF40 E168]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3E</td></tr>
|
||||
<tr><td>FA3F</td><td>[FFC1 FA3F]</td><td>[FF40 E18E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA3F</td></tr>
|
||||
<tr><td>FA40</td><td>[FFC1 FA40]</td><td>[FF40 E1F2]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA40</td></tr>
|
||||
<tr><td>FA41</td><td>[FFC1 FA41]</td><td>[FF40 E54F]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA41</td></tr>
|
||||
<tr><td>FA42</td><td>[FFC1 FA42]</td><td>[FF40 E5E2]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA42</td></tr>
|
||||
<tr><td>FA43</td><td>[FFC1 FA43]</td><td>[FF40 E691]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA43</td></tr>
|
||||
<tr><td>FA44</td><td>[FFC1 FA44]</td><td>[FF40 E885]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA44</td></tr>
|
||||
<tr><td>FA45</td><td>[FFC1 FA45]</td><td>[FF40 ED77]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA45</td></tr>
|
||||
<tr><td>FA46</td><td>[FFC1 FA46]</td><td>[FF40 EE1A]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA46</td></tr>
|
||||
<tr><td>FA47</td><td>[FFC1 FA47]</td><td>[FF40 EF22]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA47</td></tr>
|
||||
<tr><td>FA48</td><td>[FFC1 FA48]</td><td>[FF40 F16E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA48</td></tr>
|
||||
<tr><td>FA49</td><td>[FFC1 FA49]</td><td>[FF40 F22B]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA49</td></tr>
|
||||
<tr><td>FA4A</td><td>[FFC1 FA4A]</td><td>[FF40 F422]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4A</td></tr>
|
||||
<tr><td>FA4B</td><td>[FFC1 FA4B]</td><td>[FF40 F891]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4B</td></tr>
|
||||
<tr><td>FA4C</td><td>[FFC1 FA4C]</td><td>[FF40 F93E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4C</td></tr>
|
||||
<tr><td>FA4D</td><td>[FFC1 FA4D]</td><td>[FF40 F949]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4D</td></tr>
|
||||
<tr><td>FA4E</td><td>[FFC1 FA4E]</td><td>[FF40 F948]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4E</td></tr>
|
||||
<tr><td>FA4F</td><td>[FFC1 FA4F]</td><td>[FF40 F950]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA4F</td></tr>
|
||||
<tr><td>FA50</td><td>[FFC1 FA50]</td><td>[FF40 F956]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA50</td></tr>
|
||||
<tr><td>FA51</td><td>[FFC1 FA51]</td><td>[FF40 F95D]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA51</td></tr>
|
||||
<tr><td>FA52</td><td>[FFC1 FA52]</td><td>[FF40 F98D]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA52</td></tr>
|
||||
<tr><td>FA53</td><td>[FFC1 FA53]</td><td>[FF40 F98E]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA53</td></tr>
|
||||
<tr><td>FA54</td><td>[FFC1 FA54]</td><td>[FF40 FA40]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA54</td></tr>
|
||||
<tr><td>FA55</td><td>[FFC1 FA55]</td><td>[FF40 FA81]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA55</td></tr>
|
||||
<tr><td>FA56</td><td>[FFC1 FA56]</td><td>[FF40 FBC0]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA56</td></tr>
|
||||
<tr><td>FA57</td><td>[FFC1 FA57]</td><td>[FF40 FDF4]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA57</td></tr>
|
||||
<tr><td>FA58</td><td>[FFC1 FA58]</td><td>[FF40 FE09]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA58</td></tr>
|
||||
<tr><td>FA59</td><td>[FFC1 FA59]</td><td>[FF40 FE41]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA59</td></tr>
|
||||
<tr><td>FA5A</td><td>[FFC1 FA5A]</td><td>[FF40 FF72]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5A</td></tr>
|
||||
<tr><td>FA5B</td><td>[FFC1 FA5B]</td><td>[FF41 8005]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5B</td></tr>
|
||||
<tr><td>FA5C</td><td>[FFC1 FA5C]</td><td>[FF41 81ED]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5C</td></tr>
|
||||
<tr><td>FA5D</td><td>[FFC1 FA5D]</td><td>[FF41 8279]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5D</td></tr>
|
||||
<tr><td>FA5E</td><td>[FFC1 FA5E]</td><td>[FF41 8279]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5E</td></tr>
|
||||
<tr><td>FA5F</td><td>[FFC1 FA5F]</td><td>[FF41 8457]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA5F</td></tr>
|
||||
<tr><td>FA60</td><td>[FFC1 FA60]</td><td>[FF41 8910]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA60</td></tr>
|
||||
<tr><td>FA61</td><td>[FFC1 FA61]</td><td>[FF41 8996]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA61</td></tr>
|
||||
<tr><td>FA62</td><td>[FFC1 FA62]</td><td>[FF41 8B01]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA62</td></tr>
|
||||
<tr><td>FA63</td><td>[FFC1 FA63]</td><td>[FF41 8B39]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA63</td></tr>
|
||||
<tr><td>FA64</td><td>[FFC1 FA64]</td><td>[FF41 8CD3]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA64</td></tr>
|
||||
<tr><td>FA65</td><td>[FFC1 FA65]</td><td>[FF41 8D08]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA65</td></tr>
|
||||
<tr><td>FA66</td><td>[FFC1 FA66]</td><td>[FF41 8FB6]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA66</td></tr>
|
||||
<tr><td>FA67</td><td>[FFC1 FA67]</td><td>[FF41 9038]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA67</td></tr>
|
||||
<tr><td>FA68</td><td>[FFC1 FA68]</td><td>[FF41 96E3]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA68</td></tr>
|
||||
<tr><td>FA69</td><td>[FFC1 FA69]</td><td>[FF41 97FF]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA69</td></tr>
|
||||
<tr><td>FA6A</td><td>[FFC1 FA6A]</td><td>[FF41 983B]</td><td>CJK COMPATIBILITY IDEOGRAPH-FA6A</td></tr>
|
||||
<tr><td>FC5E</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FC5F</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FC60</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM</td></tr>
|
||||
<tr><td>FC61</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM</td></tr>
|
||||
<tr><td>FC62</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM</td></tr>
|
||||
<tr><td>FC63</td><td>[]</td><td>[0209]</td><td>ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM</td></tr>
|
||||
<tr><td>FCF2</td><td>[]</td><td>[020B]</td><td>ARABIC LIGATURE SHADDA WITH FATHA MEDIAL FORM</td></tr>
|
||||
<tr><td>FCF3</td><td>[]</td><td>[020B]</td><td>ARABIC LIGATURE SHADDA WITH DAMMA MEDIAL FORM</td></tr>
|
||||
<tr><td>FCF4</td><td>[]</td><td>[020B]</td><td>ARABIC LIGATURE SHADDA WITH KASRA MEDIAL FORM</td></tr>
|
||||
<tr><td>FDFC</td><td>[FFC1 FDFC]</td><td>[0EF9 0F4A 0ED6 0F2D]</td><td>RIAL SIGN</td></tr>
|
||||
<tr><td>FE49</td><td>[0211]</td><td>[0209]</td><td>DASHED OVERLINE</td></tr>
|
||||
<tr><td>FE4A</td><td>[0211]</td><td>[0209]</td><td>CENTRELINE OVERLINE</td></tr>
|
||||
<tr><td>FE4B</td><td>[0211]</td><td>[0209]</td><td>WAVY OVERLINE</td></tr>
|
||||
<tr><td>FE4C</td><td>[0211]</td><td>[0209]</td><td>DOUBLE WAVY OVERLINE</td></tr>
|
||||
<tr><td>FE70</td><td>[]</td><td>[0209]</td><td>ARABIC FATHATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FE71</td><td>[]</td><td>[020B]</td><td>ARABIC TATWEEL WITH FATHATAN ABOVE</td></tr>
|
||||
<tr><td>FE72</td><td>[]</td><td>[0209]</td><td>ARABIC DAMMATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FE74</td><td>[]</td><td>[0209]</td><td>ARABIC KASRATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FE76</td><td>[]</td><td>[0209]</td><td>ARABIC FATHA ISOLATED FORM</td></tr>
|
||||
<tr><td>FE77</td><td>[]</td><td>[020B]</td><td>ARABIC FATHA MEDIAL FORM</td></tr>
|
||||
<tr><td>FE78</td><td>[]</td><td>[0209]</td><td>ARABIC DAMMA ISOLATED FORM</td></tr>
|
||||
<tr><td>FE79</td><td>[]</td><td>[020B]</td><td>ARABIC DAMMA MEDIAL FORM</td></tr>
|
||||
<tr><td>FE7A</td><td>[]</td><td>[0209]</td><td>ARABIC KASRA ISOLATED FORM</td></tr>
|
||||
<tr><td>FE7B</td><td>[]</td><td>[020B]</td><td>ARABIC KASRA MEDIAL FORM</td></tr>
|
||||
<tr><td>FE7C</td><td>[]</td><td>[0209]</td><td>ARABIC SHADDA ISOLATED FORM</td></tr>
|
||||
<tr><td>FE7D</td><td>[]</td><td>[020B]</td><td>ARABIC SHADDA MEDIAL FORM</td></tr>
|
||||
<tr><td>FE7E</td><td>[]</td><td>[0209]</td><td>ARABIC SUKUN ISOLATED FORM</td></tr>
|
||||
<tr><td>FE7F</td><td>[]</td><td>[020B]</td><td>ARABIC SUKUN MEDIAL FORM</td></tr>
|
||||
<tr><td>FF5F</td><td>[FFC1 FF5F]</td><td>[FFC0 A985]</td><td>FULLWIDTH LEFT WHITE PARENTHESIS</td></tr>
|
||||
<tr><td>FF60</td><td>[FFC1 FF60]</td><td>[FFC0 A986]</td><td>FULLWIDTH RIGHT WHITE PARENTHESIS</td></tr>
|
||||
<tr><td>FFE3</td><td>[0210]</td><td>[0209]</td><td>FULLWIDTH MACRON</td></tr>
|
||||
</table>
|
||||
<h2>4. Secondaries Incompatible with Decompositions</h2><table border='1'>
|
||||
<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>
|
||||
<tr><td>00A8</td><td>[0214 | 0020]</td><td>[0209 | 0020 0047]</td><td>DIAERESIS</td></tr>
|
||||
<tr><td>00AF</td><td>[0210 | 0020]</td><td>[0209 | 0020 005A]</td><td>MACRON</td></tr>
|
||||
<tr><td>00B4</td><td>[020D | 0020]</td><td>[0209 | 0020 0032]</td><td>ACUTE ACCENT</td></tr>
|
||||
<tr><td>00B8</td><td>[0219 | 0020]</td><td>[0209 | 0020 0055]</td><td>CEDILLA</td></tr>
|
||||
<tr><td>017F</td><td>[0BA7 | 0020 0154]</td><td>[0BA7 | 0020]</td><td>LATIN SMALL LETTER LONG S</td></tr>
|
||||
<tr><td>02D8</td><td>[0212 | 0020]</td><td>[0209 | 0020 0037]</td><td>BREVE</td></tr>
|
||||
<tr><td>02D9</td><td>[0213 | 0020]</td><td>[0209 | 0020 0052]</td><td>DOT ABOVE</td></tr>
|
||||
<tr><td>02DA</td><td>[0215 | 0020]</td><td>[0209 | 0020 0043]</td><td>RING ABOVE</td></tr>
|
||||
<tr><td>02DB</td><td>[021A | 0020]</td><td>[0209 | 0020 0058]</td><td>OGONEK</td></tr>
|
||||
<tr><td>02DC</td><td>[020E | 0020]</td><td>[0209 | 0020 004E]</td><td>SMALL TILDE</td></tr>
|
||||
<tr><td>02DD</td><td>[0216 | 0020]</td><td>[0209 | 0020 004D]</td><td>DOUBLE ACUTE ACCENT</td></tr>
|
||||
<tr><td>037A</td><td>[0C9B | 0020]</td><td>[0209 | 0020 0096]</td><td>GREEK YPOGEGRAMMENI</td></tr>
|
||||
<tr><td>0384</td><td>[020D | 0020]</td><td>[0209 | 0020 0032]</td><td>GREEK TONOS</td></tr>
|
||||
<tr><td>0385</td><td>[0214 | 0020 0032]</td><td>[0209 | 0020 0047 0032]</td><td>GREEK DIALYTIKA TONOS</td></tr>
|
||||
<tr><td>1E9B</td><td>[0BA7 | 0020 0154 0052]</td><td>[0BA7 | 0020 0052]</td><td>LATIN SMALL LETTER LONG S WITH DOT ABOVE</td></tr>
|
||||
<tr><td>1FBD</td><td>[0217 | 0020]</td><td>[0209 | 0020 0022]</td><td>GREEK KORONIS</td></tr>
|
||||
<tr><td>1FBF</td><td>[0217 | 0020]</td><td>[0209 | 0020 0022]</td><td>GREEK PSILI</td></tr>
|
||||
<tr><td>1FC0</td><td>[021D | 0020]</td><td>[0209 | 0020 0045]</td><td>GREEK PERISPOMENI</td></tr>
|
||||
<tr><td>1FC1</td><td>[0214 | 0020 0045]</td><td>[0209 | 0020 0047 0045]</td><td>GREEK DIALYTIKA AND PERISPOMENI</td></tr>
|
||||
<tr><td>1FCD</td><td>[0217 | 0020 0035]</td><td>[0209 | 0020 0022 0035]</td><td>GREEK PSILI AND VARIA</td></tr>
|
||||
<tr><td>1FCE</td><td>[0217 | 0020 0032]</td><td>[0209 | 0020 0022 0032]</td><td>GREEK PSILI AND OXIA</td></tr>
|
||||
<tr><td>1FCF</td><td>[0217 | 0020 0045]</td><td>[0209 | 0020 0022 0045]</td><td>GREEK PSILI AND PERISPOMENI</td></tr>
|
||||
<tr><td>1FDD</td><td>[0218 | 0020 0035]</td><td>[0209 | 0020 002A 0035]</td><td>GREEK DASIA AND VARIA</td></tr>
|
||||
<tr><td>1FDE</td><td>[0218 | 0020 0032]</td><td>[0209 | 0020 002A 0032]</td><td>GREEK DASIA AND OXIA</td></tr>
|
||||
<tr><td>1FDF</td><td>[0218 | 0020 0045]</td><td>[0209 | 0020 002A 0045]</td><td>GREEK DASIA AND PERISPOMENI</td></tr>
|
||||
<tr><td>1FED</td><td>[0214 | 0020 0035]</td><td>[0209 | 0020 0047 0035]</td><td>GREEK DIALYTIKA AND VARIA</td></tr>
|
||||
<tr><td>1FEE</td><td>[0214 | 0020 0032]</td><td>[0209 | 0020 0047 0032]</td><td>GREEK DIALYTIKA AND OXIA</td></tr>
|
||||
<tr><td>1FFD</td><td>[020D | 0020]</td><td>[0209 | 0020 0032]</td><td>GREEK OXIA</td></tr>
|
||||
<tr><td>1FFE</td><td>[0218 | 0020]</td><td>[0209 | 0020 002A]</td><td>GREEK DASIA</td></tr>
|
||||
<tr><td>2017</td><td>[021C | 0020]</td><td>[0209 | 0020 008A]</td><td>DOUBLE LOW LINE</td></tr>
|
||||
<tr><td>203E</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>OVERLINE</td></tr>
|
||||
<tr><td>2047</td><td>[FFC0 A047 | 0020 0020]</td><td>[024E 024E | 0020 0020]</td><td>DOUBLE QUESTION MARK</td></tr>
|
||||
<tr><td>2057</td><td>[FFC0 A057 | 0020 0020]</td><td>[02B6 02B6 02B6 02B6 | 0020 0020 0020 0020]</td><td>QUADRUPLE PRIME</td></tr>
|
||||
<tr><td>205F</td><td>[FFC0 A05F | 0020 0020]</td><td>[0209 | 0020]</td><td>MEDIUM MATHEMATICAL SPACE</td></tr>
|
||||
<tr><td>2071</td><td>[FFC0 A071 | 0020 0020]</td><td>[0AD3 | 0020]</td><td>SUPERSCRIPT LATIN SMALL LETTER I</td></tr>
|
||||
<tr><td>213D</td><td>[FFC0 A13D | 0020 0020]</td><td>[0C93 | 0020]</td><td>DOUBLE-STRUCK SMALL GAMMA</td></tr>
|
||||
<tr><td>213E</td><td>[FFC0 A13E | 0020 0020]</td><td>[0C93 | 0020]</td><td>DOUBLE-STRUCK CAPITAL GAMMA</td></tr>
|
||||
<tr><td>213F</td><td>[FFC0 A13F | 0020 0020]</td><td>[0CA3 | 0020]</td><td>DOUBLE-STRUCK CAPITAL PI</td></tr>
|
||||
<tr><td>2140</td><td>[FFC0 A140 | 0020 0020]</td><td>[039E | 0020]</td><td>DOUBLE-STRUCK N-ARY SUMMATION</td></tr>
|
||||
<tr><td>2145</td><td>[FFC0 A145 | 0020 0020]</td><td>[0A49 | 0020]</td><td>DOUBLE-STRUCK ITALIC CAPITAL D</td></tr>
|
||||
<tr><td>2146</td><td>[FFC0 A146 | 0020 0020]</td><td>[0A49 | 0020]</td><td>DOUBLE-STRUCK ITALIC SMALL D</td></tr>
|
||||
<tr><td>2147</td><td>[FFC0 A147 | 0020 0020]</td><td>[0A65 | 0020]</td><td>DOUBLE-STRUCK ITALIC SMALL E</td></tr>
|
||||
<tr><td>2148</td><td>[FFC0 A148 | 0020 0020]</td><td>[0AD3 | 0020]</td><td>DOUBLE-STRUCK ITALIC SMALL I</td></tr>
|
||||
<tr><td>2149</td><td>[FFC0 A149 | 0020 0020]</td><td>[0AE7 | 0020]</td><td>DOUBLE-STRUCK ITALIC SMALL J</td></tr>
|
||||
<tr><td>2A0C</td><td>[FFC0 AA0C | 0020 0020]</td><td>[03C2 03C2 03C2 03C2 | 0020 0020 0020 0020]</td><td>QUADRUPLE INTEGRAL OPERATOR</td></tr>
|
||||
<tr><td>2A74</td><td>[FFC0 AA74 | 0020 0020]</td><td>[0237 0237 03A4 | 0020 0020 0020]</td><td>DOUBLE COLON EQUAL</td></tr>
|
||||
<tr><td>2A75</td><td>[FFC0 AA75 | 0020 0020]</td><td>[03A4 03A4 | 0020 0020]</td><td>TWO CONSECUTIVE EQUALS SIGNS</td></tr>
|
||||
<tr><td>2A76</td><td>[FFC0 AA76 | 0020 0020]</td><td>[03A4 03A4 03A4 | 0020 0020 0020]</td><td>THREE CONSECUTIVE EQUALS SIGNS</td></tr>
|
||||
<tr><td>309B</td><td>[021E | 0020]</td><td>[0209 | 0020 013D]</td><td>KATAKANA-HIRAGANA VOICED SOUND MARK</td></tr>
|
||||
<tr><td>309C</td><td>[021F | 0020]</td><td>[0209 | 0020 013E]</td><td>KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK</td></tr>
|
||||
<tr><td>309F</td><td>[FFC0 B09F | 0020 0020]</td><td>[1946 1948 | 0020 0020]</td><td>HIRAGANA DIGRAPH YORI</td></tr>
|
||||
<tr><td>30FF</td><td>[FFC0 B0FF | 0020 0020]</td><td>[192A 1934 | 0020 0020]</td><td>KATAKANA DIGRAPH KOTO</td></tr>
|
||||
<tr><td>3251</td><td>[FFC0 B251 | 0020 0020]</td><td>[0A0D 0A0C | 0020 0020]</td><td>CIRCLED NUMBER TWENTY ONE</td></tr>
|
||||
<tr><td>3252</td><td>[FFC0 B252 | 0020 0020]</td><td>[0A0D 0A0D | 0020 0020]</td><td>CIRCLED NUMBER TWENTY TWO</td></tr>
|
||||
<tr><td>3253</td><td>[FFC0 B253 | 0020 0020]</td><td>[0A0D 0A0E | 0020 0020]</td><td>CIRCLED NUMBER TWENTY THREE</td></tr>
|
||||
<tr><td>3254</td><td>[FFC0 B254 | 0020 0020]</td><td>[0A0D 0A0F | 0020 0020]</td><td>CIRCLED NUMBER TWENTY FOUR</td></tr>
|
||||
<tr><td>3255</td><td>[FFC0 B255 | 0020 0020]</td><td>[0A0D 0A10 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY FIVE</td></tr>
|
||||
<tr><td>3256</td><td>[FFC0 B256 | 0020 0020]</td><td>[0A0D 0A11 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY SIX</td></tr>
|
||||
<tr><td>3257</td><td>[FFC0 B257 | 0020 0020]</td><td>[0A0D 0A12 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY SEVEN</td></tr>
|
||||
<tr><td>3258</td><td>[FFC0 B258 | 0020 0020]</td><td>[0A0D 0A13 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY EIGHT</td></tr>
|
||||
<tr><td>3259</td><td>[FFC0 B259 | 0020 0020]</td><td>[0A0D 0A14 | 0020 0020]</td><td>CIRCLED NUMBER TWENTY NINE</td></tr>
|
||||
<tr><td>325A</td><td>[FFC0 B25A | 0020 0020]</td><td>[0A0E 0A0B | 0020 0020]</td><td>CIRCLED NUMBER THIRTY</td></tr>
|
||||
<tr><td>325B</td><td>[FFC0 B25B | 0020 0020]</td><td>[0A0E 0A0C | 0020 0020]</td><td>CIRCLED NUMBER THIRTY ONE</td></tr>
|
||||
<tr><td>325C</td><td>[FFC0 B25C | 0020 0020]</td><td>[0A0E 0A0D | 0020 0020]</td><td>CIRCLED NUMBER THIRTY TWO</td></tr>
|
||||
<tr><td>325D</td><td>[FFC0 B25D | 0020 0020]</td><td>[0A0E 0A0E | 0020 0020]</td><td>CIRCLED NUMBER THIRTY THREE</td></tr>
|
||||
<tr><td>325E</td><td>[FFC0 B25E | 0020 0020]</td><td>[0A0E 0A0F | 0020 0020]</td><td>CIRCLED NUMBER THIRTY FOUR</td></tr>
|
||||
<tr><td>325F</td><td>[FFC0 B25F | 0020 0020]</td><td>[0A0E 0A10 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY FIVE</td></tr>
|
||||
<tr><td>32B1</td><td>[FFC0 B2B1 | 0020 0020]</td><td>[0A0E 0A11 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY SIX</td></tr>
|
||||
<tr><td>32B2</td><td>[FFC0 B2B2 | 0020 0020]</td><td>[0A0E 0A12 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY SEVEN</td></tr>
|
||||
<tr><td>32B3</td><td>[FFC0 B2B3 | 0020 0020]</td><td>[0A0E 0A13 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY EIGHT</td></tr>
|
||||
<tr><td>32B4</td><td>[FFC0 B2B4 | 0020 0020]</td><td>[0A0E 0A14 | 0020 0020]</td><td>CIRCLED NUMBER THIRTY NINE</td></tr>
|
||||
<tr><td>32B5</td><td>[FFC0 B2B5 | 0020 0020]</td><td>[0A0F 0A0B | 0020 0020]</td><td>CIRCLED NUMBER FORTY</td></tr>
|
||||
<tr><td>32B6</td><td>[FFC0 B2B6 | 0020 0020]</td><td>[0A0F 0A0C | 0020 0020]</td><td>CIRCLED NUMBER FORTY ONE</td></tr>
|
||||
<tr><td>32B7</td><td>[FFC0 B2B7 | 0020 0020]</td><td>[0A0F 0A0D | 0020 0020]</td><td>CIRCLED NUMBER FORTY TWO</td></tr>
|
||||
<tr><td>32B8</td><td>[FFC0 B2B8 | 0020 0020]</td><td>[0A0F 0A0E | 0020 0020]</td><td>CIRCLED NUMBER FORTY THREE</td></tr>
|
||||
<tr><td>32B9</td><td>[FFC0 B2B9 | 0020 0020]</td><td>[0A0F 0A0F | 0020 0020]</td><td>CIRCLED NUMBER FORTY FOUR</td></tr>
|
||||
<tr><td>32BA</td><td>[FFC0 B2BA | 0020 0020]</td><td>[0A0F 0A10 | 0020 0020]</td><td>CIRCLED NUMBER FORTY FIVE</td></tr>
|
||||
<tr><td>32BB</td><td>[FFC0 B2BB | 0020 0020]</td><td>[0A0F 0A11 | 0020 0020]</td><td>CIRCLED NUMBER FORTY SIX</td></tr>
|
||||
<tr><td>32BC</td><td>[FFC0 B2BC | 0020 0020]</td><td>[0A0F 0A12 | 0020 0020]</td><td>CIRCLED NUMBER FORTY SEVEN</td></tr>
|
||||
<tr><td>32BD</td><td>[FFC0 B2BD | 0020 0020]</td><td>[0A0F 0A13 | 0020 0020]</td><td>CIRCLED NUMBER FORTY EIGHT</td></tr>
|
||||
<tr><td>32BE</td><td>[FFC0 B2BE | 0020 0020]</td><td>[0A0F 0A14 | 0020 0020]</td><td>CIRCLED NUMBER FORTY NINE</td></tr>
|
||||
<tr><td>32BF</td><td>[FFC0 B2BF | 0020 0020]</td><td>[0A10 0A0B | 0020 0020]</td><td>CIRCLED NUMBER FIFTY</td></tr>
|
||||
<tr><td>FB05</td><td>[0BA7 0BBF | 0020 0154 0020]</td><td>[0BA7 0BBF | 0020 0020]</td><td>LATIN SMALL LIGATURE LONG S T</td></tr>
|
||||
<tr><td>FBA4</td><td>[0F3D | 00CC]</td><td>[0F3D | 0020 00CC]</td><td>ARABIC LETTER HEH WITH YEH ABOVE ISOLATED FORM</td></tr>
|
||||
<tr><td>FBA5</td><td>[0F3D | 00CC]</td><td>[0F3D | 0020 00CC]</td><td>ARABIC LETTER HEH WITH YEH ABOVE FINAL FORM</td></tr>
|
||||
<tr><td>FBB0</td><td>[0F4F | 00CC]</td><td>[0F4F | 0020 00CC]</td><td>ARABIC LETTER YEH BARREE WITH HAMZA ABOVE ISOLATED FORM</td></tr>
|
||||
<tr><td>FBB1</td><td>[0F4F | 00CC]</td><td>[0F4F | 0020 00CC]</td><td>ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM</td></tr>
|
||||
<tr><td>FC5E</td><td>[| 00C8]</td><td>[0209 | 0020 00BE 00C8]</td><td>ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FC5F</td><td>[| 00C8]</td><td>[0209 | 0020 00C0 00C8]</td><td>ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FC60</td><td>[| 00C8]</td><td>[0209 | 0020 00C2 00C8]</td><td>ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM</td></tr>
|
||||
<tr><td>FC61</td><td>[| 00C8]</td><td>[0209 | 0020 00C4 00C8]</td><td>ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM</td></tr>
|
||||
<tr><td>FC62</td><td>[| 00C8]</td><td>[0209 | 0020 00C6 00C8]</td><td>ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM</td></tr>
|
||||
<tr><td>FC63</td><td>[| 00C8 00CE]</td><td>[0209 | 0020 00C8 00CE]</td><td>ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM</td></tr>
|
||||
<tr><td>FCF2</td><td>[| 00C8]</td><td>[020B | 0020 00C2 00C8]</td><td>ARABIC LIGATURE SHADDA WITH FATHA MEDIAL FORM</td></tr>
|
||||
<tr><td>FCF3</td><td>[| 00C8]</td><td>[020B | 0020 00C4 00C8]</td><td>ARABIC LIGATURE SHADDA WITH DAMMA MEDIAL FORM</td></tr>
|
||||
<tr><td>FCF4</td><td>[| 00C8]</td><td>[020B | 0020 00C6 00C8]</td><td>ARABIC LIGATURE SHADDA WITH KASRA MEDIAL FORM</td></tr>
|
||||
<tr><td>FD3C</td><td>[0ED6 | 00BD]</td><td>[0ED6 | 0020 00BD]</td><td>ARABIC LIGATURE ALEF WITH FATHATAN FINAL FORM</td></tr>
|
||||
<tr><td>FD3D</td><td>[0ED6 | 00BD]</td><td>[0ED6 | 0020 00BD]</td><td>ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FDFC</td><td>[FFC1 FDFC | 0020 0020]</td><td>[0EF9 0F4A 0ED6 0F2D | 0020 0020 0020 0020]</td><td>RIAL SIGN</td></tr>
|
||||
<tr><td>FE49</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>DASHED OVERLINE</td></tr>
|
||||
<tr><td>FE4A</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>CENTRELINE OVERLINE</td></tr>
|
||||
<tr><td>FE4B</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>WAVY OVERLINE</td></tr>
|
||||
<tr><td>FE4C</td><td>[0211 | 0020]</td><td>[0209 | 0020 005E]</td><td>DOUBLE WAVY OVERLINE</td></tr>
|
||||
<tr><td>FE70</td><td>[| 00BD]</td><td>[0209 | 0020 00BD]</td><td>ARABIC FATHATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FE71</td><td>[| 00BD]</td><td>[020B | 0020 00BD]</td><td>ARABIC TATWEEL WITH FATHATAN ABOVE</td></tr>
|
||||
<tr><td>FE72</td><td>[| 00BE]</td><td>[0209 | 0020 00BE]</td><td>ARABIC DAMMATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FE74</td><td>[| 00C0]</td><td>[0209 | 0020 00C0]</td><td>ARABIC KASRATAN ISOLATED FORM</td></tr>
|
||||
<tr><td>FE76</td><td>[| 00C2]</td><td>[0209 | 0020 00C2]</td><td>ARABIC FATHA ISOLATED FORM</td></tr>
|
||||
<tr><td>FE77</td><td>[| 00C2]</td><td>[020B | 0020 00C2]</td><td>ARABIC FATHA MEDIAL FORM</td></tr>
|
||||
<tr><td>FE78</td><td>[| 00C4]</td><td>[0209 | 0020 00C4]</td><td>ARABIC DAMMA ISOLATED FORM</td></tr>
|
||||
<tr><td>FE79</td><td>[| 00C4]</td><td>[020B | 0020 00C4]</td><td>ARABIC DAMMA MEDIAL FORM</td></tr>
|
||||
<tr><td>FE7A</td><td>[| 00C6]</td><td>[0209 | 0020 00C6]</td><td>ARABIC KASRA ISOLATED FORM</td></tr>
|
||||
<tr><td>FE7B</td><td>[| 00C6]</td><td>[020B | 0020 00C6]</td><td>ARABIC KASRA MEDIAL FORM</td></tr>
|
||||
<tr><td>FE7C</td><td>[| 00C8]</td><td>[0209 | 0020 00C8]</td><td>ARABIC SHADDA ISOLATED FORM</td></tr>
|
||||
<tr><td>FE7D</td><td>[| 00C8]</td><td>[020B | 0020 00C8]</td><td>ARABIC SHADDA MEDIAL FORM</td></tr>
|
||||
<tr><td>FE7E</td><td>[| 00CA]</td><td>[0209 | 0020 00CA]</td><td>ARABIC SUKUN ISOLATED FORM</td></tr>
|
||||
<tr><td>FE7F</td><td>[| 00CA]</td><td>[020B | 0020 00CA]</td><td>ARABIC SUKUN MEDIAL FORM</td></tr>
|
||||
<tr><td>FF5F</td><td>[FFC1 FF5F | 0020 0020]</td><td>[FFC0 A985 | 0020 0020]</td><td>FULLWIDTH LEFT WHITE PARENTHESIS</td></tr>
|
||||
<tr><td>FF60</td><td>[FFC1 FF60 | 0020 0020]</td><td>[FFC0 A986 | 0020 0020]</td><td>FULLWIDTH RIGHT WHITE PARENTHESIS</td></tr>
|
||||
<tr><td>FFE3</td><td>[0210 | 0020]</td><td>[0209 | 0020 005A]</td><td>FULLWIDTH MACRON</td></tr>
|
||||
</table>
|
||||
</body></html>
|
@ -1,742 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
|
||||
* $Date: 2005/04/06 08:48:16 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
public class GenOverlap implements UCD_Types, UCA_Types {
|
||||
|
||||
static Map completes = new TreeMap();
|
||||
static Map back = new HashMap();
|
||||
static Map initials = new HashMap();
|
||||
static int[] ces = new int[50];
|
||||
static UCA collator;
|
||||
static UCD ucd;
|
||||
static Normalizer nfd;
|
||||
static Normalizer nfkd;
|
||||
|
||||
public static void validateUCA(UCA collatorIn) throws Exception {
|
||||
collator = collatorIn;
|
||||
ucd = UCD.make();
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
|
||||
nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion());
|
||||
|
||||
for (int cp = 0x0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!ucd.isRepresented(cp)) continue;
|
||||
byte decompType = ucd.getDecompositionType(cp);
|
||||
if (decompType >= UCD.COMPATIBILITY) {
|
||||
String decomp = nfkd.normalize(cp);
|
||||
CEList celistDecomp = getCEList(cp, decomp, true, decompType);
|
||||
CEList celistNormal = getCEList(UTF16.valueOf(cp), false);
|
||||
if (!celistNormal.equals(celistDecomp)) {
|
||||
Utility.fixDot();
|
||||
System.out.println();
|
||||
System.out.println(ucd.getCodeAndName(cp));
|
||||
System.out.println(celistNormal);
|
||||
System.out.println(celistDecomp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void test(UCA collatorIn) throws Exception {
|
||||
collator = collatorIn;
|
||||
|
||||
CEList.main(null);
|
||||
|
||||
System.out.println("# Overlap");
|
||||
System.out.println("# Generated " + Default.getDate());
|
||||
|
||||
ucd = UCD.make();
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
|
||||
nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion());
|
||||
|
||||
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
|
||||
|
||||
// store data for faster lookup
|
||||
|
||||
System.out.println("# Gathering Data");
|
||||
int counter = 0;
|
||||
|
||||
int[] lenArray = new int[1];
|
||||
|
||||
while (true) {
|
||||
|
||||
Utility.dot(counter++);
|
||||
String s = cc.next(ces, lenArray);
|
||||
if (s == null) break;
|
||||
int len = lenArray[0];
|
||||
|
||||
CEList currCEList = new CEList(ces, 0, len);
|
||||
addString(s, currCEList);
|
||||
}
|
||||
|
||||
/*
|
||||
for (int cp = 0x10000; cp <= 0x10FFFF; ++cp) {
|
||||
if (!ucd.isRepresented(cp)) continue;
|
||||
byte decompType = ucd.getDecompositionType(cp);
|
||||
if (decompType >= UCD.COMPATIBILITY) {
|
||||
String decomp = nfkd.normalize(cp);
|
||||
CEList celist = getCEList(cp, decomp, true, decompType);
|
||||
addString(decomp, celist);
|
||||
System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("# Completes Count: " + completes.size());
|
||||
System.out.println("# Initials Count: " + initials.size());
|
||||
System.out.println("# Writing Overlaps");
|
||||
|
||||
// simpleList();
|
||||
fullCheck();
|
||||
}
|
||||
|
||||
public static void addString(String s, CEList currCEList) {
|
||||
back.put(s, currCEList);
|
||||
completes.put(currCEList, s);
|
||||
|
||||
for (int i = 1; i < currCEList.length(); ++i) {
|
||||
CEList start = currCEList.start(i);
|
||||
Set bag = (Set) initials.get(start);
|
||||
if (bag == null) {
|
||||
bag = new TreeSet();
|
||||
initials.put(start, bag);
|
||||
}
|
||||
bag.add(s);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void simpleList() {
|
||||
Iterator it = completes.keySet().iterator();
|
||||
int counter = 0;
|
||||
int foundCount = 0;
|
||||
|
||||
while (it.hasNext()) {
|
||||
Utility.dot(counter++);
|
||||
|
||||
// see if the ces for the current element are the start of something else
|
||||
CEList key = (CEList) it.next();
|
||||
String val = (String) completes.get(key);
|
||||
Set probe = (Set) initials.get(key);
|
||||
|
||||
if (probe != null) {
|
||||
Utility.fixDot();
|
||||
foundCount++;
|
||||
System.out.println("Possible Overlap: ");
|
||||
System.out.println(" " + ucd.getCodeAndName(val));
|
||||
System.out.println("\t" + key);
|
||||
|
||||
Iterator it2 = probe.iterator();
|
||||
int count2 = 0;
|
||||
while (it2.hasNext()) {
|
||||
String match = (String) it2.next();
|
||||
CEList ceList = (CEList) back.get(match);
|
||||
System.out.println((count2++) + ". " + ucd.getCodeAndName(match));
|
||||
System.out.println("\t" + ceList);
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("# Found Count: " + foundCount);
|
||||
}
|
||||
|
||||
static boolean PROGRESS = false;
|
||||
|
||||
static void fullCheck() throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.html", Utility.UTF8_WINDOWS);
|
||||
PrintWriter simpleList = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.txt", Utility.UTF8_WINDOWS);
|
||||
|
||||
Iterator it = completes.keySet().iterator();
|
||||
int counter = 0;
|
||||
int foundCount = 0;
|
||||
|
||||
String [] goalChars = new String[1];
|
||||
String [] matchChars = new String[1];
|
||||
|
||||
// CEList show = getCEList("\u2034");
|
||||
Utility.writeHtmlHeader(log, "Overlaps");
|
||||
log.print("<table>");
|
||||
|
||||
while (it.hasNext()) {
|
||||
Utility.dot(counter++);
|
||||
CEList key = (CEList) it.next();
|
||||
if (key.length() < 2) continue;
|
||||
|
||||
String val = (String) completes.get(key);
|
||||
goalChars[0] = "";
|
||||
matchChars[0] = "";
|
||||
if (matchWhole(val, key, 0, goalChars, matchChars)) {
|
||||
|
||||
simpleList.println(ucd.getCodeAndName(val));
|
||||
|
||||
goalChars[0] = val + goalChars[0]; // fix first char
|
||||
|
||||
if (!getCEList(goalChars[0]).equals(getCEList(matchChars[0]))) {
|
||||
log.println("<tr><td colspan='6'>WARNING:" + getCEList(matchChars[0]) + "</td></tr>");
|
||||
}
|
||||
foundCount++;
|
||||
log.println("<tr><td>" + val + "</td>");
|
||||
log.println("<td>" + goalChars[0] + "</td>");
|
||||
log.println("<td>" + matchChars[0] + "</td>");
|
||||
log.println("<td>" + ucd.getCodeAndName(goalChars[0]) + "</td>");
|
||||
log.println("<td>" + ucd.getCodeAndName(matchChars[0]) + "</td>");
|
||||
log.println("<td>" + getCEList(goalChars[0]) + "</td></tr>");
|
||||
//log.println("\t" + );
|
||||
}
|
||||
}
|
||||
log.println("</tr></table>Number of Overlapping characters: " + foundCount + "</body>");
|
||||
log.close();
|
||||
simpleList.close();
|
||||
}
|
||||
|
||||
static private CEList getCEList(String s) {
|
||||
return getCEList(s, true);
|
||||
}
|
||||
|
||||
static private CEList getCEList(String s, boolean decomp) {
|
||||
int len = collator.getCEs(s, decomp, ces);
|
||||
return new CEList(ces, 0, len);
|
||||
}
|
||||
|
||||
static private CEList getCEList(int originalChar, String s, boolean decomp, byte type) {
|
||||
int len = collator.getCEs(s, decomp, ces);
|
||||
if (decomp) {
|
||||
for (int i = 0; i < len; ++i) {
|
||||
ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]),
|
||||
UCA.getSecondary(ces[i]),
|
||||
CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
|
||||
}
|
||||
}
|
||||
return new CEList(ces, 0, len);
|
||||
}
|
||||
|
||||
static boolean matchWhole(String goalStr, CEList goal, int depth, String[] goalChars, String[] otherChars) {
|
||||
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Trying: " + ucd.getCodeAndName(goalStr) + ", " + goal);
|
||||
|
||||
// to stop infinite loops, we limit the depth to 5
|
||||
if (depth > 5) {
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "stack exhausted");
|
||||
return false;
|
||||
}
|
||||
|
||||
String match;
|
||||
|
||||
// There are 3 possible conditions. Any of which work.
|
||||
|
||||
// To eliminate double matches at the top level, we test depth > 0
|
||||
|
||||
if (depth > 0) {
|
||||
|
||||
// Condition 1.
|
||||
// we have an exact match
|
||||
|
||||
match = (String) completes.get(goal);
|
||||
if (match != null) {
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Exactly: " + ucd.getCodeAndName(match));
|
||||
otherChars[0] = match + otherChars[0];
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
|
||||
+ ucd.getCode(goalChars[0])
|
||||
+ " / " + ucd.getCode(otherChars[0])
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Condition 2
|
||||
// this whole string matches some initial portion of another string
|
||||
// AND the remainder of that other string also does a matchWhole.
|
||||
// Example: if we get the following, we search for a match to "de"
|
||||
// abc...
|
||||
// abcde
|
||||
// If we find a match, we append to the strings, the string for abc
|
||||
// and the one for abcde
|
||||
|
||||
Set probe = (Set) initials.get(goal);
|
||||
if (probe != null) {
|
||||
Iterator it2 = probe.iterator();
|
||||
while (it2.hasNext()) {
|
||||
match = (String) it2.next();
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Longer: " + ucd.getCodeAndName(match)
|
||||
+ "\t\tswitching");
|
||||
CEList trail = ((CEList) back.get(match)).end(goal.length());
|
||||
boolean doesMatch = matchWhole(match, trail, depth+1, otherChars, goalChars);
|
||||
if (doesMatch) {
|
||||
otherChars[0] = match + otherChars[0];
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
|
||||
+ ucd.getCode(goalChars[0])
|
||||
+ " / " + ucd.getCode(otherChars[0])
|
||||
);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Condition 3
|
||||
// the first part of this string matches a whole other string
|
||||
// and the remainder of this string also does a matchWhole
|
||||
// Example: if we get the following, we search for a match to "de"
|
||||
// abcde..
|
||||
// abc..
|
||||
// if we find a match
|
||||
|
||||
for (int i = goal.length() - 1; i > 0; --i) {
|
||||
CEList first = goal.start(i);
|
||||
match = (String) completes.get(first);
|
||||
if (match != null) {
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Shorter: " + ucd.getCodeAndName(match));
|
||||
boolean doesMatch = matchWhole("", goal.end(i), depth+1, goalChars, otherChars);
|
||||
if (doesMatch) {
|
||||
otherChars[0] = match + otherChars[0];
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
|
||||
+ ucd.getCode(goalChars[0])
|
||||
+ " / " + ucd.getCode(otherChars[0])
|
||||
);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we get this far, we failed.
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public static void generateRevision (UCA collatorIn) throws Exception {
|
||||
//generateRevision(collatorIn, false);
|
||||
generateRevision(collatorIn, true);
|
||||
}
|
||||
|
||||
public static void generateRevision (UCA collatorIn, boolean doMax) throws Exception {
|
||||
collator = collatorIn;
|
||||
|
||||
CEList.main(null);
|
||||
|
||||
System.out.println("# Generate");
|
||||
System.out.println("# Generated " + Default.getDate());
|
||||
|
||||
ucd = UCD.make();
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
|
||||
nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion());
|
||||
|
||||
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
|
||||
|
||||
// store data for faster lookup
|
||||
|
||||
System.out.println("# Gathering Data");
|
||||
int counter = 0;
|
||||
|
||||
int[] lenArray = new int[1];
|
||||
|
||||
Set list = new TreeSet();
|
||||
Map newCollisions = new HashMap();
|
||||
Map oldCollisions = new HashMap();
|
||||
Map newProblems = new TreeMap();
|
||||
Map oldProblems = new TreeMap();
|
||||
|
||||
CEList nullCEList = new CEList(new int[1]);
|
||||
|
||||
while (true) {
|
||||
Utility.dot(counter++);
|
||||
String str = cc.next(ces, lenArray);
|
||||
if (str == null) break;
|
||||
int len = lenArray[0];
|
||||
|
||||
CEList oldList = new CEList(ces, 0, len);
|
||||
|
||||
CEList newList = new CEList(ces,0,0);
|
||||
int cp;
|
||||
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(str, i);
|
||||
if (0xFF3F == cp) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
boolean mashLast = false;
|
||||
if (!nfkd.isNormalized(cp)) {
|
||||
String decomp = nfkd.normalize(cp);
|
||||
String canon = nfd.normalize(cp);
|
||||
len = collator.getCEs(decomp, true, ces);
|
||||
if (!decomp.equals(canon)) {
|
||||
byte type = ucd.getDecompositionType(cp);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
int p = (i == 0 && decomp.length() > 1 && decomp.charAt(0) == ' ' ? 0x20A : UCA.getPrimary(ces[j]));
|
||||
int s = UCA.getSecondary(ces[j]);
|
||||
boolean needsFix = (s != 0x20 && p != 0);
|
||||
if (needsFix) ++len;
|
||||
int t = (doMax && j > 0 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
|
||||
if (needsFix) {
|
||||
ces[j++] = UCA.makeKey(p, 0x20, t); // Set Extra
|
||||
System.arraycopy(ces, j, ces, j+1, len - j); // Insert HOLE!
|
||||
p = 0;
|
||||
}
|
||||
ces[j] = UCA.makeKey(p, s, t);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
len = collator.getCEs(UTF16.valueOf(cp), true, ces);
|
||||
}
|
||||
CEList inc = new CEList(ces, 0, len);
|
||||
|
||||
if (cp == 0xFF71 || cp == 0xFF67) {
|
||||
System.out.println(" String: " + ucd.getCodeAndName(cp));
|
||||
System.out.println(" Type: " + ucd.getDecompositionTypeID(cp));
|
||||
System.out.println(" xxx: " + inc);
|
||||
}
|
||||
|
||||
newList = newList.append(inc);
|
||||
|
||||
}
|
||||
if (newList.length() == 0) newList = nullCEList;
|
||||
if (oldList.length() == 0) oldList = nullCEList;
|
||||
|
||||
if (!newList.equals(oldList)) {
|
||||
/*
|
||||
System.out.println("String: " + ucd.getCodeAndName(str));
|
||||
System.out.println("\tOld: " + oldList);
|
||||
System.out.println("\tNew: " + newList);
|
||||
*/
|
||||
list.add(new Pair(newList, new Pair(str, oldList)));
|
||||
}
|
||||
|
||||
// check for collisions
|
||||
if (str.equals("\u206F")) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
Object probe = newCollisions.get(newList);
|
||||
if (probe == null) {
|
||||
newCollisions.put(newList, str);
|
||||
} else {
|
||||
newProblems.put(str, new Pair((String)probe, newList));
|
||||
}
|
||||
|
||||
probe = oldCollisions.get(oldList);
|
||||
if (probe == null) {
|
||||
oldCollisions.put(oldList, str);
|
||||
} else {
|
||||
oldProblems.put(str, new Pair((String)probe, oldList));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Set newKeys = new TreeSet(newProblems.keySet());
|
||||
Set oldKeys = new TreeSet(oldProblems.keySet());
|
||||
Set joint = new TreeSet(newKeys);
|
||||
joint.retainAll(oldKeys);
|
||||
newKeys.removeAll(joint);
|
||||
oldKeys.removeAll(joint);
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS);
|
||||
Iterator it = list.iterator();
|
||||
int last = -1;
|
||||
while (it.hasNext()) {
|
||||
Utility.dot(counter++);
|
||||
Pair value = (Pair) it.next();
|
||||
CEList newList = (CEList)value.first;
|
||||
int cur = UCA.getPrimary(newList.at(0));
|
||||
if (cur != last) {
|
||||
log.println();
|
||||
last = cur;
|
||||
}
|
||||
Pair v2 = (Pair) value.second;
|
||||
String ss = (String)v2.first;
|
||||
log.println(ucd.getCodeAndName(ss) + "\t\t" + ucd.getDecompositionTypeID(ss.charAt(0)));
|
||||
log.println("\tnew:\t" + value.first);
|
||||
log.println("\told:\t" + v2.second);
|
||||
}
|
||||
|
||||
/*
|
||||
log.println();
|
||||
log.println("New Collisions: " + newKeys.size());
|
||||
it = newKeys.iterator();
|
||||
while (it.hasNext()) {
|
||||
String key = (String) it.next();
|
||||
CEList cel = (CEList) newProblems.get(key);
|
||||
String other = (String) newCollisions.get(cel);
|
||||
log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other));
|
||||
log.println("\t" + cel);
|
||||
}
|
||||
|
||||
log.println("Removed Collisions: " + oldKeys.size());
|
||||
it = oldKeys.iterator();
|
||||
while (it.hasNext()) {
|
||||
String key = (String) it.next();
|
||||
CEList cel = (CEList) oldProblems.get(key);
|
||||
String other = (String) oldCollisions.get(cel);
|
||||
log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other));
|
||||
log.println("\t" + cel);
|
||||
}
|
||||
*/
|
||||
|
||||
showCollisions(log, "New Collisions:", newKeys, newProblems);
|
||||
showCollisions(log, "Old Collisions:", oldKeys, oldProblems);
|
||||
showCollisions(log, "In Both:", joint, oldProblems);
|
||||
log.close();
|
||||
}
|
||||
|
||||
static void showCollisions(PrintWriter log, String title, Set bad, Map probs) {
|
||||
log.println();
|
||||
log.println(title + bad.size());
|
||||
Iterator it = bad.iterator();
|
||||
Set lister = new TreeSet();
|
||||
|
||||
while (it.hasNext()) {
|
||||
String key = (String) it.next();
|
||||
Pair pair = (Pair) probs.get(key);
|
||||
String other = (String) pair.first;
|
||||
CEList cel = (CEList) pair.second;
|
||||
if (key.equals("\u0001")) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
lister.add(new Pair(cel, ucd.getCodeAndName(key) + ",\t" + ucd.getCodeAndName(other)));
|
||||
}
|
||||
|
||||
it = lister.iterator();
|
||||
int last = -1;
|
||||
while (it.hasNext()) {
|
||||
Pair pair = (Pair) it.next();
|
||||
CEList cel = (CEList) pair.first;
|
||||
int curr = UCA.getPrimary(cel.at(0));
|
||||
if (curr != last) {
|
||||
last = curr;
|
||||
log.println();
|
||||
}
|
||||
log.println("Collision between: " + pair.second);
|
||||
log.println("\t" + pair.first);
|
||||
}
|
||||
log.flush();
|
||||
}
|
||||
|
||||
public static void checkHash(UCA collatorIn) throws Exception {
|
||||
collator = collatorIn;
|
||||
|
||||
System.out.println("# Check Hash");
|
||||
System.out.println("# Generated " + Default.getDate());
|
||||
|
||||
ucd = UCD.make();
|
||||
|
||||
//nfd = new Normalizer(Normalizer.NFD);
|
||||
//nfkd = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
|
||||
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
|
||||
nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion());
|
||||
|
||||
|
||||
int tableLength = 257;
|
||||
/*
|
||||
257 263 269 271 277 281 283 293 307 311 313 317
|
||||
331 337 347 349 353 359 367 373 379 383 389 397
|
||||
401 409 419 421 431 433 439 443 449 457 461 463
|
||||
467 479 487 491 499 503 509 521 523 541 547 557
|
||||
563 569 571 577 587 593 599 601 607 613 617 619
|
||||
631 641 643 647 653 659 661 673 677 683 691 701
|
||||
709 719 727 733 739 743 751 757 761 769 773 787
|
||||
797 809 811 821 823 827 829 839 853 857 859 863
|
||||
877 881 883 887 907 911 919 929 937 941 947 953
|
||||
967 971 977 983 991 997
|
||||
|
||||
*/
|
||||
int [][] collisions = new int[LIMIT_SCRIPT][];
|
||||
BitSet[] repeats = new BitSet[LIMIT_SCRIPT];
|
||||
for (int i = 0; i < collisions.length; ++i) {
|
||||
collisions[i] = new int[tableLength];
|
||||
repeats[i] = new BitSet();
|
||||
}
|
||||
|
||||
int counter = 0;
|
||||
|
||||
int[] lenArray = new int[1];
|
||||
|
||||
if (false) while (true) {
|
||||
|
||||
Utility.dot(counter++);
|
||||
String s = cc.next(ces, lenArray);
|
||||
if (s == null) break;
|
||||
|
||||
if (UTF16.countCodePoint(s) != 1) continue; // skip ligatures
|
||||
int cp = UTF16.charAt(s, 0);
|
||||
if (!nfkd.isNormalized(cp)) continue;
|
||||
|
||||
int script = ucd.getScript(cp);
|
||||
int len = lenArray[0];
|
||||
for (int i = 0; i < len; ++i) {
|
||||
int prim = UCA.getPrimary(ces[i]);
|
||||
int hash = prim % tableLength;
|
||||
if (!repeats[script].get(prim)) {
|
||||
++collisions[script][hash];
|
||||
repeats[script].set(prim);
|
||||
} else {
|
||||
System.out.println("Skipping: " + prim + " in " + ucd.getCodeAndName(cp));
|
||||
}
|
||||
if (!repeats[UNUSED_SCRIPT].get(prim)) {
|
||||
++collisions[UNUSED_SCRIPT][hash];
|
||||
repeats[UNUSED_SCRIPT].set(prim);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String [] latin = new String[tableLength];
|
||||
for (int i = 0; i < latin.length; ++i) {
|
||||
latin[i] = "";
|
||||
}
|
||||
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
|
||||
Utility.dot(counter++);
|
||||
if (!ucd.isAllocated(cp)) continue;
|
||||
if (!nfkd.isNormalized(cp)) continue;
|
||||
if (ucd.getCategory(cp) == Lu) continue; // don't count case
|
||||
|
||||
String scp = UTF16.valueOf(cp);
|
||||
int len = collator.getCEs(scp, true, ces);
|
||||
int script = ucd.getScript(cp);
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
int prim = UCA.getPrimary(ces[i]);
|
||||
int hash = prim % tableLength;
|
||||
if (!repeats[script].get(prim)) {
|
||||
++collisions[script][hash];
|
||||
repeats[script].set(prim);
|
||||
if (script == LATIN_SCRIPT) latin[hash] += scp;
|
||||
}
|
||||
if (!repeats[UNUSED_SCRIPT].get(prim)) {
|
||||
++collisions[UNUSED_SCRIPT][hash];
|
||||
repeats[UNUSED_SCRIPT].set(prim);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Data Gathered");
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "checkstringsearchhash.html", Utility.UTF8_WINDOWS);
|
||||
Utility.writeHtmlHeader(log, "Check Hash");
|
||||
log.println("<h1>Collisions</h1>");
|
||||
log.println("<p>Shows collisions among primary values when hashed to table size = " + tableLength + ".");
|
||||
log.println("Note: All duplicate primarys are removed: all non-colliding values are removed.</p>");
|
||||
log.println("<table><tr><th>Script</th><th>Sum</th><th>Average</th><th>Std Dev.</th></tr>");
|
||||
|
||||
for (byte i = 0; i < collisions.length; ++i) {
|
||||
if (i == UNUSED_SCRIPT) continue;
|
||||
showCollisions(log, ucd.getScriptID_fromIndex(i), collisions[i]);
|
||||
}
|
||||
showCollisions(log, "All", collisions[UNUSED_SCRIPT]);
|
||||
log.println("</table>");
|
||||
|
||||
log.println("<p>Details of collisions for Latin</p>");
|
||||
|
||||
for (int i = 0; i < latin.length; ++i) {
|
||||
if (latin[i].length() < 2) continue;
|
||||
//if (UTF16.countCodePoint(latin[i]) < 2) continue;
|
||||
int cp2;
|
||||
log.println("<table>");
|
||||
for (int j = 0; j < latin[i].length(); j += UTF16.getCharCount(cp2)) {
|
||||
cp2 = UTF16.charAt(latin[i], j);
|
||||
String scp2 = UTF16.valueOf(cp2);
|
||||
CEList clist = collator.getCEList(scp2, true);
|
||||
log.println("<tr><td>" + scp2 + "</td><td>" + clist + "</td><td>" + ucd.getCodeAndName(cp2) + "</td></tr>");
|
||||
}
|
||||
log.println("</table><br>");
|
||||
}
|
||||
|
||||
log.close();
|
||||
}
|
||||
|
||||
static java.text.NumberFormat nf = new java.text.DecimalFormat("#,##0.00");
|
||||
static java.text.NumberFormat nf0 = new java.text.DecimalFormat("#,##0");
|
||||
|
||||
static void showCollisions(PrintWriter log, String title, int[] curr) {
|
||||
|
||||
double sum = 0;
|
||||
int count = 0;
|
||||
for (int j = 0; j < curr.length; ++j) {
|
||||
if (curr[j] == 0) continue;
|
||||
sum += curr[j];
|
||||
++count;
|
||||
}
|
||||
double average = sum / count;
|
||||
|
||||
double sd = 0;
|
||||
for (int j = 0; j < curr.length; ++j) {
|
||||
if (curr[j] == 0) continue;
|
||||
double deviation = curr[j] - average;
|
||||
sd += deviation * deviation;
|
||||
}
|
||||
sd = Math.sqrt(sd / count);
|
||||
|
||||
log.println("<tr><td>" + title
|
||||
+ "</td><td align='right'>" + nf0.format(sum)
|
||||
+ "</td><td align='right'>" + nf.format(average)
|
||||
+ "</td><td align='right'>" + nf.format(sd)
|
||||
+ "</td></tr>");
|
||||
}
|
||||
|
||||
public static void listCyrillic(UCA collatorIn) throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "ListCyrillic.txt", Utility.UTF8_WINDOWS);
|
||||
Set set = new TreeSet(collatorIn);
|
||||
Set set2 = new TreeSet(collatorIn);
|
||||
ucd = UCD.make();
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion());
|
||||
|
||||
for (char i = 0; i < 0xFFFF; ++i) {
|
||||
Utility.dot(i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
if (ucd.getScript(i) != CYRILLIC_SCRIPT) continue;
|
||||
|
||||
String decomp = nfd.normalize(String.valueOf(i));
|
||||
String oldDecomp = decomp;
|
||||
for (int j = 0; j < decomp.length(); ++j) {
|
||||
if (ucd.getCategory(decomp.charAt(j)) == Mn) {
|
||||
decomp = decomp.substring(0,j) + decomp.substring(j+1);
|
||||
}
|
||||
}
|
||||
if (decomp.length() == 0) continue;
|
||||
|
||||
set.add(decomp);
|
||||
if (!decomp.equals(oldDecomp)) set2.add(oldDecomp);
|
||||
}
|
||||
|
||||
Iterator it = set.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
String name = ucd.getName(s.charAt(0));
|
||||
Utility.replace(name, "CYRILLIC ", "");
|
||||
log.println("# " + s + " <> XXX ; # " + name);
|
||||
}
|
||||
|
||||
it = set2.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
String name = ucd.getName(s.charAt(0));
|
||||
Utility.replace(name, "CYRILLIC ", "");
|
||||
log.println("### " + s + " <> XXX ; # " + name);
|
||||
}
|
||||
|
||||
log.close();
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="VI60_defaultClientScript" content="JavaScript">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="keywords" content="Unicode Standard, technical reports">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<title>Technical Reports</title>
|
||||
<link rel="stylesheet" type="text/css"
|
||||
href="http://www.unicode.org/webscripts/standard_styles.css">
|
||||
<script language="Javascript" src="http://www.unicode.org/webscripts/commonHeader.js"></script>
|
||||
</head>
|
||||
|
||||
<body text="#330000" topmargin="0" leftmargin="0" marginwidth="0"
|
||||
marginheight="0">
|
||||
|
||||
<form action="http://www.unicode.org/webscripts/POST">
|
||||
<table width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<table width="100%" border="0" cellpadding="0" cellspacing="0">
|
||||
<tr>
|
||||
<td class="icon"><img border="0"
|
||||
src="http://www.unicode.org/webscripts/logo60s2.gif"
|
||||
align="middle" alt="[Unicode]" width="34" height="33"> Charts</td>
|
||||
<td class="bar"><a href="http://www.unicode.org" class="bar">Home</a>
|
||||
| <a href="http://www.unicode.org/sitemap/" class="bar">Site Map</a>
|
||||
| <a href="http://www.unicode.org/search" class="bar">Search </a><script language="Javascript" src="http://www.unicode.org/webscripts/commonSearch.js"></script><noscript><a
|
||||
href="http://www.unicode.org/webscripts/quick_links.html"
|
||||
class="bar" target="_blank">Goto</a></noscript></td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2" class="gray"> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h1>Collation Charts</h1>
|
||||
</td>
|
||||
</tr>
|
||||
<tr><td valign="top" class="navCol">
|
@ -1,8 +0,0 @@
|
||||
<hr width="50%">
|
||||
<p align="center"><script language="Javascript" src="http://www.unicode.org/webscripts/lastModified.js"></script>
|
||||
</blockquote>
|
||||
</td>
|
||||
</table>
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
@ -1,438 +0,0 @@
|
||||
package com.ibm.text.UCA;
|
||||
|
||||
import com.ibm.text.UCD.UCD_Types;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
/**
|
||||
* For generation of Implicit CEs
|
||||
* @author Davis
|
||||
*
|
||||
* Cleaned up so that changes can be made more easily.
|
||||
* Old values:
|
||||
# First Implicit: E26A792D
|
||||
# Last Implicit: E3DC70C0
|
||||
# First CJK: E0030300
|
||||
# Last CJK: E0A9DD00
|
||||
# First CJK_A: E0A9DF00
|
||||
# Last CJK_A: E0DE3100
|
||||
|
||||
*/
|
||||
public class Implicit implements UCD_Types {
|
||||
|
||||
/**
|
||||
* constants
|
||||
*/
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
static final long topByte = 0xFF000000L;
|
||||
static final long bottomByte = 0xFFL;
|
||||
static final long fourBytes = 0xFFFFFFFFL;
|
||||
|
||||
static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
|
||||
|
||||
/**
|
||||
* Testing function
|
||||
* @param args ignored
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
System.out.println("Start");
|
||||
try {
|
||||
Implicit foo = new Implicit(0xE0, 0xE4);
|
||||
|
||||
//int x = foo.getRawImplicit(0xF810);
|
||||
foo.getRawFromImplicit(0xE20303E7);
|
||||
|
||||
int gap4 = foo.getGap4();
|
||||
System.out.println("Gap4: " + gap4);
|
||||
int gap3 = foo.getGap3();
|
||||
int minTrail = foo.getMinTrail();
|
||||
int maxTrail = foo.getMaxTrail();
|
||||
long last = 0;
|
||||
long current;
|
||||
for (int i = 0; i <= MAX_INPUT; ++i) {
|
||||
current = foo.getImplicitFromRaw(i) & fourBytes;
|
||||
|
||||
// check that it round-trips AND that all intervening ones are illegal
|
||||
int roundtrip = foo.getRawFromImplicit((int)current);
|
||||
if (roundtrip != i) {
|
||||
foo.throwError("No roundtrip", i);
|
||||
}
|
||||
if (last != 0) {
|
||||
for (long j = last + 1; j < current; ++j) {
|
||||
roundtrip = foo.getRawFromImplicit((int)j);
|
||||
// raise an error if it *doesn't* find an error
|
||||
if (roundtrip != -1) {
|
||||
foo.throwError("Fails to recognize illegal", j);
|
||||
}
|
||||
}
|
||||
}
|
||||
// now do other consistency checks
|
||||
long lastBottom = last & bottomByte;
|
||||
long currentBottom = current & bottomByte;
|
||||
long lastTop = last & topByte;
|
||||
long currentTop = current & topByte;
|
||||
|
||||
// do some consistency checks
|
||||
/*
|
||||
long gap = current - last;
|
||||
if (currentBottom != 0) { // if we are a 4-byte
|
||||
// gap has to be at least gap4
|
||||
// and gap from minTrail, maxTrail has to be at least gap4
|
||||
if (gap <= gap4) foo.throwError("Failed gap4 between", i);
|
||||
if (currentBottom < minTrail + gap4) foo.throwError("Failed gap4 before", i);
|
||||
if (currentBottom > maxTrail - gap4) foo.throwError("Failed gap4 after", i);
|
||||
} else { // we are a three-byte
|
||||
gap = gap >> 8; // move gap down for comparison.
|
||||
long current3Bottom = (current >> 8) & bottomByte;
|
||||
if (gap <= gap3) foo.throwError("Failed gap3 between ", i);
|
||||
if (current3Bottom < minTrail + gap3) foo.throwError("Failed gap3 before", i);
|
||||
if (current3Bottom > maxTrail - gap3) foo.throwError("Failed gap3 after", i);
|
||||
}
|
||||
*/
|
||||
// print out some values for spot-checking
|
||||
if (lastTop != currentTop || i == 0x10000 || i == 0x110000) {
|
||||
foo.show(i-3);
|
||||
foo.show(i-2);
|
||||
foo.show(i-1);
|
||||
if (i == 0) {
|
||||
// do nothing
|
||||
} else if (lastBottom == 0 && currentBottom != 0) {
|
||||
System.out.println("+ primary boundary, 4-byte CE's below");
|
||||
} else if (lastTop != currentTop) {
|
||||
System.out.println("+ primary boundary");
|
||||
}
|
||||
foo.show(i);
|
||||
foo.show(i+1);
|
||||
foo.show(i+2);
|
||||
System.out.println("...");
|
||||
}
|
||||
last = current;
|
||||
}
|
||||
foo.show(MAX_INPUT-2);
|
||||
foo.show(MAX_INPUT-1);
|
||||
foo.show(MAX_INPUT);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
System.out.println("End");
|
||||
}
|
||||
}
|
||||
|
||||
private void throwError(String title, int cp) {
|
||||
throw new IllegalArgumentException(title + "\t" + Utility.hex(cp) + "\t" + Utility.hex(getImplicitFromRaw(cp) & fourBytes));
|
||||
}
|
||||
|
||||
private void throwError(String title, long ce) {
|
||||
throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
|
||||
}
|
||||
|
||||
private void show(int i) {
|
||||
if (i >= 0 && i <= MAX_INPUT) {
|
||||
System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Precomputed by constructor
|
||||
*/
|
||||
int final3Multiplier;
|
||||
int final4Multiplier;
|
||||
int final3Count;
|
||||
int final4Count;
|
||||
int medialCount;
|
||||
int min3Primary;
|
||||
int min4Primary;
|
||||
int max4Primary;
|
||||
int minTrail;
|
||||
int maxTrail;
|
||||
int max3Trail;
|
||||
int max4Trail;
|
||||
int min4Boundary;
|
||||
|
||||
public int getGap4() {
|
||||
return final4Multiplier - 1;
|
||||
}
|
||||
|
||||
public int getGap3() {
|
||||
return final3Multiplier - 1;
|
||||
}
|
||||
|
||||
// old comment
|
||||
// we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
|
||||
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
|
||||
// we shift so that HAN all has the same first primary, for compression.
|
||||
// for the 4 byte case, we make the gap as large as we can fit.
|
||||
|
||||
/**
|
||||
* Supply parameters for generating implicit CEs
|
||||
*/
|
||||
public Implicit(int minPrimary, int maxPrimary) {
|
||||
// 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
|
||||
this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up to generate implicits.
|
||||
* @param minPrimary
|
||||
* @param maxPrimary
|
||||
* @param minTrail final byte
|
||||
* @param maxTrail final byte
|
||||
* @param gap3 the gap we leave for tailoring for 3-byte forms
|
||||
* @param primaries3count number of 3-byte primarys we can use (normally 1)
|
||||
*/
|
||||
public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
|
||||
if (DEBUG) {
|
||||
System.out.println("minPrimary: " + Utility.hex(minPrimary));
|
||||
System.out.println("maxPrimary: " + Utility.hex(maxPrimary));
|
||||
System.out.println("minTrail: " + Utility.hex(minTrail));
|
||||
System.out.println("maxTrail: " + Utility.hex(maxTrail));
|
||||
System.out.println("gap3: " + Utility.hex(gap3));
|
||||
System.out.println("primaries3count: " + primaries3count);
|
||||
}
|
||||
// some simple parameter checks
|
||||
if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes");
|
||||
if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes");
|
||||
if (primaries3count < 1) throw new IllegalArgumentException("bad three-byte primaries");
|
||||
|
||||
this.minTrail = minTrail;
|
||||
this.maxTrail = maxTrail;
|
||||
|
||||
min3Primary = minPrimary;
|
||||
max4Primary = maxPrimary;
|
||||
// compute constants for use later.
|
||||
// number of values we can use in trailing bytes
|
||||
// leave room for empty values between AND above, e.g. if gap = 2
|
||||
// range 3..7 => +3 -4 -5 -6 -7: so 1 value
|
||||
// range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
|
||||
// range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
|
||||
final3Multiplier = gap3 + 1;
|
||||
final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
|
||||
max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
|
||||
|
||||
// medials can use full range
|
||||
medialCount = (maxTrail - minTrail + 1);
|
||||
// find out how many values fit in each form
|
||||
int threeByteCount = medialCount * final3Count;
|
||||
// now determine where the 3/4 boundary is.
|
||||
// we use 3 bytes below the boundary, and 4 above
|
||||
int primariesAvailable = maxPrimary - minPrimary + 1;
|
||||
int primaries4count = primariesAvailable - primaries3count;
|
||||
|
||||
int min3ByteCoverage = primaries3count * threeByteCount;
|
||||
min4Primary = minPrimary + primaries3count;
|
||||
min4Boundary = min3ByteCoverage;
|
||||
// Now expand out the multiplier for the 4 bytes, and redo.
|
||||
|
||||
int totalNeeded = MAX_INPUT - min4Boundary;
|
||||
int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
|
||||
if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
|
||||
|
||||
int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
|
||||
if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
|
||||
|
||||
int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
|
||||
if (DEBUG) System.out.println("expandedGap: " + gap4);
|
||||
if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
|
||||
|
||||
final4Multiplier = gap4 + 1;
|
||||
final4Count = neededPerFinalByte;
|
||||
max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
|
||||
|
||||
if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
|
||||
throw new IllegalArgumentException("internal error");
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println("final4Count: " + final4Count);
|
||||
for (int counter = 0; counter < final4Count; ++counter) {
|
||||
int value = minTrail + (1 + counter)*final4Multiplier;
|
||||
System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static public int divideAndRoundUp(int a, int b) {
|
||||
return 1 + (a-1)/b;
|
||||
}
|
||||
/**
|
||||
* Converts implicit CE into raw integer
|
||||
* @param implicit
|
||||
* @return -1 if illegal format
|
||||
*/
|
||||
public int getRawFromImplicit(int implicit) {
|
||||
int result;
|
||||
int b3 = implicit & 0xFF;
|
||||
implicit >>= 8;
|
||||
int b2 = implicit & 0xFF;
|
||||
implicit >>= 8;
|
||||
int b1 = implicit & 0xFF;
|
||||
implicit >>= 8;
|
||||
int b0 = implicit & 0xFF;
|
||||
|
||||
// simple parameter checks
|
||||
if (b0 < min3Primary || b0 > max4Primary
|
||||
|| b1 < minTrail || b1 > maxTrail) return -1;
|
||||
// normal offsets
|
||||
b1 -= minTrail;
|
||||
|
||||
// take care of the final values, and compose
|
||||
if (b0 < min4Primary) {
|
||||
if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
|
||||
b2 -= minTrail;
|
||||
int remainder = b2 % final3Multiplier;
|
||||
if (remainder != 0) return -1;
|
||||
b0 -= min3Primary;
|
||||
b2 /= final3Multiplier;
|
||||
result = ((b0 * medialCount) + b1) * final3Count + b2;
|
||||
} else {
|
||||
if (b2 < minTrail || b2 > maxTrail
|
||||
|| b3 < minTrail || b3 > max4Trail) return -1;
|
||||
b2 -= minTrail;
|
||||
b3 -= minTrail;
|
||||
int remainder = b3 % final4Multiplier;
|
||||
if (remainder != 0) return -1;
|
||||
b3 /= final4Multiplier;
|
||||
b0 -= min4Primary;
|
||||
result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
|
||||
}
|
||||
// final check
|
||||
if (result < 0 || result > MAX_INPUT) return -1;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the implicit CE, from raw integer.
|
||||
* Left shifted to put the first byte at the top of an int.
|
||||
* @param cp code point
|
||||
* @return
|
||||
*/
|
||||
public int getImplicitFromRaw(int cp) {
|
||||
if (cp < 0 || cp > MAX_INPUT) {
|
||||
throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
|
||||
}
|
||||
int last0 = cp - min4Boundary;
|
||||
if (last0 < 0) {
|
||||
int last1 = cp / final3Count;
|
||||
last0 = cp % final3Count;
|
||||
|
||||
int last2 = last1 / medialCount;
|
||||
last1 %= medialCount;
|
||||
|
||||
last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
|
||||
last1 = minTrail + last1; // offset
|
||||
last2 = min3Primary + last2; // offset
|
||||
|
||||
if (last2 >= min4Primary) {
|
||||
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
|
||||
}
|
||||
|
||||
return (last2 << 24) + (last1 << 16) + (last0 << 8);
|
||||
} else {
|
||||
int last1 = last0 / final4Count;
|
||||
last0 %= final4Count;
|
||||
|
||||
int last2 = last1 / medialCount;
|
||||
last1 %= medialCount;
|
||||
|
||||
int last3 = last2 / medialCount;
|
||||
last2 %= medialCount;
|
||||
|
||||
last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
|
||||
last1 = minTrail + last1; // offset
|
||||
last2 = minTrail + last2; // offset
|
||||
last3 = min4Primary + last3; // offset
|
||||
|
||||
if (last3 > max4Primary) {
|
||||
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
|
||||
}
|
||||
|
||||
return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Gets an Implicit from a code point. Internally,
|
||||
* swaps (which produces a raw value 0..220000,
|
||||
* then converts raw to implicit.
|
||||
* @param cp
|
||||
* @return
|
||||
*/
|
||||
public int getSwappedImplicit(int cp) {
|
||||
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
|
||||
|
||||
// Produce Raw value
|
||||
// note, we add 1 so that the first value is always empty!!
|
||||
cp = Implicit.swapCJK(cp) + 1;
|
||||
// we now have a range of numbers from 0 to 220000.
|
||||
|
||||
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
|
||||
|
||||
return getImplicitFromRaw(cp);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Function used to:
|
||||
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
|
||||
* b) bump any non-CJK characters by 10FFFF.
|
||||
* The relevant blocks are:
|
||||
* A: 4E00..9FFF; CJK Unified Ideographs
|
||||
* F900..FAFF; CJK Compatibility Ideographs
|
||||
* B: 3400..4DBF; CJK Unified Ideographs Extension A
|
||||
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
|
||||
* As long as
|
||||
* no new B characters are allocated between 4E00 and FAFF, and
|
||||
* no new A characters are outside of this range,
|
||||
* (very high probability) this simple code will work.
|
||||
* The reordered blocks are:
|
||||
* Block1 is CJK
|
||||
* Block2 is CJK_COMPAT_USED
|
||||
* Block3 is CJK_A
|
||||
* (all contiguous)
|
||||
* Any other CJK gets its normal code point
|
||||
* Any non-CJK gets +10FFFF
|
||||
* When we reorder Block1, we make sure that it is at the very start,
|
||||
* so that it will use a 3-byte form.
|
||||
* Warning: the we only pick up the compatibility characters that are
|
||||
* NOT decomposed, so that block is smaller!
|
||||
*/
|
||||
|
||||
static int NON_CJK_OFFSET = 0x110000;
|
||||
|
||||
static int swapCJK(int i) {
|
||||
|
||||
if (i >= CJK_BASE) {
|
||||
if (i < CJK_LIMIT) return i - CJK_BASE;
|
||||
|
||||
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
|
||||
+ (CJK_LIMIT - CJK_BASE);
|
||||
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
|
||||
|
||||
return i + NON_CJK_OFFSET; // non-CJK
|
||||
}
|
||||
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
|
||||
+ (CJK_LIMIT - CJK_BASE)
|
||||
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
|
||||
return i + NON_CJK_OFFSET; // non-CJK
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
public int getMinTrail() {
|
||||
return minTrail;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
public int getMaxTrail() {
|
||||
return maxTrail;
|
||||
}
|
||||
|
||||
}
|
@ -1,175 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
|
||||
* $Date: 2005/04/06 15:15:43 $
|
||||
* $Revision: 1.20 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
import java.io.File;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
public class Main {
|
||||
//static final String UCDVersion = "4.0.0";
|
||||
static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA",
|
||||
"WriteRules", "WriteRulesXML", "writeconformance", "writeconformanceshifted",
|
||||
"short",
|
||||
"WriteRules", "WriteRulesXML", "writeconformance", "writeconformanceshifted",
|
||||
"noCE", "short",
|
||||
"WriteRules",
|
||||
"collationChart"
|
||||
};
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
// NOTE: so far, we don't need to build the UCA with anything but the latest versions.
|
||||
// A few changes would need to be made to the code to do older versions.
|
||||
try {
|
||||
|
||||
if (args.length == 0) args = new String[] {"?"}; // force the help comment
|
||||
boolean shortPrint = false;
|
||||
boolean noCE = false;
|
||||
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
String arg = args[i];
|
||||
System.out.println("OPTION: " + arg);
|
||||
if (arg.charAt(0) == '#') return; // skip rest of line
|
||||
|
||||
if (arg.equalsIgnoreCase("ICU")) {
|
||||
args = Utility.append(ICU_FILES, Utility.subarray(args, i+1));
|
||||
i = -1;
|
||||
continue;
|
||||
}
|
||||
if (arg.equalsIgnoreCase("version")) {
|
||||
Default.setUCD(args[++i]); // get next arg
|
||||
continue;
|
||||
}
|
||||
if (WriteCollationData.collator == null) {
|
||||
System.out.println("Building UCA");
|
||||
String file = Utility.searchDirectory(new File(UCD_Types.BASE_DIR + "UCA\\" + Default.ucdVersion() + "\\"), "allkeys", true, ".txt");
|
||||
WriteCollationData.collator = new UCA(file, Default.ucdVersion());
|
||||
System.out.println("Built version " + WriteCollationData.collator.getDataVersion()
|
||||
+ "/ucd: " + WriteCollationData.collator.getUCDVersion());
|
||||
|
||||
System.out.println("Building UCD data");
|
||||
WriteCollationData.ucd = UCD.make(WriteCollationData.collator.getUCDVersion());
|
||||
|
||||
}
|
||||
if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(WriteCollationData.collator);
|
||||
else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(WriteCollationData.collator);
|
||||
//else if (arg.equalsIgnoreCase("writeNonspacingDifference")) WriteCollationData.writeNonspacingDifference();
|
||||
|
||||
else if (arg.equalsIgnoreCase("collationChart")) WriteCharts.collationChart(WriteCollationData.collator);
|
||||
else if (arg.equalsIgnoreCase("scriptChart")) WriteCharts.scriptChart();
|
||||
else if (arg.equalsIgnoreCase("normalizationChart")) WriteCharts.normalizationChart();
|
||||
else if (arg.equalsIgnoreCase("caseChart")) WriteCharts.caseChart();
|
||||
else if (arg.equalsIgnoreCase("indexChart")) WriteCharts.indexChart();
|
||||
else if (arg.equalsIgnoreCase("special")) WriteCharts.special();
|
||||
|
||||
else if (arg.equalsIgnoreCase("writeCompositionChart")) WriteCharts.writeCompositionChart();
|
||||
|
||||
else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(WriteCollationData.collator);
|
||||
else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(WriteCollationData.collator);
|
||||
else if (arg.equalsIgnoreCase("listCyrillic")) GenOverlap.listCyrillic(WriteCollationData.collator);
|
||||
|
||||
else if (arg.equalsIgnoreCase("WriteRules")) WriteCollationData.writeRules(WriteCollationData.WITHOUT_NAMES, shortPrint, noCE);
|
||||
// else if (arg.equalsIgnoreCase("WriteRulesWithNames")) WriteCollationData.writeRules(WriteCollationData.WITH_NAMES);
|
||||
else if (arg.equalsIgnoreCase("WriteRulesXML")) WriteCollationData.writeRules(WriteCollationData.IN_XML, shortPrint, noCE);
|
||||
else if (arg.equalsIgnoreCase("checkDisjointIgnorables")) WriteCollationData.checkDisjointIgnorables();
|
||||
else if (arg.equalsIgnoreCase("writeContractions")) WriteCollationData.writeContractions();
|
||||
else if (arg.equalsIgnoreCase("writeFractionalUCA")) WriteCollationData.writeFractionalUCA("FractionalUCA");
|
||||
else if (arg.equalsIgnoreCase("writeConformance")) WriteCollationData.writeConformance("CollationTest_NON_IGNORABLE", UCA.NON_IGNORABLE, shortPrint);
|
||||
else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) WriteCollationData.writeConformance("CollationTest_SHIFTED", UCA.SHIFTED, shortPrint);
|
||||
else if (arg.equalsIgnoreCase("testCompatibilityCharacters")) WriteCollationData.testCompatibilityCharacters();
|
||||
else if (arg.equalsIgnoreCase("writeCollationValidityLog")) WriteCollationData.writeCollationValidityLog();
|
||||
else if (arg.equalsIgnoreCase("writeCaseExceptions")) WriteCollationData.writeCaseExceptions();
|
||||
else if (arg.equalsIgnoreCase("writeJavascriptInfo")) WriteCollationData.writeJavascriptInfo();
|
||||
else if (arg.equalsIgnoreCase("writeCaseFolding")) WriteCollationData.writeCaseFolding();
|
||||
else if (arg.equalsIgnoreCase("javatest")) WriteCollationData.javatest();
|
||||
else if (arg.equalsIgnoreCase("short")) shortPrint = !shortPrint;
|
||||
else if (arg.equalsIgnoreCase("noCE")) noCE = !noCE;
|
||||
|
||||
else if (arg.equalsIgnoreCase("checkCanonicalIterator")) checkCanonicalIterator();
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("writeAllocation")) WriteCharts.writeAllocation();
|
||||
// else if (arg.equalsIgnoreCase("probe")) Probe.test();
|
||||
|
||||
|
||||
else {
|
||||
System.out.println();
|
||||
System.out.println("UNKNOWN OPTION (" + arg + "): must be one of the following (case-insensitive)");
|
||||
System.out.println("\tWriteRulesXML, WriteRulesWithNames, WriteRules,");
|
||||
System.out.println("\tcheckDisjointIgnorables, writeContractions,");
|
||||
System.out.println("\twriteFractionalUCA, writeConformance, writeConformanceSHIFTED, testCompatibilityCharacters,");
|
||||
System.out.println("\twriteCollationValidityLog, writeCaseExceptions, writeJavascriptInfo, writeCaseFolding");
|
||||
System.out.println("\tjavatest, hex (used for conformance)");
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
System.out.println("Done");
|
||||
|
||||
/*
|
||||
String s = WriteCollationData.collator.getSortKey("\u1025\u102E", UCA.NON_IGNORABLE, true);
|
||||
System.out.println(Utility.hex("\u0595\u0325") + ", " + WriteCollationData.collator.toString(s));
|
||||
String t = WriteCollationData.collator.getSortKey("\u0596\u0325", UCA.NON_IGNORABLE, true);
|
||||
System.out.println(Utility.hex("\u0596\u0325") + ", " + WriteCollationData.collator.toString(t));
|
||||
|
||||
|
||||
Normalizer foo = new Normalizer(Normalizer.NFKD);
|
||||
char x = '\u1EE2';
|
||||
System.out.println(Utility.hex(x) + " " + ucd.getName(x));
|
||||
String nx = foo.normalize(x);
|
||||
for (int i = 0; i < nx.length(); ++i) {
|
||||
char c = nx.charAt(i);
|
||||
System.out.println(ucd.getCanonicalClass(c));
|
||||
}
|
||||
System.out.println(Utility.hex(nx, " ") + " " + ucd.getName(nx));
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static void checkCanonicalIterator() {
|
||||
|
||||
int firstImplicit = WriteCollationData.getImplicitPrimary(UCD_Types.CJK_BASE);
|
||||
System.out.println("UCD_Types.CJK_BASE: " + Utility.hex(UCD_Types.CJK_BASE));
|
||||
System.out.println("first implicit: " + Utility.hex((long)(firstImplicit & 0xFFFFFFFFL)));
|
||||
|
||||
CanonicalIterator it = new CanonicalIterator("");
|
||||
String[] tests = new String[] {"\uF900", "\u00C5d\u0307\u0327"};
|
||||
for (int j = 0; j < tests.length; ++j) {
|
||||
System.out.println(Default.ucd().getCodeAndName(tests[j]));
|
||||
it.setSource(tests[j]);
|
||||
String ss;
|
||||
for (int i = 0; (ss = it.next()) != null; ++i) {
|
||||
System.out.println(i + "\t" + Default.ucd().getCodeAndName(ss));
|
||||
}
|
||||
}
|
||||
// verify that nothing breaks
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
int cat = UCharacter.getType(i);
|
||||
if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue;
|
||||
String s = UTF16.valueOf(i);
|
||||
try {
|
||||
it.setSource(s);
|
||||
} catch (RuntimeException e) {
|
||||
System.out.println("Failure with U+" + Utility.hex(i));
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/RuleComparator.java,v $
|
||||
* $Date: 2001/08/31 00:20:40 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public final class RuleComparator implements java.util.Comparator {
|
||||
|
||||
public int compare(Object s, Object t) {
|
||||
String ss = (String)s;
|
||||
String tt = (String)t;
|
||||
|
||||
// compare just the initial portions of each level, FIRST
|
||||
// only if there is a difference outside of the initial level do we stop
|
||||
// we assume that there are the same number of levels!!
|
||||
|
||||
int si = 0;
|
||||
int ti = 0;
|
||||
int result = 0;
|
||||
try {
|
||||
while (si < ss.length() && ti < tt.length()) {
|
||||
char cs = ss.charAt(si++);
|
||||
char ct = tt.charAt(ti++);
|
||||
|
||||
if (cs == ct) continue;
|
||||
/*
|
||||
if (cs == 0) {
|
||||
if (result == 0) result = -1;
|
||||
while (ct != 0 && ti < tt.length()) {
|
||||
ct = tt.charAt(ti++);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (ct == 0) {
|
||||
if (result == 0) result = 1;
|
||||
while (cs != 0 && si < ss.length()) {
|
||||
cs = ss.charAt(si++);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
*/
|
||||
if (cs < ct) return -1;
|
||||
return 1;
|
||||
}
|
||||
} catch (StringIndexOutOfBoundsException e) {
|
||||
System.out.println("WHOOPS: ");
|
||||
System.out.println(si + ", " + Utility.hex(ss));
|
||||
System.out.println(ti + ", " + Utility.hex(tt));
|
||||
}
|
||||
if (result != 0) return result;
|
||||
if (ss.length() > tt.length()) return 1;
|
||||
if (ss.length() < tt.length()) return -1;
|
||||
return 0;
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,336 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $
|
||||
* $Date: 2006/06/08 18:16:40 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.Reader;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.FileReader;
|
||||
import java.text.MessageFormat;
|
||||
import java.io.IOException;
|
||||
import com.ibm.text.UCD.Normalizer;
|
||||
import com.ibm.text.UCD.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public class UCA_Data implements UCA_Types {
|
||||
static final boolean DEBUG = false;
|
||||
static final boolean DEBUG_SHOW_ADD = false;
|
||||
static final boolean lessThan410 = false;
|
||||
|
||||
private Normalizer toD;
|
||||
private UCD ucd;
|
||||
|
||||
public UCA_Data(Normalizer toD, UCD ucd) {
|
||||
this.toD = toD;
|
||||
this.ucd = ucd;
|
||||
}
|
||||
|
||||
/**
|
||||
* The collation element data is stored a couple of different structures.
|
||||
* First is collationElements, which generally contains the 32-bit CE corresponding
|
||||
* to the data. It is directly indexed by character code.<br>
|
||||
* For brevity in the implementation, we just use a flat array.
|
||||
* A real implementation would use a multi-stage table, as described in TUS Section 5.
|
||||
* table of simple collation elements, indexed by char.<br>
|
||||
* Exceptional cases: expanding, contracting, unsupported are handled as described below.
|
||||
*/
|
||||
private int[] collationElements = new int[65536];
|
||||
|
||||
/**
|
||||
* Although a single character can expand into multiple CEs, we don't want to burden
|
||||
* the normal case with the storage. So, they get a special value in the collationElements
|
||||
* array. This value has a distinct primary weight, followed by an index into a separate
|
||||
* table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
|
||||
* will be used for the expansion. The implementation is as a stack; this just makes it
|
||||
* easy to generate.
|
||||
*/
|
||||
private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
|
||||
|
||||
/**
|
||||
* For now, this is just a simple mapping of strings to collation elements.
|
||||
* The implementation depends on the contracting characters being "completed",
|
||||
* so that it can be efficiently determined when to stop looking.
|
||||
*/
|
||||
private Map contractingTable = new TreeMap();
|
||||
|
||||
{
|
||||
// clear some tables
|
||||
for (int i = 0; i < collationElements.length; ++i) {
|
||||
collationElements[i] = UNSUPPORTED_FLAG;
|
||||
}
|
||||
// preload with parts
|
||||
for (char i = 0xD800; i < 0xDC00; ++i) {
|
||||
collationElements[i] = CONTRACTING;
|
||||
addToContractingTable(String.valueOf(i), UNSUPPORTED_FLAG);
|
||||
}
|
||||
checkConsistency();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the type of the CE
|
||||
*/
|
||||
public byte getCEType(int ch) {
|
||||
if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands
|
||||
|
||||
int ce = collationElements[ch];
|
||||
if (ce == UNSUPPORTED_FLAG) {
|
||||
|
||||
// Special check for Han, Hangul
|
||||
if (ucd.isHangulSyllable(ch)) return HANGUL_CE;
|
||||
|
||||
if (ucd.isCJK_BASE(ch)) return CJK_CE;
|
||||
if (ucd.isCJK_AB(ch)) return CJK_AB_CE;
|
||||
|
||||
// special check for unsupported surrogate pair, 20 1/8 bits
|
||||
//if (0xD800 <= ch && ch <= 0xDFFF) {
|
||||
// return SURROGATE_CE;
|
||||
//}
|
||||
return UNSUPPORTED_CE;
|
||||
}
|
||||
if (ce == CONTRACTING) return CONTRACTING_CE;
|
||||
if ((ce & EXPANDING_MASK) == EXPANDING_MASK) return EXPANDING_CE;
|
||||
return NORMAL_CE;
|
||||
}
|
||||
|
||||
public void add(String source, IntStack ces) {
|
||||
add(new StringBuffer(source), ces);
|
||||
}
|
||||
|
||||
public void add(StringBuffer source, IntStack ces) {
|
||||
|
||||
if (DEBUG_SHOW_ADD) {
|
||||
System.out.println("Adding: " + ucd.getCodeAndName(source.toString()) + CEList.toString(ces));
|
||||
}
|
||||
if (source.length() < 1 || ces.length() < 1) {
|
||||
throw new IllegalArgumentException("String or CEs too short");
|
||||
}
|
||||
|
||||
int ce;
|
||||
if (ces.length() == 1) {
|
||||
ce = ces.get(0);
|
||||
} else {
|
||||
ce = EXPANDING_MASK | expandingTable.getTop();
|
||||
expandingTable.append(ces);
|
||||
expandingTable.append(TERMINATOR);
|
||||
}
|
||||
|
||||
// assign CE(s) to char(s)
|
||||
char value = source.charAt(0);
|
||||
//if (value == 0x10000) System.out.print("DEBUG2: " + source);
|
||||
|
||||
if (source.length() > 1) {
|
||||
addToContractingTable(source, ce);
|
||||
if (collationElements[value] == UNSUPPORTED_FLAG) {
|
||||
collationElements[value] = CONTRACTING; // mark special
|
||||
} else if (collationElements[value] != CONTRACTING) {
|
||||
// move old value to contracting table!
|
||||
//contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
|
||||
addToContractingTable(String.valueOf(value), collationElements[value]);
|
||||
collationElements[value] = CONTRACTING; // signal we must look up in table
|
||||
}
|
||||
} else if (collationElements[value] == CONTRACTING) {
|
||||
// must add old value to contracting table!
|
||||
addToContractingTable(source, ce);
|
||||
//contractingTable.put(source, new Integer(ce));
|
||||
} else {
|
||||
collationElements[source.charAt(0)] = ce; // normal
|
||||
}
|
||||
//if (DEBUG) checkConsistency();
|
||||
}
|
||||
|
||||
boolean isCompletelyIgnoreable(int cp) {
|
||||
int ce = collationElements[cp < UTF16.SUPPLEMENTARY_MIN_VALUE ? cp : UTF16.getLeadSurrogate(cp)];
|
||||
if (ce == 0) return true;
|
||||
if (ce != CONTRACTING) return false;
|
||||
Object newValue = contractingTable.get(UTF16.valueOf(cp));
|
||||
if (newValue == null) return false;
|
||||
return ((Integer)newValue).intValue() == 0;
|
||||
}
|
||||
|
||||
// returns new pos, fills in result.
|
||||
public int get(char ch, StringBuffer decompositionBuffer, int index, IntStack result) {
|
||||
int ce = collationElements[ch];
|
||||
|
||||
if (ce == CONTRACTING) {
|
||||
// Contracting is probably the most interesting (read "tricky") part
|
||||
// of the algorithm.
|
||||
// First get longest substring that is in the contracting table.
|
||||
// For simplicity, we use a hash table for contracting.
|
||||
// There are much better optimizations,
|
||||
// but they take a more complicated build algorithm than we want to show here.
|
||||
// NOTE: We are guaranteed that the first code unit is in the contracting table because
|
||||
// of the build process.
|
||||
String probe = String.valueOf(ch);
|
||||
Object value = contractingTable.get(probe);
|
||||
if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));
|
||||
|
||||
// complete the first character, if part of supplementary
|
||||
if (UTF16.isLeadSurrogate(ch) && index < decompositionBuffer.length()) {
|
||||
char ch2 = decompositionBuffer.charAt(index);
|
||||
String newProbe = probe + ch2;
|
||||
Object newValue = contractingTable.get(newProbe);
|
||||
if (newValue != null) {
|
||||
probe = newProbe;
|
||||
value = newValue;
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
// We loop, trying to add successive CODE UNITS to the longest substring.
|
||||
int cp2;
|
||||
while (index < decompositionBuffer.length()) {
|
||||
//char ch2 = decompositionBuffer.charAt(index);
|
||||
cp2 = UTF16.charAt(decompositionBuffer, index);
|
||||
int increment = UTF16.getCharCount(cp2);
|
||||
|
||||
// CHECK if last char was completely ignorable
|
||||
if (lessThan410 && isCompletelyIgnoreable(cp2)) {
|
||||
index += increment; // just skip char don't set probe, value
|
||||
continue;
|
||||
}
|
||||
|
||||
// see whether the current string plus the next char are in
|
||||
// the contracting table.
|
||||
String newProbe = probe + UTF16.valueOf(cp2);
|
||||
Object newValue = contractingTable.get(newProbe);
|
||||
if (newValue == null) break; // stop if not in table.
|
||||
|
||||
// We succeeded--so update our new values, and set index
|
||||
// and quaternary to indicate that we swallowed another character.
|
||||
probe = newProbe;
|
||||
value = newValue;
|
||||
index += increment;
|
||||
}
|
||||
|
||||
// Now, see if we can add any combining marks
|
||||
short lastCan = 0;
|
||||
int increment;
|
||||
for (int i = index; i < decompositionBuffer.length(); i += increment) {
|
||||
// We only take certain characters. They have to be accents,
|
||||
// and they have to not be blocked.
|
||||
// Unlike above, if we don't find a match (and it was an accent!)
|
||||
// then we don't stop, we continue looping.
|
||||
cp2 = UTF16.charAt(decompositionBuffer, i);
|
||||
increment = UTF16.getCharCount(cp2);
|
||||
short can = toD.getCanonicalClass(cp2);
|
||||
if (can == 0) break; // stop with any zero (non-accent)
|
||||
if (can == lastCan) continue; // blocked if same class as last
|
||||
lastCan = can; // remember for next time
|
||||
|
||||
// CHECK if last char was completely ignorable. If so, skip it.
|
||||
if (lessThan410 && isCompletelyIgnoreable(cp2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Now see if we can successfully add it onto our string
|
||||
// and find it in the contracting table.
|
||||
String newProbe = probe + UTF16.valueOf(cp2);
|
||||
Object newValue = contractingTable.get(newProbe);
|
||||
if (newValue == null) continue;
|
||||
|
||||
// We succeeded--so update our new values, remove the char, and update
|
||||
// quaternary to indicate that we swallowed another character.
|
||||
probe = newProbe;
|
||||
value = newValue;
|
||||
decompositionBuffer.setCharAt(i,'\u0000'); // zero char
|
||||
if (increment == 2) {
|
||||
// WARNING: we had a supplementary character. zero BOTH parts
|
||||
decompositionBuffer.setCharAt(i+1,'\u0000'); // zero char
|
||||
}
|
||||
}
|
||||
|
||||
// we are all done, and can extract the CE from the last value set.
|
||||
ce = ((Integer)value).intValue();
|
||||
|
||||
}
|
||||
|
||||
// if the CE is not expanding) we are done.
|
||||
if ((ce & EXPANDING_MASK) != EXPANDING_MASK) {
|
||||
result.push(ce);
|
||||
} else {
|
||||
// expanding, so copy list of items onto stack
|
||||
int ii = ce & EXCEPTION_INDEX_MASK; // get index
|
||||
// copy onto stack from index until reach TERMINATOR
|
||||
while (true) {
|
||||
ce = expandingTable.get(ii++);
|
||||
if (ce == TERMINATOR) break;
|
||||
result.push(ce);
|
||||
}
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
private void addToContractingTable(Object s, int ce) {
|
||||
if (s == null) {
|
||||
throw new IllegalArgumentException("String can't be null");
|
||||
}
|
||||
contractingTable.put(s.toString(), new Integer(ce));
|
||||
}
|
||||
|
||||
void checkConsistency() {
|
||||
// at this point, we have to guarantee that the contractingTable is CLOSED
|
||||
// e.g. if a substring of length n is in the table, then the first n-1 characters
|
||||
// are also!!
|
||||
|
||||
// First check consistency. the CE for a value is CONTRACTING if and only if there is a contraction starting
|
||||
// with that value.
|
||||
|
||||
UnicodeSet ceSet = new UnicodeSet();
|
||||
for (int i = 0; i < collationElements.length; ++i) {
|
||||
if (collationElements[i] == CONTRACTING) ceSet.add(i);
|
||||
}
|
||||
UnicodeSet ceSet2 = new UnicodeSet();
|
||||
Iterator enum1 = contractingTable.keySet().iterator();
|
||||
while (enum1.hasNext()) {
|
||||
String sequence = (String)enum1.next();
|
||||
ceSet2.add(sequence.charAt(0));
|
||||
}
|
||||
|
||||
if (!ceSet.equals(ceSet2)) {
|
||||
System.out.println("In both: " + new UnicodeSet(ceSet).retainAll(ceSet2).toPattern(true));
|
||||
System.out.println("CONTRACTING but not in table: " + new UnicodeSet(ceSet).removeAll(ceSet2).toPattern(true));
|
||||
System.out.println("In table but not CONTRACTING: " + new UnicodeSet(ceSet2).removeAll(ceSet).toPattern(true));
|
||||
throw new IllegalArgumentException("Inconsistent data");
|
||||
}
|
||||
|
||||
/*
|
||||
0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
|
||||
0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
|
||||
int[] temp1 = int[20];
|
||||
int[] temp2 = int[20];
|
||||
int[] temp3 = int[20];
|
||||
getCEs("\u0fb2", true, temp1);
|
||||
getCEs("\u0fb3", true, temp2);
|
||||
getCEs("\u0f71", true, temp3);
|
||||
add("\u0FB2\u0F71", concat(temp1, temp3));
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
Iterator getContractions() {
|
||||
return contractingTable.keySet().iterator();
|
||||
}
|
||||
|
||||
int getContractionCount() {
|
||||
return contractingTable.size();
|
||||
}
|
||||
|
||||
boolean contractionTableContains(String s) {
|
||||
return contractingTable.get(s) != null;
|
||||
}
|
||||
|
||||
}
|
@ -1,98 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Types.java,v $
|
||||
* $Date: 2005/04/06 08:48:17 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public interface UCA_Types {
|
||||
/**
|
||||
* Version of the UCA tables to use
|
||||
*/
|
||||
//private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7";
|
||||
//public static final String UCA_BASE = "4.1.0"; // "3.1.1"; // ; // ""; // "-2.1.9d7";
|
||||
//public static final String VERSION = "-" + UCA_BASE; // + "d6" ""; // "-2.1.9d7";
|
||||
public static final String ALLFILES = "allkeys"; // null if not there
|
||||
|
||||
public static final String BASE_UCA_GEN_DIR = UCD_Types.GEN_DIR + "collation" + "\\";
|
||||
public static final char LEVEL_SEPARATOR = '\u0000';
|
||||
/**
|
||||
* Expanding characters are marked with a exception bit combination
|
||||
* in the collationElement table.
|
||||
* This means that they map to more than one CE, which is looked up in
|
||||
* the expansionTable by index.
|
||||
*/
|
||||
static final int EXPANDING_MASK = 0xFFFF0000; // marks expanding range start
|
||||
|
||||
/**
|
||||
* This mask is used to get the index from an EXPANDING exception.
|
||||
* The contracting characters can also make use of this in a future optimization.
|
||||
*/
|
||||
static final int EXCEPTION_INDEX_MASK = 0x0000FFFF;
|
||||
|
||||
/**
|
||||
* Contracting characters are marked with a exception bit combination
|
||||
* in the collationElement table.
|
||||
* This means that they are the first character of a contraction, and need
|
||||
* to be looked up (with following characters) in the contractingTable.<br>
|
||||
* This isn't a MASK since there is exactly one value.
|
||||
*/
|
||||
static final int CONTRACTING = 0xFFFE0000;
|
||||
|
||||
static final int UNSUPPORTED_FLAG = 0xFFFD0000;
|
||||
|
||||
|
||||
/**
|
||||
* Used to composed Hangul and Han characters
|
||||
*/
|
||||
|
||||
static final int NEUTRAL_SECONDARY = 0x20;
|
||||
static final int NEUTRAL_TERTIARY = 0x02;
|
||||
|
||||
/** Enum for alternate handling */
|
||||
public static final byte SHIFTED = 0, ZEROED = 1, NON_IGNORABLE = 2, SHIFTED_TRIMMED = 3, LAST = 3;
|
||||
|
||||
/**
|
||||
* Used to terminate a list of CEs
|
||||
*/
|
||||
public static final int TERMINATOR = 0xFFFFFFFF; // CE that marks end of string
|
||||
|
||||
/**
|
||||
* Any unsupported characters (those not in the UCA data tables)
|
||||
* are marked with a exception bit combination
|
||||
* so that they can be treated specially.<br>
|
||||
* There are at least 34 values, so that we can use a range for surrogates
|
||||
* However, we do add to the first weight if we have surrogate pairs!
|
||||
*/
|
||||
static final int UNSUPPORTED_CJK_BASE = 0xFB40;
|
||||
static final int UNSUPPORTED_CJK_AB_BASE = 0xFB80;
|
||||
static final int UNSUPPORTED_OTHER_BASE = 0xFBC0;
|
||||
|
||||
static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
|
||||
static final int UNSUPPORTED_LIMIT = UNSUPPORTED_OTHER_BASE + 0x40;
|
||||
|
||||
|
||||
/**
|
||||
* Special char value that means failed or terminated
|
||||
*/
|
||||
static final char NOT_A_CHAR = '\uFFFF';
|
||||
|
||||
/**
|
||||
* CEType
|
||||
*/
|
||||
static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2,
|
||||
CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
|
||||
FIXED_CE = 3;
|
||||
// SURROGATE_CE = 6,
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,62 +0,0 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<title>Chart Instructions</title>
|
||||
<style>
|
||||
|
||||
<!--
|
||||
|
||||
th { background-color: #eeeeee }
|
||||
-->
|
||||
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h1>Instructions</h1>
|
||||
<p>The Case Charts provide charts of the characters in Unicode that differ from
|
||||
at least one of their case forms (lower, title, upper, or fold).</p>
|
||||
<blockquote>
|
||||
<p><i>To properly view these charts, your browser should be reasonably recent
|
||||
so it handles Unicode and cascading style sheets, and you should install a
|
||||
Unicode font and configure your browser to use it.</i></p>
|
||||
</blockquote>
|
||||
<p><b>Notes:</b></p>
|
||||
<ul>
|
||||
<li>The index pages are ordered by the following:
|
||||
<ul>
|
||||
<li>By script, unless the script is COMMON or INHERITED</li>
|
||||
<li>By general category, in the latter two cases</li>
|
||||
<li>If characters have a decomposition containing a cased character, but
|
||||
do not have a case mapping (lower, title, upper, or fold), then they are
|
||||
listed in NoCaseMapping.</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Within each chart page, the code points are sorted by lowercased <a href="http://www.unicode.org/unicode/reports/tr15/" target="_top">NFKD</a>,
|
||||
to place related characters next to one another.</li>
|
||||
<li>To help pick out cells visually, the more interesting ones have a light
|
||||
blue background. The other cells have grayed-out text.
|
||||
<ul>
|
||||
<li>The more interesting ones are:
|
||||
<ul>
|
||||
<li><i>lower: </i>if different than the character</li>
|
||||
<li><i>title: </i>if different than upper</li>
|
||||
<li><i>upper: </i>if different than the character</li>
|
||||
<li><i>fold: </i>if different than lower</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>If your browser supports tool-tops, then hovering your mouse over cells
|
||||
will show the names of the characters.</li>
|
||||
<li>For more information, see <a href="http://www.unicode.org/unicode/reports/tr21/" target="_top">UAX
|
||||
#21: Case Mappings</a>.</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -1,35 +0,0 @@
|
||||
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="keywords" content="Basic">
|
||||
<title>Case Chart</title>
|
||||
<style><!--
|
||||
p { font-size: 90% }
|
||||
--></style>
|
||||
<base target="main">
|
||||
<link rel="stylesheet" type="text/css"
|
||||
href="http://www.unicode.org/webscripts/standard_styles.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<table width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<table width="100%" border="0" cellpadding="0" cellspacing="0">
|
||||
<tr>
|
||||
<td class="icon"><a href="http://www.unicode.org/"><img border="0"
|
||||
src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
|
||||
alt="[Unicode]" width="34" height="33"></a> <a class="bar"
|
||||
href="http://www.unicode.org/unicode/faq/"><font size="3">Charts</font></a>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<div class="body">
|
||||
<!-- BEGIN CONTENTS -->
|
||||
<h2 align="center">Case Chart</h2>
|
||||
<p align="center"><a href="help.html">Help</a>
|
@ -1,21 +0,0 @@
|
||||
td { border: 1 solid #0000FF; color: #000000; background-color: #FFFFFF;
|
||||
font-size: 120%; text-align: Center; vertical-align: top; width: 48px }
|
||||
td.p { color: #000000; background-color: #7777FF }
|
||||
td.s { color: #000000; background-color: #BBBBFF }
|
||||
td.t { color: #000000; background-color: #DDDDFF }
|
||||
td.q { color: #000000; background-color: #FFFFFF }
|
||||
td.ep { color: #000000; background-color: #FF5555 }
|
||||
td.es { color: #000000; background-color: #FF7777 }
|
||||
td.et { color: #000000; background-color: #FF9999 }
|
||||
td.eq { color: #000000; background-color: #FFBBBB }
|
||||
th { vertical-align: top; font-weight: bold }
|
||||
th.x { vertical-align: top; font-weight: regular; text-align: Left }
|
||||
tt { font-size: 50% }
|
||||
|
||||
td.name { text-align: left; vertical-align: middle; width: 96% }
|
||||
body { background-color: #FFFFFF; }
|
||||
|
||||
td.g { font-size: 120%; text-align: Center; width: 72px; color: #808080; }
|
||||
td.n { font-size: 120%; text-align: Center; width: 72px; color: #000000; background-color: #CCCCFF; }
|
||||
td.z { font-size: 120%; text-align: Center; width: 72px; font-weight: bold; background-color: #EEEEEE; }
|
||||
td.h { font-size: 120%; text-align: Left; color: #000000; background-color: #EEEEEE; }
|
@ -1,125 +0,0 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<link rel="stylesheet" href="charts.css" type="text/css">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<title>UCA Chart Help</title>
|
||||
<base target="main">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h2 align="center">UCA Chart Help</h2>
|
||||
<p>This set of charts shows the Unicode Collation Algorithm values for Unicode
|
||||
characters. The characters are arranged in the following groups:</p>
|
||||
<table cellspacing="0" cellpadding="4">
|
||||
<tr>
|
||||
<th align="left"><i>Null</i></th>
|
||||
<th class="x">Completely ignoreable (primary, secondary and tertiary levels)<br>
|
||||
These include control codes and various formatting codes.</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th align="left"><i>Ignorable</i></th>
|
||||
<th class="x">Ignorable at a primary level, but not at a secondary or
|
||||
tertiary level.<br>
|
||||
These include most accents and diacritics.</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th align="left"><i>Variable</i></th>
|
||||
<th class="x">Characters that may be set to ignorable by a programmatic
|
||||
switch.<br>
|
||||
These include spaces, punctuation marks, and most symbols.</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th align="left"><i>Common</i></th>
|
||||
<th class="x">Characters that are none of the above, but not considered
|
||||
letters.<br>
|
||||
These include numbers, currency symbols, etc.</th>
|
||||
<tr>
|
||||
<th align="left"><i>Letters</i></th>
|
||||
<th class="x">According to script</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th align="left"><i>Unsupported</i></th>
|
||||
<th class="x">Not explicitly supported in this version of UCA; uses
|
||||
code-point order</th>
|
||||
</tr>
|
||||
</table>
|
||||
<p>The characters* within each group are arranged in cells. The color of the
|
||||
cell indicates the strength of the difference between that character and the <i>previous</i>
|
||||
character in the chart, as follows.</p>
|
||||
<table cellspacing="0" cellpadding="4">
|
||||
<tr>
|
||||
<th colspan="2"><font size="3"><u>No Expansion</u></font>
|
||||
<th rowspan="5">
|
||||
<th colspan="2"><font size="3"><u>Expansion</u></font>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="p">a<br>
|
||||
<tt>0061</tt></td>
|
||||
<th class="x">Primary difference
|
||||
<td class="ep">dz<br>
|
||||
<tt>01F3</tt></td>
|
||||
<th class="x">Primary difference</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="s">á<br>
|
||||
<tt>00E1</tt></td>
|
||||
<th class="x">Secondary Difference</th>
|
||||
<td class="es">DZ<br>
|
||||
<tt>01F1</tt></td>
|
||||
<th class="x">Secondary Difference</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="t">A<br>
|
||||
<tt>0041</tt></td>
|
||||
<th class="x">Tertiary difference</th>
|
||||
<td class="et">Dz<br>
|
||||
<tt>01F2</tt></td>
|
||||
<th class="x">Tertiary difference</th>
|
||||
<tr>
|
||||
<td class="q">Å<br>
|
||||
<tt>212B</tt></td>
|
||||
<th class="x">Quarternary difference<br>
|
||||
or no difference</th>
|
||||
<td class="eq"> </td>
|
||||
<th class="x">Quarternary difference<br>
|
||||
or no difference</th>
|
||||
</tr>
|
||||
</table>
|
||||
<blockquote>
|
||||
<p align="left"><b>Note: </b>If tool-tips are enabled in your browser, then if
|
||||
you pause the mouse over any cell, you will see the name of the character and
|
||||
a representation of the sort key. In this representation, the separators
|
||||
between the weight levels are represented with "|".</p>
|
||||
</blockquote>
|
||||
<table>
|
||||
<tr>
|
||||
<th>*</th>
|
||||
<th class="x">In some cases, the UCA data table also includes contractions.<br>
|
||||
They can be recognized by the multiple code point numbers, as in the
|
||||
following:</th>
|
||||
<td class="p">ஔ<br>
|
||||
<tt>0B92 0BD7</tt></td>
|
||||
</tr>
|
||||
</table>
|
||||
<h3><b>Notes</b></h3>
|
||||
<ul>
|
||||
<li>The UCA results are versioned <i>both</i> by the version of the UCA <i>and</i>
|
||||
by the version of The Unicode Standard used to process the data.</li>
|
||||
<li>These charts only provide one of the alternatives for handling variable
|
||||
characters (punctuation), whereby these characters are <b>non-ignorable.</b></li>
|
||||
<li>Characters from large blocks, such as CJK-Ideographs, Hangul Syllables,
|
||||
Private Use Area, etc. are represented by a sampling.</li>
|
||||
<li>Some unassigned code points, noncharacters and other edge cases are also
|
||||
added to the list for comparison.</li>
|
||||
<li>For more information, see <a href="http://www.unicode.org/unicode/reports/tr10/" target="_top">UTS
|
||||
#10: Unicode Collation Algorithm</a>.</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -1,21 +0,0 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<title>%%%</title>
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
</head>
|
||||
|
||||
<frameset cols="192,*">
|
||||
<frame name="header" src="index_list.html" target="main" scrolling="auto">
|
||||
<frame name="main" src="help.html" target="main" scrolling="auto">
|
||||
<noframes>
|
||||
<body>
|
||||
|
||||
<p>This page uses frames, but your browser doesn't support them.</p>
|
||||
|
||||
</body>
|
||||
</noframes>
|
||||
</frameset>
|
||||
|
||||
</html>
|
@ -1,37 +0,0 @@
|
||||
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="keywords" content="Basic">
|
||||
<title>%%%</title>
|
||||
<style><!--
|
||||
p { font-size: 90%; text-align: Center }
|
||||
--></style>
|
||||
<link rel="stylesheet" type="text/css"
|
||||
href="http://www.unicode.org/webscripts/standard_styles.css">
|
||||
<base target='main'>
|
||||
</head>
|
||||
|
||||
<body class="navColTable">
|
||||
<table width="120%" cellpadding="0" cellspacing="0" border="0">
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<table width="100%" border="0" cellpadding="0" cellspacing="0">
|
||||
<tr>
|
||||
<td class="icon"><a href="http://www.unicode.org/" target='_top'><img border="0"
|
||||
src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
|
||||
alt="[Unicode]" width="34" height="33"></a> <a class="bar"
|
||||
href="http://www.unicode.org/charts/" target='_top'><font size="3">Charts</font></a>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2" class="gray"> </td>
|
||||
</tr>
|
||||
</table>
|
||||
<div class="body">
|
||||
<!-- BEGIN CONTENTS -->
|
||||
<h2 align="center">%%%</h2>
|
||||
<p><a href="help.html">Help</a>
|
@ -1,55 +0,0 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<meta name="COPYRIGHT" content=
|
||||
"Copyright (c) 2002-2006 IBM Corporation and others. All Rights Reserved.">
|
||||
<title>Chart Instructions</title>
|
||||
<style>
|
||||
|
||||
<!--
|
||||
|
||||
th { background-color: #eeeeee }
|
||||
-->
|
||||
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h1>Instructions</h1>
|
||||
<p>The Name charts provide an index to Unicode names. Each word in each Unicode
|
||||
character name is extracted, and used as an index for the characters. </p>
|
||||
<blockquote>
|
||||
<p><i>To properly view these charts, your browser should be reasonably recent
|
||||
so it handles Unicode and cascading style sheets, and you should install a
|
||||
Unicode font and configure your browser to use it.</i></p>
|
||||
</blockquote>
|
||||
<p><b>Notes:</b></p>
|
||||
<ul>
|
||||
<li>To keep the charts from becoming too large, a 'stop-list' of words are
|
||||
omitted. These are:
|
||||
<ul>
|
||||
<li>AND, CAPITAL, CHARACTER, COMPATIBILITY, LETTER, SMALL, WITH</li>
|
||||
<li>All script names</li>
|
||||
<li>All words containing a digit</li>
|
||||
<li>All Hangul Syllables</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Unlike some of the other charts, tool-tips to reveal the names are not
|
||||
included (for compactness). However, if you want to know the name of any
|
||||
particular characters:
|
||||
<ul>
|
||||
<li>Copy the character from the cell.</li>
|
||||
<li>Go to <a href="http://demo.icu-project.org/icu-bin/translit">http://demo.icu-project.org/icu-bin/translit</a></li>
|
||||
<li>Paste in under <b>Input 1</b></li>
|
||||
<li>Select <b>Output 1</b>: Any - Name</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -1,61 +0,0 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<title>Chart Instructions</title>
|
||||
<style>
|
||||
|
||||
<!--
|
||||
|
||||
th { background-color: #eeeeee }
|
||||
-->
|
||||
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h1>Instructions</h1>
|
||||
<p>The Normalization Charts provide charts of the characters in Unicode that
|
||||
differ from at least one of their normalization forms (C, D, KC, KD).</p>
|
||||
<blockquote>
|
||||
<p><i>To properly view these charts, your browser should be reasonably recent
|
||||
so it handles Unicode and cascading style sheets, and you should install a
|
||||
Unicode font and configure your browser to use it.</i></p>
|
||||
</blockquote>
|
||||
<p><b>Notes:</b></p>
|
||||
<ul>
|
||||
<li>The index pages are ordered by the following:
|
||||
<ul>
|
||||
<li>By script, unless the script is COMMON or INHERITED</li>
|
||||
<li>By general category, in the latter two cases</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Within each chart page, the code points are sorted by folded <a href="http://www.unicode.org/unicode/reports/tr15/" target="_top">NFKD</a>,
|
||||
to place related characters next to one another.</li>
|
||||
<li>To keep the size of the Hangul chart manageable, characters U+AD00..U+D6FF
|
||||
(관..훿) are omitted.</li>
|
||||
<li>To help pick out cells visually, the more interesting ones have a light
|
||||
blue background. The other cells have grayed-out text.
|
||||
<ul>
|
||||
<li>The more interesting ones are:
|
||||
<ul>
|
||||
<li><i>C: </i>if different than the character</li>
|
||||
<li><i>D: </i>if different than C</li>
|
||||
<li><i>KC: </i>if different than C</li>
|
||||
<li><i>KD: </i>if different than KC and D</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>If your browser supports tool-tops, then hovering your mouse over cells
|
||||
will show the names of the characters.</li>
|
||||
<li>For more information, see <a href="http://www.unicode.org/unicode/reports/tr15/" target="_top">UAX
|
||||
#15: Unicode Normalization Forms</a>.</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -1,35 +0,0 @@
|
||||
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="keywords" content="Basic">
|
||||
<title>Normalization Chart</title>
|
||||
<style><!--
|
||||
p { font-size: 90% }
|
||||
--></style>
|
||||
<base target="main">
|
||||
<link rel="stylesheet" type="text/css"
|
||||
href="http://www.unicode.org/webscripts/standard_styles.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<table width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<table width="100%" border="0" cellpadding="0" cellspacing="0">
|
||||
<tr>
|
||||
<td class="icon"><a href="http://www.unicode.org/"><img border="0"
|
||||
src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
|
||||
alt="[Unicode]" width="34" height="33"></a> <a class="bar"
|
||||
href="http://www.unicode.org/unicode/faq/"><font size="3">Charts</font></a>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<div class="body">
|
||||
<!-- BEGIN CONTENTS -->
|
||||
<h2 align="center">Normalization Chart</h2>
|
||||
<p align="center"><a href="help.html">Help</a>
|
@ -1,31 +0,0 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<title>Chart Instructions</title>
|
||||
<style>
|
||||
|
||||
<!--
|
||||
|
||||
th { background-color: #eeeeee }
|
||||
-->
|
||||
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h1>Instructions</h1>
|
||||
<p>The Script charts provide an index to Unicode characters by script.</p>
|
||||
<blockquote>
|
||||
<p><i>To properly view these charts, your browser should be reasonably recent
|
||||
so it handles Unicode and cascading style sheets, and you should install a
|
||||
Unicode font and configure your browser to use it.</i></p>
|
||||
</blockquote>
|
||||
<p>Where the script = Common, the General Category is used in the index instead.</p>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -1,35 +0,0 @@
|
||||
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="keywords" content="Basic">
|
||||
<title>Script Chart</title>
|
||||
<style><!--
|
||||
p { font-size: 90% }
|
||||
--></style>
|
||||
<base target="main">
|
||||
<link rel="stylesheet" type="text/css"
|
||||
href="http://www.unicode.org/webscripts/standard_styles.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<table width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<table width="100%" border="0" cellpadding="0" cellspacing="0">
|
||||
<tr>
|
||||
<td class="icon"><a href="http://www.unicode.org/"><img border="0"
|
||||
src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
|
||||
alt="[Unicode]" width="34" height="33"></a> <a class="bar"
|
||||
href="http://www.unicode.org/unicode/faq/"><font size="3">Charts</font></a>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<div class="body">
|
||||
<!-- BEGIN CONTENTS -->
|
||||
<h2 align="center">Script Chart</h2>
|
||||
<p align="center"><a href="help.html">Help</a>
|
@ -1,6 +0,0 @@
|
||||
#
|
||||
# Note: The casing of block names is not normative.
|
||||
# For example, "Basic Latin" and "BASIC LATIN" are equivalent.
|
||||
#
|
||||
# Format:
|
||||
# Start Code..End Code; Block Name
|
@ -1,657 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
|
||||
* $Date: 2004/03/11 19:03:18 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
//import com.ibm.text.unicode.UInfo;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
//import java.text.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
public class BuildNames implements UCD_Types {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
collectWords();
|
||||
}
|
||||
|
||||
static Map words = new TreeMap(new LengthFirstComparator());
|
||||
static Map doubleWords = new TreeMap(new LengthFirstComparator());
|
||||
static Map tripleWords = new TreeMap(new LengthFirstComparator());
|
||||
static Map quadWords = new TreeMap(new LengthFirstComparator());
|
||||
static Set lines = new TreeSet(new LengthFirstComparator());
|
||||
static int[] letters = new int[128];
|
||||
|
||||
static class Count {
|
||||
Count(int count) {this.count = count;}
|
||||
int count;
|
||||
}
|
||||
|
||||
static String lastWord = "";
|
||||
static String preLastWord = "";
|
||||
static String prePreLastWord = "";
|
||||
|
||||
static void addWord(String word, Map words) {
|
||||
Count count = (Count) words.get(word);
|
||||
if (count == null) {
|
||||
count = new Count(0);
|
||||
words.put(word, count);
|
||||
}
|
||||
count.count++;
|
||||
}
|
||||
|
||||
static void stash(String word, int position) {
|
||||
addWord(word, words);
|
||||
|
||||
// doubles
|
||||
|
||||
if (position > 0) {
|
||||
addWord(lastWord + "/" + word, doubleWords);
|
||||
}
|
||||
|
||||
if (position > 1) {
|
||||
addWord(preLastWord + "/" + lastWord + "/" + word, tripleWords);
|
||||
}
|
||||
|
||||
if (position > 2) {
|
||||
addWord(prePreLastWord + "/" + preLastWord + "/" + lastWord + "/" + word, quadWords);
|
||||
}
|
||||
|
||||
prePreLastWord = preLastWord;
|
||||
preLastWord = lastWord;
|
||||
lastWord = word;
|
||||
|
||||
for (int i = 0; i < word.length(); ++i) {
|
||||
letters[word.charAt(i)]++;
|
||||
}
|
||||
}
|
||||
|
||||
static String transform(String line) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
boolean changed = false;
|
||||
for (int i = 0; i < line.length(); ++i) {
|
||||
char c = line.charAt(i);
|
||||
|
||||
if (c == '-' || c == '<' || c == '>') {
|
||||
if (result.length() > 0 && result.charAt(result.length()-1) != ' ') result.append(' ');
|
||||
result.append(c);
|
||||
if (i + 1 < line.length() && line.charAt(i+1) != ' ') result.append(' ');
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ('a' <= c && c <= 'z') {
|
||||
result.append((char)(c - 'a' + 'A'));
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
if ('0' <= c && c <= '9') {
|
||||
result.append('*').append((char)(c - '0' + 'A'));
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
result.append(c);
|
||||
}
|
||||
if (!changed) return line;
|
||||
return result.toString().trim();
|
||||
}
|
||||
|
||||
static void printWords(Map words) {
|
||||
System.out.println();
|
||||
System.out.println("Finding largest");
|
||||
System.out.println();
|
||||
|
||||
Map biggest = new TreeMap();
|
||||
Iterator it = words.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
String word = (String) it.next();
|
||||
Count count = (Count) words.get(word);
|
||||
biggest.put(new Integer(-count.count * word.length()), word); // make it negative just to reverse the sort
|
||||
}
|
||||
|
||||
it = biggest.keySet().iterator();
|
||||
int counter = 0;
|
||||
while (it.hasNext()) {
|
||||
if (counter++ > 50) break;
|
||||
Integer key = (Integer) it.next();
|
||||
String word = (String) biggest.get(key);
|
||||
System.out.println(word + ":\t" + (-key.intValue()));
|
||||
}
|
||||
}
|
||||
|
||||
static void collectWords() throws IOException {
|
||||
|
||||
String fname = "ShortNames.txt";
|
||||
System.out.println("Writing " + fname);
|
||||
PrintWriter log = Utility.openPrintWriter(fname, Utility.LATIN1_WINDOWS);
|
||||
|
||||
System.out.println("Gathering data");
|
||||
//Counter counter = new Counter();
|
||||
String[] parts = new String[100];
|
||||
//int total = 0;
|
||||
int used = 0;
|
||||
int sum = 0;
|
||||
int longSum = 0;
|
||||
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
if (!Default.ucd().isAllocated(cp)) continue;
|
||||
if (Default.ucd().hasComputableName(cp)) continue;
|
||||
Utility.dot(cp);
|
||||
String name;
|
||||
|
||||
if (Default.ucd().isRepresented(cp)) {
|
||||
name = Default.ucd().getName(cp, SHORT);
|
||||
log.println(Utility.hex(cp) + " " + name);
|
||||
String backName = Utility.replace(name, UCD_Names.NAME_ABBREVIATIONS, false);
|
||||
if (!name.equals(backName)) {
|
||||
System.out.println("Failed to recreate: " + name + ", " + backName);
|
||||
}
|
||||
}
|
||||
|
||||
// check the string, and its decomposition. This is just to get a good count.
|
||||
|
||||
String str = UTF16.valueOf(cp);
|
||||
if (false && !Default.nfkd().isNormalized(cp)) {
|
||||
str += Default.nfkd().normalize(cp);
|
||||
}
|
||||
|
||||
int cp2;
|
||||
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp2)) {
|
||||
cp2 = UTF16.charAt(str, i);
|
||||
name = Default.ucd().getName(cp2, SHORT);
|
||||
if (name == null) continue;
|
||||
//name = transform(name);
|
||||
|
||||
sum += name.length();
|
||||
longSum += Default.ucd().getName(cp2).length();
|
||||
used++;
|
||||
|
||||
// replace numbers & letters
|
||||
|
||||
int len = Utility.split(name, ' ', parts);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
stash(parts[j], j);
|
||||
}
|
||||
|
||||
lines.add(name);
|
||||
}
|
||||
}
|
||||
log.close();
|
||||
Utility.fixDot();
|
||||
//System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
|
||||
//System.out.println("Strings: " + sum + ", " + (lastLink*4));
|
||||
System.out.println("Short Names sum: " + sum + ", average: " + (sum + 0.0)/used);
|
||||
System.out.println("Long Names sum: " + longSum + ", average: " + (longSum + 0.0)/used);
|
||||
System.out.println("Savings: " + (1 - (sum+0.0)/longSum));
|
||||
|
||||
|
||||
printWords(words);
|
||||
printWords(doubleWords);
|
||||
printWords(tripleWords);
|
||||
printWords(quadWords);
|
||||
|
||||
if (true) return;
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Compacting Words");
|
||||
System.out.println();
|
||||
Iterator it = words.keySet().iterator();
|
||||
|
||||
int i = 0;
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
int test = CompactName.addWord(s);
|
||||
String round = CompactName.stringFromToken(test);
|
||||
boolean goesRound = round.equals(s);
|
||||
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
|
||||
+ (goesRound ? ": NO RT: '" + round + "'" : ""));
|
||||
}
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Compacting Lines");
|
||||
System.out.println();
|
||||
CompactName.startLines();
|
||||
it = lines.iterator();
|
||||
i = 0;
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
if (s.equals("< BELL >")) {
|
||||
System.out.println("DEBUG");
|
||||
}
|
||||
int test = CompactName.addLine(s);
|
||||
String round = CompactName.stringFromToken(test);
|
||||
boolean goesRound = round.equals(s);
|
||||
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
|
||||
+ (!goesRound ? ": NO RT: '" + round + "'" : ""));
|
||||
}
|
||||
|
||||
/*System.out.println("Printing Compact Forms");
|
||||
for (int i = 0; i < CompactName.lastToken; ++i) {
|
||||
String s = CompactName.stringFromToken(i);
|
||||
System.out.println(i + ": '" + s + "'");
|
||||
}*/
|
||||
|
||||
System.out.println("Strings: " + sum
|
||||
+ ", " + (CompactName.spacedMinimum*4)
|
||||
+ ", " + (CompactName.lastToken*4)
|
||||
);
|
||||
|
||||
}
|
||||
/*
|
||||
Set stuff = new TreeSet();
|
||||
for (int i = 0; i < letters.length; ++i) {
|
||||
if (letters[i] != 0) {
|
||||
stuff.add(new Integer((letters[i] << 8) + i));
|
||||
}
|
||||
}
|
||||
|
||||
it = stuff.iterator();
|
||||
while (it.hasNext()) {
|
||||
int in = ((Integer) it.next()).intValue();
|
||||
System.out.println((char)(in & 0xFF) + ":\t" + String.valueOf(in >> 8));
|
||||
}
|
||||
int r = addString(name);
|
||||
if (!DEBUG && !rname.equals(name)) {
|
||||
System.out.println("\tNo Round Trip: '" + rname + "'");
|
||||
}
|
||||
*/
|
||||
|
||||
static Map stringToInt = new HashMap();
|
||||
static Map intToString = new HashMap();
|
||||
|
||||
static final int[] remap = new int['Z'+1];
|
||||
static final int maxToken;
|
||||
|
||||
static {
|
||||
int counter = 1;
|
||||
remap[' '] = counter++;
|
||||
remap['-'] = counter++;
|
||||
remap['>'] = counter++;
|
||||
remap['<'] = counter++;
|
||||
for (int i = 'A'; i <= 'Z'; ++i) {
|
||||
remap[i] = counter++;
|
||||
}
|
||||
for (int i = '0'; i <= '9'; ++i) {
|
||||
remap[i] = counter++;
|
||||
}
|
||||
maxToken = counter;
|
||||
}
|
||||
|
||||
static final String[] unmap = new String[maxToken];
|
||||
static {
|
||||
unmap[0] = "";
|
||||
for (int i = 0; i < remap.length; ++i) {
|
||||
int x = remap[i];
|
||||
if (x != 0) unmap[x] = String.valueOf((char)i);
|
||||
}
|
||||
}
|
||||
|
||||
static int[] links = new int[40000];
|
||||
static final int linkStart = 0;
|
||||
static int lastLink = 0;
|
||||
static final int LITERAL_BOUND = 0x7FFF - maxToken * maxToken;
|
||||
|
||||
static boolean isLiteral(int i) {
|
||||
return (i & 0x7FFF) > LITERAL_BOUND;
|
||||
}
|
||||
|
||||
static String lookup(int i) {
|
||||
String result;
|
||||
boolean trailingSpace = false;
|
||||
if ((i & 0x8000) != 0) {
|
||||
i ^= 0x8000;
|
||||
trailingSpace = true;
|
||||
}
|
||||
if (i > LITERAL_BOUND) {
|
||||
i = i - LITERAL_BOUND;
|
||||
int first = i / maxToken;
|
||||
int second = i % maxToken;
|
||||
result = unmap[first] + unmap[second];
|
||||
} else {
|
||||
int value = links[i];
|
||||
int lead = value >>> 16;
|
||||
int trail = value & 0xFFFF;
|
||||
//if (DEBUG) System.out.println("lead: " + lead + ", trail: " + trail);
|
||||
result = lookup(lead) + lookup(trail);
|
||||
}
|
||||
if (trailingSpace) result += ' ';
|
||||
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
|
||||
return result;
|
||||
}
|
||||
|
||||
static int getInt(String s) {
|
||||
if (s.length() < 3) {
|
||||
if (s.length() == 0) return 0;
|
||||
int first = s.charAt(0);
|
||||
int second = s.length() > 1 ? s.charAt(1) : 0;
|
||||
return LITERAL_BOUND + (remap[first] * maxToken + remap[second]);
|
||||
}
|
||||
Object in = stringToInt.get(s);
|
||||
if (in == null) return -1;
|
||||
return ((Integer)in).intValue();
|
||||
}
|
||||
|
||||
static int putString(String s, int lead, int trail) {
|
||||
Object in = stringToInt.get(s);
|
||||
if (in != null) throw new IllegalArgumentException();
|
||||
int value = (lead << 16) + (trail & 0xFFFF);
|
||||
int result = lastLink;
|
||||
links[lastLink++] = value;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail);
|
||||
String roundTrip = lookup(result);
|
||||
if (!roundTrip.equals(s)) {
|
||||
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
|
||||
}
|
||||
}
|
||||
stringToInt.put(s, new Integer(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
// s cannot have a trailing space. Must be <,>,-,SPACE,0-9,A-Z
|
||||
static int addString(String s) {
|
||||
int result = getInt(s);
|
||||
if (result != -1) return result;
|
||||
int limit = s.length() - 1;
|
||||
int bestLen = 0;
|
||||
int best_i = 0;
|
||||
int bestSpaceLen = 0;
|
||||
int bestSpace_i = 0;
|
||||
int lastSpace = -1;
|
||||
int spaceBits;
|
||||
int endOfFirst;
|
||||
|
||||
// invariant. We break after a space if there is one.
|
||||
|
||||
for (int i = 1; i < limit; ++i) {
|
||||
char c = s.charAt(i-1);
|
||||
spaceBits = 0;
|
||||
endOfFirst = i;
|
||||
if (c == ' ') {
|
||||
lastSpace = i;
|
||||
endOfFirst--;
|
||||
spaceBits = 0x8000;
|
||||
}
|
||||
|
||||
String firstPart = s.substring(0, endOfFirst);
|
||||
String lastPart = s.substring(i);
|
||||
if (firstPart.equals("<START OF ")) {
|
||||
System.out.println("HUH");
|
||||
}
|
||||
int lead = getInt(firstPart);
|
||||
int trail = getInt(lastPart);
|
||||
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
|
||||
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
|
||||
+ "' # '" + lastPart + "' MATCH BOTH");
|
||||
return putString(s, spaceBits | lead, trail);
|
||||
}
|
||||
if (!isLiteral(lead)) {
|
||||
if (i > bestLen) {
|
||||
bestLen = i;
|
||||
best_i = i;
|
||||
}
|
||||
if (i > bestSpaceLen && c == ' ') {
|
||||
bestSpaceLen = i;
|
||||
bestSpace_i = i + 1;
|
||||
}
|
||||
}
|
||||
int end_i = s.length() - i;
|
||||
if (!isLiteral(trail)) {
|
||||
if (end_i > bestLen) {
|
||||
bestLen = end_i;
|
||||
best_i = i;
|
||||
}
|
||||
if (end_i > bestSpaceLen && c == ' ') {
|
||||
bestSpaceLen = end_i;
|
||||
bestSpace_i = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (lastSpace >= 0) {
|
||||
bestLen = bestSpaceLen;
|
||||
best_i = bestSpace_i;
|
||||
}
|
||||
|
||||
spaceBits = 0;
|
||||
|
||||
if (bestLen > 0) { // if one matches, recurse -- and return pair
|
||||
endOfFirst = best_i;
|
||||
if (lastSpace > 0) {
|
||||
--endOfFirst;
|
||||
spaceBits = 0x8000;
|
||||
}
|
||||
String firstPart = s.substring(0, endOfFirst);
|
||||
String lastPart = s.substring(best_i);
|
||||
int lead = getInt(firstPart);
|
||||
int trail = getInt(lastPart);
|
||||
if (lead >= 0) {
|
||||
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
|
||||
+ "' # '" + lastPart + "' MATCH FIRST");
|
||||
return putString(s, spaceBits | lead, addString(lastPart));
|
||||
} else {
|
||||
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
|
||||
+ "' # '" + lastPart + "' MATCH SECOND");
|
||||
return putString(s, spaceBits | addString(firstPart), trail);
|
||||
}
|
||||
}
|
||||
// otherwise, we failed to find anything. Then break before the last word, if there is one
|
||||
// otherwise break in the middle (but at even value)
|
||||
|
||||
|
||||
if (lastSpace >= 0) {
|
||||
best_i = lastSpace;
|
||||
endOfFirst = lastSpace - 1;
|
||||
spaceBits = 0x8000;
|
||||
} else {
|
||||
endOfFirst = best_i = ((s.length() + 1) / 4) * 2;
|
||||
}
|
||||
String firstPart = s.substring(0, endOfFirst);
|
||||
String lastPart = s.substring(best_i);
|
||||
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
|
||||
+ "' # '" + lastPart + "' FALLBACK");
|
||||
return putString(s, spaceBits | addString(firstPart), addString(lastPart));
|
||||
}
|
||||
|
||||
/*
|
||||
static int addCompression(String s) {
|
||||
Object in = stringToInt.get(s);
|
||||
if (in != null) return ((Integer) in).intValue();
|
||||
// find best match, recursively
|
||||
int bestBreak = -1;
|
||||
boolean pickFirst = false;
|
||||
for (int i = 1; i < s.length() - 1; ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c == ' ' || c == '-') {
|
||||
Object pos1 = stringToInt.get(s.substring(0,i+1));
|
||||
//Object pos23 = stringToInt.get(s..substring(i));
|
||||
|
||||
|
||||
if (pos2 >= 0 && pos3 >= 0) {
|
||||
fullToCompressed.put(value, new Integer(index + reserved));
|
||||
continue main;
|
||||
}
|
||||
if (pos2 >= 0) {
|
||||
if (k > bestBreak) {
|
||||
bestBreak = k;
|
||||
pickFirst = true;
|
||||
}
|
||||
} else if (pos3 >= 0) {
|
||||
if (value.length() - k > bestBreak) {
|
||||
bestBreak = k;
|
||||
pickFirst = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void gatherData() throws IOException {
|
||||
System.out.println("Gathering data");
|
||||
Counter counter = new Counter();
|
||||
String[] parts = new String[100];
|
||||
String[] parts2 = new String[100];
|
||||
int total = 0;
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
//if ((i & 0xFF) == 0) System.out.println(Utility.hex(i));
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
String s = ucd.getName(i);
|
||||
total += s.length();
|
||||
int len = Utility.split(s, ' ', parts);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
if (parts[j].indexOf('-') >= 0) {
|
||||
// hyphen stuff
|
||||
int len2 = Utility.split(parts[j], '-', parts2);
|
||||
for (int k = 0; k < len2; ++k) {
|
||||
if (k == len2 - 1) {
|
||||
counter.add(parts2[k] + '-');
|
||||
} else {
|
||||
counter.add(parts2[k] + " ");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// normal
|
||||
counter.add(parts[j] + " ");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Sorting data");
|
||||
Map m = counter.extract();
|
||||
|
||||
System.out.println("Printing data");
|
||||
|
||||
PrintWriter log = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(GEN_DIR + "NameCompression.txt")),
|
||||
32*1024));
|
||||
|
||||
log.println("total: " + total);
|
||||
|
||||
Iterator it = m.keySet().iterator();
|
||||
|
||||
String mondo = "";
|
||||
int i = 0;
|
||||
int strTotal = 0;
|
||||
|
||||
int index = 0;
|
||||
Map fullToCompressed = new HashMap();
|
||||
|
||||
String mondoIndex = "";
|
||||
|
||||
main:
|
||||
while (it.hasNext()) {
|
||||
index++;
|
||||
if ((i & 255) == 0) System.out.println("#" + i);
|
||||
Counter.RWInteger key = (Counter.RWInteger) it.next();
|
||||
String value = (String)m.get(key);
|
||||
log.println(i++ + ": " + key + ": \"" + value + "\"");
|
||||
strTotal += value.length();
|
||||
|
||||
|
||||
// first 128 are the highest frequency, inc. space
|
||||
|
||||
if (index < 128 - SINGLES) {
|
||||
mondo += value;
|
||||
fullToCompressed.put(value, new String((char)(index + reserved)));
|
||||
continue;
|
||||
}
|
||||
|
||||
int pos = mondo.indexOf(value);
|
||||
if (pos >= 0) {
|
||||
// try splitting!
|
||||
|
||||
int bestBreak = -1;
|
||||
boolean pickFirst = false;
|
||||
if (value.length() > 2) for (int k = 1; k < value.length()-1; ++k) {
|
||||
int pos2 = mondo.indexOf(value.substring(0,k) + " ");
|
||||
int pos3 = mondo.indexOf(value.substring(k));
|
||||
if (pos2 >= 0 && pos3 >= 0) {
|
||||
fullToCompressed.put(value, new Integer(index + reserved));
|
||||
continue main;
|
||||
}
|
||||
if (pos2 >= 0) {
|
||||
if (k > bestBreak) {
|
||||
bestBreak = k;
|
||||
pickFirst = true;
|
||||
}
|
||||
} else if (pos3 >= 0) {
|
||||
if (value.length() - k > bestBreak) {
|
||||
bestBreak = k;
|
||||
pickFirst = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bestBreak > 0) {
|
||||
if (pickFirst) {
|
||||
mondo += value.substring(bestBreak);
|
||||
} else {
|
||||
mondo += value.substring(0, bestBreak) + " ";
|
||||
}
|
||||
} else {
|
||||
mondo += value;
|
||||
}
|
||||
}
|
||||
|
||||
// high bit on, means 2 bytes, look in array
|
||||
}
|
||||
|
||||
log.println("strTotal: " + strTotal);
|
||||
log.println("mondo: " + mondo.length());
|
||||
|
||||
int k = 80;
|
||||
for (; k < mondo.length(); k += 80) {
|
||||
log.println(mondo.substring(k-80, k));
|
||||
}
|
||||
log.println(mondo.substring(k-80)); // last line
|
||||
|
||||
log.close();
|
||||
}
|
||||
|
||||
static int indexOf(StringBuffer target, String source) {
|
||||
int targetLen = target.length() - source.length();
|
||||
main:
|
||||
for (int i = 0; i <= targetLen; ++i) {
|
||||
for (int j = 0; j < source.length(); ++j) {
|
||||
if (target.charAt(i) != source.charAt(j)) continue main;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static final int SINGLES = 26 + 10 + 2;
|
||||
*/
|
||||
|
||||
/*
|
||||
static String decode(int x) {
|
||||
if (x < SINGLES) {
|
||||
if (x < 26) return String.valueOf(x + 'A');
|
||||
if (x < 36) return String.valueOf(x - 26 + '0');
|
||||
if (x == 36) return "-";
|
||||
return " ";
|
||||
}
|
||||
if (x < binaryLimit) {
|
||||
x =
|
||||
*/
|
||||
}
|
@ -1,47 +0,0 @@
|
||||
#
|
||||
# Case Folding Properties
|
||||
#
|
||||
# This file is a supplement to the UnicodeData file.
|
||||
# It provides a case folding mapping generated from the Unicode Character Database.
|
||||
# If all characters are mapped according to the full mapping below, then
|
||||
# case differences (according to UnicodeData.txt and SpecialCasing.txt)
|
||||
# are eliminated.
|
||||
#
|
||||
# The data supports both implementations that require simple case foldings
|
||||
# (where string lengths don't change), and implementations that allow full case folding
|
||||
# (where string lengths may grow). Note that where they can be supported, the
|
||||
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
|
||||
#
|
||||
# All code points not listed in this file map to themselves.
|
||||
#
|
||||
# NOTE: case folding does not preserve normalization formats!
|
||||
#
|
||||
# For information on case folding, see
|
||||
# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
|
||||
#
|
||||
# ================================================================================
|
||||
# Format
|
||||
# ================================================================================
|
||||
# The entries in this file are in the following machine-readable format:
|
||||
#
|
||||
# <code>; <status>; <mapping>; # <name>
|
||||
#
|
||||
# The status field is:
|
||||
# C: common case folding, common mappings shared by both simple and full mappings.
|
||||
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
|
||||
# S: simple case folding, mappings to single characters where different from F.
|
||||
# T: special case for uppercase I and dotted uppercase I
|
||||
# - For non-Turkic languages, this mapping is normally not used.
|
||||
# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
|
||||
# Note that the Turkic mappings do not maintain canonical equivalence without additional processing.
|
||||
# See the discussions of case mapping in the Unicode Standard for more information.
|
||||
#
|
||||
# Usage:
|
||||
# A. To do a simple case folding, use the mappings with status C + S.
|
||||
# B. To do a full case folding, use the mappings with status C + F.
|
||||
#
|
||||
# The mappings with status T can be used or omitted depending on the desired case-folding
|
||||
# behavior. (The default option is to exclude them.)
|
||||
#
|
||||
# =================================================================
|
||||
|
@ -1,47 +0,0 @@
|
||||
#
|
||||
# This file is used to test (1) case conversion, (2) case detection,
|
||||
# and (3) case-insensitive matching.
|
||||
# (1) is represented below by function names such as toLower(),
|
||||
# (2) is represented below by function names such as isLower().
|
||||
# (3) is represented below by the function name equalsCaseInsensitive().
|
||||
# (The actual function names will vary depending on software language and/or library.)
|
||||
#
|
||||
# The test cases also check whether canonical equivalence is preserved
|
||||
# by these functions.
|
||||
#
|
||||
# Format:
|
||||
# <src> ; <lower> ; <upper> ; <title> ; <fold> (# <comment>)?
|
||||
#
|
||||
# Test:
|
||||
#
|
||||
# A. For each line:
|
||||
# 1. Verify the following equalities:
|
||||
# lower == toLower(src)
|
||||
# upper == toUpper(src)
|
||||
# title == toTitle(src)
|
||||
# fold == toFold(src)
|
||||
# 2. Verify that all of the following are true:
|
||||
# isLower(toLower(lower))
|
||||
# isUpper(toUpper(upper))
|
||||
# isTitle(toTitle(title))
|
||||
# isFold(toTitle(fold))
|
||||
# 3. Verify that all of the following are true:
|
||||
# equalsCaseInsensitive(src, lower)
|
||||
# equalsCaseInsensitive(src, upper)
|
||||
# equalsCaseInsensitive(src, title)
|
||||
# equalsCaseInsensitive(src, fold)
|
||||
#
|
||||
# B. For each code point that is NOT listed as a src:
|
||||
# 1. Verify the following equalities:
|
||||
# src == toLower(src) == toUpper(src) == toTitle(src) == toFold(src)
|
||||
# 2. Verify that all of the following are true:
|
||||
# isLower(toLower(lower))
|
||||
# isUpper(toUpper(upper))
|
||||
# isTitle(toTitle(title))
|
||||
# isFold(toTitle(fold))
|
||||
# 3. Verify that all of the following are true:
|
||||
# equalsCaseInsensitive(src, lower)
|
||||
# equalsCaseInsensitive(src, upper)
|
||||
# equalsCaseInsensitive(src, title)
|
||||
# equalsCaseInsensitive(src, fold)
|
||||
#
|
@ -1,25 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Charts.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.io.*;
|
||||
|
||||
import java.util.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
public class Charts {
|
||||
}
|
@ -1,351 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CheckCollator.java,v $
|
||||
* $Date: 2002/08/09 23:56:24 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
// http://java.sun.com/j2se/1.3/docs/guide/intl/encoding.doc.html
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.NumberFormat;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* This is a quick and dirty program to get some idea of collation performance, comparing old Java to new stuff.
|
||||
*/
|
||||
abstract public class CheckCollator {
|
||||
static final String PREFIX = "C:\\ICUInternal\\icu4c\\collation-perf-data\\TestNames_";
|
||||
static final boolean DO_RAW = false;
|
||||
|
||||
static final NumberFormat nf = NumberFormat.getInstance();
|
||||
static final NumberFormat percent = NumberFormat.getPercentInstance();
|
||||
static {
|
||||
nf.setMaximumFractionDigits(2);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
// later, drive off of args
|
||||
|
||||
// choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai
|
||||
//test(Locale.KOREAN, "Korean");
|
||||
test(Locale.ENGLISH, "Latin");
|
||||
test(Locale.FRENCH, "Latin");
|
||||
test(Locale.JAPANESE, "Japanese");
|
||||
}
|
||||
|
||||
public static void test(Locale loc, String name) throws IOException {
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Testing " + loc.getDisplayName() + ", file: " + name);
|
||||
System.out.println();
|
||||
|
||||
// get test data
|
||||
|
||||
String fileName = PREFIX + name + ".txt";
|
||||
|
||||
FileInputStream fis = new FileInputStream(fileName);
|
||||
InputStreamReader isr = new InputStreamReader(fis, "UnicodeLittle");
|
||||
BufferedReader br = new BufferedReader(isr, 32*1024);
|
||||
|
||||
int counter = 0;
|
||||
|
||||
ArrayList list = new ArrayList();
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
Utility.dot(counter++);
|
||||
list.add(line);
|
||||
}
|
||||
System.out.println("Read " + counter + " lines in file");
|
||||
|
||||
int limit = 800; // put a limit on it to save time
|
||||
|
||||
// pump it up if there aren't very many
|
||||
while (list.size() < limit) {
|
||||
list.addAll(list);
|
||||
}
|
||||
|
||||
int size = list.size();
|
||||
|
||||
|
||||
// later, adjust these so we always get a reasonble number of tries
|
||||
|
||||
int extraIterations = 200;
|
||||
if (size > limit) size = limit;
|
||||
|
||||
String[] tests = new String [size];
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
tests[i] = (String) list.get(i);
|
||||
}
|
||||
|
||||
// get collators
|
||||
|
||||
com.ibm.icu.text.Collator newCol = com.ibm.icu.text.Collator.getInstance(loc);
|
||||
java.text.Collator oldCol = java.text.Collator.getInstance(loc);
|
||||
|
||||
|
||||
double startTime, endTime;
|
||||
double delta, oldDelta;
|
||||
String probe;
|
||||
|
||||
|
||||
// load classes at least once before starting
|
||||
|
||||
newCol.compare("a", "b");
|
||||
oldCol.compare("a", "b");
|
||||
|
||||
// ================================================
|
||||
// check sort key size
|
||||
|
||||
int stringSize = 0, newSize = 0, oldSize = 0;
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
stringSize += tests[i].length() * 2;
|
||||
byte[] newKey = newCol.getCollationKey(tests[i]).toByteArray();
|
||||
newSize += newKey.length;
|
||||
byte[] oldKey = oldCol.getCollationKey(tests[i]).toByteArray();
|
||||
oldSize += oldKey.length;
|
||||
}
|
||||
delta = stringSize/(size + 0.0);
|
||||
System.out.println("string size: " + nf.format(delta) + " bytes per key");
|
||||
System.out.println();
|
||||
|
||||
delta = oldDelta = (oldSize/(size + 0.0));
|
||||
System.out.println("old sortkey size: " + nf.format(delta) + " bytes per key ");
|
||||
delta = (newSize/(size + 0.0));
|
||||
System.out.println("new sortkey size: " + nf.format(delta) + " bytes per key " + percent.format(delta/oldDelta));
|
||||
System.out.println();
|
||||
|
||||
// ================================================
|
||||
// Sort Key: old time
|
||||
|
||||
// get overhead time
|
||||
counter = 0;
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
for (int j = 0; j < size; ++j) {
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
double overhead = (1000*(endTime - startTime) / counter);
|
||||
System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros");
|
||||
|
||||
counter = 0;
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
probe = tests[i];
|
||||
for (int k = 0; k < extraIterations; ++k) {
|
||||
oldCol.getCollationKey(probe);
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead;
|
||||
System.out.println("Old sort key time: " + nf.format(delta)
|
||||
+ " micros (" + counter + " iterations)");
|
||||
|
||||
// Sort Key: new time
|
||||
|
||||
counter = 0;
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
probe = tests[i];
|
||||
for (int k = 0; k < extraIterations; ++k) {
|
||||
newCol.getCollationKey(probe);
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
delta = (1000*(endTime - startTime) / counter) - overhead;
|
||||
System.out.println("New sort key time: " + nf.format(delta)
|
||||
+ " micros (" + counter + " iterations) " + percent.format(delta/oldDelta));
|
||||
System.out.println();
|
||||
|
||||
// ================================================
|
||||
// Raw Compare
|
||||
|
||||
if (DO_RAW) {
|
||||
// get overhead time
|
||||
counter = 0;
|
||||
startTime = System.currentTimeMillis();
|
||||
int opt = 0; // to keep the compiler from optimizing out
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
probe = tests[i];
|
||||
for (int j = 0; j < size; ++j) {
|
||||
opt ^= probe.compareTo(tests[j]);
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
overhead = (1000*(endTime - startTime) / counter);
|
||||
System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros");
|
||||
|
||||
// Raw Compare: old time
|
||||
|
||||
counter = 0;
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
probe = tests[i];
|
||||
for (int j = 0; j < size; ++j) {
|
||||
opt ^= oldCol.compare(probe, tests[j]);
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead;
|
||||
System.out.println("Old raw compare time: " + nf.format(delta)
|
||||
+ " micros (" + counter + " iterations)");
|
||||
|
||||
// Raw Compare: new time
|
||||
|
||||
counter = 0;
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
probe = tests[i];
|
||||
for (int j = 0; j < size; ++j) {
|
||||
opt ^= newCol.compare(probe, tests[j]);
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
delta = (1000*(endTime - startTime) / counter) - overhead;
|
||||
System.out.println("New raw compare time: " + nf.format(delta)
|
||||
+ " micros (" + counter + " iterations) " + percent.format(delta/oldDelta));
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
// ================================================
|
||||
// Binary Search
|
||||
// note: I don't worry about getting the binary search precisely right, since I just want to
|
||||
// see which strings would get compared.
|
||||
|
||||
// overhead
|
||||
|
||||
int iterations = (size * extraIterations);
|
||||
startTime = System.currentTimeMillis();
|
||||
Arrays.sort(tests);
|
||||
int opt2 = 0; // keep from optimizing out
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
probe = tests[i];
|
||||
for (int k = 0; k < extraIterations; ++k) {
|
||||
opt2 ^= Arrays.binarySearch(tests, probe);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
overhead = delta = (1000*(endTime - startTime) / iterations);
|
||||
System.out.println("Overhead: " + nf.format(delta)
|
||||
+ " micros (" + iterations + " iterations)");
|
||||
|
||||
// old time
|
||||
|
||||
startTime = System.currentTimeMillis();
|
||||
Arrays.sort(tests, oldCol);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
probe = tests[i];
|
||||
for (int k = 0; k < extraIterations; ++k) {
|
||||
opt2 ^= Arrays.binarySearch(tests, probe, oldCol);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead;
|
||||
System.out.println("Old binary search time: " + nf.format(delta)
|
||||
+ " micros (" + iterations + " iterations)");
|
||||
|
||||
|
||||
// new time
|
||||
|
||||
Arrays.sort(tests, newCol);
|
||||
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
probe = tests[i];
|
||||
for (int k = 0; k < extraIterations; ++k) {
|
||||
opt2 ^= Arrays.binarySearch(tests, probe, newCol);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
delta = (1000*(endTime - startTime) / iterations) - overhead;
|
||||
System.out.println("New binary search time: " + nf.format(delta)
|
||||
+ " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta));
|
||||
System.out.println();
|
||||
|
||||
// ================================================
|
||||
// Sort
|
||||
|
||||
String[] sortTests = (String[]) tests.clone();
|
||||
extraIterations = 5;
|
||||
iterations = (size * extraIterations);
|
||||
|
||||
// overhead
|
||||
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
for (int k = 0; k < extraIterations; ++k) {
|
||||
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
|
||||
Arrays.sort(sortTests);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
overhead = delta = (1000*(endTime - startTime) / iterations);
|
||||
System.out.println("overhead: " + nf.format(delta)
|
||||
+ " micros (" + iterations + " iterations)");
|
||||
|
||||
// old time
|
||||
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
for (int k = 0; k < extraIterations; ++k) {
|
||||
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
|
||||
Arrays.sort(sortTests, oldCol);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead;
|
||||
System.out.println("Old sort time: " + nf.format(delta)
|
||||
+ " micros (" + iterations + " iterations)");
|
||||
|
||||
// new time
|
||||
|
||||
startTime = System.currentTimeMillis();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
for (int k = 0; k < extraIterations; ++k) {
|
||||
System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
|
||||
Arrays.sort(sortTests, newCol);
|
||||
}
|
||||
}
|
||||
endTime = System.currentTimeMillis();
|
||||
delta = (1000*(endTime - startTime) / iterations) - overhead;
|
||||
System.out.println("New sort time: " + nf.format(delta)
|
||||
+ " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta));
|
||||
|
||||
}
|
||||
}
|
@ -1,327 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class CheckICU {
|
||||
static final BagFormatter bf = new BagFormatter();
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
System.out.println("Start");
|
||||
test();
|
||||
System.out.println("End");
|
||||
}
|
||||
|
||||
static UnicodeSet itemFailures;
|
||||
static ICUPropertyFactory icuFactory;
|
||||
static ToolUnicodePropertySource toolFactory;
|
||||
|
||||
static class ReplaceLabel extends UnicodeLabel {
|
||||
UnicodeProperty p;
|
||||
ReplaceLabel(UnicodeProperty p) {
|
||||
this.p = p;
|
||||
}
|
||||
public String getValue(int codepoint, boolean isShort) {
|
||||
// TODO Auto-generated method stub
|
||||
return p.getValue(codepoint, isShort).replace('_',' ');
|
||||
}
|
||||
public int getMaxWidth(boolean v) {
|
||||
return p.getMaxWidth(v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void test() throws IOException {
|
||||
checkAvailable();
|
||||
if (true) return;
|
||||
checkUCD();
|
||||
itemFailures = new UnicodeSet();
|
||||
icuFactory = ICUPropertyFactory.make();
|
||||
toolFactory = ToolUnicodePropertySource.make("4.0.0");
|
||||
|
||||
String[] quickList = {
|
||||
// "Canonical_Combining_Class",
|
||||
// "Script", "Bidi_Mirroring_Glyph", "Case_Folding",
|
||||
//"Numeric_Value"
|
||||
};
|
||||
for (int i = 0; i < quickList.length; ++i) {
|
||||
testProperty(quickList[i], -1);
|
||||
}
|
||||
if (quickList.length > 0) return;
|
||||
|
||||
Collection availableTool = toolFactory.getAvailableNames();
|
||||
|
||||
Collection availableICU = icuFactory.getAvailableNames();
|
||||
System.out.println(showDifferences("Property Aliases", "ICU", availableICU, "Tool", availableTool));
|
||||
Collection common = new TreeSet(availableICU);
|
||||
common.retainAll(availableTool);
|
||||
|
||||
for (int j = UnicodeProperty.BINARY; j < UnicodeProperty.LIMIT_TYPE; ++j) {
|
||||
System.out.println();
|
||||
System.out.println(UnicodeProperty.getTypeName(j));
|
||||
Iterator it = common.iterator();
|
||||
while (it.hasNext()) {
|
||||
String prop = (String)it.next();
|
||||
testProperty(prop, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static void checkAvailable() {
|
||||
//generateFile("4.0.0", "DerivedCombiningClass");
|
||||
//generateFile("4.0.0", "DerivedCoreProperties");
|
||||
ULocale[] locales = Collator.getAvailableULocales();
|
||||
|
||||
System.out.println("Collation");
|
||||
System.out.println("Possible keyword=values pairs:");
|
||||
{
|
||||
String[] keywords = Collator.getKeywords();
|
||||
for (int i = 0; i < Collator.getKeywords().length; ++i) {
|
||||
String[] values = Collator.getKeywordValues(keywords[i]);
|
||||
for (int j = 0; j < values.length; ++j) {
|
||||
System.out.println("\t" + keywords[i] + "=" + values[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("Differing Collators:");
|
||||
Set testSet = new HashSet(Arrays.asList(new String[] {
|
||||
"nl", "de", "de_DE", "zh_TW"
|
||||
}));
|
||||
for (int k = 0; k < locales.length; ++k) {
|
||||
if (!testSet.contains(locales[k].toString())) continue;
|
||||
showCollationVariants(locales[k]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static void showCollationVariants(ULocale locale) {
|
||||
String[] keywords = Collator.getKeywords();
|
||||
System.out.println(locale.getDisplayName(ULocale.ENGLISH) + " [" + locale + "]");
|
||||
for (int i = 0; i < Collator.getKeywords().length; ++i) {
|
||||
ULocale base = Collator.getFunctionalEquivalent(keywords[i],
|
||||
locale
|
||||
//new ULocale(locale + "@" + keywords[i] + "=standard")
|
||||
);
|
||||
if (true) System.out.println("\"" + base + "\" == Collator.getFunctionalEquivalent(\"" + keywords[i] + "\", \"" + locale + "\");");
|
||||
String[] values = Collator.getKeywordValues(keywords[i]);
|
||||
for (int j = 0; j < Collator.getKeywordValues(keywords[i]).length; ++j) {
|
||||
ULocale other = Collator.getFunctionalEquivalent(keywords[i],
|
||||
new ULocale(locale + "@" + keywords[i] + "=" + values[j]));
|
||||
if (true) System.out.println(
|
||||
"\"" + other
|
||||
+ "\" == Collator.getFunctionalEquivalent(\"" + keywords[i]
|
||||
+ "\", new ULocale(\""
|
||||
+ locale + "@" + keywords[i] + "=" + values[j] + "\");");
|
||||
// HACK: commented line should work but doesn't
|
||||
if (!other.equals(base)) {
|
||||
//if (other.toString().indexOf("@") >= 0) {
|
||||
System.out.println("\t" + keywords[i] + "=" + values[j] + "; \t" + base + "; \t" + other);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sample code that prints out the variants that 'make a difference' for a given locale.
|
||||
* To iterate through the locales, use Collator.getVariant
|
||||
*/
|
||||
private static void showCollationVariants2(ULocale locale) {
|
||||
String[] keywords = Collator.getKeywords();
|
||||
System.out.println(locale.getDisplayName(ULocale.ENGLISH) + " [" + locale + "]");
|
||||
for (int i = 0; i < Collator.getKeywords().length; ++i) {
|
||||
ULocale base = Collator.getFunctionalEquivalent(keywords[i], locale);
|
||||
String[] values = Collator.getKeywordValues(keywords[i]);
|
||||
for (int j = 0; j < Collator.getKeywordValues(keywords[i]).length; ++j) {
|
||||
ULocale other = Collator.getFunctionalEquivalent(keywords[i],
|
||||
new ULocale(locale + "@" + keywords[i] + "=" + values[j]));
|
||||
if (!other.equals(base)) {
|
||||
System.out.println("\t" + keywords[i] + "=" + values[j] + "; \t" + base + "; \t" + other);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkUCD() throws IOException {
|
||||
UCD myUCD = UCD.make("4.0.0");
|
||||
Normalizer nfc = new Normalizer(Normalizer.NFC, "4.0.0");
|
||||
UnicodeSet leading = new UnicodeSet();
|
||||
UnicodeSet trailing = new UnicodeSet();
|
||||
UnicodeSet starter = new UnicodeSet();
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
if (myUCD.getCombiningClass(i) == 0) starter.add(i);
|
||||
if (nfc.isTrailing(i)) trailing.add(i);
|
||||
if (nfc.isLeading(i)) leading.add(i);
|
||||
}
|
||||
PrintWriter pw = bf.openUTF8Writer(UCD_Types.GEN_DIR, "Trailing.txt");
|
||||
pw.println("+Trailing+Starter");
|
||||
bf.showSetNames(pw, new UnicodeSet(trailing).retainAll(starter));
|
||||
pw.println("+Trailing-Starter");
|
||||
bf.showSetNames(pw, new UnicodeSet(trailing).removeAll(starter));
|
||||
pw.println("-Trailing-Starter");
|
||||
bf.showSetNames(pw, new UnicodeSet(trailing).complement().removeAll(starter));
|
||||
pw.println("+Trailing+Leading");
|
||||
bf.showSetNames(pw, new UnicodeSet(trailing).retainAll(leading));
|
||||
pw.println("+Trailing-Leading");
|
||||
bf.showSetNames(pw, new UnicodeSet(trailing).removeAll(leading));
|
||||
pw.close();
|
||||
}
|
||||
/*
|
||||
* int icuType;
|
||||
int toolType;
|
||||
Collection icuAliases;
|
||||
Collection toolAliases;
|
||||
String firstDiffICU;
|
||||
String firstDiffTool;
|
||||
String firstDiffCP;
|
||||
String icuProp;
|
||||
String toolProp;
|
||||
|
||||
*/
|
||||
|
||||
private static void testProperty(String prop, int typeFilter) {
|
||||
UnicodeProperty icuProp = icuFactory.getProperty(prop);
|
||||
int icuType = icuProp.getType();
|
||||
|
||||
if (typeFilter >= 0 && icuType != typeFilter) return;
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Testing: " + prop);
|
||||
UnicodeProperty toolProp = toolFactory.getProperty(prop);
|
||||
|
||||
int toolType = toolProp.getType();
|
||||
if (icuType != toolType) {
|
||||
System.out.println("FAILURE Type: ICU: " + UnicodeProperty.getTypeName(icuType)
|
||||
+ "\tTool: " + UnicodeProperty.getTypeName(toolType));
|
||||
}
|
||||
|
||||
Collection icuAliases = icuProp.getNameAliases(new ArrayList());
|
||||
Collection toolAliases = toolProp.getNameAliases(new ArrayList());
|
||||
System.out.println(showDifferences("Aliases", "ICU", icuAliases, "Tool", toolAliases));
|
||||
|
||||
icuAliases = icuProp.getAvailableValues(new ArrayList());
|
||||
toolAliases = toolProp.getAvailableValues(new ArrayList());
|
||||
System.out.println(showDifferences("Value Aliases", "ICU", icuAliases, "Tool", toolAliases));
|
||||
|
||||
// TODO do property value aliases
|
||||
itemFailures.clear();
|
||||
String firstDiffICU = null, firstDiffTool = null, firstDiffCP = null;
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
/*if (i == 0x0237) {
|
||||
System.out.println();
|
||||
}
|
||||
*/
|
||||
String icuValue = icuProp.getValue(i);
|
||||
String toolValue = toolProp.getValue(i);
|
||||
if (!equals(icuValue, toolValue)) {
|
||||
itemFailures.add(i);
|
||||
if (firstDiffCP == null) {
|
||||
firstDiffICU = icuValue;
|
||||
firstDiffTool = toolValue;
|
||||
firstDiffCP = Utility.hex(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (itemFailures.size() != 0) {
|
||||
System.out.println("FAILURE " + itemFailures.size() + " Differences: ");
|
||||
System.out.println(itemFailures.toPattern(true));
|
||||
if (firstDiffICU != null) firstDiffICU = bf.hex.transliterate(firstDiffICU);
|
||||
if (firstDiffTool != null) firstDiffTool = bf.hex.transliterate(firstDiffTool);
|
||||
System.out.println(firstDiffCP
|
||||
+ "\tICU: <" + firstDiffICU
|
||||
+ ">\tTool: <" + firstDiffTool + ">");
|
||||
}
|
||||
System.out.println("done");
|
||||
|
||||
// do values later, and their aliases
|
||||
/*
|
||||
System.out.println("-Values");
|
||||
UnicodeSet
|
||||
System.out.println(showDifferences("ICU", availableICU, "Tool", availableTool));
|
||||
*/
|
||||
}
|
||||
|
||||
static boolean equals(Object a, Object b) {
|
||||
if (a == null) return b == null;
|
||||
return a.equals(b);
|
||||
}
|
||||
|
||||
static public String showDifferences(
|
||||
String title,
|
||||
String name1,
|
||||
Collection set1,
|
||||
String name2,
|
||||
Collection set2) {
|
||||
|
||||
Collection temp = new TreeSet(set1);
|
||||
temp.retainAll(set2);
|
||||
|
||||
if (set1.size() == temp.size()) {
|
||||
return title + ": " + name1 + " == " + name2 + ": " + bf.join(set1);
|
||||
}
|
||||
|
||||
StringBuffer result = new StringBuffer();
|
||||
result.append(title + "\tFAILURE\r\n");
|
||||
result.append("\t" + name1 + " = " + bf.join(set1) + "\r\n");
|
||||
result.append("\t" + name2 + " = " + bf.join(set2) + "\r\n");
|
||||
|
||||
// damn'd collection doesn't have a clone, so
|
||||
// we go with Set, even though that
|
||||
// may not preserve order and duplicates
|
||||
if (temp.size() != 0) {
|
||||
result.append("\t" + name2 + " & " + name1 + ":\r\n");
|
||||
result.append("\t" + bf.join(temp));
|
||||
result.append("\r\n");
|
||||
}
|
||||
|
||||
|
||||
temp.clear();
|
||||
temp.addAll(set1);
|
||||
temp.removeAll(set2);
|
||||
if (temp.size() != 0) {
|
||||
result.append("\t" + name1 + " - " + name2 + ":\r\n");
|
||||
result.append("\t" + bf.join(temp));
|
||||
result.append("\r\n");
|
||||
}
|
||||
|
||||
temp.clear();
|
||||
temp.addAll(set2);
|
||||
temp.removeAll(set1);
|
||||
if (temp.size() != 0) {
|
||||
result.append("\t" + name2 + " - " + name1 + ":\r\n");
|
||||
result.append("\t" + bf.join(temp));
|
||||
result.append("\r\n");
|
||||
}
|
||||
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,81 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.text.DecimalFormat;
|
||||
import com.ibm.icu.text.NumberFormat;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.text.utility.Pair;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class ChineseFrequency {
|
||||
static final String DICT_DIR = "C:\\DATA\\dict\\";
|
||||
static NumberFormat percent = new DecimalFormat("0.000000%");
|
||||
static NumberFormat percent3 = new DecimalFormat("000.000000%");
|
||||
static NumberFormat number = new DecimalFormat("#,##0");
|
||||
|
||||
static class InverseCompareTo implements Comparator {
|
||||
public int compare(Object o1, Object o2) {
|
||||
return -((Comparable)o1).compareTo(o2);
|
||||
}
|
||||
}
|
||||
|
||||
public static void test() throws IOException{
|
||||
Set freq_char = new TreeSet(new InverseCompareTo());
|
||||
BufferedReader br = BagFormatter.openUTF8Reader(DICT_DIR, "kHYPLCDPF.txt");
|
||||
double grandTotal = 0.0;
|
||||
while (true) {
|
||||
String line = br.readLine();
|
||||
if (line == null) break;
|
||||
String[] pieces = Utility.split(line,'\t');
|
||||
int cp = Integer.parseInt(pieces[0],16);
|
||||
String[] says = Utility.split(pieces[1],',');
|
||||
long total = 0;
|
||||
for (int i = 0; i < says.length; ++i) {
|
||||
int start = says[i].indexOf('(');
|
||||
int end = says[i].indexOf(')');
|
||||
long count = Long.parseLong(says[i].substring(start+1, end));
|
||||
total += count;
|
||||
}
|
||||
grandTotal += total;
|
||||
freq_char.add(new Pair(new Long(total), new Integer(cp)));
|
||||
}
|
||||
br.close();
|
||||
PrintWriter pw = BagFormatter.openUTF8Writer(DICT_DIR,"kHYPLCDPF_frequency.txt");
|
||||
pw.write("\uFEFF");
|
||||
pw.println("No.\tPercentage\tAccummulated\tHex\tChar");
|
||||
|
||||
Iterator it = freq_char.iterator();
|
||||
int counter = 0;
|
||||
double cummulative = 0;
|
||||
double cummulativePercentage = 0;
|
||||
while (it.hasNext()) {
|
||||
Pair item = (Pair)it.next();
|
||||
Long total = (Long) item.first;
|
||||
Integer cp = (Integer) item.second;
|
||||
double current = total.longValue();
|
||||
cummulative += current;
|
||||
double percentage = current / grandTotal;
|
||||
cummulativePercentage += percentage;
|
||||
pw.println(
|
||||
++counter
|
||||
//+ "\t" + number.format(current)
|
||||
//+ "\t" + number.format(cummulative)
|
||||
+ "\t" + percent.format(percentage)
|
||||
+ "\t" + percent3.format(cummulativePercentage)
|
||||
+ "\t" + Integer.toHexString(cp.intValue()).toUpperCase()
|
||||
+ "\t" + UTF16.valueOf(cp.intValue()));
|
||||
}
|
||||
//pw.println("Grand total: " + (long)grandTotal);
|
||||
pw.close();
|
||||
}
|
||||
}
|
@ -1,106 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.*;
|
||||
import java.util.*;
|
||||
|
||||
// Enumerated properties will be IntCodePointProperty.
|
||||
// The string values they return will be the property value names.
|
||||
// Binary properties are Enumerated properties. They return 0 or 1
|
||||
|
||||
abstract public class CodePointProperty {
|
||||
// styles for names and string values
|
||||
static final byte SHORT = 0, DEFAULT = 1, LONG = 2, NORMAL_LIMIT = 3;
|
||||
|
||||
// gets the property name
|
||||
abstract public String getName(byte style);
|
||||
|
||||
// value may also be numeric, etc, but this returns string equivalent.
|
||||
abstract public String getValue(int codePoint, byte style);
|
||||
|
||||
// returns true if the code point has the value
|
||||
// works with any style that getValue takes
|
||||
abstract public boolean hasValue(int codePoint, String value);
|
||||
|
||||
// returns the set of all code points with that value.
|
||||
// same effect as using hasValue one by one, but faster internal implementation
|
||||
abstract public UnicodeSet getSet(String value);
|
||||
|
||||
// returns a list of all possible values
|
||||
// logically the same as looping from 0..10FFFF with getValue and getStyleLimit,
|
||||
// and throwing out duplicates, but much faster.
|
||||
static Iterator getAllValues(byte style) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// gets top value style available for this property
|
||||
public byte getStyleLimit(byte style) {
|
||||
return NORMAL_LIMIT;
|
||||
}
|
||||
|
||||
// returns true if the value is known to be uniform over a type.
|
||||
// this is used for various optimizations, especially for Cn & Co
|
||||
public boolean isUniformOverCategory(byte generalCategory) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// subclasses
|
||||
|
||||
static abstract public class IntCodePointProperty extends CodePointProperty {
|
||||
abstract int getNumericValue(int codePoint);
|
||||
abstract int getMaxValue();
|
||||
abstract int getMinValue();
|
||||
static Iterator getAllNumericValues() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static abstract public class DoubleCodePointProperty extends CodePointProperty {
|
||||
abstract double getNumericValue(int codePoint);
|
||||
abstract double getMaxValue();
|
||||
abstract double getMinValue();
|
||||
static Iterator getAllNumericValues() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// registration and lookup
|
||||
|
||||
// register a new property
|
||||
static void register(CodePointProperty newProp) {
|
||||
//...
|
||||
}
|
||||
|
||||
// finds a registered property by name
|
||||
static CodePointProperty getInstance(String name) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// returns a list of all registered properties
|
||||
static Iterator getAllRegistered() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// UnicodeSet would use these internally to handle properties. That is, when
|
||||
// it encountered ... [:name=value:] ...
|
||||
// it would do:
|
||||
// CodePointProperty x = getInstance(name);
|
||||
// if (x != null) doError(name, value);
|
||||
// UnicodeSet s = x.getSet(value);
|
||||
// and then use s.
|
||||
|
||||
// open issue: we could have a property like: contains("dot")
|
||||
// in that case, we would register "contains" as the 'base' name,
|
||||
// but allow lookup with string parameters ("dot")
|
||||
// Maybe just adding:
|
||||
|
||||
public boolean hasParameters() {
|
||||
return false;
|
||||
}
|
||||
public void setParameters(String parameters) {}
|
||||
public String getParameters() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// that way we could have [[:letter:]&[:contains(dot):]]
|
||||
|
||||
}
|
@ -1,273 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompactName.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.*;
|
||||
|
||||
public class CompactName {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
int test = tokenFromString("ABZ");
|
||||
String ss = stringFromToken(test);
|
||||
System.out.println(ss);
|
||||
|
||||
CompactName.addWord("ABSOLUTEISM");
|
||||
|
||||
for (int i = 0; i < CompactName.lastToken; ++i) {
|
||||
String s = CompactName.stringFromToken(i);
|
||||
System.out.println(s);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static final char[] compactMap = new char[128];
|
||||
static final char[] compactUnmap = new char[128];
|
||||
|
||||
static {
|
||||
char counter = 0;
|
||||
compactMap[0] = counter++;
|
||||
for (int i = 'A'; i <= 'Z'; ++i) {
|
||||
compactMap[i] = counter++;
|
||||
}
|
||||
compactMap['-'] = counter++;
|
||||
compactMap['>'] = counter++;
|
||||
compactMap['<'] = counter++;
|
||||
compactMap['*'] = counter++;
|
||||
|
||||
compactUnmap[0] = 0;
|
||||
for (char i = 0; i < compactUnmap.length; ++i) {
|
||||
int x = compactMap[i];
|
||||
if (x != 0) compactUnmap[x] = i;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
static String expand(String s) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
int m = s.charAt(i);
|
||||
if (m == 31 && i < s.length() + 1) {
|
||||
m = 31 + s.charAt(++i);
|
||||
}
|
||||
result.append(compactUnmap[m]);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static String compact(String s) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
int m = compactMap[s.charAt(i)];
|
||||
if (m >= 31) {
|
||||
result.append((char)31);
|
||||
m -= 31;
|
||||
}
|
||||
result.append(m);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
*/
|
||||
|
||||
static Map string_token = new HashMap();
|
||||
static Map token_string = new HashMap();
|
||||
|
||||
static int[] tokenList = new int[40000];
|
||||
static final int tokenStart = 0;
|
||||
static int lastToken = 0;
|
||||
|
||||
static int spacedMinimum = Integer.MAX_VALUE;
|
||||
|
||||
static boolean isLiteral(int i) {
|
||||
return (i & 0x8000) != 0;
|
||||
}
|
||||
|
||||
static int addTokenForString(String s, int lead, int trail) {
|
||||
Object in = string_token.get(s);
|
||||
if (in != null) throw new IllegalArgumentException();
|
||||
int value = (lead << 16) + (trail & 0xFFFF);
|
||||
int result = lastToken;
|
||||
tokenList[lastToken++] = value;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail);
|
||||
String roundTrip = stringFromToken(result);
|
||||
if (!roundTrip.equals(s)) {
|
||||
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
|
||||
}
|
||||
}
|
||||
string_token.put(s, new Integer(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
static String stringFromToken(int i) {
|
||||
String result;
|
||||
if ((i & 0x8000) != 0) {
|
||||
char first = compactUnmap[(i >> 10) & 0x1F];
|
||||
char second = compactUnmap[(i >> 5) & 0x1F];
|
||||
char third = compactUnmap[i & 0x1F];
|
||||
result = String.valueOf(first);
|
||||
if (second != 0) result += String.valueOf(second);
|
||||
if (third != 0) result += String.valueOf(third);
|
||||
} else if (i > lastToken) {
|
||||
throw new IllegalArgumentException("bad token: " + i);
|
||||
} else {
|
||||
int value = tokenList[i];
|
||||
int lead = value >>> 16;
|
||||
int trail = value & 0xFFFF;
|
||||
if (i >= spacedMinimum) result = stringFromToken(lead) + ' ' + stringFromToken(trail);
|
||||
else result = stringFromToken(lead) + stringFromToken(trail);
|
||||
}
|
||||
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
|
||||
return result;
|
||||
}
|
||||
|
||||
static int tokenFromString(String s) {
|
||||
if (s.length() <= 3) {
|
||||
int first = compactMap[s.charAt(0)];
|
||||
int second = compactMap[s.length() > 1 ? s.charAt(1) : 0];
|
||||
int third = compactMap[s.length() > 2 ? s.charAt(2) : 0];
|
||||
return 0x8000 + (first << 10) + (second << 5) + third;
|
||||
}
|
||||
Object in = string_token.get(s);
|
||||
if (in == null) return -1;
|
||||
return ((Integer)in).intValue();
|
||||
}
|
||||
|
||||
|
||||
static int addWord(String s) {
|
||||
|
||||
int result = tokenFromString(s);
|
||||
if (result != -1) return result;
|
||||
int bestLen = 0;
|
||||
int best_i = 0;
|
||||
|
||||
int limit = s.length() - 1;
|
||||
|
||||
for (int i = limit; i >= 1; --i) {
|
||||
|
||||
String firstPart = s.substring(0, i);
|
||||
String lastPart = s.substring(i);
|
||||
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
|
||||
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
|
||||
return addTokenForString(s, lead, trail);
|
||||
}
|
||||
if (!isLiteral(lead)) {
|
||||
if (i > bestLen) {
|
||||
bestLen = i;
|
||||
best_i = i;
|
||||
}
|
||||
}
|
||||
if (!isLiteral(trail)) {
|
||||
int end_i = s.length() - i;
|
||||
if (end_i > bestLen) {
|
||||
bestLen = end_i;
|
||||
best_i = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bestLen > 0) { // if one matches, recurse -- and return pair
|
||||
String firstPart = s.substring(0, best_i);
|
||||
String lastPart = s.substring(best_i);
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
if (lead >= 0) {
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST");
|
||||
return addTokenForString(s, lead, addWord(lastPart));
|
||||
} else {
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND");
|
||||
return addTokenForString(s, addWord(firstPart), trail);
|
||||
}
|
||||
}
|
||||
|
||||
// break at multiple of 3
|
||||
|
||||
best_i = ((s.length() + 1) / 6) * 3;
|
||||
String firstPart = s.substring(0, best_i);
|
||||
String lastPart = s.substring(best_i);
|
||||
if (DEBUG) show(s, firstPart, lastPart, "Fallback");
|
||||
return addTokenForString(s, addWord(firstPart), addWord(lastPart));
|
||||
}
|
||||
|
||||
static void show(String s, String firstPart, String lastPart, String comment) {
|
||||
System.out.println((s) + " => '" + (firstPart)
|
||||
+ "' # '" + (lastPart) + "' " + comment);
|
||||
}
|
||||
|
||||
static void startLines() {
|
||||
spacedMinimum = lastToken;
|
||||
}
|
||||
|
||||
static int addLine(String s) {
|
||||
|
||||
int result = tokenFromString(s);
|
||||
if (result != -1) return result;
|
||||
int bestLen = 0;
|
||||
int best_i = 0;
|
||||
|
||||
int limit = s.length() - 2;
|
||||
|
||||
for (int i = limit; i >= 1; --i) {
|
||||
char c = s.charAt(i);
|
||||
if (c != ' ') continue;
|
||||
|
||||
String firstPart = s.substring(0, i);
|
||||
String lastPart = s.substring(i+1);
|
||||
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
|
||||
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
|
||||
return addTokenForString(s, lead, trail);
|
||||
}
|
||||
if (i > bestLen) {
|
||||
bestLen = i;
|
||||
best_i = i;
|
||||
}
|
||||
|
||||
int end_i = s.length() - i - 1;
|
||||
if (end_i > bestLen) {
|
||||
bestLen = end_i;
|
||||
best_i = i;
|
||||
}
|
||||
}
|
||||
if (bestLen > 0) { // if one matches, recurse -- and return pair
|
||||
String firstPart = s.substring(0, best_i);
|
||||
String lastPart = s.substring(best_i + 1);
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
if (lead >= 0) {
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST");
|
||||
return addTokenForString(s, lead, addLine(lastPart));
|
||||
} else {
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND");
|
||||
return addTokenForString(s, addLine(firstPart), trail);
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("SHOULD HAVE MATCHED!!");
|
||||
throw new IllegalArgumentException("SHOULD HAVE MATCHED!! " + s);
|
||||
}
|
||||
}
|
@ -1,387 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Compare14652.java,v $
|
||||
* $Date: 2004/02/07 01:01:16 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
// quick and dirty function for grabbing contents of ISO 14652 file
|
||||
|
||||
public class Compare14652 implements UCD_Types {
|
||||
|
||||
static final boolean oldVersion = false;
|
||||
|
||||
public static UnicodeSet getSet(int prop, byte propValue) {
|
||||
return UnifiedBinaryProperty.make(prop | propValue).getSet();
|
||||
}
|
||||
|
||||
static UnicodeSet
|
||||
titleSet = getSet(CATEGORY, Lt),
|
||||
combiningSet = getSet(CATEGORY, Mc)
|
||||
.addAll(getSet(CATEGORY, Me))
|
||||
.addAll(getSet(CATEGORY, Mn)),
|
||||
zSet = getSet(CATEGORY, Zs)
|
||||
.addAll(getSet(CATEGORY, Zl))
|
||||
.addAll(getSet(CATEGORY, Zp)),
|
||||
pSet = getSet(CATEGORY, Pd)
|
||||
.addAll(getSet(CATEGORY, Ps))
|
||||
.addAll(getSet(CATEGORY, Pe))
|
||||
.addAll(getSet(CATEGORY, Pc))
|
||||
.addAll(getSet(CATEGORY, Po))
|
||||
.addAll(getSet(CATEGORY, Pi))
|
||||
.addAll(getSet(CATEGORY, Pf)),
|
||||
sSet = getSet(CATEGORY, Sm)
|
||||
.addAll(getSet(CATEGORY, Sc))
|
||||
.addAll(getSet(CATEGORY, Sk))
|
||||
.addAll(getSet(CATEGORY, So)),
|
||||
noSet = getSet(CATEGORY, No),
|
||||
csSet = getSet(CATEGORY, Cs),
|
||||
cfSet = getSet(CATEGORY, Cf),
|
||||
cnSet = getSet(CATEGORY, Cn),
|
||||
circled = getSet(DECOMPOSITION_TYPE, COMPAT_CIRCLE),
|
||||
whitespaceSet = getSet(BINARY_PROPERTIES, White_space),
|
||||
alphaSet = getSet(DERIVED, PropAlphabetic).addAll(combiningSet),
|
||||
lowerSet = getSet(DERIVED, PropLowercase).addAll(titleSet).removeAll(circled),
|
||||
upperSet = getSet(DERIVED, PropUppercase).addAll(titleSet).removeAll(circled),
|
||||
digitSet = getSet(CATEGORY, Nd),
|
||||
xdigitSet = new UnicodeSet("[a-fA-F\uFF21-\uFF26\uFF41-\uFF46]").addAll(digitSet),
|
||||
spaceSet = whitespaceSet.size() == 0 ? zSet : whitespaceSet,
|
||||
controlSet = getSet(CATEGORY, Cc),
|
||||
punctSet = new UnicodeSet(pSet).addAll(sSet),
|
||||
graphSet = new UnicodeSet(0,0x10ffff)
|
||||
.removeAll(controlSet)
|
||||
//.removeAll(getSet(CATEGORY, Cf))
|
||||
.removeAll(csSet)
|
||||
.removeAll(cnSet)
|
||||
.removeAll(zSet),
|
||||
// Cc, Cf, Cs, Cn, Z
|
||||
blankSet = new UnicodeSet(spaceSet).removeAll(new UnicodeSet("[\\u000A-\\u000D\\u0085]"))
|
||||
.removeAll(getSet(CATEGORY, Zl))
|
||||
.removeAll(getSet(CATEGORY, Zp));
|
||||
|
||||
|
||||
static class Prop {
|
||||
String name;
|
||||
UnicodeSet contents = new UnicodeSet();
|
||||
String guess = "???";
|
||||
UnicodeSet guessContents = new UnicodeSet();
|
||||
|
||||
String wsname = whitespaceSet.size() == 0 ? "gc=Z" : "Whitespace";
|
||||
|
||||
Prop(String name) {
|
||||
this.name = name;
|
||||
if (name.equals("alpha")) {
|
||||
guess = "Alphabetic + gc=M";
|
||||
guessContents = alphaSet;
|
||||
} else if (name.equals("lower")) {
|
||||
guess = "Lowercase + gc=Lt - dt=circle";
|
||||
guessContents = lowerSet;
|
||||
} else if (name.equals("upper")) {
|
||||
guess = "Uppercase + gc=Lt - dt=circle";
|
||||
guessContents = upperSet;
|
||||
} else if (name.equals("digit")) {
|
||||
guess = "gc=Nd";
|
||||
guessContents = digitSet;
|
||||
} else if (name.equals("xdigit")) {
|
||||
guess = "gc=Nd+a..f (upper/lower,normal/fullwidth)";
|
||||
guessContents = xdigitSet;
|
||||
} else if (name.equals("space")) {
|
||||
guess = wsname;
|
||||
guessContents = spaceSet;
|
||||
//Utility.showSetNames("Whitespace", spaceSet, true, Default.ucd);
|
||||
} else if (name.equals("cntrl")) {
|
||||
guess = "gc=Cc";
|
||||
guessContents = controlSet;
|
||||
} else if (name.equals("punct")) {
|
||||
guess = "gc=P,S";
|
||||
guessContents = punctSet;
|
||||
} else if (name.equals("graph")) {
|
||||
guess = "All - gc=Cc, Cs, Cn, or Z";
|
||||
guessContents = graphSet;
|
||||
} else if (name.equals("blank")) {
|
||||
guess = wsname + " - (LF,VT,FF,CR,NEL + gc=Zl,Zp)";
|
||||
guessContents = blankSet;
|
||||
} else if (name.equals("ISO_14652_class \"combining\"")) {
|
||||
guess = "gc=M";
|
||||
guessContents = combiningSet;
|
||||
}
|
||||
|
||||
|
||||
/*upper
|
||||
lower
|
||||
alpha
|
||||
digit
|
||||
outdigit
|
||||
space
|
||||
cntrl
|
||||
punct
|
||||
graph
|
||||
xdigit
|
||||
blank
|
||||
toupper
|
||||
tolower
|
||||
*/
|
||||
}
|
||||
|
||||
void show(PrintWriter pw) {
|
||||
if (name.equals("ISO_14652_LC_CTYPE")) return;
|
||||
if (name.equals("ISO_14652_toupper")) return;
|
||||
if (name.equals("ISO_14652_tolower")) return;
|
||||
if (name.equals("ISO_14652_outdigit")) return;
|
||||
if (name.equals("ISO_14652_outdigit")) return;
|
||||
if (name.startsWith("ISO_14652_class")) return;
|
||||
|
||||
pw.println();
|
||||
pw.println("**************************************************");
|
||||
pw.println(name);
|
||||
pw.println("**************************************************");
|
||||
Utility.showSetDifferences(pw, name, contents, guess, guessContents, false, true, null, Default.ucd());
|
||||
//pw.println(props[i].contents);
|
||||
}
|
||||
}
|
||||
|
||||
static Prop[] props = new Prop[100];
|
||||
static int propCount = 0;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
String version = Default.ucd().getVersion();
|
||||
PrintWriter log = Utility.openPrintWriter("Diff14652_" + version + ".txt", Utility.UTF8_WINDOWS);
|
||||
try {
|
||||
log.write('\uFEFF');
|
||||
log.print("Version: " + version);
|
||||
|
||||
if (false) {
|
||||
UnicodeSet ID = getSet(DERIVED, ID_Start).addAll(getSet(DERIVED, ID_Continue_NO_Cf));
|
||||
UnicodeSet XID = getSet(DERIVED, Mod_ID_Start).addAll(getSet(DERIVED, Mod_ID_Continue_NO_Cf));
|
||||
UnicodeSet alphanumSet = new UnicodeSet(alphaSet).addAll(digitSet).addAll(getSet(CATEGORY, Pc));
|
||||
|
||||
Utility.showSetDifferences("ID", ID, "XID", XID, false, Default.ucd());
|
||||
Utility.showSetDifferences("ID", ID, "Alphabetic+Digit+Pc", alphanumSet, false, Default.ucd());
|
||||
}
|
||||
|
||||
BufferedReader br = Utility.openReadFile("C:\\DATA\\ISO14652_CTYPE.txt", Utility.LATIN1);
|
||||
while (true) {
|
||||
String line = br.readLine();
|
||||
if (line == null) break;
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
if (line.charAt(line.length() - 1) == '/') {
|
||||
line = line.substring(0, line.length() - 1);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
char ch = line.charAt(0);
|
||||
if (ch == '%') continue;
|
||||
if (ch == '(') continue;
|
||||
if (ch == '<') {
|
||||
addItems(line, props[propCount-1].contents);
|
||||
} else {
|
||||
// new property
|
||||
System.out.println(line);
|
||||
if (line.equals("width")) break;
|
||||
props[propCount] = new Prop(line);
|
||||
props[propCount].name = "ISO_14652_" + line;
|
||||
props[propCount].contents = new UnicodeSet();
|
||||
propCount++;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < propCount; ++i) props[i].show(log);
|
||||
|
||||
log.println();
|
||||
log.println("**************************************************");
|
||||
log.println("Checking POSIX requirements for inclusion and disjointness.");
|
||||
log.println("**************************************************");
|
||||
log.println();
|
||||
/*
|
||||
alpha, digit, punct, cntrl are all disjoint
|
||||
space, cntrl, blank are pairwise disjoint with any of alpha, digit, xdigit
|
||||
alpha includes upper, lower
|
||||
graph includes alpha, digit, punct
|
||||
print includes graph
|
||||
xdigit includes digit
|
||||
*/
|
||||
Prop
|
||||
alpha = getProp("ISO_14652_alpha"),
|
||||
upper = getProp("ISO_14652_upper"),
|
||||
lower = getProp("ISO_14652_lower"),
|
||||
graph = getProp("ISO_14652_graph"),
|
||||
//print = getProp("ISO_14652_print"),
|
||||
punct = getProp("ISO_14652_punct"),
|
||||
digit = getProp("ISO_14652_digit"),
|
||||
xdigit = getProp("ISO_14652_xdigit"),
|
||||
space = getProp("ISO_14652_space"),
|
||||
blank = getProp("ISO_14652_blank"),
|
||||
cntrl = getProp("ISO_14652_cntrl");
|
||||
|
||||
checkDisjoint(log, new Prop[] {alpha, digit, punct, cntrl});
|
||||
|
||||
Prop [] l1 = new Prop[] {space, cntrl, blank};
|
||||
Prop [] l2 = new Prop[] {alpha, digit, xdigit};
|
||||
for (int i = 0; i < l1.length; ++i) {
|
||||
for (int j = i + 1; j < l2.length; ++j) {
|
||||
checkDisjoint(log, l1[i], l2[j]);
|
||||
}
|
||||
}
|
||||
checkIncludes(log, alpha, upper);
|
||||
checkIncludes(log, alpha, lower);
|
||||
checkIncludes(log, graph, alpha);
|
||||
checkIncludes(log, graph, digit);
|
||||
checkIncludes(log, graph, punct);
|
||||
//checkIncludes(log, print, graph);
|
||||
checkIncludes(log, xdigit, digit);
|
||||
|
||||
|
||||
// possibly alpha, digit, punct, cntrl, space cover the !(Cn,Cs)
|
||||
|
||||
UnicodeSet trRemainder = new UnicodeSet(cnSet)
|
||||
.complement()
|
||||
.removeAll(csSet)
|
||||
.removeAll(digit.contents)
|
||||
.removeAll(punct.contents)
|
||||
.removeAll(alpha.contents)
|
||||
.removeAll(cntrl.contents)
|
||||
.removeAll(space.contents);
|
||||
Utility.showSetNames(log, "TR Remainder: ", trRemainder, false, false, Default.ucd());
|
||||
|
||||
UnicodeSet propRemainder = new UnicodeSet(cnSet)
|
||||
.complement()
|
||||
.removeAll(csSet)
|
||||
//.removeAll(noSet)
|
||||
//.removeAll(cfSet)
|
||||
.removeAll(digit.guessContents)
|
||||
.removeAll(punct.guessContents)
|
||||
.removeAll(alpha.guessContents)
|
||||
.removeAll(cntrl.guessContents)
|
||||
.removeAll(space.guessContents);
|
||||
Utility.showSetNames(log, "Prop Remainder: ", propRemainder, false, false, Default.ucd());
|
||||
|
||||
/*
|
||||
checkDisjoint(new Prop[] {alpha, digit, punct, cntrl});
|
||||
UnicodeSet remainder = cnSet.complement();
|
||||
UnicodeSet guessRemainder = new UnicodeSet(remainder);
|
||||
for (int i = 0; i < list.length; ++i) {
|
||||
for (int j = i + 1; j < list.length; ++j) {
|
||||
compare(log, list[i].name, list[i].contents, list[j].name, list[j].contents);
|
||||
compare(log, list[i].guess, list[i].guessContents, list[j].guess, list[j].guessContents);
|
||||
}
|
||||
remainder.removeAll(list[i].contents);
|
||||
guessRemainder.removeAll(list[i].guessContents);
|
||||
}
|
||||
if (remainder.size() != 0) {
|
||||
log.println();
|
||||
log.println("Incomplete (TR): " + remainder);
|
||||
}
|
||||
if (guessRemainder.size() != 0) {
|
||||
log.println();
|
||||
log.println("Incomplete (Prop): " + guessRemainder);
|
||||
}
|
||||
*/
|
||||
|
||||
} finally {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
static void checkDisjoint(PrintWriter log, Prop[] list) {
|
||||
for (int i = 0; i < list.length; ++i) {
|
||||
for (int j = i + 1; j < list.length; ++j) {
|
||||
checkDisjoint(log, list[i], list[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void checkDisjoint(PrintWriter log, Prop prop1, Prop prop2) {
|
||||
checkDisjoint(log, prop1.name, prop1.contents, prop2.name, prop2.contents);
|
||||
checkDisjoint(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents);
|
||||
}
|
||||
|
||||
static void checkDisjoint(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) {
|
||||
if (set.containsSome(set2)) {
|
||||
log.println();
|
||||
log.println("Fails test: " + name + " disjoint-with " + name2);
|
||||
UnicodeSet diff = new UnicodeSet(set).retainAll(set2);
|
||||
Utility.showSetNames(log, "", diff, false, false, Default.ucd());
|
||||
}
|
||||
}
|
||||
|
||||
static void checkIncludes(PrintWriter log, Prop prop1, Prop prop2) {
|
||||
checkIncludes(log, prop1.name, prop1.contents, prop2.name, prop2.contents);
|
||||
checkIncludes(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents);
|
||||
}
|
||||
|
||||
static void checkIncludes(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) {
|
||||
if (!set.containsAll(set2)) {
|
||||
log.println();
|
||||
log.println("Fails test:" + name + " includes " + name2);
|
||||
UnicodeSet diff = new UnicodeSet(set2).removeAll(set);
|
||||
Utility.showSetNames(log, "", diff, false, false, Default.ucd());
|
||||
}
|
||||
}
|
||||
|
||||
static String[] pieces = new String[100];
|
||||
|
||||
// example: <U1F48>..<U1F4D>;<U1F59>;<U1F5B>;<U1F5D>;<U1F5F>;<U1F68>..<U1F6F>;/
|
||||
static void addItems(String line, UnicodeSet contents) {
|
||||
int len = Utility.split(line, ';', pieces);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
String piece = pieces[i].trim();
|
||||
if (piece.length() == 0) continue;
|
||||
if (piece.equals("<0>")) continue;
|
||||
int start, end;
|
||||
int rangePoint = piece.indexOf("..");
|
||||
if (rangePoint >= 0) {
|
||||
start = parse(piece.substring(0,rangePoint));
|
||||
end = parse(piece.substring(rangePoint+2));
|
||||
} else {
|
||||
start = end = parse(piece);
|
||||
}
|
||||
contents.add(start, end);
|
||||
}
|
||||
}
|
||||
|
||||
static int parse(String piece) {
|
||||
if (!piece.startsWith("<U") || !piece.endsWith(">")) {
|
||||
throw new IllegalArgumentException("Bogus code point: " + piece);
|
||||
}
|
||||
return Integer.parseInt(piece.substring(2,piece.length()-1), 16);
|
||||
}
|
||||
|
||||
static Prop getProp(String name) {
|
||||
//System.out.println("Searching for: " + name);
|
||||
for (int i = 0; i < propCount; ++i) {
|
||||
//System.out.println("Checking: " + props[i].name);
|
||||
if (props[i].name.equals(name)) {
|
||||
return props[i];
|
||||
}
|
||||
}
|
||||
//System.out.println("Missed");
|
||||
return null;
|
||||
}
|
||||
|
||||
// oddities:
|
||||
// extra space after ';' <U0300>..<U036F>; <U20D0>..<U20FF>; <UFE20>..<UFE2F>;/
|
||||
// <0>?? <0>;<U0BE7>..<U0BEF>;/
|
||||
// <U202C>; <U202D>;<U202E>; <UFEFF> : 0;/
|
||||
// % "print" is by default "graph", and the <space> character
|
||||
// print is odd, since it includes space but not other spaces.
|
||||
// alnum not defined.
|
||||
|
||||
}
|
@ -1,473 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompareProperties.java,v $
|
||||
* $Date: 2004/02/12 08:23:15 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.NumberFormat;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
|
||||
public class CompareProperties implements UCD_Types {
|
||||
|
||||
static final boolean DO_DISJOINT = false;
|
||||
|
||||
static CompareProperties me = null;
|
||||
|
||||
static void partition() throws IOException {
|
||||
if (me == null) me = new CompareProperties();
|
||||
me.printPartition();
|
||||
}
|
||||
|
||||
static void statistics() throws IOException {
|
||||
UnicodeSet a = new UnicodeSet("[abc]");
|
||||
UnicodeSet empty = new UnicodeSet();
|
||||
System.out.println(a.containsAll(empty));
|
||||
System.out.println(empty.containsAll(a));
|
||||
System.out.println(empty.containsAll(new UnicodeSet()));
|
||||
if (me == null) me = new CompareProperties();
|
||||
me.printStatistics();
|
||||
}
|
||||
|
||||
public final class BitSetComparator implements Comparator {
|
||||
public int compare(Object o1, Object o2) {
|
||||
BitSet bs1 = (BitSet) o1;
|
||||
BitSet bs2 = (BitSet) o2;
|
||||
int count2 = bs1.size() > bs2.size() ? bs1.size() : bs2.size();
|
||||
for (int i = 0; i < count2; ++i) {
|
||||
if (bs1.get(i)) {
|
||||
if (!bs2.get(i)) {
|
||||
return 1;
|
||||
}
|
||||
} else if (bs2.get(i)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
* @author Davis
|
||||
*
|
||||
* Reverses the order of a comparison, for getting a list in reverse order
|
||||
*/
|
||||
public static class InverseComparator implements Comparator {
|
||||
private Comparator other;
|
||||
public InverseComparator(Comparator other) {
|
||||
this.other = other;
|
||||
}
|
||||
public int compare(Object a, Object b) {
|
||||
return other.compare(b, a);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
* @author Davis
|
||||
*
|
||||
* Reverses the order of a comparison, for getting a list in reverse order
|
||||
*/
|
||||
public static class MethodComparator implements Comparator {
|
||||
public int compare(Object a, Object b) {
|
||||
return ((Comparable)a).compareTo(b);
|
||||
}
|
||||
}
|
||||
|
||||
public final static class UnicodeSetComparator implements Comparator {
|
||||
/**
|
||||
* Compares two UnicodeSets, producing a transitive ordering.
|
||||
* The ordering is based on the first codepoint that differs between them.
|
||||
* @return -1 if first set contains the first different code point
|
||||
* 1 if the second set does.
|
||||
* 0 if there is no difference.
|
||||
* If compareTo were added to UnicodeSet, this can be optimized to use list[i].
|
||||
* @author Davis
|
||||
*
|
||||
*/
|
||||
public int compare(Object o1, Object o2) {
|
||||
UnicodeSetIterator it1 = new UnicodeSetIterator((UnicodeSet) o1);
|
||||
UnicodeSetIterator it2 = new UnicodeSetIterator((UnicodeSet) o2);
|
||||
while (it1.nextRange()) {
|
||||
if (!it2.nextRange()) return -1; // first has range while second exhausted
|
||||
if (it1.codepoint < it2.codepoint) return -1; // first has code point not in second
|
||||
if (it1.codepoint > it2.codepoint) return 1;
|
||||
if (it1.codepointEnd < it2.codepointEnd) return 1; // second has codepoint not in first
|
||||
if (it1.codepointEnd > it2.codepointEnd) return -1;
|
||||
}
|
||||
if (it2.nextRange()) return 1; // second has range while first is exhausted
|
||||
return 0; // otherwise we ran out in both of them, so equal
|
||||
}
|
||||
}
|
||||
|
||||
boolean isPartitioned = false;
|
||||
|
||||
UCDProperty[] props = new UCDProperty[500];
|
||||
UnicodeSet[] sets = new UnicodeSet[500];
|
||||
int count = 0;
|
||||
BitSet[] disjoints = new BitSet[500];
|
||||
BitSet[] contains = new BitSet[500];
|
||||
BitSet[] isin = new BitSet[500];
|
||||
BitSet[] equals = new BitSet[500];
|
||||
|
||||
Map map = new TreeMap(new BitSetComparator());
|
||||
|
||||
{
|
||||
getProperties();
|
||||
fillPropertyValues();
|
||||
Utility.fixDot();
|
||||
}
|
||||
|
||||
private void fillPropertyValues() {
|
||||
BitSet probe = new BitSet();
|
||||
int total = 0;
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
int cat = Default.ucd().getCategory(cp);
|
||||
// if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (!Default.ucd().isAllocated(cp)) continue;
|
||||
|
||||
for (int i = 0; i < count; ++i) {
|
||||
UCDProperty up = props[i];
|
||||
boolean iProp = up.hasValue(cp);
|
||||
if (iProp) {
|
||||
probe.set(i);
|
||||
sets[i].add(cp);
|
||||
} else {
|
||||
probe.clear(i);
|
||||
}
|
||||
}
|
||||
|
||||
++total;
|
||||
UnicodeSet value = (UnicodeSet) map.get(probe);
|
||||
if (value == null) {
|
||||
value = new UnicodeSet();
|
||||
map.put(probe.clone(), value);
|
||||
// Utility.fixDot();
|
||||
// System.out.println("Set Size: " + map.size() + ", total: " + total + ", " + Default.ucd.getCodeAndName(cp));
|
||||
}
|
||||
value.add(cp);
|
||||
}
|
||||
}
|
||||
|
||||
private void getProperties() {
|
||||
for (int i = 0; i < LIMIT_ENUM; ++i) { // || iType == SCRIPT
|
||||
int iType = i & 0xFF00;
|
||||
if (iType == AGE || iType == JOINING_GROUP || iType == COMBINING_CLASS) continue;
|
||||
if (i == 0x0900) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
UCDProperty up = UnifiedBinaryProperty.make(i, Default.ucd());
|
||||
if (up == null) continue;
|
||||
if (up.getValueType() < BINARY_PROP) {
|
||||
System.out.println("\tSkipping " + up.getName() + "; value varies");
|
||||
continue;
|
||||
}
|
||||
if (!up.isStandard()) {
|
||||
System.out.println("\tSkipping " + getPropName(up) + "; not standard");
|
||||
continue;
|
||||
}
|
||||
if (up.getName(LONG).startsWith("Other_")) {
|
||||
System.out.println("\tSkipping " + getPropName(up) + "; contributory");
|
||||
continue;
|
||||
}
|
||||
if (up.isDefaultValue() || up.skipInDerivedListing()) {
|
||||
System.out.println("\tSkipping " + getPropName(up) + "; default value");
|
||||
continue;
|
||||
}
|
||||
// System.out.println(Utility.hex(i) + " " + up.getName(LONG) + "(" + up.getName(SHORT) + ")");
|
||||
// System.out.println("\t" + up.getValue(LONG) + "(" + up.getValue(SHORT) + ")");
|
||||
sets[count] = new UnicodeSet();
|
||||
disjoints[count] = new BitSet();
|
||||
equals[count] = new BitSet();
|
||||
contains[count] = new BitSet();
|
||||
isin[count] = new BitSet();
|
||||
props[count++] = up;
|
||||
System.out.println(Utility.hex(i) + " " + (count - 1) + " " + getPropName(count - 1));
|
||||
}
|
||||
System.out.println("props: " + count);
|
||||
}
|
||||
|
||||
public void printPartition() throws IOException {
|
||||
System.out.println("Set Size: " + map.size());
|
||||
PrintWriter output = Utility.openPrintWriter("Partition"
|
||||
+ UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_WINDOWS);
|
||||
|
||||
Iterator it = map.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
BitSet probe2 = (BitSet) it.next();
|
||||
UnicodeSet value = (UnicodeSet) map.get(probe2);
|
||||
output.println();
|
||||
output.println(value);
|
||||
output.println("Size: " + value.size());
|
||||
for (int i = 0; i < count; ++i) {
|
||||
if (!probe2.get(i)) continue;
|
||||
output.print(" " + getPropName(i));
|
||||
}
|
||||
output.println();
|
||||
}
|
||||
output.println("Count: " + map.keySet().size());
|
||||
output.close();
|
||||
}
|
||||
|
||||
static final NumberFormat percent = NumberFormat.getPercentInstance(Locale.ENGLISH);
|
||||
|
||||
public void printStatistics() throws IOException {
|
||||
System.out.println("Set Size: " + map.size());
|
||||
PrintWriter output = Utility.openPrintWriter("Statistics"
|
||||
+ UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_WINDOWS);
|
||||
|
||||
System.out.println("Finding disjoints/contains");
|
||||
for (int i = 0; i < count; ++i) {
|
||||
System.out.println(getPropName(i));
|
||||
for (int j = 0; j < count; ++j) {
|
||||
if (j == i) continue;
|
||||
if (i == 1 && j == 2) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
if (sets[i].containsNone(sets[j])) {
|
||||
disjoints[i].set(j);
|
||||
} else if (sets[i].equals(sets[j])) {
|
||||
equals[i].set(j);
|
||||
} else if (sets[i].containsAll(sets[j])) {
|
||||
contains[i].set(j);
|
||||
} else if (sets[j].containsAll(sets[i])) {
|
||||
isin[i].set(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Removing non-maximal sets");
|
||||
// a set is non-maximal if it is contained in one of the other sets
|
||||
// so remove anything that is contained in one of the items
|
||||
if (false) {
|
||||
BitSet[] tempContains = new BitSet[count];
|
||||
for (int i = 0; i < count; ++i) {
|
||||
System.out.println(getPropName(i));
|
||||
tempContains[i] = (BitSet) contains[i]; // worry about collisions
|
||||
BitSet b = contains[i];
|
||||
for (int j = 0; j < b.size(); ++j) {
|
||||
if (b.get(j)) tempContains[i].andNot(contains[j]);
|
||||
}
|
||||
b = disjoints[i]; // don't worry
|
||||
for (int j = 0; j < b.size(); ++j) {
|
||||
if (b.get(j)) b.andNot(contains[j]);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < count; ++i) {
|
||||
contains[i] = tempContains[i];
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Printing disjoints & contains");
|
||||
// a set is non-maximal if it is contained in one of the other sets
|
||||
// so remove anything that is contained in one of the items
|
||||
List remainder = new ArrayList();
|
||||
Map m = new TreeMap(); // new UnicodeSetComparator()
|
||||
for (int i = 0; i < count; ++i) {
|
||||
m.put(getPropName(i), new Integer(i)); // sets[i]
|
||||
}
|
||||
Iterator it = m.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
int index = ((Integer)m.get(key)).intValue();
|
||||
boolean haveName = printBitSet(output, index, "EQUALS: ", equals[index], false);
|
||||
haveName = printBitSet(output, index, "CONTAINS: ", contains[index], haveName);
|
||||
haveName = printBitSet(output, index, "IS CONTAINED IN: ", isin[index], haveName);
|
||||
if (DO_DISJOINT) {
|
||||
printBitSet(output, index, "IS DISJOINT WITH: ", disjoints[index], haveName);
|
||||
}
|
||||
if (!haveName) remainder.add(getPropName(index));
|
||||
}
|
||||
it = remainder.iterator();
|
||||
output.println();
|
||||
output.print("NONE OF THE ABOVE: ");
|
||||
boolean first = true;
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
if (!first) output.print(", ");
|
||||
first = false;
|
||||
output.print(key);
|
||||
}
|
||||
output.println();
|
||||
output.close();
|
||||
}
|
||||
|
||||
private boolean printBitSet(PrintWriter output, int index, String title, BitSet b, boolean haveName) {
|
||||
if (!b.isEmpty()) {
|
||||
if (!haveName) {
|
||||
output.println();
|
||||
output.println(getPropName(index));
|
||||
haveName = true;
|
||||
}
|
||||
output.print(title);
|
||||
Set ss = new TreeSet();
|
||||
for (int j = 0; j < b.size(); ++j) {
|
||||
if (b.get(j)) {
|
||||
ss.add(getPropName(j));
|
||||
}
|
||||
}
|
||||
Iterator it = ss.iterator();
|
||||
boolean first = true;
|
||||
while (it.hasNext()) {
|
||||
if (!first) output.print(", ");
|
||||
first = false;
|
||||
output.print(it.next());
|
||||
}
|
||||
output.println();
|
||||
output.flush();
|
||||
}
|
||||
return haveName;
|
||||
}
|
||||
|
||||
/*
|
||||
UnicodeSet a_b = new UnicodeSet();
|
||||
UnicodeSet ab = new UnicodeSet();
|
||||
UnicodeSet _ab = new UnicodeSet();
|
||||
*/
|
||||
/*
|
||||
a_b.set(sets[i]).removeAll(sets[j]);
|
||||
ab.set(sets[i]).retainAll(sets[j]);
|
||||
_ab.set(sets[j]).removeAll(sets[i]);
|
||||
// we are interested in cases where a contains b or is contained by b
|
||||
// contain = _ab = 0
|
||||
// is contained == a_b = 0
|
||||
// is disjoint == ab == 0
|
||||
// is equal == contains & iscontained
|
||||
double total = a_b.size() + ab.size() + _ab.size();
|
||||
double limit = total*0.03;
|
||||
boolean gotName = showDiff(output, "C", j, a_b, total, limit, false);
|
||||
gotName = showDiff(output, "D", j, ab, total, limit, gotName);
|
||||
gotName = showDiff(output, "S", j, _ab, total, limit, gotName);
|
||||
if (gotName) output.println();
|
||||
*/
|
||||
|
||||
private boolean showDiff(PrintWriter output, String title, int propIndex, UnicodeSet a_b,
|
||||
double total, double limit, boolean gotName) {
|
||||
if (0 < a_b.size() && a_b.size() < limit) {
|
||||
if (!gotName) {
|
||||
gotName = true;
|
||||
output.print("\t" + getPropName(propIndex));
|
||||
}
|
||||
output.print("\t" + title + percent.format(a_b.size()/total));
|
||||
}
|
||||
return gotName;
|
||||
}
|
||||
|
||||
private String getPropName(int propertyIndex) {
|
||||
return getPropName(props[propertyIndex]);
|
||||
}
|
||||
|
||||
private String getPropName(UCDProperty ubp) {
|
||||
return Utility.getUnskeleton(ubp.getFullName(LONG), true);
|
||||
}
|
||||
|
||||
public static void listDifferences() throws IOException {
|
||||
|
||||
PrintWriter output = Utility.openPrintWriter("PropertyDifferences" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
output.println("# Listing of relationships among properties, suitable for analysis by spreadsheet");
|
||||
output.println("# Generated for " + Default.ucd().getVersion());
|
||||
output.println(UnicodeDataFile.generateDateLine());
|
||||
output.println("# P1 P2 R(P1,P2) C(P1&P2) C(P1-P2) C(P2-P1)");
|
||||
|
||||
|
||||
for (int i = 1; i < UCD_Types.LIMIT_ENUM; ++i) {
|
||||
int iType = i & 0xFF00;
|
||||
if (iType == UCD_Types.JOINING_GROUP || iType == UCD_Types.AGE || iType == UCD_Types.COMBINING_CLASS || iType == UCD_Types.SCRIPT) continue;
|
||||
UCDProperty upi = UnifiedBinaryProperty.make(i, Default.ucd());
|
||||
if (upi == null) continue;
|
||||
if (!upi.isStandard()) {
|
||||
System.out.println("Skipping " + upi.getName() + "; not standard");
|
||||
continue;
|
||||
}
|
||||
if (upi.getValueType() < UCD_Types.BINARY_PROP) {
|
||||
System.out.println("Skipping " + upi.getName() + "; value varies");
|
||||
continue;
|
||||
}
|
||||
|
||||
String iNameShort = upi.getFullName(UCD_Types.SHORT);
|
||||
String iNameLong = upi.getFullName(UCD_Types.LONG);
|
||||
|
||||
System.out.println();
|
||||
System.out.println();
|
||||
System.out.println(iNameLong);
|
||||
output.println("#" + iNameLong);
|
||||
|
||||
int last = -1;
|
||||
for (int j = i+1; j < UCD_Types.LIMIT_ENUM; ++j) {
|
||||
int jType = j & 0xFF00;
|
||||
if (jType == UCD_Types.JOINING_GROUP || jType == UCD_Types.AGE || jType == UCD_Types.COMBINING_CLASS || jType == UCD_Types.SCRIPT
|
||||
|| (jType == iType && jType != UCD_Types.BINARY_PROPERTIES)) continue;
|
||||
UCDProperty upj = UnifiedBinaryProperty.make(j, Default.ucd());
|
||||
if (upj == null) continue;
|
||||
if (!upj.isStandard()) continue;
|
||||
if (upj.getValueType() < UCD_Types.BINARY_PROP) continue;
|
||||
|
||||
|
||||
if ((j >> 8) != last) {
|
||||
last = j >> 8;
|
||||
System.out.println();
|
||||
System.out.print("\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]);
|
||||
output.flush();
|
||||
output.println("#\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]);
|
||||
} else {
|
||||
System.out.print('.');
|
||||
}
|
||||
System.out.flush();
|
||||
|
||||
int bothCount = 0, i_jPropCount = 0, j_iPropCount = 0, iCount = 0, jCount = 0;
|
||||
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
int cat = Default.ucd().getCategory(cp);
|
||||
if (cat == UCD_Types.UNASSIGNED || cat == UCD_Types.PRIVATE_USE || cat == UCD_Types.SURROGATE) continue;
|
||||
if (!Default.ucd().isAllocated(cp)) continue;
|
||||
|
||||
boolean iProp = upi.hasValue(cp);
|
||||
boolean jProp = upj.hasValue(cp);
|
||||
|
||||
if (jProp) ++jCount;
|
||||
if (iProp) {
|
||||
++iCount;
|
||||
if (jProp) ++bothCount;
|
||||
else ++i_jPropCount;
|
||||
} else if (jProp) ++j_iPropCount;
|
||||
}
|
||||
if (iCount == 0 || jCount == 0) continue;
|
||||
|
||||
String jNameShort = upj.getFullName(UCD_Types.SHORT);
|
||||
//String jNameLong = ubp.getFullID(j, LONG);
|
||||
|
||||
String rel = bothCount == 0 ? "DISJOINT"
|
||||
: i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS"
|
||||
: i_jPropCount == 0 ? "CONTAINS" // depends on reverse output
|
||||
: j_iPropCount == 0 ? "CONTAINS"
|
||||
: "OVERLAPS";
|
||||
|
||||
if (j_iPropCount > i_jPropCount) {
|
||||
// reverse output
|
||||
output.println(jNameShort + "\t" + iNameShort + "\t" + rel
|
||||
+ "\t" + bothCount + "\t" + j_iPropCount + "\t" + i_jPropCount);
|
||||
} else {
|
||||
output.println(iNameShort + "\t" + jNameShort + "\t" + rel
|
||||
+ "\t" + bothCount + "\t" + i_jPropCount + "\t" + j_iPropCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
output.close();
|
||||
}
|
||||
}
|
@ -1,908 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.text.NumberFormat;
|
||||
import java.io.*;
|
||||
|
||||
|
||||
/** Simple program to merge UCD files into XML. Not yet documented!!
|
||||
* @author Mark Davis
|
||||
*/
|
||||
|
||||
public final class ConvertUCD implements UCD_Types {
|
||||
public static final boolean SHOW = false;
|
||||
public static final boolean DEBUG = false;
|
||||
static final boolean SHOW_SAMPLE = false;
|
||||
|
||||
|
||||
int major;
|
||||
int minor;
|
||||
int update;
|
||||
|
||||
String version;
|
||||
|
||||
// varies by version
|
||||
/*
|
||||
public static final String BASE_DIR11 = DATA_DIR + "\\Versions\\";
|
||||
public static final String BASE_DIR20 = DATA_DIR + "\\Versions\\";
|
||||
public static final String BASE_DIR21 = DATA_DIR + "\\Versions\\";
|
||||
public static final String BASE_DIR30 = DATA_DIR + "\\Update 3.0.1\\";
|
||||
public static final String BASE_DIR31 = DATA_DIR + "\\3.1-Update\\";
|
||||
*/
|
||||
|
||||
//public static final String blocksnamePlain = "Blocks.txt";
|
||||
//public static final String blocksname31 = "Blocks-4d2.beta";
|
||||
|
||||
/** First item is file name, rest are field names (skipping character).
|
||||
* "OMIT" is special -- means don't record
|
||||
*/
|
||||
|
||||
static String[][] labelList = {
|
||||
// Labels for the incoming files. Labels MUST match field order in file.
|
||||
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
|
||||
// The one exception is "st", which is handled specially.
|
||||
// So file order is important.
|
||||
//*
|
||||
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
|
||||
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
|
||||
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
//{"ExtraProperties", "xp"},
|
||||
{"PropList", "binary"},
|
||||
|
||||
//{"ExtraProperties", "xp"},
|
||||
|
||||
{"EastAsianWidth", "ea", "OMIT"},
|
||||
{"LineBreak", "lb", "OMIT"},
|
||||
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
|
||||
{"CompositionExclusions", "ce"},
|
||||
{"CaseFolding", "OMIT", "*fc"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
{"Scripts", "sn"},
|
||||
//{"Jamo", "jn"},
|
||||
//{"Scripts-1d4", "RANGE", "sn"},
|
||||
//{"Age", "*sn"},
|
||||
//*/
|
||||
/*
|
||||
//*/
|
||||
};
|
||||
static HashMap isHex = new HashMap();
|
||||
static HashMap defaults = new HashMap();
|
||||
|
||||
static {
|
||||
for (int j = 0; j < labelList.length; ++j) {
|
||||
String[] labels = labelList[j];
|
||||
|
||||
for (int i = 1; i < labels.length; ++i) {
|
||||
boolean hex = false;
|
||||
String def = null;
|
||||
//char appendChar = '\u0000';
|
||||
|
||||
// pull off "*": hex interpretation
|
||||
if (labels[i].charAt(0) == '*') { // HEX value
|
||||
hex = true;
|
||||
labels[i] = labels[i].substring(1);
|
||||
}
|
||||
|
||||
/*
|
||||
// pull off "$": append duplicates
|
||||
if (labels[i].charAt(0) == '$') { // HEX value
|
||||
appendChar = labels[i].charAt(1);
|
||||
labels[i] = labels[i].substring(2);
|
||||
}
|
||||
|
||||
// pull off default values
|
||||
int pos = labels[i].indexOf('-');
|
||||
if (pos >= 0) {
|
||||
def = labels[i].substring(pos+1);
|
||||
labels[i] = labels[i].substring(0,pos);
|
||||
}
|
||||
*/
|
||||
// store results
|
||||
// we do this after all processing, so that the label is clean!!
|
||||
|
||||
if (hex) isHex.put(labels[i], "");
|
||||
//if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
|
||||
defaults.put(labels[i], def);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
static String[][] labelList31 = {
|
||||
// Labels for the incoming files. Labels MUST match field order in file.
|
||||
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
|
||||
// The one exception is "st", which is handled specially.
|
||||
// So file order is important.
|
||||
//*
|
||||
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
|
||||
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
|
||||
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"PropList-3.1.0d5.beta", "binary"},
|
||||
|
||||
{"ExtraProperties", "xp"},
|
||||
|
||||
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
|
||||
{"LineBreak-6d6.beta", "lb", "OMIT"},
|
||||
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
|
||||
{"CompositionExclusions-3d6.beta", "ce"},
|
||||
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
{"Scripts-3.1.0d4.beta", "sn"},
|
||||
//{"Scripts-1d4", "RANGE", "sn"},
|
||||
//{"Age", "*sn"},
|
||||
//*/
|
||||
/*
|
||||
{"Jamo", "jn"},
|
||||
//
|
||||
};
|
||||
/*
|
||||
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"ExtraProperties", "xp"},
|
||||
|
||||
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
|
||||
{"LineBreak-6d6.beta", "lb", "OMIT"},
|
||||
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
|
||||
{"CompositionExclusions-3d6.beta", "ce"},
|
||||
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
|
||||
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
{"Scripts-1d4", "sn"},
|
||||
//{"Scripts-1d4", "RANGE", "sn"},
|
||||
//{"Age", "*sn"},
|
||||
//*/
|
||||
/*
|
||||
{"Jamo", "jn"},
|
||||
//
|
||||
|
||||
//"NamesList-3.1.0d1.beta"
|
||||
|
||||
static String[][] labelList30 = {
|
||||
// Labels for the incoming files. Labels MUST match field order in file.
|
||||
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
|
||||
// The one exception is "st", which is handled specially.
|
||||
// So file order is important.
|
||||
//*
|
||||
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"CompositionExclusions", "ce"},
|
||||
{"EastAsianWidth", "ea", "OMIT"},
|
||||
{"LineBreak", "lb", "OMIT"},
|
||||
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
|
||||
{"CaseFolding", "OMIT", "*fc"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
/*
|
||||
{"Jamo", "jn"},
|
||||
{"PropList.alpha", "RANGE", "OMIT"},
|
||||
//
|
||||
};
|
||||
|
||||
static String[][] labelList11 = {
|
||||
{"UnicodeData-1.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
|
||||
static String[][] labelList20 = {
|
||||
{"UnicodeData-2.0", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
|
||||
static String[][] labelList21 = {
|
||||
{"UnicodeData-2.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
*/
|
||||
|
||||
// handles
|
||||
public static final String blocksname = "Blocks";
|
||||
//public static final String[][] labelList;
|
||||
public static final boolean NEWPROPS = true;
|
||||
|
||||
/*
|
||||
static {
|
||||
switch (major*10 + minor) {
|
||||
case 31:
|
||||
blocksname = blocksname31;
|
||||
labelList = labelList31;
|
||||
break;
|
||||
case 30:
|
||||
blocksname = blocksnamePlain;
|
||||
labelList = labelList30;
|
||||
break;
|
||||
case 21:
|
||||
blocksname = blocksnamePlain;
|
||||
labelList = labelList21;
|
||||
break;
|
||||
case 20:
|
||||
blocksname = blocksnamePlain;
|
||||
labelList = labelList20;
|
||||
break;
|
||||
default:
|
||||
blocksname = blocksnamePlain;
|
||||
labelList = labelList11;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
static final String dataFilePrefix = "UCD_Data";
|
||||
|
||||
|
||||
// MAIN!!
|
||||
|
||||
public static void main (String[] args) throws Exception {
|
||||
System.out.println("Building binary version of UCD");
|
||||
|
||||
log = new PrintWriter(new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(GEN_DIR + "UCD-log.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
log.write("\uFEFF"); // BOM
|
||||
|
||||
try {
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
String version = args[i];
|
||||
if (version.length() == 0) version = UCD.latestVersion;
|
||||
|
||||
new ConvertUCD().toJava(version);
|
||||
}
|
||||
} finally {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
static void toXML() throws Exception {
|
||||
// Blocks is special
|
||||
// Unihan is special
|
||||
// collect all the other .txt files in the directory
|
||||
if (false) readBlocks();
|
||||
if (true) for (int i = 0; i < labelList.length; ++i) {
|
||||
readSemi(labelList[i]);
|
||||
} else {
|
||||
readSemi(labelList[0]); // TESTING ONLY
|
||||
}
|
||||
writeXML();
|
||||
}
|
||||
*/
|
||||
|
||||
void toJava(String version) throws Exception {
|
||||
this.version = version;
|
||||
String[] parts = new String[3];
|
||||
Utility.split(version, '.', parts);
|
||||
major = Integer.parseInt(parts[0]);
|
||||
minor = Integer.parseInt(parts[1]);
|
||||
update = Integer.parseInt(parts[2]);
|
||||
System.out.println("Building " + version);
|
||||
// Blocks is special
|
||||
// Unihan is special
|
||||
// collect all the other .txt files in the directory
|
||||
if (false) readBlocks();
|
||||
if (true) for (int i = 0; i < labelList.length; ++i) {
|
||||
readSemi(labelList[i]);
|
||||
} else {
|
||||
readSemi(labelList[0]); // TESTING ONLY
|
||||
}
|
||||
|
||||
Iterator it = charData.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
UData value = (UData) charData.get(key);
|
||||
value.compact();
|
||||
}
|
||||
|
||||
/*
|
||||
UData ud;
|
||||
ud = getEntry(0x5e);
|
||||
System.out.println("SPOT-CHECK: 5e: " + ud);
|
||||
|
||||
ud = getEntry(0x130);
|
||||
System.out.println("SPOT-CHECK: 130: " + ud);
|
||||
|
||||
ud = getEntry(0x1f6);
|
||||
System.out.println("SPOT-CHECK: 1f6: " + ud);
|
||||
|
||||
ud = getEntry(0x2A6D6);
|
||||
System.out.println("SPOT-CHECK: 2A6D6: " + ud);
|
||||
|
||||
ud = getEntry(0xFFFF);
|
||||
System.out.println("SPOT-CHECK: FFFF: " + ud);
|
||||
*/
|
||||
|
||||
writeJavaData();
|
||||
}
|
||||
|
||||
static PrintWriter log;
|
||||
//static String directory = BASE_DIR;
|
||||
//static Map appendDuplicates = new HashMap();
|
||||
|
||||
/** First item in labels is file name, rest are field names (skipping character).
|
||||
* "OMIT" is special -- means don't record
|
||||
*/
|
||||
|
||||
|
||||
List blockData = new LinkedList();
|
||||
|
||||
void readBlocks() throws Exception {
|
||||
System.out.println("Reading 'Blocks'");
|
||||
BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, Utility.LATIN1);
|
||||
String line = "";
|
||||
try {
|
||||
String[] parts = new String[20];
|
||||
for (int lineNumber = 1; ; ++lineNumber) {
|
||||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
|
||||
//String original = line;
|
||||
String comment = "";
|
||||
int commentPos = line.indexOf('#');
|
||||
if (commentPos >= 0) {
|
||||
comment = line.substring(commentPos+1).trim();
|
||||
line = line.substring(0, commentPos);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
int count = Utility.split(line,';',parts);
|
||||
if (count != 3) throw new ChainException("Bad count in Blocks", null);
|
||||
blockData.add(new String[] {Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim()});
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception at: " + line);
|
||||
throw e;
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
|
||||
Set properties = new TreeSet();
|
||||
|
||||
void readSemi(String[] labels) throws Exception {
|
||||
System.out.println();
|
||||
System.out.println("Reading '" + labels[0] + "'");
|
||||
if (major < 3 || (major == 3 && minor < 1)) {
|
||||
if (labels[0] == "PropList") {
|
||||
System.out.println("SKIPPING old format of Proplist for " + version);
|
||||
return;
|
||||
}
|
||||
}
|
||||
String tempVersion = version;
|
||||
if (version.equals(UCD.latestVersion)) tempVersion = "";
|
||||
BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true, Utility.LATIN1);
|
||||
if (input == null) {
|
||||
System.out.println("COULDN'T OPEN: " + labels[0]);
|
||||
return;
|
||||
}
|
||||
boolean showedSemi = false;
|
||||
boolean showedShort = false;
|
||||
String line = "";
|
||||
|
||||
try {
|
||||
String[] parts = new String[20];
|
||||
for (int lineNumber = 1; ; ++lineNumber) {
|
||||
try {
|
||||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
|
||||
String original = line;
|
||||
String comment = "";
|
||||
int commentPos = line.indexOf('#');
|
||||
if (commentPos >= 0) {
|
||||
comment = line.substring(commentPos+1).trim();
|
||||
line = line.substring(0, commentPos);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
int count = Utility.split(line,';',parts);
|
||||
|
||||
if (false && parts[0].equals("2801")) {
|
||||
System.out.println("debug?");
|
||||
}
|
||||
|
||||
// fix malformed or simple lists.
|
||||
|
||||
if (count != labels.length) {
|
||||
if (count == labels.length + 1 && parts[count-1].equals("")) {
|
||||
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
|
||||
showedSemi = true;
|
||||
} else if (count == 1) { // fix simple list
|
||||
++count;
|
||||
parts[1] = "Y";
|
||||
} else if (count < labels.length) {
|
||||
if (!showedShort) System.out.println("Line shorter than labels: " + original);
|
||||
showedShort = true;
|
||||
for (int i = count; i < labels.length; ++i) {
|
||||
parts[i] = "";
|
||||
}
|
||||
} else {
|
||||
throw new ChainException("wrong count: {0}",
|
||||
new Object[] {new Integer(line), new Integer(count)});
|
||||
}
|
||||
}
|
||||
|
||||
// store char
|
||||
// first field is always character OR range. May be UTF-32
|
||||
int cpTop;
|
||||
int cpStart;
|
||||
int ddot = parts[0].indexOf(".");
|
||||
if (ddot >= 0) {
|
||||
cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0);
|
||||
cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0);
|
||||
// System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop));
|
||||
} else {
|
||||
cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0);
|
||||
cpTop = cpStart;
|
||||
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
}
|
||||
|
||||
// properties first
|
||||
if (labels[1].equals("PROP")) {
|
||||
String prop = parts[2].trim();
|
||||
// FIX!!
|
||||
boolean skipLetters = false;
|
||||
if (prop.equals("Alphabetic")) {
|
||||
prop = "Other_Alphabetic";
|
||||
skipLetters = true;
|
||||
}
|
||||
// END FIX!!
|
||||
properties.add(prop);
|
||||
if (Utility.find(prop, UCD_Names.DeletedProperties, true) == -1) { // only undeleted
|
||||
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
if (end == 0) end = cpStart;
|
||||
|
||||
for (int j = cpStart; j <= end; ++j) {
|
||||
if (j != UCD.mapToRepresentative(j, Integer.MAX_VALUE)) continue;
|
||||
if (skipLetters && getEntry(cpStart).isLetter()) continue;
|
||||
appendCharProperties(j, prop);
|
||||
}
|
||||
}
|
||||
} else { // not range!
|
||||
String val = "";
|
||||
String lastVal;
|
||||
|
||||
for (int i = 1; i < labels.length; ++i) {
|
||||
String key = labels[i];
|
||||
lastVal = val;
|
||||
if (isHex.get(key) != null) {
|
||||
val = Utility.fromHex(parts[i]);
|
||||
} else {
|
||||
val = parts[i].trim();
|
||||
}
|
||||
if (key.equals("OMIT")) continue; // do after val, so lastVal is correct
|
||||
if (key.equals("RANGE")) continue; // do after val, so lastVal is correct
|
||||
if (val.equals("")) continue; // skip empty values, they mean default
|
||||
|
||||
for (int cps = cpStart; cps <= cpTop; ++cps) {
|
||||
if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) continue; // skip condensed ranges
|
||||
|
||||
if (key.equals("binary")) {
|
||||
appendCharProperties(cps, val);
|
||||
} else if (key.equals("fc")) {
|
||||
UData data = getEntry(cps);
|
||||
String type = parts[i-1].trim();
|
||||
if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) {
|
||||
data.fullCaseFolding = val;
|
||||
//System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
if (type.equals("S") || type.equals("C") || type.equals("L")) {
|
||||
data.simpleCaseFolding = val;
|
||||
//System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
if (type.equals("I")) {
|
||||
data.simpleCaseFolding = val;
|
||||
setBinaryProperty(cps, CaseFoldTurkishI);
|
||||
if (DEBUG) System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting "
|
||||
+ Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
} else if (labels[0].equals("SpecialCasing") // special handling for special casing
|
||||
&& labels[4].equals("sc")
|
||||
&& parts[4].trim().length() > 0) {
|
||||
if (i < 4) {
|
||||
if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", "
|
||||
+ Utility.hex(key) + ":" + Utility.hex(val));
|
||||
addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val);
|
||||
}
|
||||
} else {
|
||||
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
|
||||
UData data = getEntryIfExists(cps);
|
||||
if (data == null || data.generalCategory == Cn) continue;
|
||||
}
|
||||
*/
|
||||
addCharData(cps, key, val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println("*Exception at: " + line + ", " + e.getMessage());
|
||||
//System.err.println(e.getMessage());
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception at: " + line + ", " + e.getMessage());
|
||||
throw e;
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
//printValues("JOINING_TYPE", jtSet);
|
||||
//printValues("JOINING_GROUP", jgSet);
|
||||
}
|
||||
|
||||
static void printValues(String title, Set s) {
|
||||
Iterator it = s.iterator();
|
||||
System.out.println("public static String[] " + title + " = {");
|
||||
while (it.hasNext()) {
|
||||
String value = (String) it.next();
|
||||
System.out.println(" \"" + value + "\",");
|
||||
}
|
||||
System.out.println("};");
|
||||
it = s.iterator();
|
||||
System.out.println("public static byte ");
|
||||
int count = 0;
|
||||
while (it.hasNext()) {
|
||||
String value = (String) it.next();
|
||||
System.out.println(" " + value.replace(' ', '-').toUpperCase() + " = " + (count++) + ",");
|
||||
}
|
||||
System.out.println(" LIMIT_" + title + " = " + count);
|
||||
System.out.println(";");
|
||||
}
|
||||
|
||||
Map charData = new TreeMap();
|
||||
|
||||
/*
|
||||
static void writeXML() throws IOException {
|
||||
System.out.println("Writing 'UCD-Main.xml'");
|
||||
BufferedWriter output = new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(UCD.BIN_DIR + "UCD_Data.xml"),
|
||||
"UTF8"),
|
||||
32*1024);
|
||||
|
||||
try {
|
||||
// write header
|
||||
|
||||
output.write("<?xml version='1.0' encoding='utf-8'?>\r\n");
|
||||
output.write("<UnicodeCharacterDatabase>\r\n");
|
||||
output.write(" <!-- IMPORTANT: see UCD-Notes.html for information on the format. This file CANNOT be read correctly without that information. -->\r\n");
|
||||
output.write(" <unicode version='" + major + "' minor='" + minor + "' update='" + update + "'/>\r\n");
|
||||
output.write(" <fileVersion status='DRAFT' date='" + new Date() + "'/>\r\n");
|
||||
|
||||
// write blocks
|
||||
|
||||
Iterator it = blockData.iterator();
|
||||
while (it.hasNext()) {
|
||||
String[] block = (String[]) it.next();
|
||||
output.write(" <block start='" + Utility.quoteXML(block[0])
|
||||
+ "' end='" + Utility.quoteXML(block[1])
|
||||
+ "' name='" + Utility.quoteXML(block[2])
|
||||
+ "'/>\r\n" );
|
||||
}
|
||||
|
||||
// write char data
|
||||
|
||||
it = charData.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Integer cc = (Integer) it.next();
|
||||
output.write(" <e c='" + Utility.quoteXML(cc.intValue()) + "'");
|
||||
/*
|
||||
UData data = (UData) charData.get(cc);
|
||||
Iterator dataIt = data.keySet().iterator();
|
||||
while (dataIt.hasNext()) {
|
||||
String label = (String) dataIt.next();
|
||||
if (label.equals("c")) continue; // already wrote it.
|
||||
if (label.equals("fc")) {
|
||||
String fc = getResolved(data, "fc");
|
||||
String lc = getResolved(data, "lc");
|
||||
if (!fc.equals(lc) && !lc.equals(cc)) log.println("FC " + fc.length() + ": " + toString(cc));
|
||||
}
|
||||
String value = Utility.quoteXML((String) data.get(label));
|
||||
output.write(" " + label + "='" + value + "'");
|
||||
}
|
||||
*//*
|
||||
output.write("/>\r\n");
|
||||
}
|
||||
|
||||
// write footer
|
||||
|
||||
output.write("</UnicodeCharacterDatabase>\r\n");
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
void writeJavaData() throws IOException {
|
||||
Iterator it = charData.keySet().iterator();
|
||||
int codePoint = -1;
|
||||
System.out.println("Writing " + dataFilePrefix + version);
|
||||
DataOutputStream dataOut = new DataOutputStream(
|
||||
new BufferedOutputStream(
|
||||
new FileOutputStream(UCD.BIN_DIR + dataFilePrefix + version + ".bin"),
|
||||
128*1024));
|
||||
|
||||
// write header
|
||||
dataOut.writeByte(BINARY_FORMAT);
|
||||
dataOut.writeByte(major);
|
||||
dataOut.writeByte(minor);
|
||||
dataOut.writeByte(update);
|
||||
long millis = System.currentTimeMillis();
|
||||
dataOut.writeLong(millis);
|
||||
dataOut.writeInt(charData.size());
|
||||
System.out.println("Data Size: " + NumberFormat.getInstance().format(charData.size()));
|
||||
int count = 0;
|
||||
|
||||
// write records
|
||||
try {
|
||||
// write char data
|
||||
|
||||
while (it.hasNext()) {
|
||||
Object cc = (Object) it.next();
|
||||
//codePoint = UTF32.char32At(cc,0);
|
||||
if (DEBUG) System.out.println(Utility.hex(cc));
|
||||
|
||||
UData uData = (UData) charData.get(cc);
|
||||
if (false && uData.name == null) {
|
||||
System.out.println("Warning: NULL name\r\n" + uData);
|
||||
System.out.println();
|
||||
}
|
||||
if (false && uData.codePoint == 0x2801) {
|
||||
System.out.println("SPOT-CHECK: " + uData);
|
||||
}
|
||||
uData.writeBytes(dataOut);
|
||||
count++;
|
||||
if (DEBUG) System.out.println("Setting2");
|
||||
}
|
||||
System.out.println("Wrote Data " + count);
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Bad data write {0}", new Object [] {Utility.hex(codePoint)}, e);
|
||||
} finally {
|
||||
dataOut.close();
|
||||
}
|
||||
}
|
||||
|
||||
//static String[] xsSplit = new String[40];
|
||||
|
||||
// Cache a little bit for speed
|
||||
int getEntryCodePoint = -1;
|
||||
UData getEntryUData = null;
|
||||
|
||||
UData getEntryIfExists(int cp) {
|
||||
if (cp == getEntryCodePoint) return getEntryUData;
|
||||
Integer cc = new Integer(cp);
|
||||
UData charEntry = (UData) charData.get(cc);
|
||||
if (charEntry == null) return null;
|
||||
getEntryCodePoint = cp;
|
||||
getEntryUData = charEntry;
|
||||
return charEntry;
|
||||
}
|
||||
|
||||
/* Get entry in table for cc
|
||||
*/
|
||||
UData getEntry(int cp) {
|
||||
if (cp == getEntryCodePoint) return getEntryUData;
|
||||
Integer cc = new Integer(cp);
|
||||
UData charEntry = (UData) charData.get(cc);
|
||||
if (charEntry == null) {
|
||||
charEntry = new UData(cp);
|
||||
charData.put(cc, charEntry);
|
||||
//charEntry.put("c", cc);
|
||||
}
|
||||
getEntryCodePoint = cp;
|
||||
getEntryUData = charEntry;
|
||||
return charEntry;
|
||||
}
|
||||
/** Adds the character data. Signals duplicates with an exception
|
||||
*/
|
||||
|
||||
void setBinaryProperty(int cp, int binProp) {
|
||||
UData charEntry = getEntry(cp);
|
||||
charEntry.binaryProperties |= (1L << binProp);
|
||||
}
|
||||
|
||||
void appendCharProperties(int cp, String key) {
|
||||
int ind;
|
||||
//if (true || NEWPROPS) {
|
||||
ind = Utility.lookup(key, UCD_Names.BP, true);
|
||||
/*} else {
|
||||
ind = Utility.lookup(key, UCD_Names.BP_OLD);
|
||||
}
|
||||
*/
|
||||
//charEntry.binaryProperties |= (1 << ind);
|
||||
setBinaryProperty(cp, ind);
|
||||
}
|
||||
|
||||
Set jtSet = new TreeSet();
|
||||
Set jgSet = new TreeSet();
|
||||
|
||||
/** Adds the character data. Signals duplicates with an exception
|
||||
*/
|
||||
void addCharData(int cp, String key, String value) {
|
||||
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
|
||||
UData charEntry = getEntry(cp);
|
||||
//if (cp < 10) System.out.println(" " + charEntry);
|
||||
|
||||
if (SHOW_SAMPLE && cp == 0x221) {
|
||||
System.out.println("Sample: " + cp + ", " + key + ", " + value);
|
||||
System.out.println(charEntry);
|
||||
}
|
||||
|
||||
if (key.equals("bm")) {
|
||||
if (value.equals("Y")) charEntry.binaryProperties |= 1;
|
||||
} else if (key.equals("ce")) {
|
||||
charEntry.binaryProperties |= 2;
|
||||
} else if (key.equals("on")) {
|
||||
if (charEntry.name.charAt(0) == '<') {
|
||||
charEntry.name = '<' + value + '>';
|
||||
}
|
||||
} else if (key.equals("dm")) {
|
||||
charEntry.decompositionType = CANONICAL;
|
||||
if (value.charAt(0) == '<') {
|
||||
int pos = value.indexOf('>');
|
||||
String dType = value.substring(1,pos);
|
||||
if (major < 2) if (dType.charAt(0) == '+') dType = dType.substring(1);
|
||||
value = value.substring(pos+1);
|
||||
setField(charEntry, "dt", dType);
|
||||
}
|
||||
// FIX OLD
|
||||
if (major < 2) {
|
||||
int oldStyle = value.indexOf('<');
|
||||
if (oldStyle > 0) {
|
||||
value = value.substring(0,oldStyle);
|
||||
}
|
||||
oldStyle = value.indexOf('{');
|
||||
if (oldStyle > 0) {
|
||||
value = value.substring(0,oldStyle);
|
||||
}
|
||||
}
|
||||
setField(charEntry, key, Utility.fromHex(value));
|
||||
|
||||
// fix the numeric fields to be more sensible
|
||||
} else if (key.equals("dd")) {
|
||||
if (charEntry.numericType < UCD_Types.DECIMAL) {
|
||||
charEntry.numericType = UCD_Types.DECIMAL;
|
||||
}
|
||||
setField(charEntry, "nv", value);
|
||||
} else if (key.equals("dv")) {
|
||||
if (charEntry.numericType < UCD_Types.DIGIT) {
|
||||
charEntry.numericType = UCD_Types.DIGIT;
|
||||
}
|
||||
setField(charEntry, "nv", value);
|
||||
} else if (key.equals("nv")) {
|
||||
if (charEntry.numericType < UCD_Types.NUMERIC) {
|
||||
charEntry.numericType = UCD_Types.NUMERIC;
|
||||
}
|
||||
setField(charEntry, "nv", value);
|
||||
/*} else if (key.equals("jt")) {
|
||||
jtSet.add(value);
|
||||
} else if (key.equals("jg")) {
|
||||
jgSet.add(value);
|
||||
*/
|
||||
} else {
|
||||
setField(charEntry, key, value);
|
||||
}
|
||||
if (SHOW_SAMPLE && cp == 0x221) {
|
||||
System.out.println("Sample Result:");
|
||||
System.out.println(charEntry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void setField(UData uData, String fieldName, String fieldValue) {
|
||||
try {
|
||||
if (fieldName.equals("n")) {
|
||||
uData.name = fieldValue;
|
||||
} else if (fieldName.equals("dm")) {
|
||||
uData.decompositionMapping = fieldValue;
|
||||
} else if (fieldName.equals("bg")) {
|
||||
uData.bidiMirror = fieldValue;
|
||||
} else if (fieldName.equals("uc")) {
|
||||
uData.simpleUppercase = fieldValue;
|
||||
} else if (fieldName.equals("lc")) {
|
||||
uData.simpleLowercase = fieldValue;
|
||||
} else if (fieldName.equals("tc")) {
|
||||
uData.simpleTitlecase = fieldValue;
|
||||
|
||||
} else if (fieldName.equals("su")) {
|
||||
uData.fullUppercase = fieldValue;
|
||||
} else if (fieldName.equals("sl")) {
|
||||
if (DEBUG) System.out.println("Setting full lowercase to " + Utility.hex(fieldValue) + uData);
|
||||
uData.fullLowercase = fieldValue;
|
||||
} else if (fieldName.equals("st")) {
|
||||
uData.fullTitlecase = fieldValue;
|
||||
|
||||
} else if (fieldName.equals("sc")) {
|
||||
if (uData.specialCasing.length() > 0) {
|
||||
uData.specialCasing += ";";
|
||||
}
|
||||
uData.specialCasing += fieldValue;
|
||||
|
||||
} else if (fieldName.equals("xp")) {
|
||||
uData.binaryProperties |= 1L << Utility.lookup(fieldValue, UCD_Names.BP, true);
|
||||
//UCD_Names.BP_OLD
|
||||
|
||||
} else if (fieldName.equals("gc")) {
|
||||
uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GENERAL_CATEGORY, true);
|
||||
// if (major >= 5 && uData.script == Unknown_Script
|
||||
// && uData.generalCategory != Cn
|
||||
// && uData.generalCategory != Cs
|
||||
// && uData.generalCategory != Co) {
|
||||
// uData.script = COMMON_SCRIPT;
|
||||
// System.out.println("Resetting to Common Script: " + Utility.hex(uData.codePoint));
|
||||
// }
|
||||
} else if (fieldName.equals("bc")) {
|
||||
uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BIDI_CLASS, true);
|
||||
} else if (fieldName.equals("dt")) {
|
||||
if (major < 2) {
|
||||
if (fieldValue.equals("no-break")) fieldValue = "noBreak";
|
||||
else if (fieldValue.equals("circled")) fieldValue = "circle";
|
||||
else if (fieldValue.equals("sup")) fieldValue = "super";
|
||||
else if (fieldValue.equals("break")) fieldValue = "compat";
|
||||
else if (fieldValue.equals("font variant")) fieldValue = "font";
|
||||
else if (fieldValue.equals("no-join")) fieldValue = "compat";
|
||||
else if (fieldValue.equals("join")) fieldValue = "compat";
|
||||
}
|
||||
uData.decompositionType = Utility.lookup(fieldValue, UCD_Names.LONG_DECOMPOSITION_TYPE, true);
|
||||
} else if (fieldName.equals("nt")) {
|
||||
uData.numericType = Utility.lookup(fieldValue, UCD_Names.LONG_NUMERIC_TYPE, true);
|
||||
|
||||
} else if (fieldName.equals("ea")) {
|
||||
uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.EAST_ASIAN_WIDTH, true);
|
||||
} else if (fieldName.equals("lb")) {
|
||||
uData.lineBreak = Utility.lookup(fieldValue, UCD_Names.LINE_BREAK, true);
|
||||
|
||||
} else if (fieldName.equals("sn")) {
|
||||
uData.script = Utility.lookup(fieldValue, UCD_Names.LONG_SCRIPT, true);
|
||||
|
||||
} else if (fieldName.equals("jt")) {
|
||||
uData.joiningType = Utility.lookup(fieldValue, UCD_Names.JOINING_TYPE, true);
|
||||
} else if (fieldName.equals("jg")) {
|
||||
byte temp = (byte)Utility.find(fieldValue, UCD_Names.OLD_JOINING_GROUP, true);
|
||||
if (temp != -1) uData.joiningGroup = temp;
|
||||
else uData.joiningGroup = Utility.lookup(fieldValue, UCD_Names.JOINING_GROUP, true);
|
||||
|
||||
} else if (fieldName.equals("nv")) {
|
||||
if (major < 2) {
|
||||
if (fieldValue.equals("-")) return;
|
||||
}
|
||||
uData.numericValue = Utility.doubleFrom(fieldValue);
|
||||
} else if (fieldName.equals("cc")) {
|
||||
uData.combiningClass = (byte)Utility.intFrom(fieldValue);
|
||||
if (uData.combiningClass == 9 && major >= 5) {
|
||||
System.out.println("setting Grapheme_Link " + Utility.hex(uData.codePoint) + "\t" + uData.name);
|
||||
uData.binaryProperties |= (1<<GraphemeLink);
|
||||
System.out.println(uData);
|
||||
}
|
||||
} else if (fieldName.equals("bp")) {
|
||||
uData.binaryProperties = (byte)Utility.longFrom(fieldValue);
|
||||
// if (major >= 5 && (uData.binaryProperties & 1<<Noncharacter_Code_Point) != 0) {
|
||||
// uData.script = Unknown_Script;
|
||||
// }
|
||||
System.out.println("Resetting: " + uData);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown fieldName");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new ChainException(
|
||||
"Bad field name= \"{0}\", value= \"{1}\"", new Object[] {fieldName, fieldValue}, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,93 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import java.util.Date;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.TimeZone;
|
||||
|
||||
|
||||
public final class Default implements UCD_Types {
|
||||
|
||||
private static String ucdVersion = UCD.latestVersion;
|
||||
private static UCD ucd;
|
||||
private static Normalizer nfc;
|
||||
private static Normalizer nfd;
|
||||
private static Normalizer nfkc;
|
||||
private static Normalizer nfkd;
|
||||
private static Normalizer[] nf = new Normalizer[4];
|
||||
private static String year;
|
||||
|
||||
public static void setUCD(String version) {
|
||||
ucdVersion = version;
|
||||
setUCD();
|
||||
}
|
||||
|
||||
private static boolean inRecursiveCall = false;
|
||||
private static void setUCD() {
|
||||
if (inRecursiveCall) {
|
||||
throw new IllegalArgumentException("Recursive call to setUCD");
|
||||
}
|
||||
inRecursiveCall = true;
|
||||
ucd = UCD.make(ucdVersion);
|
||||
nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdVersion());
|
||||
nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdVersion());
|
||||
nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdVersion());
|
||||
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdVersion());
|
||||
System.out.println("Loaded UCD" + ucd().getVersion() + " " + (new Date(ucd().getDate())));
|
||||
inRecursiveCall = false;
|
||||
}
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd', 'HH:mm:ss' GMT'");
|
||||
static DateFormat yearFormat = new SimpleDateFormat("yyyy");
|
||||
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
year = yearFormat.format(new Date());
|
||||
}
|
||||
|
||||
public static String getDate() {
|
||||
return myDateFormat.format(new Date());
|
||||
}
|
||||
|
||||
public static String getYear() {
|
||||
return year;
|
||||
}
|
||||
|
||||
public static String ucdVersion() {
|
||||
if (ucd == null) setUCD();
|
||||
return ucdVersion;
|
||||
}
|
||||
|
||||
public static UCD ucd() {
|
||||
if (ucd == null) setUCD();
|
||||
return ucd;
|
||||
}
|
||||
public static Normalizer nfc() {
|
||||
if (ucd == null) setUCD();
|
||||
return nfc;
|
||||
}
|
||||
public static Normalizer nfd() {
|
||||
if (ucd == null) setUCD();
|
||||
return nfd;
|
||||
}
|
||||
public static Normalizer nfkc() {
|
||||
if (ucd == null) setUCD();
|
||||
return nfkc;
|
||||
}
|
||||
public static Normalizer nfkd() {
|
||||
if (ucd == null) setUCD();
|
||||
return nfkd;
|
||||
}
|
||||
public static Normalizer nf(int index) {
|
||||
if (ucd == null) setUCD();
|
||||
return nf[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param lineValue
|
||||
*/
|
||||
public static void setYear(String lineValue) {
|
||||
year = lineValue;
|
||||
}
|
||||
|
||||
}
|
@ -1,29 +0,0 @@
|
||||
#
|
||||
# Unicode Character Database: Derived Property Data
|
||||
# This file shows when various code points were first assigned in Unicode.
|
||||
#
|
||||
# Caution: When using the Age *property*, all assigned code points
|
||||
# in each version are included, not just the newly assigned code points.
|
||||
# For more information, see http://www.unicode.org/reports/tr18/
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# - The term 'assigned' means that a previously reserved code point was assigned
|
||||
# to be a character (graphic, format, control, or private-use);
|
||||
# a noncharacter code point; or a surrogate code point.
|
||||
# For more information, see The Unicode Standard Section 2.4
|
||||
#
|
||||
# - Versions are only tracked from 1.1 onwards, since version 1.0
|
||||
# predated changes required by the ISO 10646 merger.
|
||||
#
|
||||
# - The Hangul Syllables that were removed from 2.0 are not included in the 1.1 listing.
|
||||
#
|
||||
# - The supplementary private use code points and the non-character code points
|
||||
# were assigned in version 2.0, but not specifically listed in the UCD
|
||||
# until versions 3.0 and 3.1 respectively.
|
||||
#
|
||||
# - Contiguous ranges are broken into separate lines where they would cross code point
|
||||
# types: graphic, format, control, private-use, surrogate, noncharacter
|
||||
#
|
||||
# For details on the contents of each version, see
|
||||
# http://www.unicode.org/versions/enumeratedversions.html.
|
@ -1,982 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
|
||||
* $Date: 2004/03/11 19:03:17 $
|
||||
* $Revision: 1.26 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.util.*;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
public final class DerivedProperty implements UCD_Types {
|
||||
|
||||
UCD ucdData;
|
||||
Normalizer nfc;
|
||||
Normalizer nfd;
|
||||
Normalizer nfkc;
|
||||
Normalizer nfkd;
|
||||
Normalizer[] nf = new Normalizer[4];
|
||||
UnicodeSet XID_Start_Set = new UnicodeSet();
|
||||
UnicodeSet XID_Continue_Set = new UnicodeSet();
|
||||
|
||||
// ADD CONSTANT to UCD_TYPES
|
||||
|
||||
static public UCDProperty make(int derivedPropertyID) {
|
||||
return make(derivedPropertyID, Default.ucd());
|
||||
}
|
||||
|
||||
static public UCDProperty make(int derivedPropertyID, UCD ucd) {
|
||||
if (derivedPropertyID < 0 || derivedPropertyID >= DERIVED_PROPERTY_LIMIT) return null;
|
||||
DerivedProperty dp = getCached(ucd);
|
||||
return dp.dprops[derivedPropertyID];
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
static Map cache = new HashMap();
|
||||
static UCD lastUCD = null;
|
||||
static DerivedProperty lastValue = null;
|
||||
|
||||
private static DerivedProperty getCached(UCD ucd) {
|
||||
if (ucd.equals(lastUCD)) return lastValue;
|
||||
DerivedProperty dp = (DerivedProperty) cache.get(ucd);
|
||||
if (dp == null) {
|
||||
dp = new DerivedProperty(ucd);
|
||||
cache.put(ucd, dp);
|
||||
}
|
||||
lastUCD = ucd;
|
||||
lastValue = dp;
|
||||
return dp;
|
||||
}
|
||||
|
||||
/*
|
||||
public String getHeader(int propNumber) {
|
||||
UnicodeProperty dp = dprops[propNumber];
|
||||
if (dp != null) return dp.getHeader();
|
||||
else return "Unimplemented!!";
|
||||
}
|
||||
|
||||
public String getName(int propNumber, byte style) {
|
||||
UnicodeProperty dp = dprops[propNumber];
|
||||
if (dp != null) return dp.getName(style);
|
||||
else return "Unimplemented!!";
|
||||
}
|
||||
|
||||
public String getValue(int cp, int propNumber) {
|
||||
UnicodeProperty dp = dprops[propNumber];
|
||||
if (dp != null) return dp.getValue(cp);
|
||||
else return "Unimplemented!!";
|
||||
}
|
||||
|
||||
public boolean isTest(int propNumber) {
|
||||
if (!isDefined(propNumber)) return false;
|
||||
return dprops[propNumber].isTest();
|
||||
}
|
||||
|
||||
public boolean hasProperty(int cp, int propNumber) {
|
||||
if (!isDefined(propNumber)) return false;
|
||||
return dprops[propNumber].hasProperty(cp);
|
||||
}
|
||||
|
||||
public boolean valueVaries(int propNumber) {
|
||||
return dprops[propNumber].valueVaries();
|
||||
}
|
||||
/*
|
||||
public String getValue(int cp, int propNumber) {
|
||||
return dprops[propNumber].getValue(int cp);
|
||||
}
|
||||
*/
|
||||
private UCDProperty[] dprops = new UCDProperty[50];
|
||||
|
||||
static final String[] CaseNames = {
|
||||
"Uppercase",
|
||||
"Lowercase",
|
||||
"Mixedcase"};
|
||||
|
||||
class ExDProp extends UCDProperty {
|
||||
Normalizer nfx;
|
||||
ExDProp(int i) {
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
name = "Expands_On_" + nfx.getName();
|
||||
shortName = "XO_" + nfx.getName();
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters whose normalized length is not one."
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (ucdData.getDecompositionType(cp) == NONE) return false;
|
||||
String norm = nfx.normalize(cp);
|
||||
if (UTF16.countCodePoint(norm) != 1) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
class NF_UnsafeStartProp extends UCDProperty {
|
||||
Normalizer nfx;
|
||||
//int prop;
|
||||
|
||||
NF_UnsafeStartProp(int i) {
|
||||
isStandard = false;
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
name = nfx.getName() + "_UnsafeStart";
|
||||
shortName = nfx.getName() + "_SS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
|
||||
;
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (ucdData.getCombiningClass(cp) != 0) return false;
|
||||
String norm = nfx.normalize(cp);
|
||||
int first = UTF16.charAt(norm, 0);
|
||||
if (ucdData.getCombiningClass(first) != 0) return true;
|
||||
if (nfx.isComposition()
|
||||
&& dprops[NFC_TrailingZero].hasValue(first)) return true; // 1,3 == composing
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
class HangulSyllableType extends UnicodeProperty {
|
||||
Normalizer nfx;
|
||||
//int prop;
|
||||
|
||||
HangulSyllableType(int i) {
|
||||
isStandard = false;
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
name = nfx.getName() + "_UnsafeStart";
|
||||
shortName = nfx.getName() + "_SS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
|
||||
;
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (ucdData.getCombiningClass(cp) != 0) return false;
|
||||
String norm = nfx.normalize(cp);
|
||||
int first = UTF16.charAt(norm, 0);
|
||||
if (ucdData.getCombiningClass(first) != 0) return true;
|
||||
if (nfx.isComposition()
|
||||
&& dprops[NFC_TrailingZero].hasValue(first)) return true; // 1,3 == composing
|
||||
return false;
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
class NFC_Prop extends UCDProperty {
|
||||
BitSet bitset;
|
||||
boolean filter = false;
|
||||
boolean keepNonZero = true;
|
||||
|
||||
NFC_Prop(int i) {
|
||||
isStandard = false;
|
||||
type = DERIVED_NORMALIZATION;
|
||||
BitSet[] bitsets = new BitSet[3];
|
||||
switch(i) {
|
||||
case NFC_Leading: bitsets[0] = bitset = new BitSet(); break;
|
||||
case NFC_Resulting: bitsets[2] = bitset = new BitSet(); break;
|
||||
case NFC_TrailingZero: keepNonZero = false; // FALL THRU
|
||||
case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
|
||||
}
|
||||
filter = bitsets[1] != null;
|
||||
nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
|
||||
|
||||
name = Names[i-NFC_Leading];
|
||||
shortName = SNames[i-NFC_Leading];
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# " + Description[i-NFC_Leading]
|
||||
+ "\r\n# NFKC characters are the same, after subtracting the NFKD = NO values."
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
boolean result = bitset.get(cp);
|
||||
if (result && filter) {
|
||||
result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
final String[] Names = {"NFC_Leading", "NFC_TrailingNonZero", "NFC_TrailingZero", "NFC_Resulting"};
|
||||
final String[] SNames = {"NFC_L", "NFC_TNZ", "NFC_TZ", "NFC_R"};
|
||||
final String[] Description = {
|
||||
"Characters that can combine with following characters in NFC",
|
||||
"Characters that can combine with previous characters in NFC, and have non-zero combining class",
|
||||
"Characters that can combine with previous characters in NFC, and have zero combining class",
|
||||
"Characters that can result from a combination of other characters in NFC",
|
||||
};
|
||||
};
|
||||
|
||||
class GenDProp extends UCDProperty {
|
||||
Normalizer nfx;
|
||||
Normalizer nfComp = null;
|
||||
|
||||
GenDProp (int i) {
|
||||
isStandard = false;
|
||||
setValueType(STRING_PROP);
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
name = nfx.getName();
|
||||
String compName = "the character itself";
|
||||
|
||||
if (i == NFKC || i == NFD) {
|
||||
name += "-NFC";
|
||||
nfComp = nfc;
|
||||
compName = "NFC for the character";
|
||||
} else if (i == NFKD) {
|
||||
name += "-NFD";
|
||||
nfComp = nfd;
|
||||
compName = "NFD for the character";
|
||||
}
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Lists characters in normalized form " + nfx.getName() + "."
|
||||
+ "\r\n# Only those characters whith normalized forms are DIFFERENT from " + compName + " are listed!"
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
|
||||
}
|
||||
|
||||
int cacheCp = 0;
|
||||
String cacheStr = "";
|
||||
|
||||
public String getValue(int cp, byte style) {
|
||||
if (cacheCp == cp) return cacheStr;
|
||||
cacheCp = cp;
|
||||
cacheStr = "";
|
||||
|
||||
if (ucdData.getDecompositionType(cp) != NONE) {
|
||||
String cps = UTF32.valueOf32(cp);
|
||||
String comp = cps;
|
||||
if (nfComp != null) {
|
||||
comp = nfComp.normalize(comp);
|
||||
}
|
||||
String normal = nfx.normalize(cps);
|
||||
if (!comp.equals(normal)) {
|
||||
String norm = Utility.hex(normal);
|
||||
String pad = Utility.repeat(" ", 14-norm.length());
|
||||
cacheStr = name + "; " + norm + pad;
|
||||
}
|
||||
}
|
||||
|
||||
return cacheStr;
|
||||
//if (cp >= 0xAC00 && cp <= 0xD7A3) return true;
|
||||
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps)));
|
||||
} // default
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
class CaseDProp extends UCDProperty {
|
||||
byte val;
|
||||
CaseDProp (int i) {
|
||||
type = DERIVED_CORE;
|
||||
isStandard = false;
|
||||
val = (i == Missing_Uppercase ? Lu : i == Missing_Lowercase ? Ll : Lt);
|
||||
name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase];
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == val
|
||||
|| val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) return false;
|
||||
byte xCat = getDecompCat(cp);
|
||||
if (xCat == val) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
class QuickDProp extends UCDProperty {
|
||||
String NO;
|
||||
String MAYBE;
|
||||
Normalizer nfx;
|
||||
QuickDProp (int i) {
|
||||
//setValueType((i == NFC || i == NFKC) ? ENUMERATED_PROP : BINARY_PROP);
|
||||
setValueType(ENUMERATED_PROP);
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
NO = nfx.getName() + "_NO";
|
||||
MAYBE = nfx.getName() + "_MAYBE";
|
||||
name = nfx.getName() + "_QuickCheck";
|
||||
shortName = nfx.getName() + "_QC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from computing decomposibles"
|
||||
+ ((i == NFC || i == NFKC)
|
||||
? " (and characters that may compose with previous ones)" : "");
|
||||
}
|
||||
|
||||
public String getValue(int cp, byte style) {
|
||||
if (!nfx.isNormalized(cp)) return NO;
|
||||
else if (nfx.isTrailing(cp)) return MAYBE;
|
||||
else return "";
|
||||
}
|
||||
|
||||
public String getListingValue(int cp) {
|
||||
return getValue(cp, LONG);
|
||||
}
|
||||
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
private DerivedProperty(UCD ucd) {
|
||||
ucdData = ucd;
|
||||
|
||||
nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdData.getVersion());
|
||||
nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdData.getVersion());
|
||||
nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdData.getVersion());
|
||||
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdData.getVersion());
|
||||
|
||||
for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
|
||||
dprops[i] = new ExDProp(i-ExpandsOnNFD);
|
||||
}
|
||||
|
||||
for (int i = GenNFD; i <= GenNFKC; ++i) {
|
||||
dprops[i] = new GenDProp(i-GenNFD);
|
||||
}
|
||||
|
||||
for (int i = NFC_Leading; i <= NFC_Resulting; ++i) {
|
||||
dprops[i] = new NFC_Prop(i);
|
||||
}
|
||||
|
||||
for (int i = NFD_UnsafeStart; i <= NFKC_UnsafeStart; ++i) {
|
||||
dprops[i] = new NF_UnsafeStartProp(i-NFD_UnsafeStart);
|
||||
}
|
||||
|
||||
dprops[ID_Start] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "ID_Start";
|
||||
shortName = "IDS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Characters that can start an identifier."
|
||||
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl+Other_ID_Start";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierStart(cp);
|
||||
}
|
||||
};
|
||||
|
||||
dprops[ID_Continue_NO_Cf] = new UCDProperty() {
|
||||
{
|
||||
name = "ID_Continue";
|
||||
type = DERIVED_CORE;
|
||||
shortName = "IDC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Characters that can continue an identifier."
|
||||
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc + Other_ID_Continue"
|
||||
+ "\r\n# NOTE: Cf characters should be filtered out.";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierContinue_NO_Cf(cp);
|
||||
}
|
||||
};
|
||||
|
||||
StringBuffer tempBuf = new StringBuffer();
|
||||
|
||||
//System.out.println("Deriving data for XID");
|
||||
// special hack for middle dot
|
||||
XID_Continue_Set.add(0x00B7);
|
||||
//System.out.println("Adding (2)" + ucdData.getCodeAndName(0x00B7));
|
||||
|
||||
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
// skip cases that can't matter
|
||||
if (!ucdData.isAssigned(cp)) continue;
|
||||
|
||||
// find out normal status
|
||||
int status = 0;
|
||||
if (ucdData.isIdentifierStart(cp)) status = 1;
|
||||
else if (ucdData.isIdentifierContinue_NO_Cf(cp)) status = 2;
|
||||
|
||||
if (status != 0 && !nfkd.isNormalized(cp)) {
|
||||
// now find out NFKD status
|
||||
// if it is <start><extend>*, then it is start
|
||||
// else if it is <extend>*, then it is extend
|
||||
// else it is nothing
|
||||
int status2 = 0;
|
||||
tempBuf.setLength(0);
|
||||
nfkd.normalize(UTF32.valueOf32(cp), tempBuf);
|
||||
for (int i = 0; i < tempBuf.length(); i += UTF32.count16(cp)) {
|
||||
int cp2 = UTF32.char32At(tempBuf, i);
|
||||
if (i == 0) {
|
||||
if (ucdData.isIdentifierStart(cp2)) status2 = 1;
|
||||
else if (ucdData.isIdentifierContinue_NO_Cf(cp2)) status2 = 2;
|
||||
else {
|
||||
status2 = 0;
|
||||
break;
|
||||
}
|
||||
} else if (!ucdData.isIdentifierContinue_NO_Cf(cp2) && cp2 != 0xB7) {
|
||||
status2 = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Now see if the statuses are compatible.
|
||||
if (status != status2) {
|
||||
//System.out.println("Need to do something with:");
|
||||
//System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp));
|
||||
//System.out.println(" " + status2 + ": " + ucdData.getCodeAndName(tempBuf.toString()));
|
||||
if (status2 == 0) status = 0;
|
||||
else if (status2 > status) status = status2;
|
||||
//System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp));
|
||||
}
|
||||
}
|
||||
|
||||
if (status == 1) XID_Start_Set.add(cp);
|
||||
if (status != 0) XID_Continue_Set.add(cp);
|
||||
}
|
||||
|
||||
dprops[Mod_ID_Start] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "XID_Start";
|
||||
shortName = "XIDS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# ID_Start modified for closure under NFKx"
|
||||
+ "\r\n# Modified as described in UAX #15"
|
||||
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
return XID_Start_Set.contains(cp);
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Mod_ID_Continue_NO_Cf] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "XID_Continue";
|
||||
shortName = "XIDC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
|
||||
+ "\r\n# Modified as described in UAX #15"
|
||||
+ "\r\n# NOTE: Cf characters should be filtered out."
|
||||
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
return XID_Continue_Set.contains(cp);
|
||||
}
|
||||
};
|
||||
|
||||
dprops[PropMath] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Math";
|
||||
shortName = name;
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Sm + Other_Math";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Sm
|
||||
|| ucdData.getBinaryProperty(cp,Math_Property)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[PropAlphabetic] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Alphabetic";
|
||||
shortName = "Alpha";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
|
||||
|| ucdData.getBinaryProperty(cp, Other_Alphabetic)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[PropLowercase] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Lowercase";
|
||||
shortName = "Lower";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Ll + Other_Lowercase";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Ll
|
||||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[PropUppercase] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Uppercase";
|
||||
shortName = "Upper";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Lu + Other_Uppercase";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lu
|
||||
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = Missing_Uppercase; i <= Missing_Mixedcase; ++i) {
|
||||
dprops[i] = new CaseDProp(i);
|
||||
}
|
||||
|
||||
/*
|
||||
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
|
||||
including all characters whose canonical decomposition consists of a single character.
|
||||
(4) Non-Starter Decompositions: characters that can be derived from the UnicodeData
|
||||
file by including all characters whose canonical decomposition consists of a sequence
|
||||
of characters, the first of which has a non-zero combining class.
|
||||
*/
|
||||
dprops[FullCompExclusion] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_NORMALIZATION;
|
||||
name = "Full_Composition_Exclusion";
|
||||
shortName = "Comp_Ex";
|
||||
defaultValueStyle = defaultPropertyStyle = SHORT;
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (!ucdData.isRepresented(cp)) return false;
|
||||
byte dtype = ucdData.getDecompositionType(cp);
|
||||
if (dtype != CANONICAL) return false;
|
||||
|
||||
if (isCompEx(cp)) return true;
|
||||
return false;
|
||||
}
|
||||
/*public String getListingValue(int cp) {
|
||||
return "Comp_Ex";
|
||||
}*/
|
||||
/*
|
||||
public String getListingValue(int cp) {
|
||||
if (getValueType() != BINARY) return getValue(cp, SHORT);
|
||||
return getProperty(SHORT);
|
||||
}
|
||||
*/
|
||||
};
|
||||
|
||||
dprops[FullCompInclusion] = new UCDProperty() {
|
||||
{
|
||||
isStandard = false;
|
||||
type = DERIVED_NORMALIZATION;
|
||||
name = "Full_Composition_Inclusion";
|
||||
shortName = "Comp_In";
|
||||
defaultValueStyle = defaultPropertyStyle = SHORT;
|
||||
header = "# Derived Property: " + name
|
||||
+ ": Full Composition Inclusion"
|
||||
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (!ucdData.isRepresented(cp)) return false;
|
||||
byte dtype = ucdData.getDecompositionType(cp);
|
||||
if (dtype != CANONICAL) return false;
|
||||
|
||||
if (isCompEx(cp)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[FC_NFKC_Closure] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_NORMALIZATION;
|
||||
setValueType(STRING_PROP);
|
||||
name = "FC_NFKC_Closure";
|
||||
shortName = "FC_NFKC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));"
|
||||
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
|
||||
+ "\r\n# mappings that constitute the FC_NFKC_Closure list"
|
||||
+ "\r\n# Uses the full case folding from CaseFolding.txt, without the T option."
|
||||
;
|
||||
}
|
||||
public String getValue(int cp, byte style) {
|
||||
if (!ucdData.isRepresented(cp)) return "";
|
||||
String b = nfkc.normalize(fold(cp));
|
||||
String c = nfkc.normalize(fold(b));
|
||||
if (c.equals(b)) return "";
|
||||
return "FNC; " + Utility.hex(c);
|
||||
} // default
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
dprops[FC_NFC_Closure] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_NORMALIZATION;
|
||||
isStandard = false;
|
||||
name = "FC_NFC_Closure";
|
||||
setValueType(STRING_PROP);
|
||||
shortName = "FC_NFC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));"
|
||||
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
|
||||
+ "\r\n# mappings that constitute the FC_NFC_Closure list"
|
||||
+ "\r\n# Uses the full case folding from CaseFolding.txt, without the T option."
|
||||
;
|
||||
}
|
||||
public String getValue(int cp, byte style) {
|
||||
if (!ucdData.isRepresented(cp)) return "";
|
||||
String b = nfc.normalize(fold(cp));
|
||||
String c = nfc.normalize(fold(b));
|
||||
if (c.equals(b)) return "";
|
||||
return "FN; " + Utility.hex(c);
|
||||
} // default
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
|
||||
dprops[i] = new QuickDProp(i - QuickNFD);
|
||||
}
|
||||
|
||||
dprops[DefaultIgnorable] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Default_Ignorable_Code_Point";
|
||||
hasUnassigned = true;
|
||||
shortName = "DI";
|
||||
header = null;
|
||||
|
||||
}
|
||||
public String getHeader() {
|
||||
if (ucdData.getCompositeVersion() > 0x040000) return "# Derived Property: " + name
|
||||
+ "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Variation_Selector"
|
||||
+ "\r\n# + Noncharacter_Code_Point + Cf + Cc + Cs) - White_Space"
|
||||
+ "\r\n# - U+FFF9..U+FFFB// INTERLINEAR ANNOTATION characters";
|
||||
//+ "\r\n# - U+0600..U+0603 - U+06DD - U+070F"
|
||||
return "# Derived Property: " + name
|
||||
+ "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Cf + Cc + Cs) - White_Space";
|
||||
}
|
||||
|
||||
public boolean hasValue(int cp) {
|
||||
if (ucdData.getBinaryProperty(cp, White_space)) return false;
|
||||
if (ucdData.getBinaryProperty(cp, Other_Default_Ignorable_Code_Point)) return true;
|
||||
|
||||
if (ucdData.getCompositeVersion() > 0x040000 && cp >= 0xFFF9 && cp <= 0xFFFB) return false;
|
||||
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Cf || cat == Cs || cat == Cc) return true;
|
||||
|
||||
if (ucdData.getCompositeVersion() <= 0x040000) return false;
|
||||
|
||||
//if (cp >= 0xFFF9 && cp <= 0xFFFB) return false;
|
||||
//if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
|
||||
//if (0x0600 <= cp && cp <= 0x0603 || 0x06DD == cp || 0x070F == cp) return false;
|
||||
|
||||
if (ucdData.getBinaryProperty(cp, Variation_Selector)) return true;
|
||||
if (ucdData.getBinaryProperty(cp, Noncharacter_Code_Point)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Case_Sensitive] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
isStandard = false;
|
||||
name = "Case_Sensitive";
|
||||
hasUnassigned = false;
|
||||
shortName = "CS";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from all characters that are either on the right or left side of a case mapping";
|
||||
}
|
||||
|
||||
UnicodeSet case_sensitive = null;
|
||||
UnicodeSet tempSet = new UnicodeSet();
|
||||
UnicodeSet cased = null;
|
||||
PrintWriter log;
|
||||
|
||||
private void addCase(String cps, byte c1, byte c2) {
|
||||
String temp = ucdData.getCase(cps, c1, c2);
|
||||
if (temp.equals(cps)) return;
|
||||
|
||||
//temp = nfc.normalize(temp);
|
||||
//if (temp.equals(cps)) return;
|
||||
|
||||
tempSet.clear();
|
||||
tempSet.addAll(cps);
|
||||
tempSet.addAll(temp);
|
||||
if (!case_sensitive.containsAll(tempSet)) {
|
||||
tempSet.removeAll(case_sensitive);
|
||||
if (!cased.containsAll(tempSet)) {
|
||||
log.println();
|
||||
log.println("Adding " + tempSet + " because of: ");
|
||||
log.println("\t" + ucdData.getCodeAndName(cps));
|
||||
log.println("=>\t" + ucdData.getCodeAndName(temp));
|
||||
}
|
||||
case_sensitive.addAll(tempSet);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasValue(int cp) {
|
||||
if (case_sensitive == null) {
|
||||
try {
|
||||
log = Utility.openPrintWriter("Case_Sensitive_Log.txt", Utility.UTF8_UNIX);
|
||||
|
||||
System.out.println("Building Case-Sensitive cache");
|
||||
case_sensitive = new UnicodeSet();
|
||||
cased = DerivedProperty.make(PropLowercase, ucdData).getSet()
|
||||
.addAll(DerivedProperty.make(PropUppercase, ucdData).getSet())
|
||||
.addAll(UnifiedBinaryProperty.make(CATEGORY | Lt).getSet());
|
||||
for (int c = 0; c < 0x10FFFF; ++c) {
|
||||
Utility.dot(c);
|
||||
// skip cases that can't matter
|
||||
if (!ucdData.isAssigned(c)) continue;
|
||||
|
||||
String cps = UTF16.valueOf(c);
|
||||
addCase(cps, FULL, LOWER);
|
||||
addCase(cps, FULL, UPPER);
|
||||
addCase(cps, FULL, TITLE);
|
||||
addCase(cps, FULL, FOLD);
|
||||
addCase(cps, SIMPLE, LOWER);
|
||||
addCase(cps, SIMPLE, UPPER);
|
||||
addCase(cps, SIMPLE, TITLE);
|
||||
addCase(cps, SIMPLE, FOLD);
|
||||
}
|
||||
Utility.fixDot();
|
||||
UnicodeSet temp;
|
||||
log.println("Cased, but not Case_Sensitive");
|
||||
temp = new UnicodeSet().addAll(cased).removeAll(case_sensitive);
|
||||
Utility.showSetNames(log, "", temp, false, false, ucdData);
|
||||
|
||||
log.println("Case_Sensitive, but not Cased");
|
||||
temp = new UnicodeSet().addAll(case_sensitive).removeAll(cased);
|
||||
Utility.showSetNames(log, "", temp, false, false, ucdData);
|
||||
|
||||
log.println("Both Case_Sensitive, and Cased");
|
||||
temp = new UnicodeSet().addAll(case_sensitive).retainAll(cased);
|
||||
log.println(temp);
|
||||
System.out.println("Done Building Case-Sensitive cache");
|
||||
|
||||
log.close();
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("internal error", null, e);
|
||||
}
|
||||
}
|
||||
return case_sensitive.contains(cp);
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Other_Case_Ignorable] = new UCDProperty() {
|
||||
{
|
||||
name = "Other_Case_Ignorable";
|
||||
shortName = "OCI";
|
||||
isStandard = false;
|
||||
|
||||
header = header = "# Binary Property";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
switch(cp) {
|
||||
case 0x27: case 0x2019: case 0xAD: return true;
|
||||
// case 0x2d: case 0x2010: case 0x2011:
|
||||
/*
|
||||
0027 ; Other_Case_Ignorable # Po APOSTROPHE
|
||||
00AD ; Other_Case_Ignorable # Pd SOFT HYPHEN
|
||||
2019 ; Other_Case_Ignorable # Pf RIGHT SINGLE QUOTATION MARK
|
||||
*/
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Type_i] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
isStandard = false;
|
||||
name = "DSoft_Dotted";
|
||||
shortName = "DSDot";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: all characters whose canonical decompositions end with a combining character sequence that"
|
||||
+ "\r\n# - starts with i or j"
|
||||
+ "\r\n# - has no combining marks above"
|
||||
+ "\r\n# - has no combining marks with zero canonical combining class"
|
||||
;
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (hasSoftDot(cp)) return true;
|
||||
if (nfkd.isNormalized(cp)) return false;
|
||||
String decomp = nfd.normalize(cp);
|
||||
boolean ok = false;
|
||||
for (int i = decomp.length()-1; i >= 0; --i) {
|
||||
int ch = UTF16.charAt(decomp, i);
|
||||
int cc = ucdData.getCombiningClass(ch);
|
||||
if (cc == 230) return false;
|
||||
if (cc == 0) {
|
||||
if (!hasSoftDot(ch)) return false;
|
||||
ok = true;
|
||||
}
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
boolean hasSoftDot(int ch) {
|
||||
return ch == 'i' || ch == 'j' || ch == 0x0268 || ch == 0x0456 || ch == 0x0458;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Case_Ignorable] = new UCDProperty() {
|
||||
{
|
||||
name = "Case_Ignorable";
|
||||
isStandard = false;
|
||||
shortName = "CI";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true;
|
||||
if (dprops[Other_Case_Ignorable].hasValue(cp)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
GraphemeExtend = 27,
|
||||
GraphemeBase = 28,
|
||||
# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink
|
||||
# GraphemeBase :=
|
||||
|
||||
*/
|
||||
dprops[GraphemeExtend] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Grapheme_Extend";
|
||||
shortName = "Gr_Ext";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Me + Mn + Other_Grapheme_Extend"
|
||||
+ "\r\n# Note: depending on an application's interpretation of Co (private use),"
|
||||
+ "\r\n# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."
|
||||
;
|
||||
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
//if (cp == 0x034F) return false;
|
||||
//if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
|
||||
// || cat == Mc
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Me || cat == Mn
|
||||
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[GraphemeBase] = new UCDProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Grapheme_Base";
|
||||
shortName = "Gr_Base";
|
||||
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend"
|
||||
+ "\r\n# Note: depending on an application's interpretation of Co (private use),"
|
||||
+ "\r\n# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."
|
||||
;
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
//if (cp == 0x034F) return false;
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp) return false;
|
||||
// || ucdData.getBinaryProperty(cp,GraphemeLink)
|
||||
if (dprops[GraphemeExtend].hasValue(cp)) return false;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = 0; i < dprops.length; ++i) {
|
||||
UCDProperty up = dprops[i];
|
||||
if (up == null) continue;
|
||||
if (up.getValueType() != BINARY_PROP) continue;
|
||||
up.setValue(NUMBER, "1");
|
||||
up.setValue(SHORT, "T");
|
||||
up.setValue(LONG, "True");
|
||||
}
|
||||
}
|
||||
|
||||
byte getDecompCat(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lu
|
||||
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return Lu;
|
||||
if (cat == Ll
|
||||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
|
||||
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
|
||||
|
||||
// if (true) throw new IllegalArgumentException("FIX nf[2]");
|
||||
|
||||
if (nf[NFKD].isNormalized(cp)) return Lo;
|
||||
|
||||
String norm = nf[NFKD].normalize(cp);
|
||||
int cp2;
|
||||
boolean gotUpper = false;
|
||||
boolean gotLower = false;
|
||||
boolean gotTitle = false;
|
||||
for (int i = 0; i < norm.length(); i += UTF32.count16(cp2)) {
|
||||
cp2 = UTF32.char32At(norm, i);
|
||||
byte catx = ucdData.getCategory(cp2);
|
||||
boolean upx = ucdData.getBinaryProperty(cp, Other_Uppercase);
|
||||
boolean lowx = ucdData.getBinaryProperty(cp, Other_Lowercase);
|
||||
if (catx == Ll || lowx || cp2 == 0x345) gotLower = true;
|
||||
if (catx == Lu || upx) gotUpper = true;
|
||||
if (catx == Lt) gotTitle = true;
|
||||
}
|
||||
if (gotLower && !gotUpper && !gotTitle) return Ll;
|
||||
if (!gotLower && gotUpper && !gotTitle) return Lu;
|
||||
if (gotLower || gotUpper || gotTitle) return Lt;
|
||||
return cat;
|
||||
}
|
||||
|
||||
boolean isCompEx(int cp) {
|
||||
if (ucdData.getBinaryProperty(cp, CompositionExclusion)) return true;
|
||||
String decomp = ucdData.getDecompositionMapping(cp);
|
||||
if (UTF32.length32(decomp) == 1) return true;
|
||||
int first = UTF32.char32At(decomp,0);
|
||||
if (ucdData.getCombiningClass(first) != 0) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
String fold(int cp) {
|
||||
return ucdData.getCase(cp, FULL, FOLD);
|
||||
}
|
||||
|
||||
String fold(String s) {
|
||||
return ucdData.getCase(s, FULL, FOLD);
|
||||
}
|
||||
|
||||
public static void test() {
|
||||
/*
|
||||
DerivedProperty dprop = new DerivedProperty(Default.ucd);
|
||||
for (int j = 0; j < LIMIT; ++j) {
|
||||
System.out.println();
|
||||
System.out.println(j + "\t" + dprop.getName(j));
|
||||
System.out.println(dprop.getHeader(j));
|
||||
}
|
||||
*/
|
||||
|
||||
for (int cp = 0xA0; cp < 0xFF; ++cp) {
|
||||
System.out.println();
|
||||
System.out.println(Default.ucd().getCodeAndName(cp));
|
||||
for (int j = 0; j < DERIVED_PROPERTY_LIMIT; ++j) {
|
||||
String prop = make(j, Default.ucd()).getValue(cp);
|
||||
if (prop.length() != 0) System.out.println("\t" + prop);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,118 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
|
||||
* $Date: 2006/06/09 21:21:20 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
final class DerivedPropertyLister extends PropertyLister {
|
||||
static final boolean BRIDGE = false;
|
||||
|
||||
//static int enum = 0;
|
||||
|
||||
//private int propMask;
|
||||
//private DerivedProperty dprop;
|
||||
private UCDProperty uprop;
|
||||
int width;
|
||||
boolean varies;
|
||||
|
||||
public DerivedPropertyLister(UCD ucd, int propMask, PrintWriter output) {
|
||||
//this.propMask = propMask;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
// this.dprop = new DerivedProperty(ucd);
|
||||
uprop = DerivedProperty.make(propMask, ucd);
|
||||
varies = uprop.getValueType() < BINARY_PROP;
|
||||
|
||||
width = super.minPropertyWidth();
|
||||
switch (propMask) {
|
||||
case DerivedProperty.GenNFD: case DerivedProperty.GenNFC: case DerivedProperty.GenNFKD: case DerivedProperty.GenNFKC:
|
||||
alwaysBreaks = true;
|
||||
break;
|
||||
case DerivedProperty.FC_NFKC_Closure:
|
||||
alwaysBreaks = true;
|
||||
width = 21;
|
||||
break;
|
||||
case DerivedProperty.QuickNFC: case DerivedProperty.QuickNFKC:
|
||||
width = 11;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public String headerString() {
|
||||
return uprop.getHeader();
|
||||
}
|
||||
|
||||
public String valueName(int cp) {
|
||||
return uprop.getListingValue(cp);
|
||||
}
|
||||
|
||||
//public String optionalComment(int cp) {
|
||||
// return super.optionalComment(cp) + " [" + ucdData.getCodeAndName(computedValue) + "]";
|
||||
//}
|
||||
|
||||
|
||||
public int minPropertyWidth() {
|
||||
return width;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
public String optionalComment(int cp) {
|
||||
String id = ucdData.getCategoryID(cp);
|
||||
if (UCD.mainCategoryMask(ucdData.getCategory(cp)) == LETTER_MASK) return id.substring(0,1) + "*";
|
||||
return id;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
public String optionalName(int cp) {
|
||||
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
|
||||
return Utility.hex(ucdData.getDecompositionMapping(cp));
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
String last;
|
||||
|
||||
public byte status(int cp) {
|
||||
if (!uprop.hasUnassigned() && !ucdData.isAssigned(cp)) return EXCLUDE;
|
||||
if (!varies) {
|
||||
return uprop.hasValue(cp) ? INCLUDE : EXCLUDE;
|
||||
}
|
||||
String prop = uprop.getValue(cp);
|
||||
if (prop.length() == 0) return EXCLUDE;
|
||||
if (prop.equals(last)) return INCLUDE;
|
||||
last = prop;
|
||||
return BREAK;
|
||||
}
|
||||
|
||||
/*
|
||||
static Map computedValue = new HashMap();
|
||||
static String getComputedValue(int cp) {
|
||||
return (String) computedValue.get(new Integer(cp));
|
||||
}
|
||||
static void setComputedValue(int cp, String value) {
|
||||
computedValue.put(new Integer(cp), value);
|
||||
}
|
||||
static String lastValue = "";
|
||||
static String currentValue = "";
|
||||
|
||||
StringBuffer foldBuffer = new StringBuffer();
|
||||
|
||||
*/
|
||||
}
|
||||
|
@ -1,158 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
|
||||
* $Date: 2004/02/06 18:30:22 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.io.*;
|
||||
|
||||
class DiffPropertyLister extends PropertyLister {
|
||||
private UCD oldUCD;
|
||||
private UnicodeSet set = new UnicodeSet();
|
||||
private static final int NOPROPERTY = -1;
|
||||
|
||||
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output, int property) {
|
||||
this.output = output;
|
||||
this.ucdData = UCD.make(newUCDName);
|
||||
if (property != NOPROPERTY) newProp = DerivedProperty.make(property, ucdData);
|
||||
|
||||
if (oldUCDName != null) {
|
||||
this.oldUCD = UCD.make(oldUCDName);
|
||||
if (property != NOPROPERTY) oldProp = DerivedProperty.make(property, oldUCD);
|
||||
}
|
||||
breakByCategory = property != NOPROPERTY;
|
||||
useKenName = false;
|
||||
usePropertyComment = false;
|
||||
}
|
||||
|
||||
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) {
|
||||
this(oldUCDName, newUCDName, output, NOPROPERTY);
|
||||
}
|
||||
|
||||
public UnicodeSet getSet() {
|
||||
return set;
|
||||
}
|
||||
|
||||
public String valueName(int cp) {
|
||||
return major_minor_only(ucdData.getVersion());
|
||||
}
|
||||
|
||||
/*
|
||||
public String optionalName(int cp) {
|
||||
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
|
||||
return Utility.hex(ucdData.getDecompositionMapping(cp));
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
UCDProperty newProp = null;
|
||||
UCDProperty oldProp = null;
|
||||
String value = "";
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
String normal = super.optionalComment(cp);
|
||||
if (oldUCD != null && breakByCategory) {
|
||||
byte modCat = oldUCD.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
|
||||
normal = oldUCD.getModCatID_fromIndex(modCat) + "/" + normal;
|
||||
}
|
||||
return normal;
|
||||
}
|
||||
|
||||
|
||||
byte getModCat(int cp) {
|
||||
byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : -1);
|
||||
//System.out.println(breakByCategory + ", " + ucdData.getModCatID_fromIndex(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public byte status(int cp) {
|
||||
if (newProp == null) {
|
||||
if (ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp))) {
|
||||
set.add(cp);
|
||||
return INCLUDE;
|
||||
} else {
|
||||
return EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
||||
// just look at property differences among allocated characters
|
||||
|
||||
if (!ucdData.isAllocated(cp)) return EXCLUDE;
|
||||
if (!oldUCD.isAllocated(cp)) return EXCLUDE;
|
||||
|
||||
String val = newProp.getValue(cp);
|
||||
String oldVal = oldProp.getValue(cp);
|
||||
if (!oldVal.equals(val)) {
|
||||
set.add(cp);
|
||||
return INCLUDE;
|
||||
}
|
||||
return EXCLUDE;
|
||||
|
||||
/*if (cp == 0xFFFF) {
|
||||
System.out.println("# " + Utility.hex(cp));
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
public String headerString() {
|
||||
String result;
|
||||
if (oldUCD != null) {
|
||||
result = "# Differences between "
|
||||
+ major_minor_only(ucdData.getVersion())
|
||||
+ " and "
|
||||
+ major_minor_only(oldUCD.getVersion());
|
||||
} else {
|
||||
result = "# Designated as of "
|
||||
+ major_minor_only(ucdData.getVersion())
|
||||
+ " [excluding removed Hangul Syllables]";
|
||||
}
|
||||
//System.out.println("hs: " + result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
public int print() {
|
||||
String status;
|
||||
if (oldUCD != null) {
|
||||
status = "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion();
|
||||
} else {
|
||||
status = "# Allocated as of " + ucdData.getVersion();
|
||||
}
|
||||
output.println();
|
||||
output.println();
|
||||
output.println(status);
|
||||
output.println();
|
||||
System.out.println(status);
|
||||
int count = super.print();
|
||||
output.println();
|
||||
if (oldUCD != null) {
|
||||
output.println("# Total " + count + " new code points allocated in " + ucdData.getVersion());
|
||||
} else {
|
||||
output.println("# Total " + count + " code points allocated in " + ucdData.getVersion());
|
||||
}
|
||||
|
||||
output.println();
|
||||
return count;
|
||||
}
|
||||
*/
|
||||
|
||||
private String major_minor_only(String s) {
|
||||
if (newProp != null) return s;
|
||||
|
||||
return s.substring(0, s.lastIndexOf('.'));
|
||||
}
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,624 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
||||
* $Date: 2006/04/05 22:12:45 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public class GenerateCaseFolding implements UCD_Types {
|
||||
public static boolean DEBUG = false;
|
||||
public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
|
||||
public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
|
||||
public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
|
||||
static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1
|
||||
|
||||
// PICK_SHORT & NF_CLOSURE = false for old style
|
||||
|
||||
|
||||
/*public static void main(String[] args) throws java.io.IOException {
|
||||
makeCaseFold(arg[0]);
|
||||
//getAge();
|
||||
}
|
||||
*/
|
||||
|
||||
static PrintWriter log;
|
||||
|
||||
|
||||
public static void makeCaseFold(boolean normalized) throws java.io.IOException {
|
||||
PICK_SHORT = NF_CLOSURE = normalized;
|
||||
|
||||
log = Utility.openPrintWriter("CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
System.out.println("Writing Log: " + "CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true));
|
||||
|
||||
System.out.println("Making Full Data");
|
||||
Map fullData = getCaseFolding(true, NF_CLOSURE, "");
|
||||
Utility.fixDot();
|
||||
|
||||
System.out.println("Making Simple Data");
|
||||
Map simpleData = getCaseFolding(false, NF_CLOSURE, "");
|
||||
// write the data
|
||||
|
||||
System.out.println("Making Turkish Full Data");
|
||||
Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr");
|
||||
Utility.fixDot();
|
||||
|
||||
System.out.println("Making Simple Data");
|
||||
Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr");
|
||||
// write the data
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Writing");
|
||||
String filename = "CaseFolding";
|
||||
if (normalized) filename += "-Normalized";
|
||||
String directory = "DerivedData/";
|
||||
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, filename);
|
||||
PrintWriter out = fc.out;
|
||||
|
||||
/*
|
||||
PrintWriter out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(directory + fileRoot + GenerateData.getFileSuffix()),
|
||||
"UTF8"),
|
||||
4*1024));
|
||||
*/
|
||||
|
||||
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
|
||||
if (!charsUsed.get(ch)) continue;
|
||||
|
||||
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
|
||||
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
|
||||
String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch));
|
||||
String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch));
|
||||
if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue;
|
||||
|
||||
if (rFull != null && rFull.equals(rSimple)
|
||||
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
|
||||
String type = "C";
|
||||
if (ch == 0x49) {
|
||||
drawLine(out, ch, "C", "i");
|
||||
drawLine(out, ch, "T", "\u0131");
|
||||
} else if (ch == 0x130) {
|
||||
drawLine(out, ch, "F", "i\u0307");
|
||||
drawLine(out, ch, "T", "i");
|
||||
} else if (ch == 0x131) {
|
||||
// do nothing
|
||||
//drawLine(out, ch, "I", "i");
|
||||
} else {
|
||||
drawLine(out, ch, type, rFull);
|
||||
}
|
||||
} else {
|
||||
if (rFull != null) {
|
||||
drawLine(out, ch, "F", rFull);
|
||||
}
|
||||
if (rSimple != null) {
|
||||
drawLine(out, ch, "S", rSimple);
|
||||
}
|
||||
}
|
||||
if (rFullTurkish != null && !rFullTurkish.equals(rFull)) {
|
||||
drawLine(out, ch, "T", rFullTurkish);
|
||||
}
|
||||
if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) {
|
||||
drawLine(out, ch, "t", rSimpleTurkish);
|
||||
}
|
||||
}
|
||||
fc.close();
|
||||
log.close();
|
||||
}
|
||||
|
||||
/* Goal is following (with no entries for 0131 or 0069)
|
||||
|
||||
0049; C; 0069; # LATIN CAPITAL LETTER I
|
||||
0049; T; 0131; # LATIN CAPITAL LETTER I
|
||||
|
||||
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
*/
|
||||
|
||||
static void drawLine(PrintWriter out, int ch, String type, String result) {
|
||||
String comment = "";
|
||||
if (COMMENT_DIFFS) {
|
||||
String lower = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
|
||||
if (!lower.equals(result)) {
|
||||
String upper = Default.ucd().getCase(UTF16.valueOf(ch), FULL, UPPER);
|
||||
String lower2 = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
|
||||
if (lower.equals(lower2)) {
|
||||
comment = "[Diff " + Utility.hex(lower, " ") + "] ";
|
||||
} else {
|
||||
Utility.fixDot();
|
||||
System.out.println("PROBLEM WITH: " + Default.ucd().getCodeAndName(ch));
|
||||
comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.println(Utility.hex(ch)
|
||||
+ "; " + type
|
||||
+ "; " + Utility.hex(result, " ")
|
||||
+ "; # " + comment + Default.ucd().getName(ch));
|
||||
}
|
||||
|
||||
static int probeCh = 0x01f0;
|
||||
static String shower = UTF16.valueOf(probeCh);
|
||||
|
||||
static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException {
|
||||
Map data = new TreeMap();
|
||||
Map repChar = new TreeMap();
|
||||
//String option = "";
|
||||
|
||||
// get the equivalence classes
|
||||
|
||||
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
//if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
|
||||
if (!Default.ucd().isRepresented(ch)) continue;
|
||||
getClosure(ch, data, full, nfClose, condition);
|
||||
}
|
||||
|
||||
// get the representative characters
|
||||
|
||||
Iterator it = data.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
Set set = (Set) data.get(s);
|
||||
show = set.contains(shower);
|
||||
if (show) {
|
||||
Utility.fixDot();
|
||||
System.out.println(toString(set));
|
||||
}
|
||||
|
||||
// Pick the best available representative
|
||||
|
||||
String rep = null;
|
||||
int repGood = 0;
|
||||
String dup = null;
|
||||
Iterator it2 = set.iterator();
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String)it2.next();
|
||||
int s2Good = goodness(s2, full, condition);
|
||||
if (s2Good > repGood) {
|
||||
rep = s2;
|
||||
repGood = s2Good;
|
||||
dup = null;
|
||||
} else if (s2Good == repGood) {
|
||||
dup = s2;
|
||||
}
|
||||
}
|
||||
if (rep == null) {
|
||||
Utility.fixDot();
|
||||
System.err.println("No representative for: " + toString(set));
|
||||
} else if ((repGood & (NFC_FORMAT | ISLOWER)) != (NFC_FORMAT | ISLOWER)) {
|
||||
String message = "";
|
||||
if ((repGood & NFC_FORMAT) == 0) {
|
||||
message += " [NOT NFC FORMAT]";
|
||||
}
|
||||
if ((repGood & ISLOWER) == 0) {
|
||||
message += " [NOT LOWERCASE]";
|
||||
}
|
||||
Utility.fixDot();
|
||||
log.println("Non-Optimal Representative " + message);
|
||||
log.println(" Rep:\t" + Default.ucd().getCodeAndName(rep));
|
||||
log.println(" Set:\t" + toString(set,true, true));
|
||||
}
|
||||
|
||||
log.println();
|
||||
log.println();
|
||||
log.println(rep + "\t#" + Default.ucd().getName(rep));
|
||||
|
||||
// Add it for all the elements of the set
|
||||
|
||||
it2 = set.iterator();
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String)it2.next();
|
||||
if (s2.equals(rep)) continue;
|
||||
|
||||
log.println(s2 + "\t#" + Default.ucd().getName(s2));
|
||||
|
||||
if (UTF16.countCodePoint(s2) == 1) {
|
||||
repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
|
||||
charsUsed.set(UTF16.charAt(s2, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
return repChar;
|
||||
}
|
||||
|
||||
static BitSet charsUsed = new BitSet();
|
||||
static boolean show = false;
|
||||
static final int NFC_FORMAT = 64;
|
||||
static final int ISLOWER = 128;
|
||||
|
||||
static int goodness(String s, boolean full, String condition) {
|
||||
if (s == null) return 0;
|
||||
int result = 32-s.length();
|
||||
if (!PICK_SHORT) {
|
||||
result = s.length();
|
||||
}
|
||||
if (!full) result <<= 8;
|
||||
String low = lower(upper(s, full, condition), full, condition);
|
||||
if (s.equals(low)) result |= ISLOWER;
|
||||
else if (PICK_SHORT && Default.nfd().normalize(s).equals(Default.nfd().normalize(low))) result |= ISLOWER;
|
||||
|
||||
if (s.equals(Default.nfc().normalize(s))) result |= NFC_FORMAT;
|
||||
|
||||
if (show) {
|
||||
Utility.fixDot();
|
||||
System.out.println(Utility.hex(result) + ", " + Default.ucd().getCodeAndName(s));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
static HashSet temp = new HashSet();
|
||||
static void normalize(HashSet set) {
|
||||
temp.clear();
|
||||
temp.addAll(set);
|
||||
set.clear();
|
||||
Iterator it = temp.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
String s2 = KC.normalize(s);
|
||||
set.add(s);
|
||||
data2.put(s,set);
|
||||
if (!s.equals(s2)) {
|
||||
set.add(s2);
|
||||
data2.put(s2,set);
|
||||
System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2));
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
String
|
||||
String lower1 = Default.ucd.getLowercase(ch);
|
||||
String lower2 = Default.ucd.toLowercase(ch,option);
|
||||
|
||||
char ch2 = Default.ucd.getLowercase(Default.ucd.getUppercase(ch).charAt(0)).charAt(0);
|
||||
//String lower1 = String.valueOf(Default.ucd.getLowercase(ch));
|
||||
//String lower = Default.ucd.toLowercase(ch2,option);
|
||||
String upper = Default.ucd.toUppercase(ch2,option);
|
||||
String lowerUpper = Default.ucd.toLowercase(upper,option);
|
||||
//String title = Default.ucd.toTitlecase(ch2,option);
|
||||
//String lowerTitle = Default.ucd.toLowercase(upper,option);
|
||||
|
||||
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
|
||||
output.println(Utility.hex(ch)
|
||||
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
|
||||
+ "; " + Utility.hex(lowerUpper," ")
|
||||
+ ";\t#" + Default.ucd.getName(ch)
|
||||
);
|
||||
//if (!lowerUpper.equals(lower)) {
|
||||
// output.println("Warning1: " + Utility.hex(lower) + " " + Default.ucd.getName(lower));
|
||||
//}
|
||||
//if (!lowerUpper.equals(lowerTitle)) {
|
||||
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Default.ucd.getName(lowerTitle));
|
||||
//}
|
||||
}
|
||||
*/
|
||||
|
||||
static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) {
|
||||
String charStr = UTF32.valueOf32(ch);
|
||||
String lowerStr = lower(charStr, full, condition);
|
||||
String titleStr = title(charStr, full, condition);
|
||||
String upperStr = upper(charStr, full, condition);
|
||||
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
|
||||
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
|
||||
|
||||
// make new set
|
||||
Set set = new TreeSet();
|
||||
set.add(charStr);
|
||||
data.put(charStr, set);
|
||||
|
||||
// add cases to get started
|
||||
add(set, lowerStr, data);
|
||||
add(set, upperStr, data);
|
||||
add(set, titleStr, data);
|
||||
|
||||
// close it
|
||||
main:
|
||||
while (true) {
|
||||
Iterator it = set.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
// do funny stuff since we can't modify set while iterating
|
||||
// We don't do this because if the source is not normalized, we don't want to normalize
|
||||
if (nfClose) {
|
||||
if (add(set, Default.nfd().normalize(s), data)) continue main;
|
||||
if (add(set, Default.nfc().normalize(s), data)) continue main;
|
||||
if (add(set, Default.nfkd().normalize(s), data)) continue main;
|
||||
if (add(set, Default.nfkc().normalize(s), data)) continue main;
|
||||
}
|
||||
if (add(set, lower(s, full, condition), data)) continue main;
|
||||
if (add(set, title(s, full, condition), data)) continue main;
|
||||
if (add(set, upper(s, full, condition), data)) continue main;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static String lower(String s, boolean full, String condition) {
|
||||
String result = lower2(s,full, condition);
|
||||
return result.replace('\u03C2', '\u03C3'); // HACK for lower
|
||||
}
|
||||
|
||||
// These functions are no longer necessary, since Default.ucd is parameterized,
|
||||
// but it's not worth changing
|
||||
|
||||
static String lower2(String s, boolean full, String condition) {
|
||||
/*if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
|
||||
}
|
||||
*/
|
||||
return Default.ucd().getCase(s, full ? FULL : SIMPLE, LOWER, condition);
|
||||
}
|
||||
|
||||
static String upper(String s, boolean full, String condition) {
|
||||
/* if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
|
||||
}
|
||||
*/
|
||||
return Default.ucd().getCase(s, full ? FULL : SIMPLE, UPPER, condition);
|
||||
}
|
||||
|
||||
static String title(String s, boolean full, String condition) {
|
||||
/*if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
|
||||
}
|
||||
*/
|
||||
return Default.ucd().getCase(s, full ? FULL : SIMPLE, TITLE, condition);
|
||||
}
|
||||
|
||||
static boolean add(Set set, String s, Map data) {
|
||||
if (set.contains(s)) return false;
|
||||
set.add(s);
|
||||
if (DEBUG) System.err.println("adding: " + toString(set));
|
||||
Set other = (Set) data.get(s);
|
||||
if (other != null && other != set) { // merge
|
||||
// make all the items in set point to merged set
|
||||
Iterator it = other.iterator();
|
||||
while (it.hasNext()) {
|
||||
data.put(it.next(), set);
|
||||
}
|
||||
set.addAll(other);
|
||||
}
|
||||
if (DEBUG) System.err.println("done adding: " + toString(set));
|
||||
return true;
|
||||
}
|
||||
|
||||
static String toString(Set set) {
|
||||
return toString(set, false, false);
|
||||
}
|
||||
|
||||
static String toString(Set set, boolean name, boolean crtab) {
|
||||
String result = "{";
|
||||
Iterator it2 = set.iterator();
|
||||
boolean first = true;
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String) it2.next();
|
||||
if (!first) {
|
||||
if (crtab) {
|
||||
result += ";\r\n\t";
|
||||
} else {
|
||||
result += "; ";
|
||||
}
|
||||
}
|
||||
first = false;
|
||||
if (name) {
|
||||
result += Default.ucd().getCodeAndName(s2);
|
||||
} else {
|
||||
result += Utility.hex(s2, " ");
|
||||
}
|
||||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
static boolean specialNormalizationDiffers(int ch) {
|
||||
if (ch == 0x00DF) return true; // es-zed
|
||||
return !Default.nfkd().isNormalized(ch);
|
||||
}
|
||||
|
||||
static String specialNormalization(String s) {
|
||||
if (s.equals("\u00DF")) return "ss";
|
||||
return Default.nfkd().normalize(s);
|
||||
}
|
||||
|
||||
static boolean isExcluded(int ch) {
|
||||
// if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij
|
||||
if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI
|
||||
if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
|
||||
if (0x20A8 <= ch && ch <= 0x217B) return true; // skip Rupee..
|
||||
|
||||
byte type = Default.ucd().getDecompositionType(ch);
|
||||
if (type == COMPAT_SQUARE) return true;
|
||||
//if (type == COMPAT_UNSPECIFIED) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void generateSpecialCasing(boolean normalize) throws IOException {
|
||||
Map sorted = new TreeMap();
|
||||
|
||||
String suffix2 = "";
|
||||
if (normalize) suffix2 = "-Normalized";
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions"
|
||||
+ suffix2 + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
|
||||
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!Default.ucd().isRepresented(ch)) continue;
|
||||
if (!specialNormalizationDiffers(ch)) continue;
|
||||
|
||||
String lower = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, LOWER));
|
||||
String upper = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, UPPER));
|
||||
String title = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, TITLE));
|
||||
|
||||
String chstr = UTF16.valueOf(ch);
|
||||
|
||||
String decomp = specialNormalization(chstr);
|
||||
String flower = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, LOWER));
|
||||
String fupper = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, UPPER));
|
||||
String ftitle = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, TITLE));
|
||||
|
||||
String base = decomp;
|
||||
String blower = specialNormalization(lower);
|
||||
String bupper = specialNormalization(upper);
|
||||
String btitle = specialNormalization(title);
|
||||
|
||||
if (true) {
|
||||
flower = Default.nfc().normalize(flower);
|
||||
fupper = Default.nfc().normalize(fupper);
|
||||
ftitle = Default.nfc().normalize(ftitle);
|
||||
base = Default.nfc().normalize(base);
|
||||
blower = Default.nfc().normalize(blower);
|
||||
bupper = Default.nfc().normalize(bupper);
|
||||
btitle = Default.nfc().normalize(btitle);
|
||||
}
|
||||
|
||||
if (ch == CHECK_CHAR) {
|
||||
System.out.println("Code: " + Default.ucd().getCodeAndName(ch));
|
||||
System.out.println("Decomp: " + Default.ucd().getCodeAndName(decomp));
|
||||
System.out.println("Base: " + Default.ucd().getCodeAndName(base));
|
||||
System.out.println("SLower: " + Default.ucd().getCodeAndName(lower));
|
||||
System.out.println("FLower: " + Default.ucd().getCodeAndName(flower));
|
||||
System.out.println("BLower: " + Default.ucd().getCodeAndName(blower));
|
||||
System.out.println("STitle: " + Default.ucd().getCodeAndName(title));
|
||||
System.out.println("FTitle: " + Default.ucd().getCodeAndName(ftitle));
|
||||
System.out.println("BTitle: " + Default.ucd().getCodeAndName(btitle));
|
||||
System.out.println("SUpper: " + Default.ucd().getCodeAndName(upper));
|
||||
System.out.println("FUpper: " + Default.ucd().getCodeAndName(fupper));
|
||||
System.out.println("BUpper: " + Default.ucd().getCodeAndName(bupper));
|
||||
}
|
||||
|
||||
// presumably if there is a single code point, it would already be in the simple mappings
|
||||
|
||||
if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
|
||||
&& UTF16.countCodePoint(title) == 1) {
|
||||
if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Default.ucd().getCodeAndName(ch));
|
||||
continue;
|
||||
}
|
||||
|
||||
// if there is no change from the base, skip
|
||||
|
||||
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) {
|
||||
if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Default.ucd().getCodeAndName(ch));
|
||||
continue;
|
||||
}
|
||||
|
||||
// fix special cases
|
||||
// if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
|
||||
if (flower.equals(blower)) flower = lower;
|
||||
if (fupper.equals(bupper)) fupper = upper;
|
||||
if (ftitle.equals(btitle)) ftitle = title;
|
||||
|
||||
// if there are no changes from the original, or the expanded original, skip
|
||||
|
||||
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) {
|
||||
if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Default.ucd().getCodeAndName(ch));
|
||||
continue;
|
||||
}
|
||||
|
||||
String name = Default.ucd().getName(ch);
|
||||
|
||||
int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
|
||||
: ch == 0x130 ? 2
|
||||
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4
|
||||
: name.indexOf("LIGATURE") >= 0 ? 3
|
||||
: name.indexOf("GEGRAMMENI") < 0 ? 5
|
||||
: UTF16.countCodePoint(ftitle) == 1 ? 6
|
||||
: UTF16.countCodePoint(fupper) == 2 ? 7
|
||||
: 8;
|
||||
|
||||
if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Default.ucd().getCodeAndName(ch));
|
||||
|
||||
// HACK
|
||||
boolean denormalize = !normalize && order != 6 && order != 7;
|
||||
|
||||
String mapping = Utility.hex(ch)
|
||||
+ "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Default.nfd().normalize(flower) : flower)
|
||||
+ "; " + Utility.hex(ftitle.equals(base) ? chstr : denormalize ? Default.nfd().normalize(ftitle) : ftitle)
|
||||
+ "; " + Utility.hex(fupper.equals(base) ? chstr : denormalize ? Default.nfd().normalize(fupper) : fupper)
|
||||
+ "; # " + Default.ucd().getName(ch);
|
||||
|
||||
// special exclusions
|
||||
if (isExcluded(ch)) {
|
||||
log.println("# " + mapping);
|
||||
} else {
|
||||
int x = ch;
|
||||
if (ch == 0x01F0) x = 0x03B1; // HACK to reorder the same
|
||||
sorted.put(new Integer((order << 24) | x), mapping);
|
||||
}
|
||||
}
|
||||
log.close();
|
||||
|
||||
System.out.println("Writing");
|
||||
//String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
|
||||
//PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
|
||||
UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2);
|
||||
PrintWriter out = udf.out;
|
||||
|
||||
/* String[] batName = {""};
|
||||
String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName);
|
||||
out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
|
||||
out.println(UnicodeDataFile.generateDateLine());
|
||||
out.println("#");
|
||||
*/
|
||||
//Utility.appendFile("com/ibm/text/UCD/SpecialCasingHeader.txt", Utility.UTF8, out);
|
||||
|
||||
Iterator it = sorted.keySet().iterator();
|
||||
int lastOrder = -1;
|
||||
while (it.hasNext()) {
|
||||
Integer key = (Integer) it.next();
|
||||
String line = (String) sorted.get(key);
|
||||
int order = key.intValue() >> 24;
|
||||
if (order != lastOrder) {
|
||||
lastOrder = order;
|
||||
out.println();
|
||||
boolean skipLine = false;
|
||||
switch(order) {
|
||||
case 1:
|
||||
out.println("# The German es-zed is special--the normal mapping is to SS.");
|
||||
out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
|
||||
break;
|
||||
case 2:
|
||||
out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below.");
|
||||
break;
|
||||
case 3: out.println("# Ligatures"); break;
|
||||
case 4: skipLine = true; break;
|
||||
case 5: out.println("# No corresponding uppercase precomposed character"); break;
|
||||
case 6: Utility.appendFile("com/ibm/text/UCD/SpecialCasingIota.txt", Utility.UTF8, out); break;
|
||||
case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break;
|
||||
case 8: skipLine = true; break;
|
||||
}
|
||||
if (!skipLine) out.println();
|
||||
}
|
||||
out.println(line);
|
||||
}
|
||||
Utility.appendFile("com/ibm/text/UCD/SpecialCasingFooter.txt", Utility.UTF8, out);
|
||||
udf.close();
|
||||
//Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
}
|
@ -1,93 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseTest.java,v $
|
||||
* $Date: 2004/02/07 01:01:15 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
abstract public class GenerateCaseTest implements UCD_Types {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61");
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("CaseTest.txt", Utility.UTF8_WINDOWS);
|
||||
|
||||
out.println("# CaseTest");
|
||||
out.println("# Generated: " + Default.getDate() + ", MED");
|
||||
Utility.appendFile("CaseTestHeader.txt", Utility.LATIN1, out);
|
||||
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!Default.ucd().isAllocated(cp)) continue;
|
||||
if (Default.ucd().isHangulSyllable(cp)) continue;
|
||||
byte cat = Default.ucd().getCategory(cp);
|
||||
if (cp == PRIVATE_USE) continue;
|
||||
|
||||
String lower = Default.ucd().getCase(cp, FULL, LOWER);
|
||||
String upper = Default.ucd().getCase(cp, FULL, UPPER);
|
||||
String title = Default.ucd().getCase(cp, FULL, TITLE);
|
||||
String fold = Default.ucd().getCase(cp, FULL, FOLD);
|
||||
if (lower.equals(upper)
|
||||
&& lower.equals(title)
|
||||
&& lower.equals(fold)) continue;
|
||||
|
||||
String s = UTF16.valueOf(cp);
|
||||
write(out, s, true);
|
||||
|
||||
// if (cp == '\u0345') continue; // don't add combining for this special case
|
||||
|
||||
s = s + testChar;
|
||||
|
||||
String s2 = Default.nfd().normalize(s);
|
||||
|
||||
String lower1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, LOWER));
|
||||
String upper1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, UPPER));
|
||||
String title1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, TITLE));
|
||||
String fold1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, FOLD));
|
||||
|
||||
if (lower1.equals(Default.nfc().normalize(lower+testChar))
|
||||
&& upper1.equals(Default.nfc().normalize(upper+testChar))
|
||||
&& title1.equals(Default.nfc().normalize(title+testChar))
|
||||
&& fold1.equals(Default.nfc().normalize(fold+testChar))
|
||||
) continue;
|
||||
|
||||
write(out, s, true);
|
||||
}
|
||||
out.println("# total lines: " + counter);
|
||||
out.close();
|
||||
}
|
||||
|
||||
static final char testChar = '\u0316';
|
||||
static int counter = 0;
|
||||
|
||||
static void write(PrintWriter out, String ss, boolean doComment) {
|
||||
String s = Default.nfd().normalize(ss);
|
||||
String lower = Default.nfc().normalize(Default.ucd().getCase(s, FULL, LOWER));
|
||||
String upper = Default.nfc().normalize(Default.ucd().getCase(s, FULL, UPPER));
|
||||
String title = Default.nfc().normalize(Default.ucd().getCase(s, FULL, TITLE));
|
||||
String fold = Default.nfc().normalize(Default.ucd().getCase(s, FULL, FOLD));
|
||||
out.println(Utility.hex(ss) + "; "
|
||||
+ Utility.hex(lower) + "; "
|
||||
+ Utility.hex(upper) + "; "
|
||||
+ Utility.hex(title) + "; "
|
||||
+ Utility.hex(fold)
|
||||
+ (doComment ? "\t# " + Default.ucd().getName(ss) : "")
|
||||
);
|
||||
counter++;
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,777 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
|
||||
* $Date: 2004/04/17 18:21:39 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public class GenerateLineBreakTest implements UCD_Types {
|
||||
|
||||
// COMMON STUFF for Hangul
|
||||
static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5;
|
||||
static final String[] hNames = {"L", "V", "T", "LV", "LVT"};
|
||||
|
||||
static byte getHangulType(int cp) {
|
||||
if (Default.ucd().isLeadingJamo(cp)) return hL;
|
||||
if (Default.ucd().isVowelJamo(cp)) return hV;
|
||||
if (Default.ucd().isTrailingJamo(cp)) return hT;
|
||||
if (Default.ucd().isHangulSyllable(cp)) {
|
||||
if (Default.ucd().isDoubleHangul(cp)) return hLV;
|
||||
return hLVT;
|
||||
}
|
||||
return hNot;
|
||||
}
|
||||
|
||||
//============================
|
||||
|
||||
protected String rule;
|
||||
protected String fileName = "Line";
|
||||
|
||||
// all the other items are supplied in UCD_TYPES
|
||||
static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT,
|
||||
LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT,
|
||||
LB2_LIMIT = (byte)(LB_SUP + 1);
|
||||
|
||||
String[] samples = new String[100];
|
||||
|
||||
|
||||
byte[] TypeOrder = {
|
||||
LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
|
||||
LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
|
||||
// missing from Pair Table
|
||||
LB_SP, LB_BK, LB_CR, LB_LF,
|
||||
// resolved types below
|
||||
LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
|
||||
// 3 JAMO CLASSES, plus supplementary
|
||||
LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP
|
||||
};
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
new GenerateLineBreakTest().run();
|
||||
|
||||
new GenerateWordBreakTest().run();
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public void run() throws IOException {
|
||||
findSamples();
|
||||
|
||||
// test individual cases
|
||||
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
|
||||
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
|
||||
+ fileName + "</title></head>");
|
||||
out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>");
|
||||
|
||||
|
||||
|
||||
generateTable(out, false);
|
||||
out.println("<h3>Recommended:</h3>");
|
||||
generateTable(out, true);
|
||||
out.println("</body></html>");
|
||||
out.close();
|
||||
|
||||
String[] testCase = new String[50];
|
||||
// do main test
|
||||
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
|
||||
int counter = 0;
|
||||
|
||||
out.println("# Default " + fileName + " Break Test");
|
||||
out.println("# Generated: " + Default.getDate() + ", MED");
|
||||
out.println("#");
|
||||
out.println("# Format:");
|
||||
out.println("# <string> (# <comment>)? ");
|
||||
out.println("# <string> contains hex Unicode code points, with ");
|
||||
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
|
||||
out.println("#\t" + NOBREAK + " wherever there is not.");
|
||||
out.println("# <comment> the format can change, but currently it shows:");
|
||||
out.println("#\t- the sample character name");
|
||||
out.println("#\t- (x) the line_break property* for the sample character");
|
||||
out.println("#\t- [x] the rule that determines whether there is a break or not");
|
||||
out.println("#");
|
||||
out.println("# Samples:");
|
||||
out.println("# The test currently takes all pairs of linebreak types*,");
|
||||
out.println("# picks a sample for each type, and generates three strings: ");
|
||||
out.println("#\t- the pair alone");
|
||||
out.println("#\t- the pair alone with an imbeded space");
|
||||
out.println("#\t- the pair alone with embedded combining marks");
|
||||
out.println("# The sample for each type is simply the first code point (above NULL)");
|
||||
out.println("# with that property.");
|
||||
out.println("# * Note:");
|
||||
out.println("#\t- SG is omitted");
|
||||
out.println("#\t- 3 different Jamo characters and a supplementary character are added");
|
||||
out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments");
|
||||
out.println("#\t instead of the linebreak property");
|
||||
out.println("# These samples may be extended in the future.");
|
||||
out.println("#");
|
||||
|
||||
for (int ii = 0; ii < getLimit(); ++ii) {
|
||||
int i = TypeOrder[ii];
|
||||
if (i == LB_SG) continue;
|
||||
String before = samples[i];
|
||||
|
||||
for (int jj = 0; jj < getLimit(); ++jj) {
|
||||
Utility.dot(counter);
|
||||
int j = TypeOrder[jj];
|
||||
if (j == LB_SG) continue;
|
||||
String after = samples[j];
|
||||
// do line straight
|
||||
int len = genTestItems(before, after, testCase);
|
||||
for (int q = 0; q < len; ++q) {
|
||||
printLine(out, testCase[q], k != 0 && q == 0, false);
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
}
|
||||
out.println("# Lines: " + counter);
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public int genTestItems(String before, String after, String[] results) {
|
||||
results[0] = before + after;
|
||||
results[1] = before + " " + after;
|
||||
results[2] = before + "\u0301\u0308" + after;
|
||||
return 3;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
boolean skipType(byte type) {
|
||||
return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp) {
|
||||
byte result = getType(cp);
|
||||
if (result == LB_SUP) return "SUP";
|
||||
if (result >= LB_LIMIT) return hNames[result - LB_LIMIT];
|
||||
return Default.ucd().getLineBreakID_fromIndex(result);
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public byte getType(int cp) {
|
||||
if (cp > 0xFFFF) return LB_SUP;
|
||||
byte result = getHangulType(cp);
|
||||
if (result != hNot) return (byte)(result + LB_LIMIT);
|
||||
return Default.ucd().getLineBreak(cp);
|
||||
}
|
||||
|
||||
public int getLimit() {
|
||||
return LB2_LIMIT;
|
||||
}
|
||||
|
||||
public int getTableLimit() {
|
||||
return LB_SUP; // skip last;
|
||||
}
|
||||
|
||||
|
||||
public void generateTable(PrintWriter out, boolean recommended) {
|
||||
String width = "width='" + (100 / (getTableLimit() + 1)) + "%'";
|
||||
out.print("<table border='1' cellspacing='0'><tr><th " + width + "></th>");
|
||||
byte type;
|
||||
for (int i = 0; i < getTableLimit(); ++i) {
|
||||
type = TypeOrder[i];
|
||||
if (skipType(type)) continue;
|
||||
|
||||
String h = getTypeID(samples[TypeOrder[i]]);
|
||||
out.print("<th " + width + ">" + h + "</th>");
|
||||
}
|
||||
out.print("</tr>");
|
||||
String[] rule = new String[1];
|
||||
String[] rule2 = new String[1];
|
||||
for (int i = 0; i < getTableLimit(); ++i) {
|
||||
type = TypeOrder[i];
|
||||
if (skipType(type)) continue;
|
||||
|
||||
String before = samples[type];
|
||||
String line = "<tr><th>" + getTypeID(before) + "</th>";
|
||||
for (int j = 0; j < getTableLimit(); ++j) {
|
||||
type = TypeOrder[j];
|
||||
if (skipType(type)) continue;
|
||||
|
||||
String after = samples[type];
|
||||
String t = getTableEntry(before, after, recommended, rule);
|
||||
String background = "";
|
||||
String t2 = getTableEntry(before, after, !recommended, rule2);
|
||||
if (!t.equals(t2)) {
|
||||
if (t.equals(NOBREAK)) {
|
||||
background = " bgcolor='#CCFFFF'";
|
||||
} else {
|
||||
background = " bgcolor='#FFFF00'";
|
||||
}
|
||||
} else if (t.equals(NOBREAK)) {
|
||||
background = " bgcolor='#CCCCFF'";
|
||||
}
|
||||
line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
|
||||
}
|
||||
out.println(line + "</tr>");
|
||||
}
|
||||
out.println("</table>");
|
||||
}
|
||||
|
||||
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
|
||||
String t = "_";
|
||||
boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
|
||||
String spaceRule = rule;
|
||||
|
||||
boolean spaceBreak2 = isBreak(before + " " + after, before.length(), recommended);
|
||||
String spaceRule2 = rule;
|
||||
|
||||
boolean normalBreak = isBreak(before + after, before.length(), recommended);
|
||||
String normalRule = rule;
|
||||
|
||||
if (!normalBreak) {
|
||||
if (!spaceBreak && !spaceBreak2) {
|
||||
t = "^";
|
||||
rule = spaceRule.equals(normalRule) ? normalRule : spaceRule + "/" + normalRule;
|
||||
if (!spaceRule2.equals(normalRule) && !spaceRule2.equals(spaceRule)) {
|
||||
rule += "/" + spaceRule2;
|
||||
}
|
||||
} else {
|
||||
t = "%";
|
||||
rule = normalRule;
|
||||
}
|
||||
}
|
||||
ruleOut[0] = rule;
|
||||
return t;
|
||||
}
|
||||
|
||||
static final String BREAK = "\u00F7";
|
||||
static final String NOBREAK = "\u00D7";
|
||||
|
||||
public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) {
|
||||
int cp;
|
||||
StringBuffer string = new StringBuffer();
|
||||
StringBuffer comment = new StringBuffer("\t# ");
|
||||
String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK;
|
||||
string.append(status);
|
||||
comment.append(' ').append(status).append(" [").append(rule).append(']');
|
||||
|
||||
for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) {
|
||||
|
||||
cp = UTF16.charAt(source, offset);
|
||||
if (string.length() > 0) {
|
||||
string.append(' ');
|
||||
comment.append(' ');
|
||||
}
|
||||
|
||||
string.append(Utility.hex(cp));
|
||||
comment.append(Default.ucd().getName(cp) + " (" + getTypeID(cp) + ")");
|
||||
|
||||
status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK;
|
||||
string.append(' ').append(status);
|
||||
comment.append(' ').append(status).append(" [").append(rule).append(']');
|
||||
}
|
||||
|
||||
if (comments) string.append(comment);
|
||||
out.println(string);
|
||||
}
|
||||
|
||||
public void findSamples() {
|
||||
for (int i = 1; i <= 0x10FFFF; ++i) {
|
||||
if (!Default.ucd().isAllocated(i)) continue;
|
||||
if (0xD800 <= i && i <= 0xDFFF) continue;
|
||||
if(i == 0x1100) {
|
||||
System.out.print("here");
|
||||
}
|
||||
byte lb = getType(i);
|
||||
if (samples[lb] == null) {
|
||||
samples[lb] = UTF16.valueOf(i);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < TypeOrder.length; ++i) {
|
||||
String sample = samples[i];
|
||||
System.out.println(getTypeID(sample) + ":\t" + Default.ucd().getCodeAndName(sample));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String getTypeID(String s) {
|
||||
if (s == null) return "<null>";
|
||||
if (s.length() == 1) return getTypeID(s.charAt(0));
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(" ");
|
||||
result.append(getTypeID(cp));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
|
||||
int cp;
|
||||
for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
byte f = getResolvedType(cp, recommended);
|
||||
if (f != notLBType) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public byte getResolvedType (int cp, boolean recommended) {
|
||||
// LB 1 Assign a line break category to each character of the input.
|
||||
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
|
||||
byte result = getType(cp);
|
||||
switch (result) {
|
||||
case LB_AI: result = LB_AI; break;
|
||||
// case LB_CB: result = LB_ID; break;
|
||||
case LB_SA: result = LB_AL; break;
|
||||
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
|
||||
case LB_XX: result = LB_AL; break;
|
||||
}
|
||||
if (recommended) {
|
||||
if (getHangulType(cp) != hNot) {
|
||||
result = LB_ID;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean onCodepointBoundary(String s, int offset) {
|
||||
if (offset < 0 || offset > s.length()) return false;
|
||||
if (offset == 0 || offset == s.length()) return true;
|
||||
if (UTF16.isLeadSurrogate(s.charAt(offset-1))
|
||||
&& UTF16.isTrailSurrogate(s.charAt(offset))) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// find out whether there is a break at offset
|
||||
// WARNING: as a side effect, sets "rule"
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
|
||||
// LB 1 Assign a line break category to each character of the input.
|
||||
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
|
||||
// this is taken care of in the getResolvedType function
|
||||
|
||||
// LB 2a Never break at the start of text
|
||||
|
||||
rule="2a";
|
||||
if (offset <= 0) return false;
|
||||
|
||||
// LB 2b Always break at the end of text
|
||||
|
||||
rule="2b";
|
||||
if (offset >= source.length()) return true;
|
||||
|
||||
|
||||
// UTF-16: never break in the middle of a code point
|
||||
if (!onCodepointBoundary(source, offset)) return false;
|
||||
|
||||
|
||||
// now get the character before and after, and their types
|
||||
|
||||
|
||||
int cpBefore = UTF16.charAt(source, offset-1);
|
||||
int cpAfter = UTF16.charAt(source, offset);
|
||||
|
||||
byte before = getResolvedType(cpBefore, recommended);
|
||||
byte after = getResolvedType(cpAfter, recommended);
|
||||
|
||||
|
||||
rule="3a";
|
||||
// Always break after hard line breaks (but never between CR and LF).
|
||||
// CR ^ LF
|
||||
if (before == LB_CR && after == LB_LF) return false;
|
||||
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
|
||||
|
||||
//LB 3b Don’t break before hard line breaks.
|
||||
rule="3b";
|
||||
if (after == LB_BK || after == LB_LF | after == LB_CR) return false;
|
||||
|
||||
// LB 4 Don’t break before spaces or zero-width space.
|
||||
// × SP
|
||||
// × ZW
|
||||
|
||||
rule="4";
|
||||
if (after == LB_SP || after == LB_ZW) return false;
|
||||
|
||||
// LB 5 Break after zero-width space.
|
||||
// ZW ÷
|
||||
rule="5";
|
||||
if (before == LB_ZW) return true;
|
||||
|
||||
// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
|
||||
rule="6";
|
||||
if (after == LB_CM) return false;
|
||||
|
||||
if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false;
|
||||
|
||||
if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false;
|
||||
|
||||
if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false;
|
||||
|
||||
boolean setBase = false;
|
||||
if (before == LB_CM) {
|
||||
setBase = true;
|
||||
int backOffset = findLastNon(source, offset, LB_CM, recommended);
|
||||
if (backOffset < 0) {
|
||||
before = LB_ID;
|
||||
} else {
|
||||
before = getResolvedType(UTF16.charAt(source, backOffset), recommended);
|
||||
}
|
||||
}
|
||||
|
||||
// LB 7 In all of the following rules, if a space is the base character for a combining mark,
|
||||
// the space is changed to type ID. In other words, break before SP CM* in the same cases as
|
||||
// one would break before an ID.
|
||||
rule="7";
|
||||
if (setBase && before == LB_SP) before = LB_ID;
|
||||
|
||||
// LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
|
||||
// × CL, × EX, × IS, × SY
|
||||
rule="8";
|
||||
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
|
||||
|
||||
|
||||
// find the last non-space character; we will need it
|
||||
byte lastNonSpace = before;
|
||||
if (lastNonSpace == LB_SP) {
|
||||
int backOffset = findLastNon(source, offset, LB_CM, recommended);
|
||||
if (backOffset >= 0) {
|
||||
lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended);
|
||||
}
|
||||
}
|
||||
|
||||
// LB 9 Don’t break after ‘[’, even after spaces.
|
||||
// OP SP* ×
|
||||
rule="9";
|
||||
if (lastNonSpace == LB_OP) return false;
|
||||
|
||||
// LB 10 Don’t break within ‘”[’, , even with intervening spaces.
|
||||
// QU SP* × OP
|
||||
rule="10";
|
||||
if (lastNonSpace == LB_QU && after == LB_OP) return false;
|
||||
|
||||
// LB 11 Don’t break within ‘]h’, even with intervening spaces.
|
||||
// CL SP* × NS
|
||||
rule="11";
|
||||
if (lastNonSpace == LB_CL && after == LB_NS) return false;
|
||||
|
||||
// LB 11a Don’t break within ‘——’, even with intervening spaces.
|
||||
// B2 × B2
|
||||
rule="11a";
|
||||
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
|
||||
|
||||
|
||||
if (recommended) {
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
|
||||
rule="11b";
|
||||
if (after == LB_GL || before == LB_GL) return false;
|
||||
}
|
||||
|
||||
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
|
||||
|
||||
rule="12";
|
||||
// LB 12 Break after spaces
|
||||
// SP ÷
|
||||
|
||||
if (before == LB_SP) return true;
|
||||
|
||||
if (!recommended) {
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
|
||||
rule="13";
|
||||
if (after == LB_GL || before == LB_GL) return false;
|
||||
}
|
||||
|
||||
rule="14";
|
||||
// LB 14 Don’t break before or after ‘”’
|
||||
// × QU
|
||||
// QU ×
|
||||
if (before == LB_QU || after == LB_QU) return false;
|
||||
|
||||
// LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces,
|
||||
// small kana and other non- starters, or after acute accents:
|
||||
// × BA
|
||||
// × HY
|
||||
// × NS
|
||||
// BB ×
|
||||
|
||||
if (recommended) {
|
||||
// LB 14a Break before and after CB
|
||||
// CB ÷
|
||||
// ÷ CB
|
||||
if (before == LB_CB || after == LB_CB) return true;
|
||||
|
||||
}
|
||||
|
||||
rule="15";
|
||||
if (after == LB_NS) return false;
|
||||
if (after == LB_HY) return false;
|
||||
if (after == LB_BA) return false;
|
||||
if (before == LB_BB) return false;
|
||||
|
||||
if (!recommended) {
|
||||
// LB 15b Break after hyphen-minus, and before acute accents:
|
||||
// HY ÷
|
||||
// ÷ BB
|
||||
|
||||
rule="15b";
|
||||
if (before == LB_HY) return true;
|
||||
if (after == LB_BB) return true;
|
||||
}
|
||||
|
||||
// LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis:
|
||||
// AL × IN
|
||||
// ID × IN
|
||||
// IN × IN
|
||||
// NU × IN
|
||||
// Examples: ’9...’, ‘a...’, ‘H...’
|
||||
rule="16";
|
||||
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
|
||||
if (before == LB_IN && after == LB_IN) return false;
|
||||
|
||||
// Don't break alphanumerics.
|
||||
// LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’
|
||||
// ID × PO
|
||||
// AL × NU
|
||||
// NU × AL
|
||||
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
|
||||
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
|
||||
// This is approximated with the following rules. (Some cases already handled above,
|
||||
// like ‘9,’, ‘[9’.)
|
||||
rule="17";
|
||||
if (before == LB_ID && after == LB_PO) return false;
|
||||
if (before == LB_AL && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_AL) return false;
|
||||
|
||||
// LB 18 Don’t break between the following pairs of classes.
|
||||
// CL × PO
|
||||
// HY × NU
|
||||
// IS × NU
|
||||
// NU × NU
|
||||
// NU × PO
|
||||
// PR × AL
|
||||
// PR × HY
|
||||
// PR × ID
|
||||
// PR × NU
|
||||
// PR × OP
|
||||
// SY × NU
|
||||
// Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’
|
||||
|
||||
rule="18";
|
||||
if (before == LB_CL && after == LB_PO) return false;
|
||||
if (before == LB_HY && after == LB_NU) return false;
|
||||
if (before == LB_IS && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_PO) return false;
|
||||
|
||||
if (before == LB_PR && after == LB_AL) return false;
|
||||
if (before == LB_PR && after == LB_HY) return false;
|
||||
if (before == LB_PR && after == LB_ID) return false;
|
||||
if (before == LB_PR && after == LB_NU) return false;
|
||||
if (before == LB_PR && after == LB_OP) return false;
|
||||
|
||||
if (before == LB_SY && after == LB_NU) return false;
|
||||
|
||||
if (recommended) {
|
||||
// LB 15b Break after hyphen-minus, and before acute accents:
|
||||
// HY ÷
|
||||
// ÷ BB
|
||||
|
||||
rule="18b";
|
||||
if (before == LB_HY) return true;
|
||||
if (after == LB_BB) return true;
|
||||
}
|
||||
|
||||
// LB 19 Don’t break between alphabetics (“at”)
|
||||
// AL × AL
|
||||
|
||||
rule="19";
|
||||
if (before == LB_AL && after == LB_AL) return false;
|
||||
|
||||
// LB 20 Break everywhere else
|
||||
// ALL ÷
|
||||
// ÷ ALL
|
||||
|
||||
rule="20";
|
||||
return true;
|
||||
}
|
||||
|
||||
static class GenerateWordBreakTest extends GenerateLineBreakTest {
|
||||
|
||||
static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8,
|
||||
oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES!
|
||||
L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT,
|
||||
LIMIT = LVT + 1;
|
||||
|
||||
static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" };
|
||||
|
||||
static UCDProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend);
|
||||
static UCDProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
|
||||
static UCDProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
|
||||
|
||||
{
|
||||
fileName = "Word";
|
||||
TypeOrder = new byte[LIMIT];
|
||||
for (byte i = 0; i < TypeOrder.length; ++i) {
|
||||
TypeOrder[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
boolean skipType(byte type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
public int getLimit() {
|
||||
return LIMIT;
|
||||
}
|
||||
|
||||
public int getTableLimit() {
|
||||
return LIMIT;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public int genTestItems(String before, String after, String[] results) {
|
||||
results[0] = before + after;
|
||||
return 1;
|
||||
}
|
||||
|
||||
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
|
||||
boolean normalBreak = isBreak(before + after, before.length(), recommended);
|
||||
String normalRule = rule;
|
||||
ruleOut[0] = rule;
|
||||
return normalBreak ? BREAK : NOBREAK;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp) {
|
||||
byte type = getType(cp);
|
||||
if (type >= oLIMIT) return hNames[type - oLIMIT];
|
||||
return Names[type];
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public byte getType(int cp) {
|
||||
// single characters
|
||||
if (cp == 0xA) return LF;
|
||||
if (cp == 0xD) return CR;
|
||||
if (cp == 0x034F) return CGJ;
|
||||
if (cp == 0x2028 || cp == 0x2029) return Control;
|
||||
|
||||
// Hangul
|
||||
byte result = getHangulType(cp);
|
||||
if (result != hNot) return (byte)(result + oLIMIT);
|
||||
|
||||
// other properties
|
||||
// category based
|
||||
byte cat = Default.ucd().getCategory(cp);
|
||||
if (cat == Cc) return Control;
|
||||
if (cat == Cf) return Extend;
|
||||
if (((1<<cat) & LETTER_MASK) != 0) return LetterBase;
|
||||
|
||||
// other binary properties
|
||||
if (linkProp.hasValue(cp)) return Link;
|
||||
if (extendProp.hasValue(cp)) return Extend;
|
||||
if (baseProp.hasValue(cp)) return Base;
|
||||
|
||||
return Other;
|
||||
}
|
||||
|
||||
public byte getResolvedType(int cp, boolean recommended) {
|
||||
return getType(cp);
|
||||
}
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
rule="1";
|
||||
if (offset < 0 || offset > source.length()) return false;
|
||||
if (offset == 0) return true;
|
||||
|
||||
rule = "2";
|
||||
if (offset == source.length()) return true;
|
||||
|
||||
// UTF-16: never break in the middle of a code point
|
||||
if (!onCodepointBoundary(source, offset)) return false;
|
||||
|
||||
// now get the character before and after, and their types
|
||||
|
||||
|
||||
int cpBefore = UTF16.charAt(source, offset-1);
|
||||
int cpAfter = UTF16.charAt(source, offset);
|
||||
|
||||
byte before = getResolvedType(cpBefore, recommended);
|
||||
byte after = getResolvedType(cpAfter, recommended);
|
||||
|
||||
rule = "3";
|
||||
if (before == CR && after == LF) return false;
|
||||
|
||||
rule = "4";
|
||||
if (before == CR || before == LF || before == Control
|
||||
|| after == Control || after == LF || after == CR) return true;
|
||||
|
||||
rule = "6";
|
||||
if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
|
||||
|
||||
rule = "7";
|
||||
if ((before == LV || before == V) && (after == V || after == T)) return false;
|
||||
|
||||
rule = "8";
|
||||
if ((before == LVT || before == T) && (after == T)) return false;
|
||||
|
||||
rule = "9";
|
||||
if (after == Extend) return false;
|
||||
|
||||
if (recommended) {
|
||||
if (after == Link || after == CGJ) return false;
|
||||
} else {
|
||||
|
||||
// Do not break around a CGJ.
|
||||
rule = "10";
|
||||
if (before == CGJ && (after == Base
|
||||
|| after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false;
|
||||
rule = "11";
|
||||
if (after == CGJ) return false;
|
||||
|
||||
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
|
||||
|
||||
rule = "12";
|
||||
//Link Extend* × LetterBase (12)
|
||||
if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
|
||||
int backOffset = findLastNon(source, offset, Extend, recommended);
|
||||
if (backOffset >= 0) {
|
||||
byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended);
|
||||
if (last == Link) return false;
|
||||
}
|
||||
}
|
||||
|
||||
rule = "13";
|
||||
if (after == Link) return false;
|
||||
}
|
||||
|
||||
// Otherwise break after all characters.
|
||||
rule = "14";
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -1,125 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $
|
||||
* $Date: 2006/04/05 22:12:45 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
public final class GenerateNamedSequences implements UCD_Types {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
static public String showVarGlyphs(String code0, String code1, String shape, String description) {
|
||||
if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
|
||||
|
||||
String abbShape = "";
|
||||
if (shape.length() != 0) {
|
||||
abbShape = '-' + shape.substring(0,4);
|
||||
if (description.indexOf("feminine") >= 0) abbShape += "fem";
|
||||
}
|
||||
|
||||
return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape
|
||||
+ "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
|
||||
}
|
||||
|
||||
/*
|
||||
# Field 0: the variation sequence
|
||||
# Field 1: the description of the desired appearance
|
||||
# Field 2: where the appearance is only different in in particular shaping environments
|
||||
# this field lists them. The possible values are: isolated, initial, medial, final.
|
||||
# If more than one is present, there are spaces between them.
|
||||
*/
|
||||
static public void generate() throws IOException {
|
||||
|
||||
|
||||
// read the data and compose the table
|
||||
|
||||
String table = "<table><tr><th width='10%'>Rep Glyph</th><th>Hex Sequence</th><th>Name</th><th>Copyable</th></tr>";
|
||||
|
||||
String[] splits = new String[4];
|
||||
String[] codes = new String[20];
|
||||
String[] shapes = new String[4];
|
||||
|
||||
BufferedReader in = Utility.openUnicodeFile("NamedSequences", Default.ucdVersion(), true, Utility.LATIN1);
|
||||
Transliterator unicodexml = Transliterator.getInstance("hex/xml");
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(in);
|
||||
if (line == null) break;
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
int count = Utility.split(line, ';', splits);
|
||||
String name = splits[0];
|
||||
int codeCount = Utility.split(splits[1], ' ', codes);
|
||||
StringBuffer codeBuffer = new StringBuffer();
|
||||
for (int i = 0; i < codeCount; ++i) {
|
||||
UTF16.append(codeBuffer, Integer.parseInt(codes[i],16));
|
||||
}
|
||||
String codeWithHyphens = splits[1].replaceAll("\\s", "-");
|
||||
String codeAlt = "U+" + splits[1].replaceAll("\\s", " U+");
|
||||
String codeString = unicodexml.transliterate(codeBuffer.toString());
|
||||
|
||||
// <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
|
||||
|
||||
//table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
|
||||
String imageName = "images/U" + codeWithHyphens + ".gif";
|
||||
if (splits[1].compareTo("1780") >= 0 && splits[1].compareTo("1800") < 0) {
|
||||
String codeNoSpaces2 = splits[1].replaceAll("\\s", "");
|
||||
imageName = "http://www.unicode.org/reports/tr28/images/" + codeNoSpaces2 + ".gif";
|
||||
}
|
||||
table += "<tr>"
|
||||
+ "<td class='copy'><img alt='(" + codeAlt + ")' src='" + imageName + "'><br><tt>"
|
||||
+ splits[1] + "</tt></td>"
|
||||
+ "<td>" + splits[1] + "</td>"
|
||||
+ "</td><td>" + name + "</td>"
|
||||
+ "<td class='copy'>" + codeString + "</td>"
|
||||
+ "</tr>\n";
|
||||
System.out.println(splits[1] + "\t" + codeString);
|
||||
}
|
||||
in.close();
|
||||
table += "</table>";
|
||||
|
||||
// now write out the results
|
||||
|
||||
String directory = "DerivedData/";
|
||||
String filename = directory + "NamedSequences" + UnicodeDataFile.getHTMLFileSuffix(true);
|
||||
PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
|
||||
/*
|
||||
String[] batName = {""};
|
||||
String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
|
||||
|
||||
String version = Default.ucd().getVersion();
|
||||
int lastDot = version.lastIndexOf('.');
|
||||
String updateDirectory = version.substring(0,lastDot) + "-Update";
|
||||
int updateV = version.charAt(version.length()-1) - '0';
|
||||
if (updateV != 0) updateDirectory += (char)('1' + updateV);
|
||||
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
|
||||
*/
|
||||
|
||||
String[] replacementList = {
|
||||
"@revision@", Default.ucd().getVersion(),
|
||||
//"@updateDirectory@", updateDirectory,
|
||||
"@date@", Default.getDate(),
|
||||
"@table@", table};
|
||||
|
||||
Utility.appendFile("com/ibm/text/UCD/NamedSequences-Template.html", Utility.UTF8, out, replacementList);
|
||||
|
||||
out.close();
|
||||
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
|
||||
}
|
||||
}
|
@ -1,136 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java,v $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
public final class GenerateStandardizedVariants implements UCD_Types {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
static public String showVarGlyphs(String code0, String code1, String shape, String description) {
|
||||
if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
|
||||
|
||||
String abbShape = "";
|
||||
if (shape.length() != 0) {
|
||||
abbShape = '-' + shape.substring(0,4);
|
||||
if (description.indexOf("feminine") >= 0) abbShape += "fem";
|
||||
}
|
||||
|
||||
return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape
|
||||
+ "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
|
||||
}
|
||||
|
||||
/*
|
||||
# Field 0: the variation sequence
|
||||
# Field 1: the description of the desired appearance
|
||||
# Field 2: where the appearance is only different in in particular shaping environments
|
||||
# this field lists them. The possible values are: isolated, initial, medial, final.
|
||||
# If more than one is present, there are spaces between them.
|
||||
*/
|
||||
static public void generate() throws IOException {
|
||||
|
||||
|
||||
// read the data and compose the table
|
||||
|
||||
String table = "<table><tr><th>Rep Glyph</th><th>Character Sequence</th><th>Context</th><th width='10%'>Alt Glyph</th><th>Description of variant appearance</th></tr>";
|
||||
|
||||
String[] splits = new String[4];
|
||||
String[] codes = new String[2];
|
||||
String[] shapes = new String[4];
|
||||
|
||||
BufferedReader in = Utility.openUnicodeFile("StandardizedVariants", Default.ucdVersion(), true, Utility.LATIN1);
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(in);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
int count = Utility.split(line, ';', splits);
|
||||
int codeCount = Utility.split(splits[0], ' ', codes);
|
||||
int code = Utility.codePointFromHex(codes[0]);
|
||||
|
||||
// <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
|
||||
|
||||
table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
|
||||
table += "<td>" + splits[0] + "</td>\n";
|
||||
|
||||
String shape = splits[2].trim();
|
||||
if (shape.equals("all")) shape = "";
|
||||
|
||||
table += "<td>" + Utility.replace(shape, " ", "<br>") + "</td>\n";
|
||||
|
||||
// http://www.unicode.org/cgi-bin/varglyph?24-1820-180B-fina
|
||||
// http://www.unicode.org/cgi-bin/varglyph?24-222A-FE00
|
||||
|
||||
table += "<td>";
|
||||
if (shape.length() == 0) {
|
||||
table += showVarGlyphs(codes[0], codes[1], "", "");
|
||||
} else {
|
||||
int shapeCount = Utility.split(shape, ' ', shapes);
|
||||
for (int i = 0; i < shapeCount; ++i) {
|
||||
if (i != 0) table += " ";
|
||||
table += showVarGlyphs(codes[0], codes[1], shapes[i], splits[1]);
|
||||
}
|
||||
}
|
||||
table += "</td>\n";
|
||||
|
||||
table += "<td>" + Default.ucd().getName(code) + " " + splits[1] + "</td>\n";
|
||||
table += "</tr>";
|
||||
}
|
||||
in.close();
|
||||
table += "</table>";
|
||||
|
||||
// now write out the results
|
||||
|
||||
String directory = "DerivedData/";
|
||||
String filename = directory + "StandardizedVariants" + UnicodeDataFile.getHTMLFileSuffix(true);
|
||||
PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
|
||||
//String[] batName = {""};
|
||||
//String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
|
||||
|
||||
String version = Default.ucd().getVersion();
|
||||
int lastDot = version.lastIndexOf('.');
|
||||
String updateDirectory;
|
||||
String partialFilename;
|
||||
if (version.compareTo("4.1.0") < 0) {
|
||||
updateDirectory = version.substring(0,lastDot) + "-Update";
|
||||
int updateV = version.charAt(version.length()-1) - '0';
|
||||
if (updateV != 0) updateDirectory += (char)('1' + updateV);
|
||||
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
|
||||
partialFilename = "StandardizedVariants-" + Default.ucd().getVersion();
|
||||
} else if (version.compareTo("4.1.0") == 0) {
|
||||
updateDirectory = version.substring(0,lastDot) + "/ucd";
|
||||
partialFilename = "StandardizedVariants";
|
||||
} else {
|
||||
updateDirectory = version + "/ucd";
|
||||
partialFilename = "StandardizedVariants";
|
||||
}
|
||||
|
||||
|
||||
String[] replacementList = {
|
||||
"@revision@", Default.ucd().getVersion(),
|
||||
"@updateDirectory@", updateDirectory,
|
||||
"@filename@", partialFilename,
|
||||
"@date@", Default.getDate(),
|
||||
"@table@", table};
|
||||
|
||||
Utility.appendFile("com/ibm/text/UCD/StandardizedVariants-Template.html", Utility.UTF8, out, replacementList);
|
||||
|
||||
out.close();
|
||||
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
|
||||
}
|
||||
}
|
@ -1,516 +0,0 @@
|
||||
/*
|
||||
* Created on May 3, 2005
|
||||
* Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
|
||||
* For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
*/
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
|
||||
import com.ibm.icu.impl.CollectionUtilities;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.IDNA;
|
||||
import com.ibm.icu.text.StringPrepParseException;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.text.UTF16.StringComparator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.text.UCD.GenerateHanTransliterator.MultiComparator;
|
||||
import com.ibm.text.UCD.TestData.RegexMatcher;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
|
||||
class GenerateStringPrep implements UCD_Types {
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
//checkChars(false);
|
||||
new GenerateStringPrep().genStringPrep();
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
UnicodeSet[] coreChars = new UnicodeSet[100];
|
||||
UnicodeSet decomposable = new UnicodeSet();
|
||||
UnicodeMap suspect = new UnicodeMap();
|
||||
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
ToolUnicodePropertySource ups32 = ToolUnicodePropertySource.make("3.2.0");
|
||||
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
|
||||
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
|
||||
UnicodeSet wordChars = new UnicodeSet();
|
||||
{
|
||||
if (false) {
|
||||
wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
|
||||
wordChars.retainAll(ups.getSet("gc=Sk"));
|
||||
}
|
||||
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
|
||||
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
|
||||
" \\u055A \\u02B9 \\u02BA]"));
|
||||
//wordChars.removeAll(xid_continue);
|
||||
}
|
||||
|
||||
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
|
||||
UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
|
||||
UnicodeSet non_spacing = new UnicodeSet(ups.getSet("gc=Me"))
|
||||
.addAll(ups.getSet("gc=Mn"))
|
||||
.removeAll(ups.getSet("Default_Ignorable_Code_Point=true"));
|
||||
|
||||
UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
|
||||
|
||||
//UnicodeSet[] decompChars = new UnicodeSet[100];
|
||||
UCD ucd = Default.ucd();
|
||||
|
||||
static Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
|
||||
{
|
||||
uca0.setStrength(Collator.IDENTICAL);
|
||||
}
|
||||
static GenerateHanTransliterator.MultiComparator uca
|
||||
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
|
||||
uca0, new UTF16.StringComparator()});
|
||||
|
||||
UnicodeSet bidiR = new UnicodeSet(
|
||||
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
|
||||
|
||||
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
|
||||
UnicodeSet hasNoUpper = new UnicodeSet();
|
||||
UnicodeSet hasNoUpperMinus = new UnicodeSet();
|
||||
BagFormatter bf = new BagFormatter();
|
||||
UnicodeSet inIDN = new UnicodeSet();
|
||||
UnicodeSet isCaseFolded = new UnicodeSet();
|
||||
|
||||
void genStringPrep() throws IOException {
|
||||
//showScriptToBlock();
|
||||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
//bf.setValueSource(UnicodeLabel.NULL);
|
||||
if (false) {
|
||||
|
||||
System.out.println("word chars: " + bf.showSetNames(wordChars));
|
||||
System.out.println("pat: " + bf.showSetNames(patternProp));
|
||||
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
|
||||
}
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
int cat = Default.ucd().getCategory(cp);
|
||||
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
|
||||
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
|
||||
// get IDNA
|
||||
int idnaType = getIDNAType(cp);
|
||||
idnaTypeSet[idnaType].add(cp);
|
||||
|
||||
String str = UTF16.valueOf(cp);
|
||||
if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
|
||||
if (str.equals(ucd.getCase(str, FULL, FOLD))) isCaseFolded.add(cp);
|
||||
|
||||
// scripts
|
||||
int script = ucd.getScript(cp);
|
||||
if (coreChars[script] == null)
|
||||
coreChars[script] = new UnicodeSet();
|
||||
coreChars[script].add(cp);
|
||||
}
|
||||
// fix characters with no uppercase
|
||||
hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
|
||||
System.out.println(bf.showSetNames(hasNoUpper));
|
||||
|
||||
Utility.fixDot();
|
||||
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
|
||||
PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
|
||||
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
|
||||
textOut.println('\uFEFF');
|
||||
textOut.println("For documentation, see idn-chars.html");
|
||||
|
||||
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
|
||||
new String[] {"%date%", Default.getDate()});
|
||||
/*
|
||||
out
|
||||
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
out.println("<title>IDN Characters</title><style>");
|
||||
out.println("<!--");
|
||||
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
|
||||
out.println(".Atomic { background-color: #CCCCFF }");
|
||||
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
|
||||
out.println(".Non-XID { background-color: #FFCCCC }");
|
||||
out.println(".Decomposable { background-color: #FFFFCC }");
|
||||
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
|
||||
|
||||
out.println("th { text-align: left }");
|
||||
out.println("-->");
|
||||
out.println("</style></head><body><table>");
|
||||
*/
|
||||
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
|
||||
htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
|
||||
|
||||
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
|
||||
if (scriptCode == COMMON_SCRIPT
|
||||
|| scriptCode == INHERITED_SCRIPT)
|
||||
continue;
|
||||
showCodes(htmlOut, textOut, scriptCode, htmlOut2);
|
||||
}
|
||||
showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
|
||||
showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
|
||||
|
||||
showCodes(htmlOut, textOut, non_spacing);
|
||||
htmlOut.println("</table></body></html>");
|
||||
htmlOut.close();
|
||||
htmlOut2.println("</table></body></html>");
|
||||
htmlOut2.close();
|
||||
bf.setMergeRanges(false);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
|
||||
textOut.println();
|
||||
bf.setValueSource("word-chars");
|
||||
bf.showSetNames(textOut, wordChars);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** FOR REVIEW ***");
|
||||
bf.setLabelSource(UnicodeLabel.NULL);
|
||||
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
|
||||
textOut.println();
|
||||
String value = (String)it.next();
|
||||
bf.setValueSource(value);
|
||||
bf.showSetNames(textOut, suspect.getSet(value));
|
||||
}
|
||||
textOut.close();
|
||||
textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn_vs_cfnfkcid.txt");
|
||||
bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
textOut.println();
|
||||
textOut.println("# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***");
|
||||
UnicodeSet U32 = ups32.getSet("gc=cn").complement();
|
||||
UnicodeSet CF_NFKC_ID = new UnicodeSet(xid_continue).retainAll(isNFKC).retainAll(isCaseFolded).retainAll(U32);
|
||||
bf.showSetDifferences(textOut, "CF_NFKC_ID", CF_NFKC_ID, "IDN", idnaTypeSet[OK]);
|
||||
textOut.close();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private void showScriptToBlock() {
|
||||
UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
|
||||
UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
|
||||
UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
|
||||
public Object compose(int codePoint, Object a, Object b) {
|
||||
return a + "\t" + b;
|
||||
}
|
||||
};
|
||||
UnicodeMap sb = ((UnicodeMap)scripts.cloneAsThawed()).composeWith(blocks, myCompose);
|
||||
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
|
||||
System.out.println(it.next());
|
||||
}
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
|
||||
Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
|
||||
|
||||
static String[][] script_to_gif = {
|
||||
|
||||
{"Common","common.gif"}, //Miscellaneous_Symbols
|
||||
{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
|
||||
{"Arabic","arabic.gif"}, //Arabic
|
||||
{"Armenian","armenian.gif"}, //Armenian
|
||||
{"Bengali","bengali.gif"}, //Bengali
|
||||
{"Bopomofo","bopomofo.gif"}, //Bopomofo
|
||||
{"Braille","braillesymbols.gif"}, //Braille_Patterns
|
||||
{"Buginese","buginese.gif"}, //Buginese
|
||||
{"Buhid","buhid.gif"}, //Buhid
|
||||
{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
|
||||
{"Cherokee","cherokee.gif"}, //Cherokee
|
||||
{"Coptic","coptic.gif"}, //Coptic
|
||||
{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
|
||||
{"Cyrillic","cyrillic.gif"}, //Cyrillic
|
||||
{"Deseret","deseret.gif"}, //Deseret
|
||||
{"Devanagari","devanagari.gif"}, //Devanagari
|
||||
{"Ethiopic","ethiopic.gif"}, //Ethiopic
|
||||
{"Georgian","georgian.gif"}, //Georgian
|
||||
{"Glagolitic","glagolitic.gif"}, //Glagolitic
|
||||
{"Gothic","gothic.gif"}, //Gothic
|
||||
{"Greek","greek.gif"}, //Greek_and_Coptic
|
||||
{"Gujarati","gujarati.gif"}, //Gujarati
|
||||
{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
|
||||
{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
|
||||
{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
|
||||
{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
|
||||
{"Hanunoo","hanunoo.gif"}, //Hanunoo
|
||||
{"Hebrew","hebrew.gif"}, //Hebrew
|
||||
{"Hiragana","hiragana.gif"}, //Hiragana
|
||||
{"Kannada","kannada.gif"}, //Kannada
|
||||
{"Katakana","katakana.gif"}, //Katakana
|
||||
{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
|
||||
{"Khmer","khmer.gif"}, //Khmer
|
||||
{"Lao","lao.gif"}, //Lao
|
||||
{"Latin","latin.gif"}, //Basic_Latin
|
||||
{"Limbu","limbu.gif"}, //Limbu
|
||||
{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
|
||||
{"Malayalam","malayalam.gif"}, //Malayalam
|
||||
{"Mongolian","mongolian.gif"}, //Mongolian
|
||||
{"Myanmar","myanmar.gif"}, //Myanmar
|
||||
{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
|
||||
{"Ogham","ogham.gif"}, //Ogham
|
||||
{"Old_Italic","olditalic.gif"}, //Old_Italic
|
||||
{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
|
||||
{"Oriya","oriya.gif"}, //Oriya
|
||||
{"Osmanya","osmanya.gif"}, //Osmanya
|
||||
{"Runic","runic.gif"}, //Runic
|
||||
{"Shavian","shavian.gif"}, //Shavian
|
||||
{"Sinhala","sinhala.gif"}, //Sinhala
|
||||
{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
|
||||
{"Syriac","syriac.gif"}, //Syriac
|
||||
{"Tagalog","tagalog.gif"}, //Tagalog
|
||||
{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
|
||||
{"Tai_Le","taile.gif"}, //Tai_Le
|
||||
{"Tamil","tamil.gif"}, //Tamil
|
||||
{"Telugu","telugu.gif"}, //Telugu
|
||||
{"Thaana","thaana.gif"}, //Thaana
|
||||
{"Thai","thai.gif"}, //Thai
|
||||
{"Tibetan","tibetan.gif"}, //Tibetan
|
||||
{"Tifinagh","tifinagh.gif"}, //Tifinagh
|
||||
{"Ugaritic","ugaritic.gif"}, //Ugaritic
|
||||
{"Yi","yi.gif"}, //Yi_Syllables
|
||||
|
||||
};
|
||||
|
||||
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
|
||||
{
|
||||
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
|
||||
}
|
||||
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
|
||||
/**
|
||||
*
|
||||
*/
|
||||
static public int getIDNAType(int cp) {
|
||||
inbuffer.setLength(0);
|
||||
UTF16.append(inbuffer, cp);
|
||||
try {
|
||||
intermediate = IDNA.convertToASCII(inbuffer,
|
||||
IDNA.DEFAULT); // USE_STD3_RULES
|
||||
if (intermediate.length() == 0)
|
||||
return DELETED;
|
||||
outbuffer = IDNA.convertToUnicode(intermediate,
|
||||
IDNA.USE_STD3_RULES);
|
||||
} catch (StringPrepParseException e) {
|
||||
return ILLEGAL;
|
||||
} catch (Exception e) {
|
||||
System.out.println("Failure at: " + Utility.hex(cp));
|
||||
return ILLEGAL;
|
||||
}
|
||||
if (!TestData.equals(inbuffer, outbuffer))
|
||||
return REMAPPED;
|
||||
return OK;
|
||||
}
|
||||
static StringBuffer inbuffer = new StringBuffer();
|
||||
static StringBuffer intermediate, outbuffer;
|
||||
|
||||
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
|
||||
|
||||
/**
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param scriptCode
|
||||
* @param htmlOut2 TODO
|
||||
* @param ucd
|
||||
* @param coreChars
|
||||
* @param decompChars
|
||||
*/
|
||||
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
|
||||
if (coreChars[scriptCode] == null) return;
|
||||
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
|
||||
script = Utility.getUnskeleton(script.toLowerCase(),true);
|
||||
System.out.println(script);
|
||||
|
||||
htmlOut.println();
|
||||
String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
|
||||
+ "'> Script: " + script + "</th></tr>";
|
||||
htmlOut.println(scriptLine);
|
||||
htmlOut2.println(scriptLine);
|
||||
textOut.println();
|
||||
textOut.println("#*** Script: " + script + " ***");
|
||||
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
|
||||
|
||||
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
|
||||
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
|
||||
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
|
||||
|
||||
UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
|
||||
UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
|
||||
|
||||
UnicodeSet decomp = extract(decomposable, core);
|
||||
UnicodeSet pattern = extract(patternProp, core);
|
||||
UnicodeSet non_id = extract(not_xid_continue, core);
|
||||
|
||||
UnicodeSet bicameralNoupper = new UnicodeSet();
|
||||
if (!hasNoUpper.containsAll(core)) {
|
||||
bicameralNoupper = extract(hasNoUpperMinus, core);
|
||||
}
|
||||
|
||||
UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
|
||||
String cat = Default.ucd().getCategoryID(it.codepoint);
|
||||
String name = Default.ucd().getName(it.codepoint);
|
||||
if (name.indexOf("MUSICAL SYMBOL") >= 0
|
||||
|| name.indexOf("DINGBA") >= 0
|
||||
|| name.indexOf("RADICAL ") >= 0
|
||||
) cat = "XX";
|
||||
suspect.put(it.codepoint, cat);
|
||||
}
|
||||
|
||||
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode, uca);
|
||||
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode, uca);
|
||||
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode, uca);
|
||||
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode, uca);
|
||||
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode, uca);
|
||||
|
||||
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode, uca);
|
||||
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode, uca);
|
||||
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode, uca);
|
||||
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode, uca);
|
||||
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode, uca);
|
||||
}
|
||||
|
||||
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) throws IOException {
|
||||
String script = Default.ucd().getScriptID_fromIndex((byte) INHERITED_SCRIPT);
|
||||
script = Utility.getUnskeleton(script.toLowerCase(),true);
|
||||
String scriptLine = "<tr><th class='script'><img src='images/"
|
||||
+ ((String)scriptToGif.get(script)).toLowerCase()
|
||||
+ "'> Script: " + script + "</th></tr>";
|
||||
htmlOut.println(scriptLine);
|
||||
UnicodeMap m = getPositions();
|
||||
|
||||
for (Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); it.hasNext(); ) {
|
||||
String type = (String) it.next();
|
||||
UnicodeSet current = m.getSet(type).retainAll(non_spacing);
|
||||
if (current.size() == 0) continue;
|
||||
printlnSet(htmlOut, textOut, script, "Visible_Combining_Marks_" + type, current, INHERITED_SCRIPT, positionComparator);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws IOException
|
||||
*
|
||||
*/
|
||||
private UnicodeMap getPositions() throws IOException {
|
||||
UnicodeMap result = new UnicodeMap();
|
||||
BufferedReader in = bf.openUTF8Reader("C:\\DATA\\confusables\\", "positions.txt");
|
||||
String type="Undetermined";
|
||||
while (true) {
|
||||
String line = Utility.readDataLine(in);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
if (line.startsWith("@")) {
|
||||
type = line.substring(1);
|
||||
continue;
|
||||
}
|
||||
String[] pieces = Utility.split(line, ';');
|
||||
String code = Utility.fromHex(pieces[0]);
|
||||
result.put(UTF16.charAt(code,0), type);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static Comparator positionComparator = new Comparator() {
|
||||
public int compare(Object o1, Object o2) {
|
||||
String s1 = (String)o1;
|
||||
String s2 = (String)o2;
|
||||
return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
|
||||
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
|
||||
core.removeAll(decomp);
|
||||
return decomp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param script TODO
|
||||
* @param unicodeset
|
||||
* @param scriptCode
|
||||
* @param comparator TODO
|
||||
* @param uca
|
||||
*/
|
||||
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
|
||||
String script, String title, UnicodeSet unicodeset, int scriptCode, Comparator comparator) {
|
||||
if (unicodeset == null)
|
||||
return;
|
||||
int size = unicodeset.size();
|
||||
String dir = unicodeset.containsSome(bidiR)
|
||||
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
|
||||
htmlOut.println("<tr><th class='" + title + "'><a href='#" +
|
||||
title + "'>" + title + "</a> ("
|
||||
+ TestData.nf.format(size) + ")</th></tr>");
|
||||
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
|
||||
// <a href="#Atomic">categorization</a>
|
||||
textOut.println();
|
||||
textOut.println("# " + title);
|
||||
bf.setValueSource(script + " ; " + title);
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator();
|
||||
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
|
||||
usi.reset(unicodeset);
|
||||
while (usi.nextRange()) {
|
||||
if (usi.codepoint == usi.codepointEnd) {
|
||||
htmlOut.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint)));
|
||||
} else {
|
||||
htmlOut.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint))
|
||||
+ ".. "
|
||||
+ formatCode(UTF16
|
||||
.valueOf(usi.codepointEnd)));
|
||||
}
|
||||
}
|
||||
bf.showSetNames(textOut, unicodeset);
|
||||
} else {
|
||||
Set reordered = new TreeSet(comparator);
|
||||
usi.reset(unicodeset);
|
||||
while (usi.next()) {
|
||||
String x = usi.getString();
|
||||
boolean foo = reordered.add(x);
|
||||
if (!foo)
|
||||
throw new IllegalArgumentException("Collision with "
|
||||
+ Default.ucd().getCodeAndName(x));
|
||||
}
|
||||
for (Iterator it = reordered.iterator(); it.hasNext();) {
|
||||
Object key = it.next();
|
||||
htmlOut.print(formatCode((String)key));
|
||||
}
|
||||
bf.showSetNames(textOut, reordered);
|
||||
}
|
||||
htmlOut.println("</td></tr>");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string
|
||||
* @return
|
||||
*/
|
||||
private String formatCode(String string) {
|
||||
int cat = ucd.getCategory(UTF16.charAt(string,0));
|
||||
String pad = "\u00A0", pad1 = pad;
|
||||
if (cat == Me || cat == Mn) {
|
||||
pad = "\u00A0\u00A0";
|
||||
pad1 = "\u00A0\u00A0\u25cc";
|
||||
}
|
||||
return "<span title='" + ucd.getCodeAndName(string) + "'>"
|
||||
+ pad1
|
||||
+ TransliteratorUtilities.toHTMLControl.transliterate(string)
|
||||
+ pad
|
||||
+ "</span> ";
|
||||
}
|
||||
}
|
@ -1,74 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks-old.java,v $
|
||||
* $Date: 2005/03/04 02:50:26 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.util.*;
|
||||
|
||||
public class GenerateThaiBreaks {
|
||||
public static void main(String [] args) throws IOException {
|
||||
|
||||
BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new FileInputStream("\\icu4j\\src\\data\\thai6.ucs"), "UnicodeLittle"));
|
||||
try {
|
||||
Main.setUCD();
|
||||
UnicodeSet ignorables = new UnicodeSet("[:M:]");
|
||||
ignorables.retain(0x0E00, 0x0E7F); // just Thai block
|
||||
ignorables.add(0x0E40, 0x0E44); // add logical order exception
|
||||
ignorables.add(0, ' '); // add controls
|
||||
ignorables.add('.');
|
||||
|
||||
UnicodeSet initials = new UnicodeSet();
|
||||
UnicodeSet finals = new UnicodeSet();
|
||||
UnicodeSet medials = new UnicodeSet();
|
||||
while (true) {
|
||||
String line = br.readLine();
|
||||
if (line == null) break;
|
||||
int end;
|
||||
|
||||
// find final consonant
|
||||
for (int i = line.length() - 1; ; --i) {
|
||||
char c = line.charAt(i);
|
||||
if (!ignorables.contains(c)) {
|
||||
finals.add(c);
|
||||
end = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
boolean haveFirst = false;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
char c = line.charAt(i);
|
||||
if (ignorables.contains(c)) continue;
|
||||
if (!haveFirst) {
|
||||
initials.add(c);
|
||||
haveFirst = true;
|
||||
} else {
|
||||
medials.add(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
initials.removeAll(medials);
|
||||
finals.removeAll(medials);
|
||||
Utility.showSetNames("initials: ", initials, false, Main.ucd);
|
||||
Utility.showSetNames("finals: ", finals, false, Main.ucd);
|
||||
Utility.showSetNames("medials: ", medials, false, Main.ucd);
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,135 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
|
||||
* $Date: 2006/09/24 23:32:44 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
//import com.ibm.text.utility;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.Utility;
|
||||
//import java.util.*;
|
||||
|
||||
public class GenerateThaiBreaks {
|
||||
public static void main(String [] args) throws IOException {
|
||||
|
||||
BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle"));
|
||||
PrintWriter out = null;
|
||||
|
||||
try {
|
||||
|
||||
UnicodeSet ignorables = new UnicodeSet();
|
||||
/* new UnicodeSet(0xE30, 0xE3A);
|
||||
ignorables.add(0x0E40, 0x0E44); // add logical order exception
|
||||
ignorables.add(0x0E47, 0x0E4E);
|
||||
*/
|
||||
ignorables.add(0, ' '); // add controls
|
||||
ignorables.add('.');
|
||||
|
||||
|
||||
UnicodeSet initials = new UnicodeSet();
|
||||
UnicodeSet finals = new UnicodeSet();
|
||||
UnicodeSet medials = new UnicodeSet();
|
||||
|
||||
char[] buffer = new char[100];
|
||||
|
||||
while (true) {
|
||||
String line = br.readLine();
|
||||
if (line == null) break;
|
||||
int end = 0;
|
||||
|
||||
// find 'real' characters
|
||||
for (int i = 0; i < line.length(); ++i) {
|
||||
char c = line.charAt(i);
|
||||
if (ignorables.contains(c)) continue;
|
||||
buffer[end++] = c;
|
||||
}
|
||||
String temp = new String(buffer, 0, end);
|
||||
|
||||
if (temp.length() <= 1) {
|
||||
initials.add(temp);
|
||||
finals.add(temp);
|
||||
continue;
|
||||
}
|
||||
|
||||
initials.add(temp.substring(0,1));
|
||||
//initials.add(temp.substring(0,2));
|
||||
finals.add(temp.substring(temp.length()-1));
|
||||
//finals.add(temp.substring(temp.length()-1));
|
||||
|
||||
for (int i = 1; i < temp.length() - 1; ++i) {
|
||||
//medials.add(temp.substring(i, i+2));
|
||||
medials.add(temp.substring(i, i+1));
|
||||
}
|
||||
//medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
|
||||
}
|
||||
|
||||
System.out.println("initials size: " + initials.size());
|
||||
System.out.println("finals size: " + finals.size());
|
||||
System.out.println("medials size: " + medials.size());
|
||||
|
||||
//out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
|
||||
// out.write('\uFEFF');
|
||||
|
||||
UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]");
|
||||
finals.addAll(marks);
|
||||
|
||||
UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals);
|
||||
|
||||
UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all);
|
||||
|
||||
System.out.println("Never occur: " + missingThai.toPattern(true));
|
||||
Utility.showSetNames("", missingThai, true, Default.ucd());
|
||||
System.out.println();
|
||||
|
||||
UnicodeSet neverInitial = new UnicodeSet(all).removeAll(initials);
|
||||
UnicodeSet neverFinal = new UnicodeSet(all).removeAll(finals);
|
||||
|
||||
System.out.println("Never initial: " + neverInitial.toPattern(true));
|
||||
Utility.showSetNames("", neverInitial, true, Default.ucd());
|
||||
System.out.println();
|
||||
|
||||
System.out.println("Never final: " + neverFinal.toPattern(true));
|
||||
Utility.showSetNames("", neverFinal, true, Default.ucd());
|
||||
System.out.println();
|
||||
|
||||
initials.removeAll(medials);
|
||||
finals.removeAll(medials);
|
||||
|
||||
System.out.println("initials size: " + initials.size());
|
||||
System.out.println("finals size: " + finals.size());
|
||||
|
||||
System.out.println("Only Initials" + initials.toPattern(true));
|
||||
Utility.showSetNames("", initials, true, Default.ucd());
|
||||
System.out.println();
|
||||
|
||||
System.out.println("Only Finals" + finals.toPattern(true));
|
||||
Utility.showSetNames("", finals, true, Default.ucd());
|
||||
} finally {
|
||||
br.close();
|
||||
if (out != null) out.close();
|
||||
}
|
||||
}
|
||||
|
||||
static class MyBreaker implements Utility.Breaker {
|
||||
public String get(Object current, Object old) {
|
||||
if (old == null || UTF16.charAt(current.toString(), 0) == UTF16.charAt(old.toString(), 0)) {
|
||||
return current.toString() + "(" + Default.ucd().getCode(current.toString().substring(1)) + "))";
|
||||
} else {
|
||||
return "\r\n" + current + "(" + Default.ucd().getCode(current.toString()) + "))";
|
||||
}
|
||||
}
|
||||
public boolean filter(Object current) { return true; }
|
||||
}
|
||||
}
|
@ -1,177 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/IANANames.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
||||
|
||||
import java.util.*;
|
||||
import java.text.NumberFormat;
|
||||
import java.io.*;
|
||||
|
||||
public class IANANames implements UCD_Types {
|
||||
private Map aliasToBase = new TreeMap();
|
||||
private Map aliasToComment = new TreeMap();
|
||||
private Map aliasToLine = new TreeMap();
|
||||
|
||||
public static void testSensitivity() throws IOException {
|
||||
IANANames iNames = new IANANames();
|
||||
Map m = new HashMap();
|
||||
Iterator it = iNames.getIterator();
|
||||
UnicodeSet removed = new UnicodeSet();
|
||||
int maxLength = 0;
|
||||
while (it.hasNext()) {
|
||||
String alias = (String) it.next();
|
||||
if (maxLength < alias.length()) maxLength = alias.length();
|
||||
if (alias.length() > 40) System.out.println("Name >40: " + alias);
|
||||
if (alias.indexOf(')') >= 0 || alias.indexOf('(') >= 0) System.out.println("Illegal tag: " + alias);
|
||||
String skeleton = removeNonAlphanumeric(alias, removed);
|
||||
String other = (String) m.get(skeleton);
|
||||
if (other != null) {
|
||||
String base = iNames.getBase(alias);
|
||||
String otherBase = iNames.getBase(other);
|
||||
if (!base.equals(otherBase)) {
|
||||
System.out.println("Collision between: " + alias + " (" + base + ") and "
|
||||
+ other + " (" + otherBase + ")");
|
||||
} else {
|
||||
System.out.println("Alias Variant: " + alias + " and " + other + " (" + base + ")");
|
||||
}
|
||||
} else {
|
||||
m.put(skeleton, alias);
|
||||
}
|
||||
}
|
||||
System.out.println("Max Length: " + maxLength);
|
||||
|
||||
System.out.println("Characters removed: ");
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator(removed);
|
||||
while (usi.next()) {
|
||||
char c = (char) usi.codepoint; // safe, can't be supplementary
|
||||
System.out.println("0x" + usi.codepoint + "\t'" + c + "'\t" + UCharacter.getName(usi.codepoint));
|
||||
}
|
||||
}
|
||||
|
||||
public IANANames() throws IOException {
|
||||
BufferedReader in = Utility.openReadFile(BASE_DIR + "IANA\\character-sets.txt", Utility.LATIN1);
|
||||
try {
|
||||
boolean atStart = true;
|
||||
String lastName = "";
|
||||
int counter = 0;
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
counter++;
|
||||
if (atStart) {
|
||||
if (line.startsWith("-------------")) atStart = false;
|
||||
continue;
|
||||
}
|
||||
if (line.trim().length() == 0) continue;
|
||||
|
||||
if (line.startsWith("Name:") || line.startsWith("Alias:")) {
|
||||
lastName = add(line, lastName, counter);
|
||||
} else if (line.startsWith("Source:") || line.startsWith("MIBenum:")
|
||||
|| line.startsWith(" ")) {
|
||||
continue;
|
||||
} else if (line.equals("REFERENCES")) {
|
||||
break;
|
||||
} else {
|
||||
System.out.println("Unknown Line: " + line);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
in.close();
|
||||
}
|
||||
}
|
||||
|
||||
private String add(String line, String baseName, int counter) {
|
||||
// extract the alias, doing a little validity check
|
||||
int pos = line.indexOf(": ");
|
||||
if (pos < 0) throw new IllegalArgumentException("Bad line: " + counter + " '" + line + "'");
|
||||
String alias = line.substring(pos+2).trim();
|
||||
|
||||
// get comment
|
||||
String comment = null;
|
||||
pos = alias.indexOf(' ');
|
||||
if (pos >= 0) {
|
||||
comment = alias.substring(pos).trim();
|
||||
alias = alias.substring(0, pos);
|
||||
}
|
||||
|
||||
// reset the baseName if we are a name
|
||||
if (line.startsWith("Name:")) {
|
||||
baseName = alias;
|
||||
}
|
||||
|
||||
// store
|
||||
if (!alias.equals("None")) {
|
||||
if (false) {
|
||||
if (baseName.equals(alias)) System.out.println();
|
||||
System.out.println("Adding " + alias + "\t=> " + baseName + (comment != null ? "\t(" + comment + ")" : ""));
|
||||
}
|
||||
// check if it is stored already
|
||||
String oldbaseName = (String) aliasToBase.get(alias);
|
||||
if (oldbaseName != null) {
|
||||
System.out.println("Duplicate alias (" + alias + ", " + oldbaseName + ", " + baseName + "): "
|
||||
+ counter + " '" + line + "'");
|
||||
}
|
||||
aliasToBase.put(alias, baseName);
|
||||
if (comment != null) aliasToComment.put(alias, comment);
|
||||
aliasToLine.put(alias, comment);
|
||||
}
|
||||
return baseName;
|
||||
}
|
||||
|
||||
public Iterator getIterator() {
|
||||
return aliasToBase.keySet().iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the name for this alias, or "" if there is none
|
||||
*/
|
||||
public String getBase(String alias) {
|
||||
return (String) aliasToBase.get(alias);
|
||||
}
|
||||
|
||||
public static String removeNonAlphanumeric(String s, UnicodeSet removed) {
|
||||
s = s.toUpperCase(Locale.ENGLISH); // can't have Turkish!
|
||||
StringBuffer result = new StringBuffer();
|
||||
boolean removedZero = false;
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c == '0') {
|
||||
char cLast = result.length() > 0 ? result.charAt(result.length() - 1) : '0';
|
||||
if ('0' <= cLast && cLast <= '9') {
|
||||
result.append(c);
|
||||
} else {
|
||||
if (!removed.contains(c)) {
|
||||
System.out.println("Removed '" + c + "' from " + s + " => " + result);
|
||||
removed.add(c);
|
||||
}
|
||||
removedZero = true;
|
||||
}
|
||||
} else if (('A' <= c && c <= 'Z') || ('0' <= c && c <= '9')) {
|
||||
result.append(c);
|
||||
} else {
|
||||
if (!removed.contains(c)) {
|
||||
System.out.println("Removed '" + c + "' from " + s + " => " + result);
|
||||
removed.add(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
//if (removedZero) System.out.println("Removed 0 from " + s + " => " + result);
|
||||
return result.toString();
|
||||
}
|
||||
}
|
@ -1,142 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.impl.PrettyPrinter;
|
||||
import com.ibm.icu.text.IDNA;
|
||||
import com.ibm.icu.text.StringPrepParseException;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class IDNTester {
|
||||
static StringBuffer inbuffer = new StringBuffer();
|
||||
static StringBuffer intermediate, outbuffer;
|
||||
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
|
||||
static UnicodeSet IDNInputOnly = new UnicodeSet();
|
||||
static UnicodeSet IDNOutput = new UnicodeSet();
|
||||
static boolean initialized = false;
|
||||
static UnicodeSet IDInputOnly32 = new UnicodeSet();
|
||||
static UnicodeSet IDOutput32 = new UnicodeSet();
|
||||
static UnicodeSet IDInputOnly50 = new UnicodeSet();
|
||||
static UnicodeSet IDOutput50 = new UnicodeSet();
|
||||
static PrettyPrinter pp = new PrettyPrinter();
|
||||
static PrintWriter pw;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
initialize();
|
||||
pw = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "idnCount.html");
|
||||
pw.println("<html><body>");
|
||||
showSet("IDN InputOnly: ", IDNInputOnly);
|
||||
showSet("IDN Output: ", IDNOutput);
|
||||
showSet("ID InputOnly, U3.2: ", IDInputOnly32);
|
||||
showSet("ID Output, U3.2: ", IDOutput32);
|
||||
|
||||
showSet("IDN Output - ID Output, U3.2: ", new UnicodeSet(IDNOutput).removeAll(IDOutput32));
|
||||
showSet("IDN Output & ID Output, U3.2: ", new UnicodeSet(IDNOutput).retainAll(IDOutput32));
|
||||
showSet("ID Output - IDN Output, U3.2: ", new UnicodeSet(IDOutput32).removeAll(IDNOutput));
|
||||
|
||||
showSet("ID InputOnly, U5.0: ", IDInputOnly50);
|
||||
showSet("ID Output, U5.0: ", IDOutput50);
|
||||
showSet("ID Output, U5.0 - U3.2: ", new UnicodeSet(IDOutput50).removeAll(IDOutput32));
|
||||
|
||||
pw.println("</body></html>");
|
||||
|
||||
pw.close();
|
||||
}
|
||||
|
||||
public static void showSet(String title, UnicodeSet set) {
|
||||
pw.println("<h2>" + title + set.size() + "</h2>" + "<p>" + pp.toPattern(set) + "</p>");
|
||||
pw.println();
|
||||
}
|
||||
|
||||
static UnicodeSet getIDNInput() {
|
||||
if (!initialized) initialize();
|
||||
return IDNInputOnly;
|
||||
}
|
||||
|
||||
static UnicodeSet getIDNOutput() {
|
||||
if (!initialized) initialize();
|
||||
return IDNInputOnly;
|
||||
}
|
||||
|
||||
private static void initialize() {
|
||||
UnicodeSet oddballs = new UnicodeSet("[\u034F \u180B-\u180D \uFE00-\uFE0F _]");
|
||||
UCD U32 = UCD.make("3.2.0");
|
||||
Normalizer nfkc32 = new Normalizer(Normalizer.NFKC, "3.2.0");
|
||||
UCDProperty xid32 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U32);
|
||||
UnicodeSet IDInput32 = xid32.getSet();
|
||||
IDInput32.add('-').removeAll(oddballs);
|
||||
|
||||
UCD U50 = UCD.make("5.0.0");
|
||||
Normalizer nfkc50 = new Normalizer(Normalizer.NFKC, "5.0.0");
|
||||
UCDProperty xid50 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U50);
|
||||
UnicodeSet IDInput50 = xid50.getSet();
|
||||
IDInput50.add('-').removeAll(oddballs);
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if ((i & 0xFFF) == 0) {
|
||||
System.out.println(i);
|
||||
System.out.flush();
|
||||
}
|
||||
int type = getIDNAType(i);
|
||||
if (type == OK) {
|
||||
IDNOutput.add(i);
|
||||
} else if (type != ILLEGAL) {
|
||||
IDNInputOnly.add(i);
|
||||
}
|
||||
if (IDInput32.contains(i)) {
|
||||
splitSet(IDInputOnly32, IDOutput32, U32, nfkc32, i);
|
||||
}
|
||||
if (IDInput50.contains(i)) {
|
||||
splitSet(IDInputOnly50, IDOutput50, U50, nfkc50, i);
|
||||
}
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
private static void splitSet(UnicodeSet inputOnlySet, UnicodeSet outputSet, UCD ucd, Normalizer nfkc, int i) {
|
||||
if (i < 0x7F) {
|
||||
outputSet.add(i);
|
||||
return;
|
||||
}
|
||||
String v = UTF16.valueOf(i);
|
||||
String s = ucd.getCase(i, UCD.FULL, UCD.FOLD);
|
||||
if (s.equals(v)) {
|
||||
s = nfkc.normalize(s);
|
||||
if (s.equals(v)) {
|
||||
s = ucd.getCase(s, UCD.FULL, UCD.FOLD);
|
||||
if (s.equals(v)) {
|
||||
outputSet.add(i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
inputOnlySet.add(i);
|
||||
}
|
||||
|
||||
static public int getIDNAType(int cp) {
|
||||
if (cp == '-') return OK;
|
||||
inbuffer.setLength(0);
|
||||
UTF16.append(inbuffer, cp);
|
||||
try {
|
||||
intermediate = IDNA.convertToASCII(inbuffer,
|
||||
IDNA.DEFAULT); // USE_STD3_RULES
|
||||
if (intermediate.length() == 0)
|
||||
return DELETED;
|
||||
outbuffer = IDNA.convertToUnicode(intermediate,
|
||||
IDNA.USE_STD3_RULES);
|
||||
} catch (StringPrepParseException e) {
|
||||
return ILLEGAL;
|
||||
} catch (Exception e) {
|
||||
System.out.println("Failure at: " + Utility.hex(cp));
|
||||
return ILLEGAL;
|
||||
}
|
||||
if (!TestData.equals(inbuffer, outbuffer))
|
||||
return REMAPPED;
|
||||
return OK;
|
||||
}
|
||||
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/IntMap.java,v $
|
||||
* $Date: 2003/03/18 00:10:47 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class IntMap {
|
||||
int lowest = Integer.MAX_VALUE;
|
||||
int highest = Integer.MIN_VALUE;
|
||||
HashMap store = new HashMap();
|
||||
|
||||
public Object get(int key) {
|
||||
if (key < lowest || key > highest) return null;
|
||||
return store.get(new Integer(key));
|
||||
}
|
||||
|
||||
public void put(int key, Object value) {
|
||||
if (key < lowest) lowest = key;
|
||||
if (key > highest) highest = key;
|
||||
store.put(new Integer(key), value);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return store.size();
|
||||
}
|
||||
}
|
||||
|
@ -1,92 +0,0 @@
|
||||
Show [[:block=tamil:] & [:age=3.2:] - [:age=3.1:]]
|
||||
Show [[:block=tamil:] & [:age=4.0:] - [:age=3.2:]]
|
||||
Show [[:block=tamil:] & [:age=4.1:] - [:age=4.0:]]
|
||||
Show [[:block=tamil:] & [:age=5.0:] - [:age=4.1:]]
|
||||
|
||||
Stop
|
||||
|
||||
Show [[:NFKCQuickCheck=No:] & [$gc:Lm]]
|
||||
|
||||
Stop
|
||||
|
||||
[$Name: $gc:Sk]
|
||||
[$Name: $gc:Lm]
|
||||
|
||||
Show [[$whitespace] - [$gc:zs]]
|
||||
Show [[$gc:zs] - [$whitespace]]
|
||||
|
||||
Let $letter = [$gc:Lu $gc:Ll $gc:Lt $gc:Lo $gc:Lm];
|
||||
Let $number = [$gc:Nd $gc:Nl $gc:No]
|
||||
Let $mark = [$gc:mn $gc:me $gc:mc]
|
||||
Let $LMN = [$letter $number $mark]
|
||||
Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
|
||||
Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
|
||||
Let $nfc = [^$NFC_Quick_Check:No]
|
||||
|
||||
Show $nfc
|
||||
|
||||
Show [$alphabetic - [$mark $letter $number]]
|
||||
|
||||
|
||||
Let $oldCJK = [\u1100-\u11FF \u3040-\u30FF \u3130-\u318F \u31F0-\u31FF \u3400-\u4DBF \u4E00-\u9FFF \uAC00-\uD7AF \uF900-\uFAFF \uFF65-\uFFDC]
|
||||
|
||||
Show [$oldCJK & $gc:cn]
|
||||
|
||||
Let $fixedOld = [$oldCJK-$gc:cn]
|
||||
|
||||
|
||||
#List the non-alphabetic old items
|
||||
#Show [$oldCJK-$gc:cn-$alphabetic]
|
||||
|
||||
#Check for differences
|
||||
#Test $fixedOld = $trialNew
|
||||
|
||||
#ShowEach $mark
|
||||
|
||||
Let $uax29_outliers = [\u3031-\u3035 \u309B-\u309C \u30A0 \u30FC \uFF70 \uFF9E-\uFF9F]
|
||||
Let $other_outliers = [\u3099-\u309A \u3006 \u303C \u302A-\u302E \u302F \U000E0100-\U000E01EF]
|
||||
|
||||
# ==========================================
|
||||
|
||||
# Outliers from UAX29
|
||||
Show $uax29_outliers
|
||||
|
||||
# Additional outliers
|
||||
Show $other_outliers
|
||||
|
||||
# Take the 5 CJK scripts
|
||||
Let $trialScripts = [$script:hani $script:hang $script:kana $script:hira $script:bopo]
|
||||
|
||||
# Remove the non-LMN
|
||||
Let $trialNewBase = [$trialScripts & $LMN]
|
||||
|
||||
# Add the outliers
|
||||
Let $trialNew = [$trialNewBase $uax29_outliers $other_outliers]
|
||||
|
||||
# Show our result
|
||||
Show $trialNew
|
||||
|
||||
# As a double-check, show script characters we're tossing
|
||||
Show [$trialScripts - $trialNew]
|
||||
|
||||
# Compare snippets stuff
|
||||
Let $guessClose = [$lb:QU $lb:Close_Punctuation]
|
||||
Let $__closing_punc = ["')>\]`\}\u00AB\u00BB\u2018\u2019\u201C\u201D\u2039\u203A\u207E\u208E\u27E7\u27E9\u27EB\u2984\u2986\u2988\u298A\u298C\u298E\u2990\u2992\u2994\u2996\u2998\u29D9\u29DB\u29FD\u3009\u300B\u300D\u300F\u3011\u3015\u3017\u3019\u301B\u301E\u301F\uFD3F\uFE42\uFE44\uFE5A\uFE5C\uFF02\uFF07\uFF09\uFF3D\uFF5D\uFF63]
|
||||
|
||||
$guessClose = $__closing_punc
|
||||
|
||||
Let $guessClose = [$gc:pf $gc:pe $gc:pi]
|
||||
$guessClose = $__closing_punc
|
||||
|
||||
Let $guessTerm = [$sb:aterm $sb:sterm]
|
||||
$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? <20> ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
|
||||
|
||||
Let $__issymotherr = [\u00A6\u00A7\u06FD\u06FE\u0F01-\u0F03\u0F13-\u0F17\u0F1A-\u0F1F\u0FBE-\u0FC5\u0FC7-\u0FCC\u2100\u2101\u2104-\u2106\u2108\u2109\u2117\u2118\u211E-\u2121\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u2613\u2619-\u266E\u2670\u2671\u2701-\u2704\u2706-\u2709\u270C-\u2727\u2729-\u274B\u274F-\u2752\u2758-\u275E\u2761-\u2794\u2798-\u27AF\u27B1-\u27BE\u2800-\u28FF\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3012\u3013\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u3200-\u321C\u322A-\u3243\u3260-\u327B\u328A-\u32B0\u32C0-\u32CB\u32D0-\u32FE\u3300-\u3376\u337B-\u33DD\u33E0-\u33FE\uA490-\uA4A1\uA4A4-\uA4B3\uA4B5-\uA4C0\uA4C2-\uA4C4\uFFED\uFFEE\uFFFC\uFFFD]
|
||||
Let $__issymothers = [\u00B6\u0482\u06E9\u09FA\u0B70\u0F34\u0F36\u0F38\u0FCF\u2114\u2123\u2125\u2127\u2129\u212E\u2132\u213A\u21D3\u220E\u2617\u274D\u2756\u3004\u3020\u327F\uA4C6\uFFE4\uFFE8]
|
||||
|
||||
Let $symOther = [$__issymotherr $__issymothers]
|
||||
|
||||
$symOther = $gcAllSymbols
|
||||
|
||||
|
||||
[$symOther & $nfc] = [$gcAllSymbols & $nfc]
|
@ -1,18 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
public class ListNFComplete {
|
||||
|
||||
// find all the characters that are
|
||||
// a) not decomposed by this normalization form
|
||||
// b) of combining class 0
|
||||
// AND if NKC or NFKC,
|
||||
// c) can never compose with a previous character
|
||||
// d) can never compose with a following character
|
||||
// e) can never change if another character is added
|
||||
// Example: a-breve might satisfy a-d, but if you
|
||||
// add an ogonek it changes to a-ogonek + breve
|
||||
|
||||
public static void main (String[] args) {
|
||||
//Normalizer nfd = new Normalizer(Normalizer.NFD);
|
||||
|
||||
}
|
||||
}
|
@ -1,327 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MLStreamWriter.java,v $
|
||||
* $Date: 2003/04/25 01:39:15 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import com.ibm.text.UCD.*;
|
||||
|
||||
public class MLStreamWriter extends Writer {
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
public MLStreamWriter (PrintWriter output, boolean HTML) {
|
||||
out = output;
|
||||
isHTML = HTML;
|
||||
}
|
||||
|
||||
public MLStreamWriter (PrintWriter output) {
|
||||
this(output,true);
|
||||
}
|
||||
|
||||
public MLStreamWriter el(String elementName) {
|
||||
closeIfOpen();
|
||||
print('<', AFTER);
|
||||
print(elementName, elementName.equals("!--") ? AFTER+FORCE : AFTER);
|
||||
stack.add(elementName);
|
||||
inElement = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
private MLStreamWriter closeIfOpen() {
|
||||
if (inElement && !"!--".equals(stack.get(stack.size()-1))) {
|
||||
print('>',BEFORE+FORCE);
|
||||
}
|
||||
inElement = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
final public MLStreamWriter cel(String elementName) {
|
||||
return cl().tx(elementName);
|
||||
}
|
||||
|
||||
public MLStreamWriter at(String attributeName, String attributeValue) {
|
||||
if (!inElement) {
|
||||
throw new IllegalArgumentException("attribute \"" + attributeName + "\" not in element");
|
||||
}
|
||||
print(' ', BOTH);
|
||||
print(attributeName, AFTER);
|
||||
print('=', AFTER);
|
||||
print('"');
|
||||
print(quoted(attributeValue));
|
||||
print('"', AFTER);
|
||||
return this;
|
||||
}
|
||||
|
||||
public MLStreamWriter at(String attributeName, int value) {
|
||||
return at(attributeName, String.valueOf(value));
|
||||
}
|
||||
|
||||
public MLStreamWriter CR() {
|
||||
closeIfOpen();
|
||||
out.println();
|
||||
return this;
|
||||
}
|
||||
|
||||
/*public MLStreamWriter comment() {
|
||||
closeIfOpen();
|
||||
print("<!--");
|
||||
CR();
|
||||
return this;
|
||||
}
|
||||
|
||||
public MLStreamWriter endComment() {
|
||||
print("-->");
|
||||
return this;
|
||||
}
|
||||
*/
|
||||
|
||||
public MLStreamWriter tx(String text) {
|
||||
closeIfOpen();
|
||||
print(quoted(text));
|
||||
return this;
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx(char text) {
|
||||
return tx(String.valueOf(text));
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx(int text) {
|
||||
return tx(String.valueOf(text));
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx16(String text) {
|
||||
return tx(hex(text));
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx16(char text) {
|
||||
return tx(hex(text));
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx16(int text) {
|
||||
return tx(hex(text));
|
||||
}
|
||||
|
||||
public MLStreamWriter cl(String closingElement) {
|
||||
closeIfOpen();
|
||||
String lastElement = (String)stack.remove(stack.size()-1);
|
||||
if (closingElement != null && !closingElement.equals(lastElement)) {
|
||||
throw new IllegalArgumentException("mismatch when closing \"" + closingElement
|
||||
+ "\", current active element is \"" + lastElement + "\"");
|
||||
}
|
||||
if (lastElement.equals("!--")) {// hack for XML/HTML
|
||||
print("-->",BEFORE+FORCE);
|
||||
} else {
|
||||
print("</");
|
||||
print(lastElement);
|
||||
print('>',BEFORE);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
final public MLStreamWriter cl() {
|
||||
return cl(null);
|
||||
}
|
||||
|
||||
public MLStreamWriter closeAllElements() {
|
||||
for (int i = stack.size()-1; i >= 0; --i) {
|
||||
cl(null);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
// stream stuff
|
||||
|
||||
public void write(char[] source, int start, int len) {
|
||||
closeIfOpen();
|
||||
// later make more efficient!!
|
||||
out.print(quoted(new String(source, start, len)));
|
||||
}
|
||||
|
||||
public void close() {
|
||||
closeAllElements();
|
||||
out.close();
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
out.flush();
|
||||
}
|
||||
|
||||
// Utility methods
|
||||
|
||||
final public MLStreamWriter cell(String ch, String type, String codepoint, String cat) {
|
||||
if (codepoint == null) codepoint = ch;
|
||||
int dotpos = type.indexOf('.');
|
||||
if (dotpos == -1) el(type);
|
||||
else {
|
||||
el(type.substring(0,dotpos));
|
||||
at("class",type.substring(dotpos+1));
|
||||
}
|
||||
/*
|
||||
if (color == -1) {
|
||||
el("th");
|
||||
} else {
|
||||
el("td");
|
||||
if (color != 0xFFFFFF) {
|
||||
at("bgcolor","#"+hex(color,6));
|
||||
}
|
||||
}
|
||||
*/
|
||||
tx(ch).el("br").el("tt").tx16(codepoint);
|
||||
if (cat != null) tx(" ").tx(cat);
|
||||
cl().cl().cl();
|
||||
return this;
|
||||
}
|
||||
|
||||
final public MLStreamWriter cell(String ch) {
|
||||
return cell(ch,"td",null,null);
|
||||
}
|
||||
|
||||
final public MLStreamWriter cell(String ch, String type) {
|
||||
return cell(ch,type,null,null);
|
||||
}
|
||||
|
||||
final public MLStreamWriter cell(String ch, String type, String codepoint) {
|
||||
return cell(ch,type,codepoint,null);
|
||||
}
|
||||
|
||||
static public String hex(int i, int width) {
|
||||
String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
|
||||
return "00000000".substring(result.length(),width) + result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of an integer (without 0x)
|
||||
*/
|
||||
static public String hex(int i) {
|
||||
return hex(i,8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
|
||||
*/
|
||||
static public String hex(char i) {
|
||||
return hex(i,4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u)
|
||||
*@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062"
|
||||
*/
|
||||
static public String hex(String s, String sep) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (i != 0) result.append(sep);
|
||||
result.append(hex(s.charAt(i)));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static public String hex(String s) {
|
||||
return hex(s," ");
|
||||
}
|
||||
|
||||
|
||||
public void author(String name, String url) {
|
||||
el("font").at("size","-3").tx("[").el("a").at("href",url).tx(name).cl("a").el("script").el("!--");
|
||||
tx("document.write(', ', document.lastModified);");
|
||||
cl("!--").cl("script").tx("]").cl("font");
|
||||
}
|
||||
|
||||
// ================== PRIVATES =================
|
||||
|
||||
PrintWriter out;
|
||||
boolean isHTML;
|
||||
ArrayList stack = new ArrayList();
|
||||
boolean inElement = false;
|
||||
Normalizer formC = new Normalizer(Normalizer.NFC, "");
|
||||
int len;
|
||||
int maxLineLength = 60;
|
||||
// later, add better line end management, indenting
|
||||
|
||||
static final int NONE=0, BEFORE=1, AFTER=2, BOTH=3, FORCE = 4; // chosen for bits!!
|
||||
|
||||
final void print(String s) {
|
||||
print(s,NONE);
|
||||
}
|
||||
|
||||
final void print(char c) {
|
||||
print(c,NONE);
|
||||
}
|
||||
|
||||
final void print(String s, int doesBreak) {
|
||||
if ((doesBreak & BEFORE) != 0) tryBreak(s.length(), doesBreak);
|
||||
len += s.length();
|
||||
out.print(s);
|
||||
if ((doesBreak & AFTER) != 0) tryBreak(0, doesBreak);
|
||||
}
|
||||
|
||||
final void print(char c, int doesBreak) {
|
||||
if ((doesBreak & BEFORE) != 0) tryBreak(1, doesBreak);
|
||||
++len;
|
||||
out.print(c);
|
||||
if ((doesBreak & AFTER) != 0) tryBreak(0, doesBreak);
|
||||
}
|
||||
|
||||
void tryBreak(int toAdd, int doesBreak) {
|
||||
if ((doesBreak & FORCE) != 0 || (len + toAdd) > maxLineLength) {
|
||||
out.println();
|
||||
len = stack.size();
|
||||
for (int i = 0; i < len; ++i) out.print(' ');
|
||||
}
|
||||
}
|
||||
|
||||
public String quoted(String source) {
|
||||
source = formC.normalize(source);
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char ch = source.charAt(i);
|
||||
switch(ch) {
|
||||
case '\'':
|
||||
if (!isHTML) {
|
||||
result.append("'");
|
||||
} else {
|
||||
result.append(ch);
|
||||
}
|
||||
break;
|
||||
case '\"':
|
||||
result.append(""");
|
||||
break;
|
||||
case '<':
|
||||
result.append("<");
|
||||
break;
|
||||
case '&':
|
||||
result.append("&");
|
||||
break;
|
||||
case '>':
|
||||
result.append(">");
|
||||
break;
|
||||
case '\n': case '\r': case '\t':
|
||||
result.append(ch);
|
||||
break;
|
||||
default: if (ch < ' ' // do surrogates later
|
||||
|| ch >= '\u007F' && ch <= '\u009F'
|
||||
|| ch >= '\uD800' && ch <= '\uDFFF'
|
||||
|| ch >= '\uFFFE') {
|
||||
result.append('\uFFFD');
|
||||
} else {
|
||||
result.append(ch);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
}
|
@ -1,350 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.37 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.util.Date;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public final class Main implements UCD_Types {
|
||||
|
||||
static final String classPrefix = "com.ibm.text.UCD.";
|
||||
|
||||
static final String[] CORE_FILES = {
|
||||
"CaseFolding",
|
||||
"CompositionExclusions",
|
||||
"DerivedCoreProperties",
|
||||
"DerivedNormalizationProps",
|
||||
"NormalizationTest",
|
||||
"PropertyAliases",
|
||||
"PropList",
|
||||
"Scripts",
|
||||
"SpecialCasing",
|
||||
"HangulSyllableType",
|
||||
"DerivedAge",
|
||||
"StandardizedVariants",
|
||||
"HangulSyllableType",
|
||||
//"OtherDerivedProperties",
|
||||
};
|
||||
|
||||
static final String[] EXTRACTED_FILES = {
|
||||
"DerivedBidiClass",
|
||||
"DerivedBinaryProperties",
|
||||
"DerivedCombiningClass",
|
||||
"DerivedDecompositionType",
|
||||
"DerivedEastAsianWidth",
|
||||
"DerivedGeneralCategory",
|
||||
"DerivedJoiningGroup",
|
||||
"DerivedJoiningType",
|
||||
"DerivedLineBreak",
|
||||
"DerivedNumericType",
|
||||
"DerivedNumericValues",
|
||||
};
|
||||
|
||||
static final String[] ALL_FILES = {
|
||||
"Core", "Extracted"
|
||||
};
|
||||
|
||||
public static void main (String[] args) throws Exception {
|
||||
System.out.println("*** Start *** " + Default.getDate());
|
||||
|
||||
try {
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
|
||||
long mask = 0;
|
||||
|
||||
String arg = args[i];
|
||||
if (arg.charAt(0) == '#') return; // skip rest of line
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println();
|
||||
System.out.println("** Argument: " + args[i] + " ** " + Default.getDate());
|
||||
|
||||
// Expand string arguments
|
||||
|
||||
if (arg.equalsIgnoreCase("ALL")) {
|
||||
args = Utility.append(ALL_FILES, Utility.subarray(args, i+1));
|
||||
i = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.equalsIgnoreCase("CORE")) {
|
||||
args = Utility.append(CORE_FILES, Utility.subarray(args, i+1));
|
||||
i = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.equalsIgnoreCase("EXTRACTED")) {
|
||||
args = Utility.append(EXTRACTED_FILES, Utility.subarray(args, i+1));
|
||||
i = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// make sure the UCD is set up
|
||||
|
||||
if (arg.equalsIgnoreCase("version")) {
|
||||
Default.setUCD(args[++i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Now handle other options
|
||||
|
||||
if (arg.equalsIgnoreCase("verify")) {
|
||||
VerifyUCD.verify();
|
||||
VerifyUCD.checkCanonicalProperties();
|
||||
VerifyUCD.CheckCaseFold();
|
||||
VerifyUCD.checkAgainstUInfo();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{Default.ucdVersion()});
|
||||
else if (arg.equalsIgnoreCase("statistics")) VerifyUCD.statistics();
|
||||
else if (arg.equalsIgnoreCase("NFSkippable")) NFSkippable.main(null);
|
||||
else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
|
||||
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
|
||||
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
|
||||
else if (arg.equalsIgnoreCase("onetime")) VerifyUCD.oneTime();
|
||||
else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
|
||||
|
||||
else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0);
|
||||
else if (arg.equalsIgnoreCase("romajiTransliterator")) GenerateHanTransliterator.main(1);
|
||||
else if (arg.equalsIgnoreCase("pinYinTransliterator")) GenerateHanTransliterator.main(2);
|
||||
else if (arg.equalsIgnoreCase("hanproperties")) GenerateHanTransliterator.readUnihan();
|
||||
|
||||
else if (arg.equalsIgnoreCase("fixChineseOverrides")) GenerateHanTransliterator.fixChineseOverrides();
|
||||
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
|
||||
|
||||
else if (arg.equalsIgnoreCase("testenum")) SampleEnum.test();
|
||||
|
||||
else if (arg.equalsIgnoreCase("quicktest")) QuickTest.test();
|
||||
else if (arg.equalsIgnoreCase("TernaryStore")) TernaryStore.test();
|
||||
|
||||
else if (arg.equalsIgnoreCase("checkBIDI")) VerifyUCD.checkBIDI();
|
||||
else if (arg.equalsIgnoreCase("Buildnames")) BuildNames.main(null);
|
||||
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("binary")) FastBinarySearch.test();
|
||||
|
||||
else if (arg.equalsIgnoreCase("GenerateCaseTest")) GenerateCaseTest.main(null);
|
||||
else if (arg.equalsIgnoreCase("checkDecompFolding")) VerifyUCD.checkDecompFolding();
|
||||
|
||||
else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);
|
||||
else if (arg.equalsIgnoreCase("checkcollator")) CheckCollator.main(null);
|
||||
|
||||
//else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
|
||||
else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();
|
||||
|
||||
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
|
||||
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
|
||||
else if (arg.equalsIgnoreCase("checkCase3")) VerifyUCD.checkCase3();
|
||||
else if (arg.equalsIgnoreCase("checkCaseLong")) VerifyUCD.checkCase2(true);
|
||||
else if (arg.equalsIgnoreCase("checkCaseShort")) VerifyUCD.checkCase2(false);
|
||||
else if (arg.equalsIgnoreCase("checkCanonicalProperties")) VerifyUCD.checkCanonicalProperties();
|
||||
else if (arg.equalsIgnoreCase("CheckCaseFold")) VerifyUCD.CheckCaseFold();
|
||||
else if (arg.equalsIgnoreCase("genIDN")) VerifyUCD.genIDN();
|
||||
else if (arg.equalsIgnoreCase("VerifyIDN")) VerifyUCD.VerifyIDN();
|
||||
else if (arg.equalsIgnoreCase("NFTest")) VerifyUCD.NFTest();
|
||||
else if (arg.equalsIgnoreCase("test1")) VerifyUCD.test1();
|
||||
//else if (arg.equalsIgnoreCase("TrailingZeros")) GenerateData.genTrailingZeros();
|
||||
else if (arg.equalsIgnoreCase("GenerateThaiBreaks")) GenerateThaiBreaks.main(null);
|
||||
|
||||
else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]});
|
||||
else if (arg.equalsIgnoreCase("MakeUnicodeFiles")) MakeUnicodeFiles.main(new String[]{});
|
||||
|
||||
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
|
||||
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
|
||||
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();
|
||||
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
|
||||
else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
|
||||
else if (arg.equalsIgnoreCase("TestDirectoryIterator")) DirectoryIterator.test();
|
||||
//else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical();
|
||||
else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.checkNameList();
|
||||
//else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0");
|
||||
|
||||
else if (arg.equalsIgnoreCase("Compare14652")) Compare14652.main(null);
|
||||
|
||||
|
||||
//else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts();
|
||||
|
||||
|
||||
/*else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
|
||||
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
|
||||
*/
|
||||
// EXTRACTED PROPERTIES
|
||||
/*
|
||||
else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
|
||||
GenerateData.generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedBidiClass");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedBinaryProperties")) {
|
||||
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES+1, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedBinaryProperties" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedCombiningClass")) {
|
||||
GenerateData.generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedCombiningClass" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedDecompositionType")) {
|
||||
GenerateData.generateVerticalSlice(DECOMPOSITION_TYPE, DECOMPOSITION_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedDecompositionType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
|
||||
GenerateData.generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedEastAsianWidth" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedGeneralCategory")) {
|
||||
GenerateData.generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedGeneralCategory" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedJoiningGroup")) {
|
||||
GenerateData.generateVerticalSlice(JOINING_GROUP, JOINING_GROUP+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedJoiningGroup" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedJoiningType")) {
|
||||
GenerateData.generateVerticalSlice(JOINING_TYPE, JOINING_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedJoiningType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedLineBreak")) {
|
||||
GenerateData.generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedLineBreak" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedNumericType")) {
|
||||
GenerateData.generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedNumericType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("HangulSyllableType")) {
|
||||
GenerateData.generateVerticalSlice(HANGUL_SYLLABLE_TYPE,HANGUL_SYLLABLE_TYPE+NEXT_ENUM, GenerateData.HEADER_EXTEND,
|
||||
"DerivedData/", "HangulSyllableType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
|
||||
GenerateData.generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedNumericValues" );
|
||||
}
|
||||
*/
|
||||
else if (arg.equalsIgnoreCase("StandardizedVariants")) {
|
||||
GenerateStandardizedVariants.generate();
|
||||
|
||||
// OTHER STANDARD PROPERTIES
|
||||
|
||||
} else if (arg.equalsIgnoreCase("CaseFolding")) {
|
||||
GenerateCaseFolding.makeCaseFold(true);
|
||||
GenerateCaseFolding.makeCaseFold(false);
|
||||
|
||||
} else if (arg.equalsIgnoreCase("SpecialCasing")) {
|
||||
GenerateCaseFolding.generateSpecialCasing(true);
|
||||
GenerateCaseFolding.generateSpecialCasing(false);
|
||||
|
||||
/* } else if (arg.equalsIgnoreCase("CompositionExclusions")) {
|
||||
GenerateData.generateCompExclusions();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedAge")) {
|
||||
GenerateData.generateAge("DerivedData/", "DerivedAge");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("backwardsCompat")) {
|
||||
GenerateData.backwardsCompat("DerivedData/extracted/", "Compatibility_ID_START",
|
||||
new int[] {ID_Start, ID_Continue_NO_Cf, Mod_ID_Start, Mod_ID_Continue_NO_Cf});
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedCoreProperties")) {
|
||||
GenerateData.generateDerived(DERIVED_CORE, true, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedCoreProperties");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedNormalizationProps")) {
|
||||
GenerateData.generateDerived(DERIVED_NORMALIZATION, true, GenerateData.HEADER_DERIVED, "DerivedData/",
|
||||
"DerivedNormalizationProps" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("NormalizationTest")) {
|
||||
GenerateData.writeNormalizerTestSuite("DerivedData/", "NormalizationTest");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("PropertyAliases")) {
|
||||
GenerateData.generatePropertyAliases();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("PropList")) {
|
||||
GenerateData.generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + NEXT_ENUM,
|
||||
GenerateData.HEADER_EXTEND, "DerivedData/", "PropList");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("Scripts")) {
|
||||
GenerateData.generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM,
|
||||
GenerateData.HEADER_SCRIPTS, "DerivedData/", "Scripts");
|
||||
// OTHER TESTING
|
||||
|
||||
} else if (arg.equalsIgnoreCase("OtherDerivedProperties")) {
|
||||
//mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
|
||||
GenerateData.generateDerived((byte)(ALL & ~DERIVED_CORE & ~DERIVED_NORMALIZATION), false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("AllBinary")) {
|
||||
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,
|
||||
GenerateData.HEADER_EXTEND, "OtherDerived/", "AllBinary");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedGeneralCategoryTEST")) {
|
||||
GenerateData.generateVerticalSlice(CATEGORY+29, CATEGORY+32, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/", "DerivedGeneralCategory" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listDifferences")) {
|
||||
CompareProperties.listDifferences();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("partition")) {
|
||||
CompareProperties.partition();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("propertyStatistics")) {
|
||||
CompareProperties.statistics();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listAccents")) {
|
||||
GenerateData.listCombiningAccents();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listGreekVowels")) {
|
||||
GenerateData.listGreekVowels();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listKatakana")) {
|
||||
GenerateData.listKatakana();
|
||||
*/
|
||||
/*
|
||||
} else if (arg.equalsIgnoreCase("DerivedFullNormalization")) {
|
||||
mask = Utility.setBits(0, DerivedProperty.GenNFD, DerivedProperty.GenNFKC);
|
||||
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedFullNormalization" );
|
||||
} else if (arg.equalsIgnoreCase("caseignorable")) {
|
||||
mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i);
|
||||
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "CaseIgnorable" );
|
||||
} else if (arg.equalsIgnoreCase("nfunsafestart")) {
|
||||
mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart);
|
||||
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "NFUnsafeStart");
|
||||
*/
|
||||
|
||||
} else {
|
||||
CallArgs.call(new String[]{arg}, classPrefix);
|
||||
}
|
||||
|
||||
|
||||
//checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
|
||||
//checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
|
||||
|
||||
|
||||
//GenerateData.generateDerived(Utility.setBits(0, DerivedProperty.PropMath, DerivedProperty.Mod_ID_Continue_NO_Cf),
|
||||
// GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedPropData2" );
|
||||
//GenerateData.generateVerticalSlice(SCRIPT, SCRIPT+1, "ScriptCommon" );
|
||||
//listStrings("LowerCase" , 0,0);
|
||||
//GenerateData.generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedData/", "DerivedPropData1" );
|
||||
|
||||
// AGE stuff
|
||||
//UCD ucd = UCD.make();
|
||||
//System.out.println(ucd.getAgeID(0x61));
|
||||
//System.out.println(ucd.getAgeID(0x2FA1D));
|
||||
|
||||
//
|
||||
}
|
||||
} finally {
|
||||
System.out.println("*** Done *** " + Default.getDate());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,506 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.text.utility.Utility;
|
||||
import com.ibm.text.utility.Utility.Encoding;
|
||||
|
||||
public class MakeNamesChart {
|
||||
|
||||
static int lastCodePoint = -1;
|
||||
static boolean lastCodePointIsOld = false;
|
||||
static int lastDecompType = UCD.NONE;
|
||||
|
||||
static final String chartPrefix = "c_";
|
||||
static final String namePrefix = "n_";
|
||||
|
||||
static UnicodeSet skipChars;// = new UnicodeSet("[[:gc=cn:]-[:noncharactercodepoint:]]");
|
||||
static UnicodeSet rtl;// = new UnicodeSet("[[:bidiclass=r:][:bidiclass=al:]]");
|
||||
static UnicodeSet usePicture;// = new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");
|
||||
|
||||
static UCD ucd41;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
//ConvertUCD.main(new String[]{"5.0.0"});
|
||||
BlockInfo blockInfo = new BlockInfo("5.0.0", "NamesList.txt");
|
||||
// http://www.unicode.org/~book/incoming/kenfiles/U50M051010.lst
|
||||
Default.setUCD("5.0.0");
|
||||
ucd41 = UCD.make("4.1.0");
|
||||
ToolUnicodePropertySource up = ToolUnicodePropertySource.make("5.0.0");
|
||||
skipChars = new UnicodeSet(up.getSet("gc=cn")).removeAll(up.getSet("gc=cn"));
|
||||
//"[[:gc=cn:]-[:noncharactercodepoint:]]");
|
||||
rtl = new UnicodeSet(up.getSet("bidiclass=r")).addAll(up.getSet("bidiclass=al"));// "[[:bidiclass=r:][:bidiclass=al:]]");
|
||||
usePicture = new UnicodeSet(up.getSet("whitespace=true")).addAll(up.getSet("defaultignorablecodepoint=true"));// new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");
|
||||
|
||||
List nameList = new ArrayList();
|
||||
ArrayList lines = new ArrayList();
|
||||
UnicodeSet collectedCodePoints = new UnicodeSet();
|
||||
BitSet nameListNew = new BitSet();
|
||||
|
||||
int limit = Integer.MAX_VALUE;
|
||||
for (int count = 0; count < limit; ++count) {
|
||||
if (!blockInfo.next(lines)) break;
|
||||
String firstLine = (String)lines.get(0);
|
||||
if (firstLine.startsWith("@@@")) continue;
|
||||
String[] lineParts = firstLine.split("\t");
|
||||
String fileName = lineParts[1] + ".html";
|
||||
nameList.add(firstLine);
|
||||
System.out.println();
|
||||
System.out.println("file: " + chartPrefix + fileName);
|
||||
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" +
|
||||
TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])) +
|
||||
"</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
|
||||
"<base target='names'></head><body>");
|
||||
|
||||
// header
|
||||
out.println("<table class='headerTable'><tr><td class='headerLeft'>" +
|
||||
lineParts[1] +
|
||||
" <a href='help.html'>help</a></td><td class='headerCenter'>" +
|
||||
getHeading(lineParts[2]) +
|
||||
"</td><td class='headerRight'><a href='mainList.html'>index</a> " +
|
||||
lineParts[3] +
|
||||
"</td></tr></table>");
|
||||
|
||||
if ("Unassigned".equals(lineParts[2])) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
// first pass through and collect all the code points
|
||||
collectedCodePoints.clear();
|
||||
for (int i = 1; i < lines.size(); ++i) {
|
||||
String line = (String)lines.get(i);
|
||||
int cp1 = line.charAt(0);
|
||||
if (cp1 != '@' && cp1 != '\t') {
|
||||
int cp = Integer.parseInt(line.split("\t")[0],16);
|
||||
collectedCodePoints.add(cp);
|
||||
}
|
||||
}
|
||||
collectedCodePoints.removeAll(skipChars);
|
||||
if (collectedCodePoints.size() == 0) {
|
||||
out.println("<p align='center'>No Names List</p>");
|
||||
} else {
|
||||
out.println("<div align='center'><table class='chart'><tr>");
|
||||
int counter = 0;
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); it.next();) {
|
||||
if ((counter % 16) == 0 && counter != 0) {
|
||||
out.println("</tr><tr>");
|
||||
}
|
||||
String tdclass = "cell";
|
||||
if (counter < 16) tdclass = "cellw";
|
||||
if (it.codepoint == 0x242) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
boolean isNew = isNew(it.codepoint);
|
||||
if (isNew) tdclass += "new";
|
||||
String hexcp = Utility.hex(it.codepoint, 4);
|
||||
String title = "";
|
||||
String name = Default.ucd().getName(it.codepoint);
|
||||
if (name != null) title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'";
|
||||
out.println("<td class='" + tdclass + "'"
|
||||
+ title
|
||||
+ ">\u00A0"
|
||||
+ showChar(it.codepoint) + "\u00A0<br><tt><a href='" + namePrefix + fileName + "#"+ hexcp + "'>" +
|
||||
hexcp + "</a></tt></td>");
|
||||
counter++;
|
||||
}
|
||||
if (counter > 16) {
|
||||
counter &= 0xF;
|
||||
if (counter != 0) for (; counter < 16; ++counter) out.println("<td class='cell'>\u00A0</td>");
|
||||
out.println("</tr></table></div>");
|
||||
}
|
||||
}
|
||||
out.close();
|
||||
out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + fileName);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
|
||||
"<link rel='stylesheet' type='text/css' href='namelist.css'></head><body>");
|
||||
|
||||
// now do the characters
|
||||
boolean inTable = false;
|
||||
for (int i = 1; i < lines.size(); ++i) {
|
||||
String line = (String)lines.get(i);
|
||||
try {
|
||||
if (line.startsWith("@")) {
|
||||
finishItem(out);
|
||||
if (inTable) {
|
||||
out.println("</table>");
|
||||
inTable = false;
|
||||
}
|
||||
if (line.startsWith("@+")) {
|
||||
line = line.substring(2).trim();
|
||||
out.println("<p class='comment'>"
|
||||
+ line
|
||||
+ "</p>");
|
||||
} else {
|
||||
line = line.substring(1).trim();
|
||||
out.println("<h2>"
|
||||
+ line
|
||||
+ "</h2>");
|
||||
}
|
||||
} else {
|
||||
if (!inTable) {
|
||||
out.println("<table>");
|
||||
inTable = true;
|
||||
}
|
||||
//String line2 = lineParts[1];
|
||||
if (line.startsWith("\t")) {
|
||||
String body = line.trim();
|
||||
if (false && line.indexOf(body) != 1) {
|
||||
System.out.println("Format error: too much inital whitespace: <" + line + ">");
|
||||
}
|
||||
char firstChar = body.charAt(0);
|
||||
switch (firstChar) {
|
||||
case '*': body = "\u2022 " + body.substring(2); break;
|
||||
case ':': body = checkCanonical(lastCodePoint, body); break;
|
||||
case '#': body = checkCompatibility(lastCodePoint, body); break;
|
||||
case 'x': body = getOther(body); break;
|
||||
case '=': break;
|
||||
default: throw new IllegalArgumentException("Huh? " + body);
|
||||
}
|
||||
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td>"
|
||||
+ maybeNameStyle(showTextConvertingHex(body, firstChar != '='), firstChar == '=')
|
||||
+ "</td></tr>");
|
||||
} else {
|
||||
finishItem(out);
|
||||
lineParts = line.split("\t");
|
||||
String x = lineParts[0];
|
||||
lastCodePoint = Integer.parseInt(x,16);
|
||||
boolean lastCodePointIsNew = isNew(lastCodePoint);
|
||||
if (lastCodePointIsNew) nameListNew.set(nameList.size()-1, true);
|
||||
out.println("<tr><td"
|
||||
+ (lastCodePointIsNew ? " class='new'" : "")
|
||||
+ "><code><a name='" + x + "'>" + x + "</a></code></td><td>\u00A0"
|
||||
+ showChar(lastCodePoint) + "\u00A0</td><td"
|
||||
+ (lastCodePointIsNew ? " class='new'" : "") + ">"
|
||||
+ nameStyle(showTextConvertingHex(lineParts[1], false)) + "</td></tr>");
|
||||
lastDecompType = Default.ucd().getDecompositionType(lastCodePoint);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw (IllegalArgumentException) new IllegalArgumentException("Error on line: " + line)
|
||||
.initCause(e);
|
||||
}
|
||||
}
|
||||
finishItem(out);
|
||||
out.close();
|
||||
}
|
||||
blockInfo.in.close();
|
||||
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", "mainList.html");
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
|
||||
"<title>Main List</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
|
||||
"<base target='chart'></head><body><table>");
|
||||
for (int i = 0; i < nameList.size(); ++i) {
|
||||
String line = (String) nameList.get(i);
|
||||
String[] lineParts = line.split("\t");
|
||||
String fileName = lineParts[1] + ".html";
|
||||
out.println("<tr><td><code>" + lineParts[1] +
|
||||
"</code></td><td"
|
||||
+ (nameListNew.get(i) ? " class='new'" : "")
|
||||
+ "><a href='" + chartPrefix + fileName + "'>" + getHeading(lineParts[2]) + "</a></td><td><code>" +
|
||||
lineParts[3] +"</code></td></tr>");
|
||||
}
|
||||
out.println("</table></body></html>");
|
||||
out.close();
|
||||
BagFormatter bf = new BagFormatter();
|
||||
//System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in decomps", hasNoName));
|
||||
System.out.println("Name differences: Canonical");
|
||||
showNameDifferences(hasNameCan, hasNoNameCan);
|
||||
System.out.println("Name differences: Compatibility");
|
||||
showNameDifferences(hasNameComp, hasNoNameComp);
|
||||
// System.out.println("Characters with names in decomps: " + hasName.toPattern(true));
|
||||
// System.out.println("Characters without names in decomps: " + hasNoName.toPattern(true));
|
||||
// System.out.println("Characters sometimes with, sometimes without names in decomps: " + both.toPattern(true));
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
private static boolean isNew(int codepoint) {
|
||||
return Default.ucd().isAllocated(codepoint) && !ucd41.isAllocated(codepoint);
|
||||
}
|
||||
|
||||
private static void showNameDifferences(Map hasName, Map hasNoName) {
|
||||
Set both = new TreeSet(hasNoName.keySet());
|
||||
both.retainAll(hasName.keySet());
|
||||
//hasNoName.removeAll(both);
|
||||
//hasName.removeAll(both);
|
||||
for (Iterator it = both.iterator(); it.hasNext();) {
|
||||
String decomp = (String) it.next();
|
||||
System.out.println();
|
||||
System.out.println("decomp: " + Utility.hex(decomp));
|
||||
System.out.println("Has name in: " + Utility.hex((String)hasName.get(decomp)));
|
||||
System.out.println("Has no name in: " + Utility.hex((String)hasNoName.get(decomp)));
|
||||
}
|
||||
System.out.println("Count: " + both.size());
|
||||
}
|
||||
|
||||
static TestIdentifiers ti;
|
||||
static {
|
||||
try {
|
||||
ti = new TestIdentifiers("L");
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static void finishItem(PrintWriter out) {
|
||||
if (lastCodePoint < 0) return;
|
||||
if (lastDecompType != UCD.NONE) {
|
||||
System.out.println("Alert: missing decomp for " + Utility.hex(lastCodePoint));
|
||||
}
|
||||
String str = UTF16.valueOf(lastCodePoint);
|
||||
String upper = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.UPPER), "\u2191");
|
||||
showForm(out, str, upper, null, Default.ucd().getCase(str,UCD.FULL,UCD.TITLE), "\u2195");
|
||||
String lower = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.LOWER), "\u2193");
|
||||
showForm(out, lower, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.FOLD), "\u2194");
|
||||
|
||||
String dc = Default.ucd().getDecompositionMapping(lastCodePoint);
|
||||
String nfd = showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB");
|
||||
//String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB");
|
||||
String nfkd = showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD");
|
||||
|
||||
if (nfkd.equals(str)) {
|
||||
Set s = ti.getConfusables(lastCodePoint, "MA");
|
||||
if (s.size() > 1) {
|
||||
sortedSet.clear();
|
||||
for (Iterator it = s.iterator(); it.hasNext();) {
|
||||
sortedSet.add(Default.nfkd().normalize((String)it.next()));
|
||||
}
|
||||
sortedSet.remove(nfkd); // remove me
|
||||
for (Iterator it = sortedSet.iterator(); it.hasNext();) {
|
||||
String other = (String)it.next();
|
||||
if (nfkd.equals(Default.nfkd().normalize(other))) continue;
|
||||
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='conf'>\u279F\u00A0"
|
||||
+ showTextConvertingHex(Utility.hex(other, 4, " + "), true)
|
||||
+ " "
|
||||
+ Default.ucd().getName(other, UCD.NORMAL, " + ").toLowerCase()
|
||||
// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
|
||||
+ "</td></tr>");
|
||||
}
|
||||
}
|
||||
}
|
||||
lastCodePoint = -1;
|
||||
}
|
||||
|
||||
static Set sortedSet = new TreeSet(Collator.getInstance(ULocale.ENGLISH));
|
||||
|
||||
private static String showForm(PrintWriter out, String str, String str2, String str3, String transformed, String symbol) {
|
||||
if (!transformed.equals(str) && !transformed.equals(str2) && !transformed.equals(str3)) {
|
||||
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='c'>" + symbol + "\u00A0"
|
||||
+ showTextConvertingHex(Utility.hex(transformed, 4, " + "), true)
|
||||
+ (UTF16.countCodePoint(transformed) != 1 ? "" :
|
||||
" " + Default.ucd().getName(transformed, UCD.NORMAL, " + ").toLowerCase())
|
||||
// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
|
||||
+ "</td></tr>");
|
||||
}
|
||||
return transformed;
|
||||
}
|
||||
|
||||
static public String getHeading(String name) {
|
||||
int pos = name.lastIndexOf(" (");
|
||||
if (pos < 0) return name;
|
||||
return name.substring(0, pos);
|
||||
}
|
||||
|
||||
private static String maybeNameStyle(String string, boolean b) {
|
||||
if (b && string.equals(string.toUpperCase(Locale.ENGLISH))) return nameStyle(string);
|
||||
return string;
|
||||
}
|
||||
|
||||
|
||||
private static String nameStyle(String string) {
|
||||
// TODO Auto-generated method stub
|
||||
String result = "<i>" + Default.ucd().getCase(string, UCD.FULL, UCD.TITLE) + "</i>";
|
||||
// if it has any &xxx;, then restore them.
|
||||
int position = 0;
|
||||
while (true) {
|
||||
if (!escapeMatch.reset(result).find(position)) break;
|
||||
int start = escapeMatch.start();
|
||||
position = escapeMatch.end();
|
||||
result = result.substring(0,start)
|
||||
+ result.substring(start, position).toLowerCase()
|
||||
+ result.substring(position);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher("");
|
||||
|
||||
private static String showTextConvertingHex(String body, boolean addCharToHex) {
|
||||
body = TransliteratorUtilities.toHTML.transliterate(body);
|
||||
if (addCharToHex) {
|
||||
int position = 0;
|
||||
while (position < body.length()) {
|
||||
if (!findHex.reset(body).find(position)) break;
|
||||
position = findHex.end();
|
||||
int start = findHex.start();
|
||||
int len = position - start;
|
||||
if (len < 4 || len > 6) continue;
|
||||
int cp = Integer.parseInt(findHex.group(),16);
|
||||
if (cp > 0x10FFFF) continue;
|
||||
String insert = "\u00A0" + showChar(cp);
|
||||
String beginning = body.substring(0,start)
|
||||
+ "<code>" + body.substring(start, position) + "</code>"
|
||||
+ insert;
|
||||
body = beginning + body.substring(position);
|
||||
position = beginning.length();
|
||||
}
|
||||
}
|
||||
return body;
|
||||
}
|
||||
|
||||
static Matcher pointer = Pattern.compile("x \\((.*) - ([0-9A-F]+)\\)").matcher("");
|
||||
static Matcher pointer2 = Pattern.compile("x ([0-9A-F]{4,6})").matcher("");
|
||||
static Matcher findHex = Pattern.compile("[0-9A-F]+").matcher("");
|
||||
|
||||
private static String getOther(String body) {
|
||||
// of form: x (hyphenation point - 2027)
|
||||
// => arrow 2027 X hyphenation point
|
||||
int cp;
|
||||
String name = null;
|
||||
if (pointer.reset(body).matches()) {
|
||||
cp = Integer.parseInt(pointer.group(2),16);
|
||||
name = pointer.group(1);
|
||||
String name2 = Default.ucd().getName(cp);
|
||||
if (name2 == null) name2 = "<not a character>";
|
||||
if (!name.equalsIgnoreCase(name2)) {
|
||||
System.out.println("Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint));
|
||||
System.out.println("\tName is: " + name2);
|
||||
}
|
||||
} else if (pointer2.reset(body).matches()) {
|
||||
cp = Integer.parseInt(pointer2.group(1),16);
|
||||
// name = UCharacter.getName(cp).toLowerCase();
|
||||
// System.out.println("Irregular format: " + body);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Bad format: " + body);
|
||||
}
|
||||
return "\u2192 " + Utility.hex(cp,4) /*+ " " + showChar(cp)*/ + (name != null ? " " + name : "");
|
||||
}
|
||||
|
||||
static String showChar(int cp) {
|
||||
if (usePicture.contains(cp)) {
|
||||
int rep = '\u2588';
|
||||
if (cp <= 0x20) rep = 0x2400 + cp;
|
||||
else if (cp == 0x7F) rep = 0x2421;
|
||||
return "<span class='inv'>" + (char)rep + "</span>";
|
||||
//String hex = Utility.hex(cp);
|
||||
//return "<img alt='" + hex + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + hex + "'>";
|
||||
}
|
||||
|
||||
int type = Default.ucd().getCategory(cp);
|
||||
if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) {
|
||||
return "\u2588";
|
||||
}
|
||||
String result = TransliteratorUtilities.toHTML.transliterate(UTF16.valueOf(cp));
|
||||
if (type == UCD.Me || type == UCD.Mn) {
|
||||
result = "\u25CC" + result;
|
||||
} else if (rtl.contains(cp)) {
|
||||
result = "\u200E" + result + "\u200E";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]");
|
||||
static final Map hasNoNameCan = new TreeMap();
|
||||
static final Map hasNameCan = new TreeMap();
|
||||
static final Map hasNoNameComp = new TreeMap();
|
||||
static final Map hasNameComp = new TreeMap();
|
||||
|
||||
private static String checkCanonical(int codePoint, String body) {
|
||||
body = body.substring(2);
|
||||
if (lastDecompType != UCD.CANONICAL) {
|
||||
System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
|
||||
}
|
||||
String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
|
||||
String hexed = Utility.hex(lastDecomp, 4, " ");
|
||||
String hexed2 = hexed;
|
||||
if (UTF16.countCodePoint(lastDecomp) == 1) {
|
||||
hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
|
||||
}
|
||||
if (hexed.equalsIgnoreCase(body)) {
|
||||
hasNoNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
|
||||
} else if (hexed2.equalsIgnoreCase(body)) {
|
||||
hasNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
|
||||
} else {
|
||||
System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
|
||||
System.out.println("\tShould be: " + hexed);
|
||||
}
|
||||
lastDecompType = UCD.NONE;
|
||||
return "\u2261 " + body;
|
||||
}
|
||||
|
||||
private static String checkCompatibility(int codePoint, String body) {
|
||||
body = body.substring(2);
|
||||
if (lastDecompType <= UCD.CANONICAL) {
|
||||
System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
|
||||
}
|
||||
String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
|
||||
String hexed = Utility.hex(lastDecomp, 4, " ");
|
||||
if (lastDecompType != UCD.COMPAT_UNSPECIFIED) {
|
||||
String lastDecompID = Default.ucd().getDecompositionTypeID(lastCodePoint);
|
||||
hexed = "<" + lastDecompID + "> " + hexed;
|
||||
}
|
||||
String hexed2 = hexed;
|
||||
if (UTF16.countCodePoint(lastDecomp) == 1) {
|
||||
hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
|
||||
}
|
||||
if (hexed.equalsIgnoreCase(body)) {
|
||||
hasNoNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
|
||||
} else if (hexed2.equalsIgnoreCase(body)) {
|
||||
hasNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
|
||||
} else {
|
||||
System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
|
||||
System.out.println("\tShould be: " + hexed);
|
||||
}
|
||||
lastDecompType = UCD.NONE;
|
||||
return "\u2248 " + body;
|
||||
}
|
||||
|
||||
static class BlockInfo {
|
||||
BufferedReader in;
|
||||
String lastLine;
|
||||
BlockInfo (String version, String filename) throws IOException {
|
||||
in = Utility.openUnicodeFile(filename, version, true, Utility.LATIN1_WINDOWS);
|
||||
//in = BagFormatter.openUTF8Reader(dir, filename);
|
||||
}
|
||||
boolean next(List inout) throws IOException {
|
||||
inout.clear();
|
||||
if (lastLine != null) {
|
||||
inout.add(lastLine);
|
||||
lastLine = null;
|
||||
}
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.startsWith("@@\t")) {
|
||||
lastLine = line;
|
||||
break;
|
||||
}
|
||||
inout.add(line);
|
||||
}
|
||||
return inout.size() > 0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,405 +0,0 @@
|
||||
Generate: .*BreakTest.*
|
||||
DeltaVersion: 17
|
||||
CopyrightYear: 2006
|
||||
|
||||
File: auxiliary/GraphemeBreakProperty
|
||||
Property: Grapheme_Cluster_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
File: auxiliary/WordBreakProperty
|
||||
Property: Word_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
File: auxiliary/SentenceBreakProperty
|
||||
Property: Sentence_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
File: auxiliary/GraphemeBreakTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: auxiliary/WordBreakTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: auxiliary/LineBreakTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: auxiliary/SentenceBreakTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: Blocks
|
||||
Property: Block
|
||||
# Note: When comparing block names, casing, whitespace, hyphens,
|
||||
# and underbars are ignored.
|
||||
# For example, "Latin Extended-A" and "latin extended a" are equivalent.
|
||||
# For more information on the comparison of property values,
|
||||
# see UCD.html.
|
||||
Format: valueList skipUnassigned=No_Block
|
||||
|
||||
File: CaseFolding
|
||||
Property: SPECIAL
|
||||
|
||||
File: DerivedAge
|
||||
Property: Age
|
||||
Format: nameStyle=none noLabel skipValue=unassigned
|
||||
|
||||
Value: 1.1
|
||||
# Assigned as of Unicode 1.1.0 (June, 1993)
|
||||
# [excluding removed Hangul Syllables]
|
||||
|
||||
Value: 2.0
|
||||
# Newly assigned in Unicode 2.0.0 (July, 1996)
|
||||
|
||||
Value: 2.1
|
||||
# Newly assigned in Unicode 2.1.2 (May, 1998)
|
||||
|
||||
Value: 3.0
|
||||
# Newly assigned in Unicode 3.0.0 (September, 1999)
|
||||
|
||||
Value: 3.1
|
||||
# Newly assigned in Unicode 3.1.0 (March, 2001)
|
||||
|
||||
Value: 3.2
|
||||
# Newly assigned in Unicode 3.2.0 (March, 2002)
|
||||
|
||||
Value: 4.0
|
||||
# Newly assigned in Unicode 4.0.0 (April, 2003)
|
||||
|
||||
Value: 4.1
|
||||
# Newly assigned in Unicode 4.1.0 (March, 2005)
|
||||
|
||||
Value: 5.0
|
||||
# Newly assigned in Unicode 5.0.0 (XXX, 2006)
|
||||
|
||||
File: extracted/DerivedBidiClass
|
||||
Property: Bidi_Class
|
||||
# Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
|
||||
# Unlike other properties, unassigned code points in blocks
|
||||
# reserved for right-to-left scripts are given either types R or AL.
|
||||
# The unassigned characters that default to R are:
|
||||
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF
|
||||
# \uFB1D-\uFB4F \U00010840-\U000109FF \U00010A60-\U00010FFF
|
||||
# The unassigned characters that default to AL are:
|
||||
# Arabic, Syriac, Arabic_Supplement, Thaana, Arabic_Presentation_Forms_A,
|
||||
# Arabic_Presentation_Forms_B, minus the Noncharacter_Code_Points
|
||||
# For all other cases:
|
||||
Format: valueStyle=short skipUnassigned=Left_To_Right
|
||||
|
||||
File: extracted/DerivedBinaryProperties
|
||||
Property: Bidi_Mirrored
|
||||
# Bidi_Mirrored (listing UnicodeData.txt, field 9: see UCD.html)
|
||||
|
||||
File: extracted/DerivedCombiningClass
|
||||
Property: Canonical_Combining_Class
|
||||
# Combining Class (listing UnicodeData.txt, field 3: see UCD.html)
|
||||
Format: nameStyle=none valueStyle=short skipUnassigned=Not_Reordered
|
||||
|
||||
File: DerivedCoreProperties
|
||||
Property: Math
|
||||
# Derived Property: Math
|
||||
# Generated from: Sm + Other_Math
|
||||
|
||||
Property: Alphabetic
|
||||
# Derived Property: Alphabetic
|
||||
# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic
|
||||
|
||||
|
||||
Property: Lowercase
|
||||
# Derived Property: Lowercase
|
||||
# Generated from: Ll + Other_Lowercase
|
||||
|
||||
|
||||
Property: Uppercase
|
||||
# Derived Property: Uppercase
|
||||
# Generated from: Lu + Other_Uppercase
|
||||
|
||||
|
||||
Property: ID_Start
|
||||
# Derived Property: ID_Start
|
||||
# Characters that can start an identifier.
|
||||
# Generated from Lu+Ll+Lt+Lm+Lo+Nl+Other_ID_Start
|
||||
# NOTE: See UAX #31 for more information
|
||||
|
||||
Property: ID_Continue
|
||||
# Derived Property: ID_Continue
|
||||
# Characters that can continue an identifier.
|
||||
# Generated from: ID_Start + Mn+Mc+Nd+Pc + Other_ID_Continue
|
||||
# NOTE: See UAX #31 for more information
|
||||
|
||||
|
||||
Property: XID_Start
|
||||
# Derived Property: XID_Start
|
||||
# ID_Start modified for closure under NFKx
|
||||
# Modified as described in UAX #15
|
||||
# NOTE: Does NOT remove the non-NFKx characters.
|
||||
# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))
|
||||
# NOTE: See UAX #31 for more information
|
||||
|
||||
Property: XID_Continue
|
||||
# Derived Property: XID_Continue
|
||||
# Mod_ID_Continue modified for closure under NFKx
|
||||
# Modified as described in UAX #15
|
||||
# NOTE: Cf characters should be filtered out.
|
||||
# NOTE: Does NOT remove the non-NFKx characters.
|
||||
# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))
|
||||
# NOTE: See UAX #31 for more information
|
||||
|
||||
Property: Default_Ignorable_Code_Point
|
||||
# Derived Property: Default_Ignorable_Code_Point
|
||||
# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters
|
||||
# - White_Space - FFF9..FFFB (Annotation Characters)
|
||||
|
||||
Property: Grapheme_Extend
|
||||
# Derived Property: Grapheme_Extend
|
||||
# Generated from: Me + Mn + Other_Grapheme_Extend
|
||||
# Note: depending on an application's interpretation of Co (private use),
|
||||
# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither.
|
||||
|
||||
|
||||
Property: Grapheme_Base
|
||||
# Derived Property: Grapheme_Base
|
||||
# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
|
||||
# Note: depending on an application's interpretation of Co (private use),
|
||||
# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither.
|
||||
|
||||
Property: Grapheme_Link
|
||||
# Derived Property: Grapheme_Link (deprecated)
|
||||
# Generated from: Canonical_Combining_Class=Virama
|
||||
# Use Canonical_Combining_Class=Virama directly instead
|
||||
|
||||
File: extracted/DerivedDecompositionType
|
||||
Property: Decomposition_Type
|
||||
Format: skipValue=None
|
||||
# Decomposition_Type (from UnicodeData.txt, field 5: see UCD.html)
|
||||
|
||||
File: extracted/DerivedEastAsianWidth
|
||||
Property: East_Asian_Width
|
||||
Format: valueStyle=short skipUnassigned=Neutral
|
||||
# East_Asian_Width (listing EastAsianWidth.txt, field 1)
|
||||
|
||||
File: extracted/DerivedGeneralCategory
|
||||
Property: General_Category
|
||||
Format: valueStyle=short noLabel
|
||||
|
||||
File: extracted/DerivedJoiningGroup
|
||||
Property: Joining_Group
|
||||
# Joining Group (listing ArabicShaping.txt, field 3)
|
||||
Format: skipValue=No_Joining_Group
|
||||
|
||||
File: extracted/DerivedJoiningType
|
||||
Property: Joining_Type
|
||||
# Type T is derived, as described in ArabicShaping.txt
|
||||
Format: valueStyle=short skipValue=Non_Joining
|
||||
|
||||
File: extracted/DerivedLineBreak
|
||||
Property: Line_Break
|
||||
Format: valueStyle=short skipUnassigned=Unknown
|
||||
|
||||
File: DerivedNormalizationProps
|
||||
|
||||
Property: FC_NFKC_Closure
|
||||
# Derived Property: FC_NFKC_Closure
|
||||
# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));
|
||||
# Then if (c != b) add the mapping from a to c to the set of
|
||||
# mappings that constitute the FC_NFKC_Closure list
|
||||
# Uses the full case folding from CaseFolding.txt, without the T option.
|
||||
Format: nameStyle=short
|
||||
|
||||
|
||||
Property: Full_Composition_Exclusion
|
||||
# Derived Property: Full_Composition_Exclusion
|
||||
# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions
|
||||
|
||||
|
||||
Property: NFD_QuickCheck
|
||||
# Derived Property: NFD_QuickCheck
|
||||
# Generated from computing decomposibles
|
||||
Format: nameStyle=short valueStyle=short skipValue=Yes
|
||||
|
||||
|
||||
Property: NFC_QuickCheck
|
||||
# Derived Property: NFC_QuickCheck
|
||||
# Generated from computing decomposibles (and characters that may compose with previous ones)
|
||||
Format: nameStyle=short valueStyle=short skipValue=Yes
|
||||
|
||||
Property: NFKD_QuickCheck
|
||||
# Derived Property: NFKD_QuickCheck
|
||||
# Generated from computing decomposibles
|
||||
Format: nameStyle=short valueStyle=short skipValue=Yes
|
||||
|
||||
|
||||
Property: NFKC_QuickCheck
|
||||
# Derived Property: NFKC_QuickCheck
|
||||
# Generated from computing decomposibles (and characters that may compose with previous ones)
|
||||
Format: nameStyle=short valueStyle=short skipValue=Yes
|
||||
|
||||
Property: Expands_On_NFD
|
||||
# Derived Property: Expands_On_NFD
|
||||
# Generated according to UAX #15.
|
||||
# Characters whose normalized length is not one.
|
||||
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
|
||||
# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!
|
||||
|
||||
|
||||
Property: Expands_On_NFC
|
||||
# Derived Property: Expands_On_NFC
|
||||
# Generated according to UAX #15.
|
||||
# Characters whose normalized length is not one.
|
||||
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
|
||||
# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!
|
||||
|
||||
|
||||
Property: Expands_On_NFKD
|
||||
# Derived Property: Expands_On_NFKD
|
||||
# Generated according to UAX #15.
|
||||
# Characters whose normalized length is not one.
|
||||
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
|
||||
# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!
|
||||
|
||||
|
||||
Property: Expands_On_NFKC
|
||||
# Derived Property: Expands_On_NFKC
|
||||
# Generated according to UAX #15.
|
||||
# Characters whose normalized length is not one.
|
||||
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
|
||||
# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!
|
||||
|
||||
|
||||
File: extracted/DerivedNumericType
|
||||
Property: Numeric_Type
|
||||
# Numeric Type (from UnicodeData.txt, field 6/7/8 plus Unihan.txt: see UCD.html)
|
||||
Format: skipValue=None
|
||||
|
||||
File: extracted/DerivedNumericValues
|
||||
Property: Numeric_Value
|
||||
# Numeric Values (from UnicodeData.txt, field 6/7/8)
|
||||
# WARNING: Certain valus, such as 0.16666667, are repeating fractions
|
||||
# Although they are only printed with a limited number of decimal places
|
||||
# in this file, they should be expressed to the limits of the precision
|
||||
# available when used.
|
||||
Format: sortNumeric
|
||||
|
||||
File: HangulSyllableType
|
||||
Property: Hangul_Syllable_Type
|
||||
Format: valueStyle=short skipValue=Not_Applicable
|
||||
|
||||
File: NormalizationTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: PropList
|
||||
|
||||
Property: White_Space
|
||||
|
||||
Property: Bidi_Control
|
||||
|
||||
Property: Join_Control
|
||||
|
||||
Property: Dash
|
||||
|
||||
Property: Hyphen
|
||||
|
||||
Property: Quotation_Mark
|
||||
|
||||
Property: Terminal_Punctuation
|
||||
|
||||
Property: Other_Math
|
||||
|
||||
Property: Hex_Digit
|
||||
|
||||
Property: ASCII_Hex_Digit
|
||||
|
||||
Property: Other_Alphabetic
|
||||
|
||||
Property: Ideographic
|
||||
|
||||
Property: Diacritic
|
||||
|
||||
Property: Extender
|
||||
|
||||
Property: Other_Lowercase
|
||||
|
||||
Property: Other_Uppercase
|
||||
|
||||
Property: Noncharacter_Code_Point
|
||||
|
||||
Property: Other_Grapheme_Extend
|
||||
|
||||
Property: IDS_Binary_Operator
|
||||
|
||||
Property: IDS_Trinary_Operator
|
||||
|
||||
Property: Radical
|
||||
|
||||
Property: Unified_Ideograph
|
||||
|
||||
Property: Other_Default_Ignorable_Code_Point
|
||||
|
||||
Property: Deprecated
|
||||
|
||||
Property: Soft_Dotted
|
||||
|
||||
Property: Logical_Order_Exception
|
||||
|
||||
Property: Other_ID_Start
|
||||
|
||||
Property: Other_ID_Continue
|
||||
|
||||
Property: STerm
|
||||
|
||||
Property: Variation_Selector
|
||||
Property: Pattern_White_Space
|
||||
Property: Pattern_Syntax
|
||||
|
||||
|
||||
File: PropertyAliases
|
||||
Property: SPECIAL
|
||||
|
||||
File: PropertyValueAliases
|
||||
Property: SPECIAL
|
||||
|
||||
File: Scripts
|
||||
Property: Script
|
||||
Format: nameStyle=none skipValue=Unknown
|
||||
|
||||
File: SpecialCasing
|
||||
Property: SPECIAL
|
||||
|
||||
File: StandardizedVariants
|
||||
Property: SPECIAL
|
||||
|
||||
File: NamedSequences
|
||||
Property: SPECIAL
|
||||
|
||||
HackName: noBreak
|
||||
HackName: Arabic_Presentation_Forms-A
|
||||
HackName: Arabic_Presentation_Forms-B
|
||||
HackName: CJK_Symbols_and_Punctuation
|
||||
HackName: Combining_Diacritical_Marks_for_Symbols
|
||||
HackName: Enclosed_CJK_Letters_and_Months
|
||||
HackName: Greek_and_Coptic
|
||||
HackName: Halfwidth_and_Fullwidth_Forms
|
||||
HackName: Latin-1_Supplement
|
||||
HackName: Latin_Extended-A
|
||||
HackName: Latin_Extended-B
|
||||
HackName: Miscellaneous_Mathematical_Symbols-A
|
||||
HackName: Miscellaneous_Mathematical_Symbols-B
|
||||
HackName: Miscellaneous_Symbols_and_Arrows
|
||||
HackName: Superscripts_and_Subscripts
|
||||
HackName: Supplemental_Arrows-A
|
||||
HackName: Supplemental_Arrows-B
|
||||
HackName: Supplementary_Private_Use_Area-A
|
||||
HackName: Supplementary_Private_Use_Area-B
|
||||
HackName: Canadian-Aboriginal
|
||||
#HackName: Old-Italic
|
||||
|
||||
FinalComments
|
||||
Note that PropertyAliases sorts by the long name, while PropertyValueAliases
|
||||
sorts by the short name
|
||||
ArabicShaping
|
||||
BidiMirroring
|
||||
CompositionExclusions
|
||||
EastAsianWidth
|
||||
LineBreak
|
||||
StandardizedVariants
|
||||
UnicodeData
|
||||
|
||||
|
@ -1,50 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyFloatLister.java,v $
|
||||
* $Date: 2004/03/11 19:03:17 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import java.text.NumberFormat;
|
||||
import java.util.Locale;
|
||||
|
||||
class MyFloatLister extends PropertyLister {
|
||||
private double propMask;
|
||||
NumberFormat nf = NumberFormat.getNumberInstance(Locale.US);
|
||||
|
||||
public MyFloatLister(UCD ucd, double f, PrintWriter output) {
|
||||
this.propMask = f;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
nf.setGroupingUsed(false);
|
||||
nf.setMaximumFractionDigits(8);
|
||||
nf.setMinimumFractionDigits(1);
|
||||
}
|
||||
|
||||
public String valueName(int cp) {
|
||||
return nf.format(ucdData.getNumericValue(cp));
|
||||
}
|
||||
|
||||
public String optionalName(int cp) {
|
||||
return ucdData.getNumericTypeID(cp);
|
||||
}
|
||||
|
||||
public byte status(int cp) {
|
||||
//if ((cp & 0xFFF) == 0) System.out.println("# " + Utility.hex(cp));
|
||||
if (false && !ucdData.isRepresented(cp)) {
|
||||
if (ucdData.mapToRepresentative(cp, ucdData.getCompositeVersion()) != cp) return PropertyLister.CONTINUE;
|
||||
return PropertyLister.CONTINUE;
|
||||
}
|
||||
if (ucdData.getCategory(cp) == Cn) return PropertyLister.CONTINUE;
|
||||
return ucdData.getNumericValue(cp) == propMask ? INCLUDE : EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
@ -1,123 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
|
||||
* $Date: 2004/02/18 03:08:59 $
|
||||
* $Revision: 1.12 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
final class MyPropertyLister extends PropertyLister {
|
||||
|
||||
static final boolean BRIDGE = false;
|
||||
|
||||
private int propMask;
|
||||
|
||||
private boolean isDefaultValue = false;
|
||||
|
||||
private UCDProperty up;
|
||||
|
||||
public MyPropertyLister(UCD ucd, int propMask, PrintWriter output) {
|
||||
this.propMask = propMask;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
up = UnifiedBinaryProperty.make(propMask, ucd);
|
||||
if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat
|
||||
isDefaultValue = up.isDefaultValue();
|
||||
}
|
||||
|
||||
public String headerString() {
|
||||
int main = (propMask & 0xFF00);
|
||||
if (main == COMBINING_CLASS) {
|
||||
String s = UCD.getCombiningClassID_fromIndex((short)(propMask & 0xFF), LONG);
|
||||
if (s.charAt(0) <= '9') s = "Other Combining Class";
|
||||
return "# " + s;
|
||||
} else if (main == BINARY_PROPERTIES) {
|
||||
return "";
|
||||
} else if (main == JOINING_GROUP) {
|
||||
return "";
|
||||
} else {
|
||||
return "";
|
||||
/*
|
||||
String shortID = up.getName(SHORT);
|
||||
String longID = up.getName(LONG);
|
||||
return "# ???? " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")");
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
public String valueName(int cp) {
|
||||
if (up.getValueType() == BINARY_PROP) return up.getName();
|
||||
return up.getValue(cp);
|
||||
}
|
||||
|
||||
public String missingValueName() {
|
||||
return up.getValue(NORMAL);
|
||||
}
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
if (propMask < COMBINING_CLASS) return ""; // skip gen cat
|
||||
int cat = ucdData.getCategory(cp);
|
||||
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
|
||||
return ucdData.getCategoryID(cp);
|
||||
}
|
||||
|
||||
/*
|
||||
public String optionalName(int cp) {
|
||||
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
|
||||
return Utility.hex(ucdData.getDecompositionMapping(cp));
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
public byte status(int cp) {
|
||||
//if (cp == 0xFFFF) {
|
||||
// System.out.println("# " + Utility.hex(cp));
|
||||
//}
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
//if (cp == 0x0385) {
|
||||
// System.out.println(Utility.hex(firstRealCp));
|
||||
//}
|
||||
|
||||
if (isDefaultValue
|
||||
&& cat == Cn
|
||||
&& propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point)
|
||||
&& propMask != (BINARY_PROPERTIES | Other_Default_Ignorable_Code_Point)
|
||||
&& propMask != (CATEGORY | Cn)) {
|
||||
if (BRIDGE) return CONTINUE;
|
||||
else return EXCLUDE;
|
||||
}
|
||||
|
||||
boolean inSet = up.hasValue(cp);
|
||||
|
||||
/*
|
||||
if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) {
|
||||
if (propMask == (SCRIPT | LATIN_SCRIPT)) inSet = cp <= 0x1D6A3;
|
||||
else if (propMask == (SCRIPT | GREEK_SCRIPT)) inSet = cp > 0x1D6A3;
|
||||
}
|
||||
*/
|
||||
/* HACK
|
||||
1D400;MATHEMATICAL BOLD CAPITAL A;Lu;0;L;<font> 0041;;;;N;;;;;
|
||||
1D6A3;MATHEMATICAL MONOSPACE SMALL Z;Ll;0;L;<font> 007A;;;;N;;;;;
|
||||
1D6A8;MATHEMATICAL BOLD CAPITAL ALPHA;Lu;0;L;<font> 0391;;;;N;;;;;
|
||||
1D7C9;MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL;Ll;0;L;<font> 03D6;;;;N;;;;;
|
||||
*/
|
||||
|
||||
if (!inSet) return EXCLUDE;
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -1,20 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
public class NFCSkippable {
|
||||
|
||||
// find all the characters that are
|
||||
// a) not decomposed by this normalization form
|
||||
// b) of combining class 0
|
||||
// AND if NKC or NFKC,
|
||||
// c) can never compose with a previous character
|
||||
// d) can never compose with a following character
|
||||
// e) can never change if another character is added
|
||||
// Example: a-breve might satisfy a-d, but if you
|
||||
// add an ogonek it changes to a-ogonek + breve
|
||||
|
||||
public boolean is(int cp) {
|
||||
return false;
|
||||
}
|
||||
|
||||
public static void main (String[] args) {
|
||||
}
|
||||
}
|
@ -1,301 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.icu.impl.CollectionUtilities;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
import java.util.BitSet;
|
||||
import com.ibm.text.utility.*;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
|
||||
public final class NFSkippable extends UCDProperty {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
private Normalizer nf;
|
||||
private Normalizer nfd;
|
||||
private UCD ucd;
|
||||
private boolean composes;
|
||||
private int[] realTrailers = new int[100];
|
||||
private int realTrailerCount = 0;
|
||||
|
||||
public NFSkippable(byte normalizerMode, UCD inputUCD) {
|
||||
isStandard = false;
|
||||
this.ucd = inputUCD;
|
||||
nf = new Normalizer(normalizerMode, ucd.getVersion());
|
||||
name = nf.getName() + "_Skippable";
|
||||
shortName = nf.getName() + "_Skip";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters that don't interact with any others in this normalization form."
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
|
||||
composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC;
|
||||
|
||||
// preprocess to find possible trailers
|
||||
|
||||
if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) {
|
||||
if (nf.isTrailing(cp2)) {
|
||||
//System.out.println("Trailing: " + ucd.getCodeAndName(cp2));
|
||||
if (ucd.isNonLeadJamo(cp2)) {
|
||||
//System.out.println("Jamo: " + ucd.getCodeAndName(cp2));
|
||||
continue;
|
||||
}
|
||||
realTrailers[realTrailerCount++] = cp2;
|
||||
}
|
||||
}
|
||||
Utility.fixDot();
|
||||
//System.out.println("trailer count: " + realTrailerCount);
|
||||
}
|
||||
|
||||
/** A skippable character is<br>
|
||||
* a) unassigned, or ALL of the following:<br>
|
||||
* b) of combining class 0.<br>
|
||||
* c) not decomposed by this normalization form.<br>
|
||||
* AND if NKC or NFKC, <br>
|
||||
* d) can never compose with a previous character.<br>
|
||||
* e) can never compose with a following character.<br>
|
||||
* f) can never change if another character is added.
|
||||
* Example: a-breve might satisfy all but f, but if you
|
||||
* add an ogonek it changes to a-ogonek + breve
|
||||
*/
|
||||
|
||||
String cause = "";
|
||||
|
||||
public boolean hasValue(int cp) {
|
||||
// quick check on some special classes
|
||||
if (DEBUG) cause = "\t\tunassigned";
|
||||
if (!ucd.isAssigned(cp)) return true;
|
||||
|
||||
if (DEBUG) cause = "\t\tnf differs";
|
||||
if (!nf.isNormalized(cp)) return false;
|
||||
|
||||
if (DEBUG) cause = "\t\tnon-zero cc";
|
||||
if (ucd.getCombiningClass(cp) != 0) return false;
|
||||
|
||||
if (DEBUG) cause = "";
|
||||
if (!composes) return true;
|
||||
|
||||
// now special checks for composing normalizers
|
||||
if (DEBUG) cause = "\t\tleading";
|
||||
if (nf.isLeading(cp)) return false;
|
||||
|
||||
if (DEBUG) cause = "\t\ttrailing";
|
||||
if (nf.isTrailing(cp)) return false;
|
||||
|
||||
// OPTIMIZATION -- careful
|
||||
// If there is no NFD decomposition, then this character's accents can't be
|
||||
// "displaced", so we don't have to test further
|
||||
|
||||
if (DEBUG) cause = "\t\tno decomp";
|
||||
if (nfd.isNormalized(cp)) return true;
|
||||
|
||||
// OPTIMIZATION -- careful
|
||||
// Hangul syllables are skippable IFF they are isLeadingJamoComposition
|
||||
if (ucd.isHangulSyllable(cp)) return !ucd.isLeadingJamoComposition(cp);
|
||||
|
||||
// We now see if adding another character causes a problem.
|
||||
// brute force for now!!
|
||||
// We do skip the trailing Jamo, since those never displace!
|
||||
|
||||
StringBuffer base = new StringBuffer(UTF16.valueOf(cp));
|
||||
int baseLen = base.length();
|
||||
for (int i = 0; i < realTrailerCount; ++i) {
|
||||
base.setLength(baseLen); // shorten if needed
|
||||
base.append(UTF16.valueOf(realTrailers[i]));
|
||||
String probe = base.toString();
|
||||
String result = nf.normalize(probe);
|
||||
if (!result.equals(probe)) {
|
||||
if (DEBUG) cause = "\t\tinteracts with " + ucd.getCodeAndName(realTrailers[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// passed the sieve, so we are ok
|
||||
if (DEBUG) cause = "";
|
||||
return true;
|
||||
}
|
||||
|
||||
// both the following should go into UTF16
|
||||
|
||||
public static String replace(String source, int toReplace, int replacement) {
|
||||
if (0 <= toReplace && toReplace <= 0xFFFF
|
||||
&& 0 <= replacement && replacement <= 0xFFFF) {
|
||||
return source.replace((char)toReplace, (char)replacement);
|
||||
}
|
||||
return replace(source, UTF16.valueOf(toReplace), UTF16.valueOf(replacement));
|
||||
}
|
||||
|
||||
public static String replace(String source, String toReplace, String replacement) {
|
||||
int pos = 0;
|
||||
StringBuffer result = new StringBuffer(source.length());
|
||||
while (true) {
|
||||
int newPos = source.indexOf(toReplace, pos);
|
||||
if (newPos >= 0) {
|
||||
result.append(source.substring(pos, newPos));
|
||||
result.append(replacement);
|
||||
pos = newPos + toReplace.length();
|
||||
} else if (pos != 0) {
|
||||
result.append(source.substring(pos));
|
||||
return result.toString();
|
||||
} else {
|
||||
return source; // no change necessary
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void writeStringInPieces(PrintWriter pw, String s, String term) {
|
||||
int start;
|
||||
int end;
|
||||
int lineLen = 64;
|
||||
for (start = 0; ; start = end) {
|
||||
if (start == 0) pw.print("\t \"");
|
||||
else pw.print("\t+ \"");
|
||||
end = s.length();
|
||||
if (end > start + lineLen) end = start + lineLen;
|
||||
|
||||
// if we have a slash in the last 5 characters, backup
|
||||
|
||||
int lastSlash = s.lastIndexOf('\\', end);
|
||||
if (lastSlash >= end-5) end = lastSlash;
|
||||
|
||||
// backup if we broke on a \
|
||||
|
||||
while (end > start && s.charAt(end-1) == '\\') --end;
|
||||
|
||||
pw.print(s.substring(start, end));
|
||||
if (end == s.length()) {
|
||||
pw.println('"' + term);
|
||||
break;
|
||||
} else {
|
||||
pw.println('"');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void testWriteStringInPieces() {
|
||||
String test =
|
||||
"[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
|
||||
+ "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD\\u00F"
|
||||
+ "F-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137\\u0139-"
|
||||
+ "\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165\\u0168-\\u017"
|
||||
+ "E\\u01A0-\\u01A1\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u"
|
||||
+ "01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F\\u0226";
|
||||
PrintWriter pw = new PrintWriter(System.out);
|
||||
writeStringInPieces(pw,test,"");
|
||||
writeStringInPieces(pw,replace(test, "\\", "\\\\"),"");
|
||||
|
||||
pw.flush();
|
||||
}
|
||||
|
||||
static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller
|
||||
|
||||
public static void main (String[] args) throws java.io.IOException {
|
||||
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS);
|
||||
out.println(Utility.BOM);
|
||||
out.println("NFSafeSets");
|
||||
out.println("Version: " + Default.ucd().getVersion());
|
||||
out.println("Date: " + Default.getDate());
|
||||
out.println();
|
||||
|
||||
for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) {
|
||||
UCDProperty up = DerivedProperty.make(mode, Default.ucd());
|
||||
generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up);
|
||||
}
|
||||
|
||||
for (byte mode = NFD; mode <= NFKC; ++mode) {
|
||||
NFSkippable skipper = new NFSkippable(mode, Default.ucd());
|
||||
generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
|
||||
}
|
||||
System.out.println("Done");
|
||||
out.close();
|
||||
}
|
||||
|
||||
static Collator UCA = Collator.getInstance(ULocale.ROOT);
|
||||
|
||||
static void generateSet(PrintWriter out, String label, UCDProperty up) {
|
||||
System.out.println("Generating: " + up.getName(NORMAL));
|
||||
UnicodeSet result = new UnicodeSet();
|
||||
for (int cp = 0; cp <= limit; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (up.hasValue(cp)) result.add(cp);
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
String rSet = result.toPattern(true);
|
||||
rSet = replace(rSet, "\\U", "\\\\U");
|
||||
rSet = replace(rSet, "\\u", "\\\\u");
|
||||
out.println(label + " = new UnicodeSet(");
|
||||
writeStringInPieces(out, rSet, ", false);");
|
||||
|
||||
if (true) {
|
||||
rSet = result.toPattern(false);
|
||||
} else {
|
||||
rSet = CollectionUtilities.prettyPrint(result, true, null, null, UCA, UCA);
|
||||
}
|
||||
|
||||
out.println("/*Unicode: ");
|
||||
writeStringInPieces(out, rSet, "*/");
|
||||
out.println();
|
||||
out.flush();
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
/*
|
||||
// DerivedProperty dp = new DerivedProperty(UCD.make(version));
|
||||
|
||||
System.out.println(skipper.getName(NORMAL));
|
||||
|
||||
UnicodeSet result = new UnicodeSet();
|
||||
for (int cp = 0; cp <= limit; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (skipper.hasProperty(cp)) result.add(cp);
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
String rSet = result.toPattern(true);
|
||||
rSet = replace(rSet, "\\U", "\\\\U");
|
||||
out.println("\tSKIPPABLE[" + skipper.getName(NORMAL)
|
||||
+ "] = new UnicodeSet(");
|
||||
writeStringInPieces(out, rSet, ", false);");
|
||||
out.println();
|
||||
|
||||
rSet = result.toPattern(false);
|
||||
out.println("/*Unicode: ");
|
||||
*/
|
||||
//writeStringInPieces(out, rSet, "*/");
|
||||
/*out.println();
|
||||
out.flush();
|
||||
|
||||
if (false) {
|
||||
NFSkippable skipper = new NFSkippable(Normalizer.NFC,"");
|
||||
NFSkippable skipper2 = new NFSkippable(Normalizer.NFKC,"");
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (cp > 0xFF) {
|
||||
if (!skipper.ucd.isAssigned(cp)) continue;
|
||||
byte cat = skipper.ucd.getCategory(cp);
|
||||
if (cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (skipper.ucd.getCombiningClass(cp) != 0) continue;
|
||||
if (!skipper.nf.isNormalized(cp)) continue;
|
||||
if ((cp < 0xAC00 || cp > 0xAE00)
|
||||
&& cp != skipper.ucd.mapToRepresentative(cp, false)) continue;
|
||||
}
|
||||
|
||||
if (skipper2.hasProperty(cp) == skipper.hasProperty(cp)) continue;
|
||||
|
||||
String status = (skipper.hasProperty(cp) ? " SKIPc " : "NOSKIPc ")
|
||||
+ (skipper2.hasProperty(cp) ? " SKIPkc " : "NOSKIPkc ");
|
||||
System.out.println(status
|
||||
+ skipper.ucd.getCodeAndName(cp)
|
||||
+ skipper.cause);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
@ -1,153 +0,0 @@
|
||||
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<meta name="keywords" content="unicode, variant glyphs">
|
||||
<meta name="description" content="Describes and displays standardized variant glyphs">
|
||||
<title>Named Sequences</title>
|
||||
<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
|
||||
<style>
|
||||
<!--
|
||||
.copy { text-align: center; font-size: 150% }
|
||||
th, td { vertical-align: middle }
|
||||
tt { font-size: 8pt }
|
||||
table { padding: 2pt }
|
||||
-->
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body bgcolor="#ffffff">
|
||||
|
||||
<table class="header">
|
||||
<tr>
|
||||
<td class="icon"><a href="http://www.unicode.org">
|
||||
<img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a> <a class="bar" href="http://www.unicode.org/ucd">Unicode
|
||||
Character Database</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="gray"> </td>
|
||||
</tr>
|
||||
</table>
|
||||
<div style="margin:1em">
|
||||
<table border="1" cellpadding="0" cellspacing="1" style="border-collapse: collapse" bordercolor="#111111" width="100%" id="AutoNumber1">
|
||||
<tr>
|
||||
<td width="100%">
|
||||
<p style="text-align: right">L2-XXX</p>
|
||||
<p><i>To: UTC<br>
|
||||
From: Mark Davis<br>
|
||||
Date: 2005-04-28</i></p>
|
||||
<p><i>One of the original ideas for Unicode 4.1.0 was to produce a NamedSequences.html,
|
||||
following the pattern of StandardizedVariants.html. This document was generated along those
|
||||
lines, but not added into U4.1.0. My suggestion instead is to add this file (with suitable
|
||||
style modifications, of course) as a chart someplace accessible under
|
||||
<a href="http://unicode.org/charts/">http://unicode.org/charts/</a>.</i></p>
|
||||
<p><i>Alternatively, we could also combine this with the StandardizedVariants.html to provide
|
||||
a unified chart of sequences, again someplace under <a href="http://unicode.org/charts/">
|
||||
http://unicode.org/charts/</a>.</i></p>
|
||||
<p><i><b>Note:</b> we don't have some of the glyphs quite right yet, but it should be
|
||||
sufficient for discussing the format. One of the innovations is having a separate column of
|
||||
text that for copy&paste; that needs discussion also.</i></td>
|
||||
</tr>
|
||||
</table>
|
||||
<h1><i><font color="#990000"> PROPOSED WORKING DRAFT<br>
|
||||
</font></i>Named Sequences</h1>
|
||||
<table class="wide">
|
||||
<tr>
|
||||
<td valign="top" width="144">Revision</td>
|
||||
<td valign="top">@revision@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Authors</td>
|
||||
<td valign="top">Members of the Editorial Committee</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Date</td>
|
||||
<td valign="top">@date@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">This Version</td>
|
||||
<td valign="top">
|
||||
<a href="http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html">
|
||||
http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Previous Version</td>
|
||||
<td valign="top">n/a</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Latest Version</td>
|
||||
<td valign="top">n/a</td>
|
||||
</tr>
|
||||
</table>
|
||||
<h3><br>
|
||||
<i>Summary</i></h3>
|
||||
<blockquote>
|
||||
<p>This file provides a visual display of the named sequences derived from NamedSequences.txt.<i>The
|
||||
proposal is to add this, </i></p>
|
||||
</blockquote>
|
||||
<h3><i>Status</i></h3>
|
||||
<blockquote>
|
||||
<p><i>The file and the files described herein are part of the
|
||||
<a href="http://www.unicode.org/ucd">Unicode Character Database</a> (UCD) and are governed by
|
||||
the <a href="#Terms of Use">UCD Terms of Use</a> stated at the end.</i></p>
|
||||
</blockquote>
|
||||
<hr width="50%">
|
||||
<h2>Introduction</h2>
|
||||
<p>The tables here exhaustively lists the valid, registered named sequences. The columns include a
|
||||
representative glyph, the sequence of code points in hex, and the name of the sequence. In
|
||||
addition, there is a last column entitled <i>Copyable</i>, which contains the literal text forming
|
||||
the sequence. That text can be copied and pasting in elsewhere. The display of the text in this
|
||||
column is up to the capabilities of the browser and the set of available fonts. For more
|
||||
information, see <a href="http://www.unicode.org/help/display_problems.html">Display Problems?</a>.</p>
|
||||
<blockquote>
|
||||
<p><a name="fonts"><b>Note: </b></a>The representative glyphs used to show the names sequences
|
||||
are often derived from different physical fonts than the representative glyphs in the standard.
|
||||
They may therefore exhibit minor differences in size, proportion, style, or weight.</p>
|
||||
</blockquote>
|
||||
<p>@table@</p>
|
||||
<hr width="50%">
|
||||
<h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
|
||||
<h3><i>Disclaimer</i></h3>
|
||||
<blockquote>
|
||||
<p><i>The Unicode Character Database is provided as is by Unicode, Inc. No claims are made as to
|
||||
fitness for any particular purpose. No warranties of any kind are expressed or implied. The
|
||||
recipient agrees to determine applicability of information provided. If this file has been
|
||||
purchased on magnetic or optical media from Unicode, Inc., the sole remedy for any claim will be
|
||||
exchange of defective media within 90 days of receipt.</i></p>
|
||||
<p><i>This disclaimer is applicable for all other data files accompanying the Unicode Character
|
||||
Database, some of which have been compiled by the Unicode Consortium, and some of which have
|
||||
been supplied by other sources.</i></p>
|
||||
</blockquote>
|
||||
<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
|
||||
<blockquote>
|
||||
<p><i>Recipient is granted the right to make copies in any form for internal distribution and to
|
||||
freely use the information supplied in the creation of products supporting the Unicode<sup>TM</sup>
|
||||
Standard. The files in the Unicode Character Database can be redistributed to third parties or
|
||||
other organizations (whether for profit or not) as long as this notice and the disclaimer notice
|
||||
are retained. Information can be extracted from these files and used in documentation or
|
||||
programs, as long as there is an accompanying notice indicating the source.</i></p>
|
||||
</blockquote>
|
||||
<hr width="50%">
|
||||
<div align="center">
|
||||
<center>
|
||||
<table cellspacing="0" cellpadding="0" border="0">
|
||||
<tr>
|
||||
<td><a href="http://www.unicode.org/unicode/copyright.html">
|
||||
<img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
|
||||
</tr>
|
||||
</table>
|
||||
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
|
||||
</script>
|
||||
</center>
|
||||
</div>
|
||||
<blockquote>
|
||||
</blockquote>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -1,32 +0,0 @@
|
||||
#
|
||||
# Normalization Test Suite
|
||||
# Format:
|
||||
#
|
||||
# Columns (c1, c2,...) are separated by semicolons
|
||||
# Comments are indicated with hash marks
|
||||
#
|
||||
# CONFORMANCE:
|
||||
# 1. The following invariants must be true for all conformant implementations
|
||||
#
|
||||
# NFC
|
||||
# c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
# c4 == NFC(c4) == NFC(c5)
|
||||
#
|
||||
# NFD
|
||||
# c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
# c5 == NFD(c4) == NFD(c5)
|
||||
#
|
||||
# NFKC
|
||||
# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
#
|
||||
# NFKD
|
||||
# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
#
|
||||
# 2. For every code point X assigned in this version of Unicode that is not specifically
|
||||
# listed in Part 1, the following invariants must be true for all conformant
|
||||
# implementations:
|
||||
#
|
||||
# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
|
||||
#
|
||||
@Part0 # Specific cases
|
||||
#
|
@ -1,665 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2006/09/24 23:32:44 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.sun.java_cup.internal.internal_error;
|
||||
|
||||
|
||||
/**
|
||||
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
|
||||
* See UTR#15 for details.<br>
|
||||
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
|
||||
* The Unicode Consortium makes no expressed or implied warranty of any
|
||||
* kind, and assumes no liability for errors or omissions.
|
||||
* No liability is assumed for incidental and consequential damages
|
||||
* in connection with or arising out of the use of the information here.
|
||||
* @author Mark Davis
|
||||
*/
|
||||
|
||||
public final class Normalizer implements UCD_Types {
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
public static boolean SHOW_PROGRESS = false;
|
||||
|
||||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
public Normalizer(byte form, String unicodeVersion) {
|
||||
this.form = form;
|
||||
this.composition = (form & NF_COMPOSITION_MASK) != 0;
|
||||
this.compatibility = (form & NF_COMPATIBILITY_MASK) != 0;
|
||||
this.data = getData(unicodeVersion);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
// public Normalizer(byte form) {
|
||||
// this(form,"");
|
||||
//}
|
||||
|
||||
/**
|
||||
* Return string name
|
||||
*/
|
||||
public static String getName(byte form) {
|
||||
return UCD_Names.NF_NAME[form];
|
||||
}
|
||||
|
||||
/**
|
||||
* Return string name
|
||||
*/
|
||||
public String getName() {
|
||||
return getName(form);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return string name
|
||||
*/
|
||||
public String getUCDVersion() {
|
||||
return data.getUCDVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* Does compose?
|
||||
*/
|
||||
public boolean isComposition() {
|
||||
return composition;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does compose?
|
||||
*/
|
||||
public boolean isCompatibility() {
|
||||
return compatibility;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form,
|
||||
* replacing contents of the target buffer.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
public StringBuffer normalize(String source, StringBuffer target) {
|
||||
|
||||
// First decompose the source into target,
|
||||
// then compose if the form requires.
|
||||
|
||||
if (source.length() != 0) {
|
||||
internalDecompose(source, target, true, compatibility);
|
||||
if (composition) {
|
||||
internalCompose(target);
|
||||
}
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form,
|
||||
* replacing contents of the target buffer.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
public boolean isFCD(String source) {
|
||||
if (source.length() == 0) return true;
|
||||
StringBuffer noReorder = new StringBuffer();
|
||||
StringBuffer reorder = new StringBuffer();
|
||||
|
||||
internalDecompose(source, noReorder, false, false);
|
||||
internalDecompose(source, reorder, true, false);
|
||||
|
||||
return reorder.toString().equals(noReorder.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param source the original text, unnormalized
|
||||
* @return target the resulting normalized text
|
||||
*/
|
||||
public String normalize(String source) {
|
||||
return normalize(source, new StringBuffer()).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param newLocaleID the original text, unnormalized
|
||||
* @return target the resulting normalized text
|
||||
*/
|
||||
public String normalize(int cp) {
|
||||
return normalize(UTF16.valueOf(cp));
|
||||
}
|
||||
|
||||
/**
|
||||
private StringBuffer hasDecompositionBuffer = new StringBuffer();
|
||||
|
||||
public boolean hasDecomposition(int cp) {
|
||||
hasDecompositionBuffer.setLength(0);
|
||||
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
|
||||
if (hasDecompositionBuffer.length() != 1) return true;
|
||||
return cp != hasDecompositionBuffer.charAt(0);
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Does a quick check to see if the string is in the current form. Checks canonical order and
|
||||
* isAllowed().
|
||||
* @param newLocaleID source text
|
||||
* @return YES, NO, MAYBE
|
||||
*/
|
||||
/*
|
||||
public static final int NO = 0, YES = 1, MAYBE = -1;
|
||||
|
||||
public int quickCheck(String source) {
|
||||
short lastCanonicalClass = 0;
|
||||
int result = YES;
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char ch = source.charAt(i);
|
||||
short canonicalClass = data.getCanonicalClass(ch);
|
||||
if (lastCanonicalClass > canonicalClass && canonicalClass != 0) {
|
||||
return NO;
|
||||
}
|
||||
int check = isAllowed(ch);
|
||||
if (check == NO) return NO;
|
||||
if (check == MAYBE) result = MAYBE;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find whether the given character is allowed in the current form.
|
||||
* @return YES, NO, MAYBE
|
||||
*/
|
||||
/*
|
||||
public int isAllowed(char ch) {
|
||||
if (composition) {
|
||||
if (compatibility) {
|
||||
if (data.isCompatibilityExcluded(ch)) {
|
||||
return NO;
|
||||
}
|
||||
} else {
|
||||
if (data.isExcluded(ch)) {
|
||||
return NO;
|
||||
}
|
||||
}
|
||||
if (data.isTrailing(ch)) {
|
||||
return MAYBE;
|
||||
}
|
||||
} else { // decomposition: both NFD and NFKD
|
||||
if (data.normalizationDiffers(compatibility,ch)) return NO;
|
||||
}
|
||||
return YES;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Gets the combining class of a character from the
|
||||
* Unicode Character Database. Only a byte is needed, but since they are signed in Java
|
||||
* return an int to forstall problems.
|
||||
* @param ch the source character
|
||||
* @return value from 0 to 255
|
||||
*/
|
||||
|
||||
public short getCanonicalClass(int ch) {
|
||||
return data.getCanonicalClass(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Checks whether there is a recursive decomposition of a character from the
|
||||
* Unicode Character Database. It is compatibility or canonical according to the particular
|
||||
* normalizer.
|
||||
* @param ch the source character
|
||||
*/
|
||||
public boolean isNormalized(int ch) {
|
||||
return !data.normalizationDiffers(ch, composition, compatibility);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Checks whether there is a recursive decomposition of a character from the
|
||||
* Unicode Character Database. It is compatibility or canonical according to the particular
|
||||
* normalizer.
|
||||
* @param ch the source character
|
||||
*/
|
||||
public boolean isNormalized(String s) {
|
||||
if (UTF16.countCodePoint(s) > 1) {
|
||||
return !data.normalizationDiffers(UTF16.charAt(s,0), composition, compatibility);
|
||||
}
|
||||
return s.equals(normalize(s)); // TODO: OPTIMIZE LATER
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Gets recursive decomposition of a character from the
|
||||
* Unicode Character Database.
|
||||
* @param compatibility If false selects the recursive
|
||||
* canonical decomposition, otherwise selects
|
||||
* the recursive compatibility AND canonical decomposition.
|
||||
* @param ch the source character
|
||||
* @param buffer buffer to be filled with the decomposition
|
||||
*/
|
||||
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
|
||||
data.getRecursiveDecomposition(ch, buffer, compatibility);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Gets composition mapping.
|
||||
* @return IntEnumeration with the pair -> value mapping, where the
|
||||
* pair is firstChar << 16 | secondChar.
|
||||
* Will need to be fixed for surrogates.
|
||||
*/
|
||||
|
||||
public void getCompositionStatus(BitSet leading, BitSet trailing, BitSet resulting) {
|
||||
Iterator it = data.compTable.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Long key = (Long)it.next();
|
||||
Integer result = (Integer)data.compTable.get(key);
|
||||
long keyLong = key.longValue();
|
||||
if (leading != null) leading.set((int)(keyLong >>> 32));
|
||||
if (trailing != null) trailing.set((int)keyLong);
|
||||
if (resulting != null) resulting.set(result.intValue());
|
||||
}
|
||||
for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
|
||||
if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
|
||||
if (trailing != null && UCD.isNonLeadJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
|
||||
}
|
||||
if (leading != null) {
|
||||
for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
|
||||
if (UCD.isDoubleHangul(i)) leading.set(i); // set all two-Jamo syllables
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isTrailing(int cp) {
|
||||
return this.composition ? data.isTrailing(cp) : false;
|
||||
}
|
||||
|
||||
public boolean isLeading(int cp) {
|
||||
return this.composition ? data.isLeading(cp) : false;
|
||||
}
|
||||
|
||||
public int getComposition(int first, int second) {
|
||||
return data.getPairwiseComposition(first, second);
|
||||
}
|
||||
|
||||
// ======================================
|
||||
// PRIVATES
|
||||
// ======================================
|
||||
|
||||
/**
|
||||
* The current form.
|
||||
*/
|
||||
private byte form;
|
||||
private boolean composition;
|
||||
private boolean compatibility;
|
||||
private UnicodeMap substituteMapping;
|
||||
|
||||
/**
|
||||
* Decomposes text, either canonical or compatibility,
|
||||
* replacing contents of the target buffer.
|
||||
* @param form the normalization form. If NF_COMPATIBILITY_MASK
|
||||
* bit is on in this byte, then selects the recursive
|
||||
* compatibility decomposition, otherwise selects
|
||||
* the recursive canonical decomposition.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
private void internalDecompose(String source, StringBuffer target, boolean reorder, boolean compat) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
int ch32;
|
||||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
|
||||
buffer.setLength(0);
|
||||
ch32 = UTF16.charAt(source, i);
|
||||
String sub = substituteMapping == null ? null : (String) substituteMapping.getValue(ch32);
|
||||
if (sub != null) {
|
||||
buffer.append(sub);
|
||||
} else {
|
||||
data.getRecursiveDecomposition(ch32, buffer, compat);
|
||||
}
|
||||
|
||||
// add all of the characters in the decomposition.
|
||||
// (may be just the original character, if there was
|
||||
// no decomposition mapping)
|
||||
|
||||
int ch;
|
||||
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
|
||||
ch = UTF16.charAt(buffer, j);
|
||||
int chClass = data.getCanonicalClass(ch);
|
||||
int k = target.length(); // insertion point
|
||||
if (chClass != 0 && reorder) {
|
||||
|
||||
// bubble-sort combining marks as necessary
|
||||
|
||||
int ch2;
|
||||
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
|
||||
ch2 = UTF16.charAt(target, k-1);
|
||||
if (data.getCanonicalClass(ch2) <= chClass) break;
|
||||
}
|
||||
}
|
||||
target.insert(k, UTF16.valueOf(ch));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Composes text in place. Target must already
|
||||
* have been decomposed.
|
||||
* Uses UTF16, which is a utility class for supplementary character support in Java.
|
||||
* @param target input: decomposed text.
|
||||
* output: the resulting normalized text.
|
||||
*/
|
||||
private void internalCompose(StringBuffer target) {
|
||||
int starterPos = 0;
|
||||
int starterCh = UTF16.charAt(target,0);
|
||||
int compPos = UTF16.getCharCount(starterCh); // length of last composition
|
||||
int lastClass = data.getCanonicalClass(starterCh);
|
||||
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
|
||||
int oldLen = target.length();
|
||||
|
||||
// Loop on the decomposed characters, combining where possible
|
||||
|
||||
int ch;
|
||||
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
|
||||
ch = UTF16.charAt(target, decompPos);
|
||||
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
|
||||
+ ", decompPos: " + decompPos
|
||||
+ ", compPos: " + compPos
|
||||
+ ", ch: " + Utility.hex(ch)
|
||||
);
|
||||
int chClass = data.getCanonicalClass(ch);
|
||||
int composite = data.getPairwiseComposition(starterCh, ch);
|
||||
if (composite != data.NOT_COMPOSITE
|
||||
&& (lastClass < chClass || lastClass == 0)) {
|
||||
UTF16.setCharAt(target, starterPos, composite);
|
||||
// we know that we will only be replacing non-supplementaries by non-supplementaries
|
||||
// so we don't have to adjust the decompPos
|
||||
starterCh = composite;
|
||||
} else {
|
||||
if (chClass == 0) {
|
||||
starterPos = compPos;
|
||||
starterCh = ch;
|
||||
}
|
||||
lastClass = chClass;
|
||||
UTF16.setCharAt(target, compPos, ch);
|
||||
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
|
||||
System.out.println("ADJUSTING: " + Utility.hex(target));
|
||||
decompPos += target.length() - oldLen;
|
||||
oldLen = target.length();
|
||||
}
|
||||
compPos += UTF16.getCharCount(ch);
|
||||
}
|
||||
}
|
||||
target.setLength(compPos);
|
||||
}
|
||||
|
||||
static class Stub {
|
||||
private UCD ucd;
|
||||
private HashMap compTable = new HashMap();
|
||||
private BitSet isSecond = new BitSet();
|
||||
private BitSet isFirst = new BitSet();
|
||||
private BitSet canonicalRecompose = new BitSet();
|
||||
private BitSet compatibilityRecompose = new BitSet();
|
||||
static final int NOT_COMPOSITE = 0xFFFF;
|
||||
|
||||
Stub(String version) {
|
||||
ucd = UCD.make(version);
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAssigned(i)) continue;
|
||||
if (ucd.isPUA(i)) continue;
|
||||
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
|
||||
if (ucd.isLeadingJamoComposition(i)) isFirst.set(i);
|
||||
byte dt = ucd.getDecompositionType(i);
|
||||
if (dt != CANONICAL) continue;
|
||||
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
|
||||
try {
|
||||
String s = ucd.getDecompositionMapping(i);
|
||||
int len = UTF16.countCodePoint(s);
|
||||
if (len != 2) {
|
||||
if (len > 2) {
|
||||
if (ucd.getVersion().compareTo("3.0.0") >= 0) {
|
||||
throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
int a = UTF16.charAt(s, 0);
|
||||
if (ucd.getCombiningClass(a) != 0) continue;
|
||||
isFirst.set(a);
|
||||
|
||||
int b = UTF16.charAt(s, UTF16.getCharCount(a));
|
||||
isSecond.set(b);
|
||||
|
||||
// have a recomposition, so set the bit
|
||||
canonicalRecompose.set(i);
|
||||
|
||||
// set the compatibility recomposition bit
|
||||
// ONLY if the component characters
|
||||
// don't compatibility decompose
|
||||
if (ucd.getDecompositionType(a) <= CANONICAL
|
||||
&& ucd.getDecompositionType(b) <= CANONICAL) {
|
||||
compatibilityRecompose.set(i);
|
||||
}
|
||||
|
||||
long key = (((long)a)<<32) | b;
|
||||
|
||||
/*if (i == '\u1E0A' || key == 0x004400000307) {
|
||||
System.out.println(Utility.hex(s));
|
||||
System.out.println(Utility.hex(i));
|
||||
System.out.println(Utility.hex(key));
|
||||
}*/
|
||||
compTable.put(new Long(key), new Integer(i));
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
// process compatibilityRecompose
|
||||
// have to do this afterwards, since we don't know whether the pieces
|
||||
// are allowable until we have processed all the characters
|
||||
/*
|
||||
Iterator it = compTable.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Long key = (Long)it.next();
|
||||
int cp = compTable.get(key);
|
||||
long keyLong = key.longValue();
|
||||
int first = (int)(keyLong >>> 32);
|
||||
int second = (int)keyLong;
|
||||
if (ucd.
|
||||
*/
|
||||
}
|
||||
|
||||
String getUCDVersion() {
|
||||
return ucd.getVersion();
|
||||
}
|
||||
|
||||
/*
|
||||
Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS
|
||||
Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
|
||||
Problem: differs: true, call: false U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
|
||||
Problem: differs: true, call: false U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE
|
||||
Problem: differs: true, call: false U+1FC1 GREEK DIALYTIKA AND PERISPOMENI
|
||||
Problem: differs: true, call: false U+1FCD GREEK PSILI AND VARIA
|
||||
Problem: differs: true, call: false U+1FCE GREEK PSILI AND OXIA
|
||||
Problem: differs: true, call: false U+1FCF GREEK PSILI AND PERISPOMENI
|
||||
Problem: differs: true, call: false U+1FDD GREEK DASIA AND VARIA
|
||||
Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA
|
||||
Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI
|
||||
Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
||||
*/
|
||||
|
||||
short getCanonicalClass(int cp) {
|
||||
return ucd.getCombiningClass(cp);
|
||||
}
|
||||
|
||||
boolean isTrailing(int cp) {
|
||||
return isSecond.get(cp);
|
||||
}
|
||||
|
||||
boolean isLeading(int cp) {
|
||||
return isFirst.get(cp);
|
||||
}
|
||||
|
||||
boolean normalizationDiffers(int cp, boolean composition, boolean compat) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
if (!composition) {
|
||||
if (compat) return dt >= CANONICAL;
|
||||
else return dt == CANONICAL;
|
||||
} else {
|
||||
// almost the same, except that we add back in the characters
|
||||
// that RECOMPOSE
|
||||
if (compat) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
|
||||
else return dt == CANONICAL && !canonicalRecompose.get(cp);
|
||||
}
|
||||
}
|
||||
|
||||
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compat) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
// we know we decompose all CANONICAL, plus > CANONICAL if compat is TRUE.
|
||||
if (dt == CANONICAL || dt > CANONICAL && compat) {
|
||||
String s = ucd.getDecompositionMapping(cp);
|
||||
if (s.equals(UTF16.valueOf(cp))) {
|
||||
System.out.println("fix");
|
||||
}
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
getRecursiveDecomposition(cp, buffer, compat);
|
||||
}
|
||||
} else {
|
||||
UTF16.append(buffer, cp);
|
||||
}
|
||||
}
|
||||
|
||||
int getPairwiseComposition(int starterCh, int ch) {
|
||||
int hangulPoss = UCD.composeHangul(starterCh, ch);
|
||||
if (hangulPoss != 0xFFFF) return hangulPoss;
|
||||
Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
|
||||
if (obj == null) return 0xFFFF;
|
||||
return ((Integer)obj).intValue();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains normalization data from the Unicode Character Database.
|
||||
* use false for the minimal set, true for the real set.
|
||||
*/
|
||||
private Stub data;
|
||||
|
||||
private static HashMap versionCache = new HashMap();
|
||||
|
||||
private static Stub getData (String version) {
|
||||
if (version.length() == 0) version = UCD.latestVersion;
|
||||
Stub result = (Stub)versionCache.get(version);
|
||||
if (result == null) {
|
||||
result = new Stub(version);
|
||||
versionCache.put(version, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public UnicodeMap getSubstituteMapping() {
|
||||
return substituteMapping;
|
||||
}
|
||||
|
||||
public Normalizer setSubstituteMapping(UnicodeMap substituteMapping) {
|
||||
this.substituteMapping = substituteMapping;
|
||||
return this;
|
||||
}
|
||||
|
||||
static UnicodeMap spacingMap;;
|
||||
public void setSpacingSubstitute() {
|
||||
if (spacingMap == null) {
|
||||
makeSpacingMap();
|
||||
}
|
||||
setSubstituteMapping(spacingMap);
|
||||
}
|
||||
|
||||
private void makeSpacingMap() {
|
||||
spacingMap = new UnicodeMap();
|
||||
StringBuffer b = new StringBuffer();
|
||||
main:
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
boolean compat = data.ucd.getDecompositionType(i) >= data.ucd.CANONICAL;
|
||||
if (!compat) continue;
|
||||
b.setLength(0);
|
||||
data.getRecursiveDecomposition(i, b, true);
|
||||
if (b.length() == 1) continue;
|
||||
char firstChar = b.charAt(0);
|
||||
if (firstChar != 0x20 && firstChar != '\u0640') continue;
|
||||
// if rest are just Mn or Me marks, then add to substitute mapping
|
||||
int cp;
|
||||
for (int j = 1; j < b.length(); j += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(b,j);
|
||||
int cat = data.ucd.getCategory(cp);
|
||||
if (cat != data.ucd.Mn && cat != data.ucd.Me) continue main;
|
||||
}
|
||||
spacingMap.put(i, UTF16.valueOf(i));
|
||||
}
|
||||
String[][] specials = {
|
||||
{"[\\u0384\\u1FFD]", "\u00B4"},
|
||||
{"[\\uFFE3]", "\u00AF"},
|
||||
{"[\\uFE49-\\uFE4C]", "\u203E"},
|
||||
{"[\\u1FED]", "\u00A8\u0300"},
|
||||
{"[\\u1FEE\\u0385]", "\u00A8\u0301"},
|
||||
{"[\\u1FC1]", "\u00A8\u0342"},
|
||||
{"[\\u1FBD]", "\u1FBF"},
|
||||
{"[\\u1FCD]", "\u1FBF\u0300"},
|
||||
{"[\\u1FCE]", "\u1FBF\u0301"},
|
||||
{"[\\u1FCF]", "\u1FBF\u0342"},
|
||||
{"[\\u1FDD]", "\u1FFE\u0300"},
|
||||
{"[\\u1FDE]", "\u1FFE\u0301"},
|
||||
{"[\\u1FDF]", "\u1FFE\u0342"},
|
||||
{"[\\uFC5E]", "\uFE72\u0651"},
|
||||
{"[\\uFC5F]", "\uFE74\u0651"},
|
||||
{"[\\uFC60]", "\uFE76\u0651"},
|
||||
{"[\\uFC61]", "\uFE78\u0651"},
|
||||
{"[\\uFC62]", "\uFE7A\u0651"},
|
||||
{"[\\uFC63]", "\uFE7C\u0670"},
|
||||
{"[\\uFCF2]", "\uFE77\u0651"},
|
||||
{"[\\uFCF3]", "\uFE79\u0651"},
|
||||
{"[\\uFCF4]", "\uFE7B\u0651"},
|
||||
};
|
||||
int count = 0;
|
||||
UnicodeSet mappedChars = spacingMap.keySet();
|
||||
for (int i = 0; i < specials.length; ++i) {
|
||||
UnicodeSet source = new UnicodeSet(specials[i][0]);
|
||||
if (!mappedChars.containsAll(source)) {
|
||||
throw new InternalError("Remapping character that doesn't need it!" + source);
|
||||
}
|
||||
spacingMap.putAll(source, specials[i][1]);
|
||||
count += source.size();
|
||||
}
|
||||
spacingMap.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Just accessible for testing.
|
||||
*/
|
||||
/*
|
||||
boolean isExcluded (char ch) {
|
||||
return data.isExcluded(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Just accessible for testing.
|
||||
*/
|
||||
/*
|
||||
String getRawDecompositionMapping (char ch) {
|
||||
return data.getRawDecompositionMapping(ch);
|
||||
}
|
||||
//*/
|
||||
}
|
@ -1,349 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import com.ibm.text.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
/**
|
||||
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
|
||||
* See UTR#15 for details.<br>
|
||||
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
|
||||
* The Unicode Consortium makes no expressed or implied warranty of any
|
||||
* kind, and assumes no liability for errors or omissions.
|
||||
* No liability is assumed for incidental and consequential damages
|
||||
* in connection with or arising out of the use of the information here.
|
||||
* @author Mark Davis
|
||||
*/
|
||||
|
||||
public class NormalizerSample implements UCD_Types {
|
||||
static final String copyright = "Copyright (C) 2001, IBM Corp. and Unicode Inc. All Rights Reserved.";
|
||||
|
||||
public static boolean SHOW_PROGRESS = false;
|
||||
|
||||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
public NormalizerSample(byte form, String unicodeVersion) {
|
||||
this.composition = (form & COMPOSITION_MASK) != 0;
|
||||
this.compatibility = (form & COMPATIBILITY_MASK) != 0;
|
||||
this.data = getData(unicodeVersion);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
public NormalizerSample(byte form) {
|
||||
this(form,"");
|
||||
}
|
||||
|
||||
/**
|
||||
* Masks for the form selector
|
||||
*/
|
||||
public static final byte
|
||||
COMPATIBILITY_MASK = 1,
|
||||
COMPOSITION_MASK = 2;
|
||||
|
||||
/**
|
||||
* Normalization Form Selector
|
||||
*/
|
||||
public static final byte
|
||||
NFD = 0 ,
|
||||
NFKD = COMPATIBILITY_MASK,
|
||||
NFC = COMPOSITION_MASK,
|
||||
NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form,
|
||||
* replacing contents of the target buffer.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
public StringBuffer normalize(String source, StringBuffer target) {
|
||||
|
||||
// First decompose the source into target,
|
||||
// then compose if the form requires.
|
||||
|
||||
if (source.length() != 0) {
|
||||
internalDecompose(source, target);
|
||||
if (composition) {
|
||||
internalCompose(target);
|
||||
}
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param source the original text, unnormalized
|
||||
* @return target the resulting normalized text
|
||||
*/
|
||||
public String normalize(String source) {
|
||||
return normalize(source, new StringBuffer()).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param newLocaleID the original text, unnormalized
|
||||
* @return target the resulting normalized text
|
||||
*/
|
||||
public String normalize(int cp) {
|
||||
return normalize(UTF16.valueOf(cp));
|
||||
}
|
||||
|
||||
/**
|
||||
*/
|
||||
private StringBuffer hasDecompositionBuffer = new StringBuffer();
|
||||
|
||||
public boolean hasDecomposition(int cp) {
|
||||
hasDecompositionBuffer.setLength(0);
|
||||
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
|
||||
if (hasDecompositionBuffer.length() != 1) return true;
|
||||
return cp != hasDecompositionBuffer.charAt(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Checks whether there is a recursive decomposition of a character from the
|
||||
* Unicode Character Database. It is compatibility or canonical according to the particular
|
||||
* normalizer.
|
||||
* @param ch the source character
|
||||
*/
|
||||
public boolean normalizationDiffers(int ch) {
|
||||
return data.normalizationDiffers(ch, composition, compatibility);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Gets recursive decomposition of a character from the
|
||||
* Unicode Character Database.
|
||||
* @param compatibility If false selects the recursive
|
||||
* canonical decomposition, otherwise selects
|
||||
* the recursive compatibility AND canonical decomposition.
|
||||
* @param ch the source character
|
||||
* @param buffer buffer to be filled with the decomposition
|
||||
*/
|
||||
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
|
||||
data.getRecursiveDecomposition(ch, buffer, compatibility);
|
||||
}
|
||||
|
||||
|
||||
// ======================================
|
||||
// PRIVATES
|
||||
// ======================================
|
||||
|
||||
/**
|
||||
* The current form.
|
||||
*/
|
||||
private boolean composition;
|
||||
private boolean compatibility;
|
||||
|
||||
/**
|
||||
* Decomposes text, either canonical or compatibility,
|
||||
* replacing contents of the target buffer.
|
||||
* @param form the normalization form. If COMPATIBILITY_MASK
|
||||
* bit is on in this byte, then selects the recursive
|
||||
* compatibility decomposition, otherwise selects
|
||||
* the recursive canonical decomposition.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
private void internalDecompose(String source, StringBuffer target) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
int ch32;
|
||||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
|
||||
buffer.setLength(0);
|
||||
ch32 = UTF16.charAt(source, i);
|
||||
data.getRecursiveDecomposition(ch32, buffer, compatibility);
|
||||
|
||||
// add all of the characters in the decomposition.
|
||||
// (may be just the original character, if there was
|
||||
// no decomposition mapping)
|
||||
|
||||
int ch;
|
||||
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
|
||||
ch = UTF16.charAt(buffer, j);
|
||||
int chClass = data.getCanonicalClass(ch);
|
||||
int k = target.length(); // insertion point
|
||||
if (chClass != 0) {
|
||||
|
||||
// bubble-sort combining marks as necessary
|
||||
|
||||
int ch2;
|
||||
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
|
||||
ch2 = UTF16.charAt(target, k-1);
|
||||
if (data.getCanonicalClass(ch2) <= chClass) break;
|
||||
}
|
||||
}
|
||||
target.insert(k, UTF16.valueOf(ch));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Composes text in place. Target must already
|
||||
* have been decomposed.
|
||||
* Uses UTF16, which is a utility class for supplementary character support in Java.
|
||||
* @param target input: decomposed text.
|
||||
* output: the resulting normalized text.
|
||||
*/
|
||||
private void internalCompose(StringBuffer target) {
|
||||
int starterPos = 0;
|
||||
int starterCh = UTF16.charAt(target,0);
|
||||
int compPos = UTF16.getCharCount(starterCh); // length of last composition
|
||||
int lastClass = data.getCanonicalClass(starterCh);
|
||||
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
|
||||
int oldLen = target.length();
|
||||
|
||||
// Loop on the decomposed characters, combining where possible
|
||||
|
||||
int ch;
|
||||
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
|
||||
ch = UTF16.charAt(target, decompPos);
|
||||
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
|
||||
+ ", decompPos: " + decompPos
|
||||
+ ", compPos: " + compPos
|
||||
+ ", ch: " + Utility.hex(ch)
|
||||
);
|
||||
int chClass = data.getCanonicalClass(ch);
|
||||
int composite = data.getPairwiseComposition(starterCh, ch);
|
||||
if (composite != data.NOT_COMPOSITE
|
||||
&& (lastClass < chClass || lastClass == 0)) {
|
||||
UTF16.setCharAt(target, starterPos, composite);
|
||||
// we know that we will only be replacing non-supplementaries by non-supplementaries
|
||||
// so we don't have to adjust the decompPos
|
||||
starterCh = composite;
|
||||
} else {
|
||||
if (chClass == 0) {
|
||||
starterPos = compPos;
|
||||
starterCh = ch;
|
||||
}
|
||||
lastClass = chClass;
|
||||
UTF16.setCharAt(target, compPos, ch);
|
||||
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
|
||||
System.out.println("ADJUSTING: " + Utility.hex(target));
|
||||
decompPos += target.length() - oldLen;
|
||||
oldLen = target.length();
|
||||
}
|
||||
compPos += UTF16.getCharCount(ch);
|
||||
}
|
||||
}
|
||||
target.setLength(compPos);
|
||||
}
|
||||
|
||||
// The following class makes use of the UCD class, which accesses data in the Unicode Character Database
|
||||
|
||||
static class Stub {
|
||||
private UCD ucd;
|
||||
private HashMap compTable = new HashMap();
|
||||
private BitSet isSecond = new BitSet();
|
||||
private BitSet canonicalRecompose = new BitSet();
|
||||
private BitSet compatibilityRecompose = new BitSet();
|
||||
static final int NOT_COMPOSITE = 0xFFFF;
|
||||
|
||||
Stub(String version) {
|
||||
ucd = UCD.make(version);
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAssigned(i)) continue;
|
||||
if (ucd.isPUA(i)) continue;
|
||||
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
|
||||
byte dt = ucd.getDecompositionType(i);
|
||||
if (dt != CANONICAL) continue;
|
||||
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
|
||||
try {
|
||||
String s = ucd.getDecompositionMapping(i);
|
||||
int len = UTF16.countCodePoint(s);
|
||||
if (len != 2) {
|
||||
if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
|
||||
continue;
|
||||
}
|
||||
int a = UTF16.charAt(s, 0);
|
||||
if (ucd.getCombiningClass(a) != 0) continue;
|
||||
|
||||
int b = UTF16.charAt(s, UTF16.getCharCount(a));
|
||||
isSecond.set(b);
|
||||
|
||||
// have a recomposition, so set the bit
|
||||
canonicalRecompose.set(i);
|
||||
|
||||
// set the compatibility recomposition bit
|
||||
// ONLY if the component characters
|
||||
// don't compatibility decompose
|
||||
if (ucd.getDecompositionType(a) <= CANONICAL
|
||||
&& ucd.getDecompositionType(b) <= CANONICAL) {
|
||||
compatibilityRecompose.set(i);
|
||||
}
|
||||
|
||||
long key = (((long)a)<<32) | b;
|
||||
|
||||
compTable.put(new Long(key), new Integer(i));
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
short getCanonicalClass(int cp) {
|
||||
return ucd.getCombiningClass(cp);
|
||||
}
|
||||
|
||||
boolean isTrailing(int cp) {
|
||||
return isSecond.get(cp);
|
||||
}
|
||||
|
||||
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
if (!composition) {
|
||||
if (compatibility) return dt >= CANONICAL;
|
||||
else return dt == CANONICAL;
|
||||
} else {
|
||||
// almost the same, except that we add back in the characters
|
||||
// that RECOMPOSE
|
||||
if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
|
||||
else return dt == CANONICAL && !canonicalRecompose.get(cp);
|
||||
}
|
||||
}
|
||||
|
||||
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
|
||||
if (dt == CANONICAL || dt > CANONICAL && compatibility) {
|
||||
String s = ucd.getDecompositionMapping(cp);
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
getRecursiveDecomposition(cp, buffer, compatibility);
|
||||
}
|
||||
} else {
|
||||
UTF16.append(buffer, cp);
|
||||
}
|
||||
}
|
||||
|
||||
int getPairwiseComposition(int starterCh, int ch) {
|
||||
int hangulPoss = UCD.composeHangul(starterCh, ch);
|
||||
if (hangulPoss != 0xFFFF) return hangulPoss;
|
||||
Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
|
||||
if (obj == null) return 0xFFFF;
|
||||
return ((Integer)obj).intValue();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains normalization data from the Unicode Character Database.
|
||||
* use false for the minimal set, true for the real set.
|
||||
*/
|
||||
private Stub data;
|
||||
|
||||
private static HashMap versionCache = new HashMap();
|
||||
|
||||
private static Stub getData (String version) {
|
||||
if (version.length() == 0) version = UCD.latestVersion;
|
||||
Stub result = (Stub)versionCache.get(version);
|
||||
if (result == null) {
|
||||
result = new Stub(version);
|
||||
versionCache.put(version, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
@ -1,109 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/OldUnicodeMap.java,v $
|
||||
* $Date: 2005/03/04 02:50:26 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Class that maps from codepoints to an index, and optionally a label.
|
||||
*/
|
||||
public class OldUnicodeMap {
|
||||
UnicodeSet[] sets = new UnicodeSet[50];
|
||||
String[] labels = new String[50];
|
||||
int count = 0;
|
||||
|
||||
public int add(String label, UnicodeSet set) {
|
||||
return add(label, set, false, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add set
|
||||
*@param removeOld true: remove any collisions from sets already in the map
|
||||
* if false, remove any collisions from this set
|
||||
*@param signal: print a warning when collisions occur
|
||||
*/
|
||||
public int add(String label, UnicodeSet set, boolean removeOld, boolean signal) {
|
||||
// remove from any preceding!!
|
||||
for (int i = 0; i < count; ++i) {
|
||||
if (!set.containsSome(sets[i])) continue;
|
||||
if (signal) showOverlap(label, set, i);
|
||||
if (removeOld) {
|
||||
sets[i] = sets[i].removeAll(set);
|
||||
} else {
|
||||
set = set.removeAll(sets[i]);
|
||||
}
|
||||
}
|
||||
sets[count] = set;
|
||||
labels[count++] = label;
|
||||
return (short)(count - 1);
|
||||
}
|
||||
|
||||
public void showOverlap(String label, UnicodeSet set, int i) {
|
||||
UnicodeSet delta = new UnicodeSet(set).retainAll(sets[i]);
|
||||
System.out.println("Warning! Overlap with " + label + " and " + labels[i]
|
||||
+ ": " + delta);
|
||||
}
|
||||
|
||||
public int getIndex(int codepoint) {
|
||||
for (int i = count - 1; i >= 0; --i) {
|
||||
if (sets[i].contains(codepoint)) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public int getIndexFromLabel(String label) {
|
||||
for (int i = count - 1; i >= 0; --i) {
|
||||
if (labels[i].equalsIgnoreCase(label)) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public String getLabel(int codepoint) {
|
||||
return getLabelFromIndex(getIndex(codepoint));
|
||||
}
|
||||
|
||||
public String getLabelFromIndex(int index) {
|
||||
if (index < 0 || index >= count) return null;
|
||||
return labels[index];
|
||||
}
|
||||
|
||||
public UnicodeSet getSetFromIndex(int index) {
|
||||
if (index < 0 || index >= count) return null;
|
||||
return new UnicodeSet(sets[index]); // protect from changes
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public int setLabel(int index, String label) {
|
||||
labels[index] = label;
|
||||
return index;
|
||||
}
|
||||
|
||||
public int put(int codepoint, int index) {
|
||||
if (sets[index] == null) {
|
||||
sets[index] = new UnicodeSet();
|
||||
if (index >= count) count = index + 1;
|
||||
}
|
||||
sets[index].add(codepoint);
|
||||
return index;
|
||||
}
|
||||
|
||||
}
|
@ -1,76 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ProcessUnihan.java,v $
|
||||
* $Date: 2005/03/04 02:50:26 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import java.util.*;
|
||||
|
||||
// stub file, ignore
|
||||
|
||||
public final class ProcessUnihan {
|
||||
/*
|
||||
|
||||
static final boolean TESTING = false;
|
||||
static int type;
|
||||
|
||||
public static void main() {
|
||||
try {
|
||||
type = 0;
|
||||
System.out.println("Starting");
|
||||
process();
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception: " + e);
|
||||
}
|
||||
}
|
||||
|
||||
static PrintWriter out;
|
||||
static PrintWriter err;
|
||||
|
||||
static int count;
|
||||
static int oldLine;
|
||||
|
||||
static Map map = new HashMap();
|
||||
static Map tags = new HashMap();
|
||||
|
||||
static void process() throws java.io.IOException {
|
||||
int lineCounter = 0;
|
||||
String[] parts = new String[3];
|
||||
|
||||
//out = Utility.openPrintWriter("Transliterate_Han_English.txt");
|
||||
//err = Utility.openPrintWriter("Transliterate_Han_English.log.txt");
|
||||
|
||||
BufferedReader in = Utility.openUnicodeFile("Unihan", "3.2.0", true, Utility.UTF8);
|
||||
while (true) {
|
||||
Utility.dot(++lineCounter);
|
||||
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
int commentPos = line.indexOf('#');
|
||||
if (commentPos >= 0) line = line.substring(0,commentPos);
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
int count = Utility.split(line, '#', parts);
|
||||
|
||||
int code = Integer.parseInt(parts[0].substring(2), 16);
|
||||
Byte itag = (Byte) tags.get("a");
|
||||
if (itag == null) {}
|
||||
String tag = parts[1];
|
||||
String value = parts[2];
|
||||
if (tags.containsKey(tag)) {}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
@ -1,41 +0,0 @@
|
||||
#
|
||||
# This file contains aliases for properties used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
# For information on which properties are normative, see UCD.html.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
#
|
||||
# FORMAT
|
||||
#
|
||||
# Each line has two or more fields, separated by semicolons.
|
||||
#
|
||||
# First Field: The first field is an abbreviated name for the property.
|
||||
#
|
||||
# Second Field: The second field is a long name
|
||||
#
|
||||
# The above are the preferred aliases. Other aliases may be listed in additional fields.
|
||||
#
|
||||
# Loose matching should be applied to all property names and property values, with
|
||||
# the exception of String Property values. With loose matching of property names and
|
||||
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
|
||||
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
|
||||
#
|
||||
# NOTE: Property value names are NOT unique across properties. For example:
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Alpha_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
# For example:
|
||||
#
|
||||
# sc means the Script property, and
|
||||
# Sc means the General_Category property value Currency_Symbol (Sc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
#
|
||||
# For more information, see UTS #18: Regular Expression Guidelines
|
||||
# ================================================
|
||||
|
@ -1,248 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
|
||||
* $Date: 2003/03/19 17:30:56 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.*;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.text.NumberFormat;
|
||||
|
||||
|
||||
abstract public class PropertyLister implements UCD_Types {
|
||||
|
||||
static final boolean COMPRESS_NAMES = false;
|
||||
static final boolean DROP_INDICATORS = true;
|
||||
|
||||
|
||||
protected UCD ucdData;
|
||||
protected PrintWriter output;
|
||||
protected boolean showOnConsole;
|
||||
protected boolean usePropertyComment = true;
|
||||
protected boolean breakByCategory = true;
|
||||
protected int firstRealCp = -2;
|
||||
protected int lastRealCp = -2;
|
||||
protected boolean alwaysBreaks = false; // set to true if property only breaks
|
||||
protected boolean commentOut = false;
|
||||
protected boolean useKenName = true; // set to false to get meaningful names
|
||||
private UnicodeSet set = new UnicodeSet();
|
||||
|
||||
public static final byte INCLUDE = 0, BREAK = 1, CONTINUE = 2, EXCLUDE = 3;
|
||||
|
||||
/**
|
||||
* @return status. Also have access to firstRealCp, lastRealCp
|
||||
*/
|
||||
abstract public byte status(int cp);
|
||||
|
||||
public String headerString() {
|
||||
return "";
|
||||
}
|
||||
|
||||
public String valueName(int cp) {
|
||||
return "";
|
||||
}
|
||||
|
||||
public String missingValueName() {
|
||||
return "";
|
||||
}
|
||||
|
||||
public String optionalName(int cp) {
|
||||
return "";
|
||||
}
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
if (!usePropertyComment) return "";
|
||||
return ucdData.getModCatID_fromIndex(getModCat(cp));
|
||||
}
|
||||
|
||||
public int minPropertyWidth() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public void format(int startCp, int endCp, int realCount) {
|
||||
try {
|
||||
set.add(startCp, endCp);
|
||||
String prop = valueName(startCp);
|
||||
String opt = "";
|
||||
String optCom = "";
|
||||
String commentSep = " # ";
|
||||
if (commentOut) commentSep = "";
|
||||
|
||||
if (prop.length() > 0) prop = "; " + prop;
|
||||
opt = optionalName(startCp);
|
||||
if (opt.length() > 0) opt = "; " + opt;
|
||||
optCom = optionalComment(startCp);
|
||||
if (optCom.length() > 0) optCom += " ";
|
||||
|
||||
String startName = getKenName(startCp);
|
||||
String line;
|
||||
String pgap = Utility.repeat(" ", minPropertyWidth() - prop.length() - opt.length());
|
||||
if (startCp != endCp) {
|
||||
String endName = getKenName(endCp);
|
||||
int bridge = endCp - startCp + 1 - realCount;
|
||||
String count = (bridge == 0) ? "" + realCount : realCount + "/" + bridge;
|
||||
String countStr = Utility.repeat(" ", 3-count.length()) + "[" + count + "] ";
|
||||
String gap = Utility.repeat(" ", 12 - width(startCp) - width(endCp));
|
||||
|
||||
line = Utility.hex(startCp,4) + ".." + Utility.hex(endCp,4) + gap
|
||||
+ prop + opt + pgap + commentSep + optCom
|
||||
+ countStr;
|
||||
if (startName.length() != 0 || endName.length() != 0) {
|
||||
int com = 0;
|
||||
if (COMPRESS_NAMES) com = commonInitialWords(startName, endName);
|
||||
if (com == 0) {
|
||||
line += startName + ".." + endName;
|
||||
} else {
|
||||
line += startName.substring(0,com)
|
||||
+ "(" + startName.substring(com) + ".." + endName.substring(com) + ")";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
String gap = alwaysBreaks
|
||||
? Utility.repeat(" ", 6 - width(startCp))
|
||||
: Utility.repeat(" ", 14 - width(startCp));
|
||||
String gap2 = alwaysBreaks
|
||||
? " "
|
||||
: " ";
|
||||
line = Utility.hex(startCp,4) + gap
|
||||
+ prop + opt + pgap + commentSep + optCom + gap2
|
||||
+ startName;
|
||||
}
|
||||
if (commentOut) {
|
||||
line = "# " + line;
|
||||
}
|
||||
output.println(line);
|
||||
if (showOnConsole) System.out.println(line);
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Format error {0}, {1}",
|
||||
new Object[]{new Integer(startCp), new Integer(endCp)}, e);
|
||||
}
|
||||
}
|
||||
|
||||
int width(int cp) {
|
||||
return cp <= 0xFFFF ? 4
|
||||
: cp <= 0xFFFFF ? 5
|
||||
: 6;
|
||||
}
|
||||
|
||||
String getKenName(int cp) {
|
||||
String result = ucdData.getName(cp);
|
||||
if (!useKenName) return result;
|
||||
if (result == null) return "";
|
||||
if (DROP_INDICATORS && result.charAt(0) == '<') {
|
||||
if (cp < 0xFF) return "<control>";
|
||||
return "";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
byte getModCat(int cp) {
|
||||
byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return common initial substring length ending with SPACE or HYPHEN-MINUS. 0 if there is none
|
||||
*/
|
||||
public static int commonInitialWords(String a, String b) {
|
||||
if (a.length() > b.length()) {
|
||||
String temp = a;
|
||||
a = b;
|
||||
b = temp;
|
||||
}
|
||||
int lastSpace = 0;
|
||||
for (int i = 0; i < a.length(); ++i) {
|
||||
char ca = a.charAt(i);
|
||||
char cb = b.charAt(i);
|
||||
if (ca != cb) return lastSpace;
|
||||
if (ca == ' ' || ca == '-') lastSpace = i + 1;
|
||||
}
|
||||
if (b.length() == a.length() || b.charAt(a.length()) == ' ' || b.charAt(a.length()) == '-') {
|
||||
lastSpace = a.length();
|
||||
}
|
||||
return lastSpace;
|
||||
}
|
||||
|
||||
public int print() {
|
||||
set.clear();
|
||||
int count = 0;
|
||||
firstRealCp = -1;
|
||||
byte firstRealCpCat = -1;
|
||||
lastRealCp = -1;
|
||||
int realRangeCount = 0;
|
||||
|
||||
String header = headerString();
|
||||
if (header.length() != 0) {
|
||||
// System.out.println(header);
|
||||
output.println(header);
|
||||
output.println();
|
||||
}
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
byte s = status(cp);
|
||||
if (alwaysBreaks && s == INCLUDE) s = BREAK;
|
||||
if (s == INCLUDE && firstRealCp != -1) {
|
||||
if (getModCat(cp) != firstRealCpCat) s = BREAK;
|
||||
}
|
||||
|
||||
switch(s) {
|
||||
case CONTINUE:
|
||||
break; // do nothing
|
||||
case INCLUDE:
|
||||
if (firstRealCp == -1) {
|
||||
firstRealCp = cp;
|
||||
firstRealCpCat = getModCat(firstRealCp);
|
||||
}
|
||||
lastRealCp = cp;
|
||||
count++;
|
||||
realRangeCount++;
|
||||
break;
|
||||
case BREAK:
|
||||
if (firstRealCp != -1) {
|
||||
format(firstRealCp, lastRealCp, realRangeCount);
|
||||
}
|
||||
lastRealCp = firstRealCp = cp;
|
||||
firstRealCpCat = getModCat(firstRealCp);
|
||||
|
||||
realRangeCount = 1;
|
||||
count++;
|
||||
break;
|
||||
case EXCLUDE:
|
||||
if (firstRealCp != -1) {
|
||||
format(firstRealCp, lastRealCp, realRangeCount);
|
||||
firstRealCp = -1;
|
||||
realRangeCount = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (firstRealCp != -1) {
|
||||
format(firstRealCp, lastRealCp, realRangeCount);
|
||||
}
|
||||
|
||||
if (count == 0) {
|
||||
output.println("# No values for " + missingValueName());
|
||||
System.out.println("ZERO COUNT for " + missingValueName());
|
||||
}
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
nf.setMaximumFractionDigits(0);
|
||||
nf.setGroupingUsed(false);
|
||||
output.println();
|
||||
output.println("# Total code points: " + nf.format(count));
|
||||
output.println();
|
||||
//System.out.println(headerString());
|
||||
//System.out.println(set.toPattern(true));
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
#
|
||||
# This file contains aliases for property values used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
# For information on which properties are normative, see UCD.html.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
#
|
||||
# FORMAT
|
||||
#
|
||||
# Each line describes a property value name.
|
||||
# This consists of three or more fields, separated by semicolons.
|
||||
#
|
||||
# First Field: The first field describes the property for which that
|
||||
# property value name is used.
|
||||
#
|
||||
# Second Field: The second field is an abbreviated name.
|
||||
# If there is no abbreviated name available, the field is marked with "n/a".
|
||||
#
|
||||
# Third Field: The third field is a long name.
|
||||
#
|
||||
# In the case of ccc, there are 4 fields. The second field is numeric, third
|
||||
# is abbreviated, and fourth is long.
|
||||
#
|
||||
# The above are the preferred aliases. Other aliases may be listed in additional fields.
|
||||
#
|
||||
# Loose matching should be applied to all property names and property values, with
|
||||
# the exception of String Property values. With loose matching of property names and
|
||||
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
|
||||
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
|
||||
#
|
||||
# NOTE: Property value names are NOT unique across properties. For example:
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Alpha_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
# For example:
|
||||
#
|
||||
# sc means the Script property, and
|
||||
# Sc means the General_Category property value Currency_Symbol (Sc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
#
|
||||
# For more information, see UTS #18: Regular Expression Guidelines
|
||||
# ================================================
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,266 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public class ScriptExceptions {
|
||||
|
||||
public static UnicodeSet getExceptions() {
|
||||
UnicodeSet contents = new UnicodeSet();
|
||||
// "FAIL: " => "contents.add(0x"
|
||||
// ";" => ");//"
|
||||
// ".." => ", 0x"
|
||||
|
||||
|
||||
|
||||
contents.add(0x005E);// COMMON # (Sk) CIRCUMFLEX ACCENT
|
||||
contents.add(0x0060);// COMMON # (Sk) GRAVE ACCENT
|
||||
contents.add(0x00A8);// COMMON # (Sk) DIAERESIS
|
||||
contents.add(0x00AF);// COMMON # (Sk) MACRON
|
||||
contents.add(0x00B4);// COMMON # (Sk) ACUTE ACCENT
|
||||
contents.add(0x00B8);// COMMON # (Sk) CEDILLA
|
||||
contents.add(0x02B9, 0x02BA);// COMMON # (Sk) MODIFIER LETTER PRIME, 0xMODIFIER LETTER DOUBLE PRIME
|
||||
contents.add(0x02C2, 0x02CF);// COMMON # (Sk) MODIFIER LETTER LEFT ARROWHEAD, 0xMODIFIER LETTER LOW ACUTE ACCENT
|
||||
contents.add(0x02D2, 0x02DF);// COMMON # (Sk) MODIFIER LETTER CENTRED RIGHT HALF RING, 0xMODIFIER LETTER CROSS ACCENT
|
||||
contents.add(0x02E5, 0x02ED);// COMMON # (Sk) MODIFIER LETTER EXTRA-HIGH TONE BAR, 0xMODIFIER LETTER UNASPIRATED
|
||||
contents.add(0x0374, 0x0375);// COMMON # (Sk) GREEK NUMERAL SIGN, 0xGREEK LOWER NUMERAL SIGN
|
||||
contents.add(0x0384, 0x0385);// COMMON # (Sk) GREEK TONOS, 0xGREEK DIALYTIKA TONOS
|
||||
contents.add(0x1FBD);// COMMON # (Sk) GREEK KORONIS
|
||||
contents.add(0x1FBF, 0x1FC1);// COMMON # (Sk) GREEK PSILI, 0xGREEK DIALYTIKA AND PERISPOMENI
|
||||
contents.add(0x1FCD, 0x1FCF);// COMMON # (Sk) GREEK PSILI AND VARIA, 0xGREEK PSILI AND PERISPOMENI
|
||||
contents.add(0x1FDD, 0x1FDF);// COMMON # (Sk) GREEK DASIA AND VARIA, 0xGREEK DASIA AND PERISPOMENI
|
||||
contents.add(0x1FED, 0x1FEF);// COMMON # (Sk) GREEK DIALYTIKA AND VARIA, 0xGREEK VARIA
|
||||
contents.add(0x1FFD, 0x1FFE);// COMMON # (Sk) GREEK OXIA, 0xGREEK DASIA
|
||||
contents.add(0x309B, 0x309C);// COMMON # (Sk) KATAKANA-HIRAGANA VOICED SOUND MARK, 0xKATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
contents.add(0xFF3E);// COMMON # (Sk) FULLWIDTH CIRCUMFLEX ACCENT
|
||||
contents.add(0xFF40);// COMMON # (Sk) FULLWIDTH GRAVE ACCENT
|
||||
contents.add(0xFFE3);// COMMON # (Sk) FULLWIDTH MACRON
|
||||
|
||||
contents.add(0x0640);// COMMON # (Lm) ARABIC TATWEEL
|
||||
|
||||
contents.add(0x3006);// COMMON # (Lo) IDEOGRAPHIC CLOSING MARK
|
||||
contents.add(0x303C);// COMMON # (Lo) MASU MARK
|
||||
|
||||
contents.add(0x2135, 0x2138);// COMMON # (Lo) ALEF SYMBOL..DALET SYMBOL
|
||||
contents.add(0x1714);// TAGALOG # (Mn) TAGALOG SIGN VIRAMA
|
||||
|
||||
contents.add(0x1734);// HANUNOO # (Mn) HANUNOO SIGN PAMUDPOD
|
||||
//contents.add(0x0F3E, 0x0F3F);// COMMON # (Mc) TIBETAN SIGN YAR TSHES, 0xTIBETAN SIGN MAR TSHES
|
||||
|
||||
contents.add(0x2071);// COMMON # (LC) SUPERSCRIPT LATIN SMALL LETTER I
|
||||
contents.add(0x2102);// COMMON # (LC) DOUBLE-STRUCK CAPITAL C
|
||||
contents.add(0x2107);// COMMON # (LC) EULER CONSTANT
|
||||
contents.add(0x210A, 0x2113);// COMMON # (LC) SCRIPT SMALL G, 0xSCRIPT SMALL L
|
||||
contents.add(0x2115);// COMMON # (LC) DOUBLE-STRUCK CAPITAL N
|
||||
contents.add(0x2119, 0x211D);// COMMON # (LC) DOUBLE-STRUCK CAPITAL P, 0xDOUBLE-STRUCK CAPITAL R
|
||||
contents.add(0x2124);// COMMON # (LC) DOUBLE-STRUCK CAPITAL Z
|
||||
contents.add(0x2128);// COMMON # (LC) BLACK-LETTER CAPITAL Z
|
||||
contents.add(0x212C, 0x212D);// COMMON # (LC) SCRIPT CAPITAL B, 0xBLACK-LETTER CAPITAL C
|
||||
contents.add(0x212F, 0x2131);// COMMON # (LC) SCRIPT SMALL E, 0xSCRIPT CAPITAL F
|
||||
contents.add(0x2133, 0x2134);// COMMON # (LC) SCRIPT CAPITAL M, 0xSCRIPT SMALL O
|
||||
contents.add(0x2139);// COMMON # (LC) INFORMATION SOURCE
|
||||
contents.add(0x213D, 0x213F);// COMMON # (LC) DOUBLE-STRUCK SMALL GAMMA, 0xDOUBLE-STRUCK CAPITAL PI
|
||||
contents.add(0x2145, 0x2149);// COMMON # (LC) DOUBLE-STRUCK ITALIC CAPITAL D, 0xDOUBLE-STRUCK ITALIC SMALL J
|
||||
contents.add(0x1D400, 0x1D454);// COMMON # (LC) MATHEMATICAL BOLD CAPITAL A, 0xMATHEMATICAL ITALIC SMALL G
|
||||
contents.add(0x1D456, 0x1D49C);// COMMON # (LC) MATHEMATICAL ITALIC SMALL I, 0xMATHEMATICAL SCRIPT CAPITAL A
|
||||
contents.add(0x1D49E, 0x1D49F);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL C, 0xMATHEMATICAL SCRIPT CAPITAL D
|
||||
contents.add(0x1D4A2);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL G
|
||||
contents.add(0x1D4A5, 0x1D4A6);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL J, 0xMATHEMATICAL SCRIPT CAPITAL K
|
||||
contents.add(0x1D4A9, 0x1D4AC);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL N, 0xMATHEMATICAL SCRIPT CAPITAL Q
|
||||
contents.add(0x1D4AE, 0x1D4B9);// COMMON # (LC) MATHEMATICAL SCRIPT CAPITAL S, 0xMATHEMATICAL SCRIPT SMALL D
|
||||
contents.add(0x1D4BB);// COMMON # (LC) MATHEMATICAL SCRIPT SMALL F
|
||||
contents.add(0x1D4BD, 0x1D4C0);// COMMON # (LC) MATHEMATICAL SCRIPT SMALL H, 0xMATHEMATICAL SCRIPT SMALL K
|
||||
contents.add(0x1D4C2, 0x1D4C3);// COMMON # (LC) MATHEMATICAL SCRIPT SMALL M, 0xMATHEMATICAL SCRIPT SMALL N
|
||||
contents.add(0x1D4C5, 0x1D505);// COMMON # (LC) MATHEMATICAL SCRIPT SMALL P, 0xMATHEMATICAL FRAKTUR CAPITAL B
|
||||
contents.add(0x1D507, 0x1D50A);// COMMON # (LC) MATHEMATICAL FRAKTUR CAPITAL D, 0xMATHEMATICAL FRAKTUR CAPITAL G
|
||||
contents.add(0x1D50D, 0x1D514);// COMMON # (LC) MATHEMATICAL FRAKTUR CAPITAL J, 0xMATHEMATICAL FRAKTUR CAPITAL Q
|
||||
contents.add(0x1D516, 0x1D51C);// COMMON # (LC) MATHEMATICAL FRAKTUR CAPITAL S, 0xMATHEMATICAL FRAKTUR CAPITAL Y
|
||||
contents.add(0x1D51E, 0x1D539);// COMMON # (LC) MATHEMATICAL FRAKTUR SMALL A, 0xMATHEMATICAL DOUBLE-STRUCK CAPITAL B
|
||||
contents.add(0x1D53B, 0x1D53E);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK CAPITAL D, 0xMATHEMATICAL DOUBLE-STRUCK CAPITAL G
|
||||
contents.add(0x1D540, 0x1D544);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK CAPITAL I, 0xMATHEMATICAL DOUBLE-STRUCK CAPITAL M
|
||||
contents.add(0x1D546);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK CAPITAL O
|
||||
contents.add(0x1D54A, 0x1D550);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK CAPITAL S, 0xMATHEMATICAL DOUBLE-STRUCK CAPITAL Y
|
||||
contents.add(0x1D552, 0x1D6A3);// COMMON # (LC) MATHEMATICAL DOUBLE-STRUCK SMALL A, 0xMATHEMATICAL MONOSPACE SMALL Z
|
||||
contents.add(0x1D6A8, 0x1D6C0);// COMMON # (LC) MATHEMATICAL BOLD CAPITAL ALPHA, 0xMATHEMATICAL BOLD CAPITAL OMEGA
|
||||
contents.add(0x1D6C2, 0x1D6DA);// COMMON # (LC) MATHEMATICAL BOLD SMALL ALPHA, 0xMATHEMATICAL BOLD SMALL OMEGA
|
||||
contents.add(0x1D6DC, 0x1D6FA);// COMMON # (LC) MATHEMATICAL BOLD EPSILON SYMBOL, 0xMATHEMATICAL ITALIC CAPITAL OMEGA
|
||||
contents.add(0x1D6FC, 0x1D714);// COMMON # (LC) MATHEMATICAL ITALIC SMALL ALPHA, 0xMATHEMATICAL ITALIC SMALL OMEGA
|
||||
contents.add(0x1D716, 0x1D734);// COMMON # (LC) MATHEMATICAL ITALIC EPSILON SYMBOL, 0xMATHEMATICAL BOLD ITALIC CAPITAL OMEGA
|
||||
contents.add(0x1D736, 0x1D74E);// COMMON # (LC) MATHEMATICAL BOLD ITALIC SMALL ALPHA, 0xMATHEMATICAL BOLD ITALIC SMALL OMEGA
|
||||
contents.add(0x1D750, 0x1D76E);// COMMON # (LC) MATHEMATICAL BOLD ITALIC EPSILON SYMBOL, 0xMATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA
|
||||
contents.add(0x1D770, 0x1D788);// COMMON # (LC) MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA, 0xMATHEMATICAL SANS-SERIF BOLD SMALL OMEGA
|
||||
contents.add(0x1D78A, 0x1D7A8);// COMMON # (LC) MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL, 0xMATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA
|
||||
contents.add(0x1D7AA, 0x1D7C2);// COMMON # (LC) MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA, 0xMATHEMATICAL SANS-SERIF BOLD IT ALIC SMALL OMEGA
|
||||
contents.add(0x1D7C4, 0x1D7C9);// COMMON # (LC) MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL, 0xMATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL
|
||||
|
||||
|
||||
|
||||
|
||||
contents.add(0x02BB, 0x02C1);// COMMON # (0xLm) MODIFIER LETTER TURNED COMMA, 0xMODIFIER LETTER REVERSED GLOTTAL STOP
|
||||
contents.add(0x02D0, 0x02D1);// COMMON # (0xLm) MODIFIER LETTER TRIANGULAR COLON, 0xMODIFIER LETTER HALF TRIANGULAR COLON
|
||||
contents.add(0x02EE);// COMMON # (0xLm) MODIFIER LETTER DOUBLE APOSTROPHE
|
||||
contents.add(0x3031, 0x3035);// COMMON # (0xLm) VERTICAL KANA REPEAT MARK, 0xVERTICAL KANA REPEAT MARK LOWER HALF
|
||||
contents.add(0x30FC);// COMMON # (0xLm) KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
contents.add(0xFF70);// COMMON # (0xLm) HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
contents.add(0xFF9E, 0xFF9F);// COMMON # (0xLm) HALFWIDTH KATAKANA VOICED SOUND MARK, 0xHALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
|
||||
contents.add(0x0483, 0x0486);// CYRILLIC # (0xMn) COMBINING CYRILLIC TITLO, 0xCOMBINING CYRILLIC PSILI PNEUMATA
|
||||
|
||||
contents.add(0x0711);// SYRIAC # (0xMn) SYRIAC LETTER SUPERSCRIPT ALAPH
|
||||
contents.add(0x0730, 0x074A);// SYRIAC # (0xMn) SYRIAC PTHAHA ABOVE, 0xSYRIAC BARREKH
|
||||
|
||||
contents.add(0x07A6, 0x07B0);// THAANA # (0xMn) THAANA ABAFILI, 0xTHAANA SUKUN
|
||||
|
||||
contents.add(0x0901, 0x0902);// DEVANAGARI # (0xMn) DEVANAGARI SIGN CANDRABINDU, 0xDEVANAGARI SIGN ANUSVARA
|
||||
contents.add(0x093C);// DEVANAGARI # (0xMn) DEVANAGARI SIGN NUKTA
|
||||
contents.add(0x0941, 0x0948);// DEVANAGARI # (0xMn) DEVANAGARI VOWEL SIGN U, 0xDEVANAGARI VOWEL SIGN AI
|
||||
contents.add(0x094D);// DEVANAGARI # (0xMn) DEVANAGARI SIGN VIRAMA
|
||||
contents.add(0x0951, 0x0954);// DEVANAGARI # (0xMn) DEVANAGARI STRESS SIGN UDATTA, 0xDEVANAGARI ACUTE ACCENT
|
||||
contents.add(0x0962, 0x0963);// DEVANAGARI # (0xMn) DEVANAGARI VOWEL SIGN VOCALIC L, 0xDEVANAGARI VOWEL SIGN VOCALIC LL
|
||||
|
||||
contents.add(0x0981);// BENGALI # (0xMn) BENGALI SIGN CANDRABINDU
|
||||
contents.add(0x09BC);// BENGALI # (0xMn) BENGALI SIGN NUKTA
|
||||
contents.add(0x09C1, 0x09C4);// BENGALI # (0xMn) BENGALI VOWEL SIGN U, 0xBENGALI VOWEL SIGN VOCALIC RR
|
||||
contents.add(0x09CD);// BENGALI # (0xMn) BENGALI SIGN VIRAMA
|
||||
contents.add(0x09E2, 0x09E3);// BENGALI # (0xMn) BENGALI VOWEL SIGN VOCALIC L, 0xBENGALI VOWEL SIGN VOCALIC LL
|
||||
|
||||
contents.add(0x0A02);// GURMUKHI # (0xMn) GURMUKHI SIGN BINDI
|
||||
contents.add(0x0A3C);// GURMUKHI # (0xMn) GURMUKHI SIGN NUKTA
|
||||
contents.add(0x0A41, 0x0A42);// GURMUKHI # (0xMn) GURMUKHI VOWEL SIGN U, 0xGURMUKHI VOWEL SIGN UU
|
||||
contents.add(0x0A47, 0x0A48);// GURMUKHI # (0xMn) GURMUKHI VOWEL SIGN EE, 0xGURMUKHI VOWEL SIGN AI
|
||||
contents.add(0x0A4B, 0x0A4D);// GURMUKHI # (0xMn) GURMUKHI VOWEL SIGN OO, 0xGURMUKHI SIGN VIRAMA
|
||||
contents.add(0x0A70, 0x0A71);// GURMUKHI # (0xMn) GURMUKHI TIPPI, 0xGURMUKHI ADDAK
|
||||
|
||||
contents.add(0x0A81, 0x0A82);// GUJARATI # (0xMn) GUJARATI SIGN CANDRABINDU, 0xGUJARATI SIGN ANUSVARA
|
||||
contents.add(0x0ABC);// GUJARATI # (0xMn) GUJARATI SIGN NUKTA
|
||||
contents.add(0x0AC1, 0x0AC5);// GUJARATI # (0xMn) GUJARATI VOWEL SIGN U, 0xGUJARATI VOWEL SIGN CANDRA E
|
||||
contents.add(0x0AC7, 0x0AC8);// GUJARATI # (0xMn) GUJARATI VOWEL SIGN E, 0xGUJARATI VOWEL SIGN AI
|
||||
contents.add(0x0ACD);// GUJARATI # (0xMn) GUJARATI SIGN VIRAMA
|
||||
|
||||
contents.add(0x0B01);// ORIYA # (0xMn) ORIYA SIGN CANDRABINDU
|
||||
contents.add(0x0B3C);// ORIYA # (0xMn) ORIYA SIGN NUKTA
|
||||
contents.add(0x0B3F);// ORIYA # (0xMn) ORIYA VOWEL SIGN I
|
||||
contents.add(0x0B41, 0x0B43);// ORIYA # (0xMn) ORIYA VOWEL SIGN U, 0xORIYA VOWEL SIGN VOCALIC R
|
||||
contents.add(0x0B4D);// ORIYA # (0xMn) ORIYA SIGN VIRAMA
|
||||
contents.add(0x0B56);// ORIYA # (0xMn) ORIYA AI LENGTH MARK
|
||||
|
||||
contents.add(0x0B82);// TAMIL # (0xMn) TAMIL SIGN ANUSVARA
|
||||
contents.add(0x0BC0);// TAMIL # (0xMn) TAMIL VOWEL SIGN II
|
||||
contents.add(0x0BCD);// TAMIL # (0xMn) TAMIL SIGN VIRAMA
|
||||
|
||||
contents.add(0x0C3E, 0x0C40);// TELUGU # (0xMn) TELUGU VOWEL SIGN AA, 0xTELUGU VOWEL SIGN II
|
||||
contents.add(0x0C46, 0x0C48);// TELUGU # (0xMn) TELUGU VOWEL SIGN E, 0xTELUGU VOWEL SIGN AI
|
||||
contents.add(0x0C4A, 0x0C4D);// TELUGU # (0xMn) TELUGU VOWEL SIGN O, 0xTELUGU SIGN VIRAMA
|
||||
contents.add(0x0C55, 0x0C56);// TELUGU # (0xMn) TELUGU LENGTH MARK, 0xTELUGU AI LENGTH MARK
|
||||
|
||||
contents.add(0x0CBF);// KANNADA # (0xMn) KANNADA VOWEL SIGN I
|
||||
contents.add(0x0CC6);// KANNADA # (0xMn) KANNADA VOWEL SIGN E
|
||||
contents.add(0x0CCC, 0x0CCD);// KANNADA # (0xMn) KANNADA VOWEL SIGN AU, 0xKANNADA SIGN VIRAMA
|
||||
|
||||
contents.add(0x0D41, 0x0D43);// MALAYALAM # (0xMn) MALAYALAM VOWEL SIGN U, 0xMALAYALAM VOWEL SIGN VOCALIC R
|
||||
contents.add(0x0D4D);// MALAYALAM # (0xMn) MALAYALAM SIGN VIRAMA
|
||||
|
||||
contents.add(0x0DCA);// SINHALA # (0xMn) SINHALA SIGN AL-LAKUNA
|
||||
contents.add(0x0DD2, 0x0DD4);// SINHALA # (0xMn) SINHALA VOWEL SIGN KETTI IS-PILLA, 0xSINHALA VOWEL SIGN KETTI PAA-PILLA
|
||||
contents.add(0x0DD6);// SINHALA # (0xMn) SINHALA VOWEL SIGN DIGA PAA-PILLA
|
||||
|
||||
contents.add(0x0E31);// THAI # (0xMn) THAI CHARACTER MAI HAN-AKAT
|
||||
contents.add(0x0E34, 0x0E3A);// THAI # (0xMn) THAI CHARACTER SARA I, 0xTHAI CHARACTER PHINTHU
|
||||
contents.add(0x0E47, 0x0E4E);// THAI # (0xMn) THAI CHARACTER MAITAIKHU, 0xTHAI CHARACTER YAMAKKAN
|
||||
|
||||
contents.add(0x0EB1);// LAO # (0xMn) LAO VOWEL SIGN MAI KAN
|
||||
contents.add(0x0EB4, 0x0EB9);// LAO # (0xMn) LAO VOWEL SIGN I, 0xLAO VOWEL SIGN UU
|
||||
contents.add(0x0EBB, 0x0EBC);// LAO # (0xMn) LAO VOWEL SIGN MAI KON, 0xLAO SEMIVOWEL SIGN LO
|
||||
contents.add(0x0EC8, 0x0ECD);// LAO # (0xMn) LAO TONE MAI EK, 0xLAO NIGGAHITA
|
||||
|
||||
contents.add(0x0F18, 0x0F19);// TIBETAN # (0xMn) TIBETAN ASTROLOGICAL SIGN -KHYUD PA, 0xTIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
|
||||
contents.add(0x0F35);// TIBETAN # (0xMn) TIBETAN MARK NGAS BZUNG NYI ZLA
|
||||
contents.add(0x0F37);// TIBETAN # (0xMn) TIBETAN MARK NGAS BZUNG SGOR RTAGS
|
||||
contents.add(0x0F39);// TIBETAN # (0xMn) TIBETAN MARK TSA -PHRU
|
||||
contents.add(0x0F71, 0x0F7E);// TIBETAN # (0xMn) TIBETAN VOWEL SIGN AA, 0xTIBETAN SIGN RJES SU NGA RO
|
||||
contents.add(0x0F80, 0x0F84);// TIBETAN # (0xMn) TIBETAN VOWEL SIGN REVERSED I, 0xTIBETAN MARK HALANTA
|
||||
contents.add(0x0F86, 0x0F87);// TIBETAN # (0xMn) TIBETAN SIGN LCI RTAGS, 0xTIBETAN SIGN YANG RTAGS
|
||||
contents.add(0x0F90, 0x0F97);// TIBETAN # (0xMn) TIBETAN SUBJOINED LETTER KA, 0xTIBETAN SUBJOINED LETTER JA
|
||||
contents.add(0x0F99, 0x0FBC);// TIBETAN # (0xMn) TIBETAN SUBJOINED LETTER NYA, 0xTIBETAN SUBJOINED LETTER FIXED-FORM RA
|
||||
contents.add(0x0FC6);// TIBETAN # (0xMn) TIBETAN SYMBOL PADMA GDAN
|
||||
|
||||
contents.add(0x102D, 0x1030);// MYANMAR # (0xMn) MYANMAR VOWEL SIGN I, 0xMYANMAR VOWEL SIGN UU
|
||||
contents.add(0x1032);// MYANMAR # (0xMn) MYANMAR VOWEL SIGN AI
|
||||
contents.add(0x1036, 0x1037);// MYANMAR # (0xMn) MYANMAR SIGN ANUSVARA, 0xMYANMAR SIGN DOT BELOW
|
||||
contents.add(0x1039);// MYANMAR # (0xMn) MYANMAR SIGN VIRAMA
|
||||
contents.add(0x1058, 0x1059);// MYANMAR # (0xMn) MYANMAR VOWEL SIGN VOCALIC L, 0xMYANMAR VOWEL SIGN VOCALIC LL
|
||||
|
||||
contents.add(0x17B7, 0x17BD);// KHMER # (0xMn) KHMER VOWEL SIGN I, 0xKHMER VOWEL SIGN UA
|
||||
contents.add(0x17C6);// KHMER # (0xMn) KHMER SIGN NIKAHIT
|
||||
contents.add(0x17C9, 0x17D3);// KHMER # (0xMn) KHMER SIGN MUUSIKATOAN, 0xKHMER SIGN BATHAMASAT
|
||||
|
||||
contents.add(0x18A9);// MONGOLIAN # (0xMn) MONGOLIAN LETTER ALI GALI DAGALGA
|
||||
|
||||
contents.add(0x1712, 0x1713);// TAGALOG # (0xMn) TAGALOG VOWEL SIGN I, 0xTAGALOG VOWEL SIGN U
|
||||
|
||||
contents.add(0x1732, 0x1733);// HANUNOO # (0xMn) HANUNOO VOWEL SIGN I, 0xHANUNOO VOWEL SIGN U
|
||||
|
||||
contents.add(0x1752, 0x1753);// BUHID # (0xMn) BUHID VOWEL SIGN I, 0xBUHID VOWEL SIGN U
|
||||
|
||||
contents.add(0x1772, 0x1773);// TAGBANWA # (0xMn) TAGBANWA VOWEL SIGN I, 0xTAGBANWA VOWEL SIGN U
|
||||
|
||||
//contents.add(0x1D165, 0x1D166);// COMMON # (0xMc) MUSICAL SYMBOL COMBINING STEM, 0xMUSICAL SYMBOL COMBINING SPRECHGESANG STEM
|
||||
//contents.add(0x1D16D, 0x1D172);// COMMON # (0xMc) MUSICAL SYMBOL COMBINING AUGMENTATION DOT, 0xMUSICAL SYMBOL COMBINING FLAG-5
|
||||
contents.add(0x0966, 0x096F);// DEVANAGARI # (0xNd) DEVANAGARI DIGIT ZERO, 0xDEVANAGARI DIGIT NINE
|
||||
|
||||
contents.add(0x09E6, 0x09EF);// BENGALI # (0xNd) BENGALI DIGIT ZERO, 0xBENGALI DIGIT NINE
|
||||
|
||||
contents.add(0x0A66, 0x0A6F);// GURMUKHI # (0xNd) GURMUKHI DIGIT ZERO, 0xGURMUKHI DIGIT NINE
|
||||
|
||||
contents.add(0x0AE6, 0x0AEF);// GUJARATI # (0xNd) GUJARATI DIGIT ZERO, 0xGUJARATI DIGIT NINE
|
||||
|
||||
contents.add(0x0B66, 0x0B6F);// ORIYA # (0xNd) ORIYA DIGIT ZERO, 0xORIYA DIGIT NINE
|
||||
|
||||
contents.add(0x0BE7, 0x0BEF);// TAMIL # (0xNd) TAMIL DIGIT ONE, 0xTAMIL DIGIT NINE
|
||||
|
||||
contents.add(0x0C66, 0x0C6F);// TELUGU # (0xNd) TELUGU DIGIT ZERO, 0xTELUGU DIGIT NINE
|
||||
|
||||
contents.add(0x0CE6, 0x0CEF);// KANNADA # (0xNd) KANNADA DIGIT ZERO, 0xKANNADA DIGIT NINE
|
||||
|
||||
contents.add(0x0D66, 0x0D6F);// MALAYALAM # (0xNd) MALAYALAM DIGIT ZERO, 0xMALAYALAM DIGIT NINE
|
||||
|
||||
contents.add(0x0E50, 0x0E59);// THAI # (0xNd) THAI DIGIT ZERO, 0xTHAI DIGIT NINE
|
||||
|
||||
contents.add(0x0ED0, 0x0ED9);// LAO # (0xNd) LAO DIGIT ZERO, 0xLAO DIGIT NINE
|
||||
|
||||
contents.add(0x0F20, 0x0F29);// TIBETAN # (0xNd) TIBETAN DIGIT ZERO, 0xTIBETAN DIGIT NINE
|
||||
|
||||
contents.add(0x1040, 0x1049);// MYANMAR # (0xNd) MYANMAR DIGIT ZERO, 0xMYANMAR DIGIT NINE
|
||||
|
||||
contents.add(0x1369, 0x1371);// ETHIOPIC # (0xNd) ETHIOPIC DIGIT ONE, 0xETHIOPIC DIGIT NINE
|
||||
|
||||
contents.add(0x17E0, 0x17E9);// KHMER # (0xNd) KHMER DIGIT ZERO, 0xKHMER DIGIT NINE
|
||||
|
||||
contents.add(0x1810, 0x1819);// MONGOLIAN # (0xNd) MONGOLIAN DIGIT ZERO, 0xMONGOLIAN DIGIT NINE
|
||||
|
||||
contents.add(0x16EE, 0x16F0);// RUNIC # (0xNl) RUNIC ARLAUG SYMBOL, 0xRUNIC BELGTHOR SYMBOL
|
||||
|
||||
contents.add(0x3007);// HAN # (0xNl) IDEOGRAPHIC NUMBER ZERO
|
||||
contents.add(0x3021, 0x3029);// HAN # (0xNl) HANGZHOU NUMERAL ONE, 0xHANGZHOU NUMERAL NINE
|
||||
contents.add(0x3038, 0x303A);// HAN # (0xNl) HANGZHOU NUMERAL TEN, 0xHANGZHOU NUMERAL THIRTY
|
||||
|
||||
contents.add(0x1034A);// GOTHIC # (0xNl) GOTHIC LETTER NINE HUNDRED
|
||||
|
||||
contents.add(0x0BF0, 0x0BF2);// TAMIL # (0xNo) TAMIL NUMBER TEN, 0xTAMIL NUMBER ONE THOUSAND
|
||||
|
||||
contents.add(0x0F2A, 0x0F33);// TIBETAN # (0xNo) TIBETAN DIGIT HALF ONE, 0xTIBETAN DIGIT HALF ZERO
|
||||
|
||||
contents.add(0x1372, 0x137C);// ETHIOPIC # (0xNo) ETHIOPIC NUMBER TEN, 0xETHIOPIC NUMBER TEN THOUSAND
|
||||
|
||||
contents.add(0x2E80, 0x2E99);// HAN # (0xSo) CJK RADICAL REPEAT, 0xCJK RADICAL RAP
|
||||
contents.add(0x2E9B, 0x2EF3);// HAN # (0xSo) CJK RADICAL CHOKE, 0xCJK RADICAL C-SIMPLIFIED TURTLE
|
||||
contents.add(0x2F00, 0x2FD5);// HAN # (0xSo) KANGXI RADICAL ONE, 0xKANGXI RADICAL FLUTE
|
||||
|
||||
contents.add(0xA490, 0xA4A1);// YI # (0xSo) YI RADICAL QOT, 0xYI RADICAL GA
|
||||
contents.add(0xA4A4, 0xA4B3);// YI # (0xSo) YI RADICAL DDUR, 0xYI RADICAL JO
|
||||
contents.add(0xA4B5, 0xA4C0);// YI # (0xSo) YI RADICAL JJY, 0xYI RADICAL SHAT
|
||||
contents.add(0xA4C2, 0xA4C4);// YI # (0xSo) YI RADICAL SHOP, 0xYI RADICAL ZZIET
|
||||
contents.add(0xA4C6);// YI # (0xSo) YI RADICAL KE
|
||||
return contents;
|
||||
}
|
||||
}
|
@ -1,25 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class ScriptTimeline {
|
||||
public static void main(String[] args) {
|
||||
String[] versions = { "2.0.0", "2.1.2", "3.0.0", "3.1.0", "3.2.0", "4.0.0", "4.1.0", "5.0.0" };
|
||||
for (int s = 0; s < UScript.CODE_LIMIT; ++s) {
|
||||
String scriptName = UScript.getName(s);
|
||||
UnicodeSet chars = new UnicodeSet().applyPropertyAlias("script", scriptName);
|
||||
if (chars.size() == 0) continue;
|
||||
System.out.print(scriptName);
|
||||
for (int v = 0; v < versions.length; ++v) {
|
||||
UnicodeSet age = new UnicodeSet();
|
||||
age.applyPropertyAlias("age", versions[v]);
|
||||
System.out.print("\t" + new UnicodeSet(chars).retainAll(age).size());
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,75 +0,0 @@
|
||||
|
||||
# ================================================================================
|
||||
# Conditional mappings
|
||||
# ================================================================================
|
||||
|
||||
# Special case for final form of sigma
|
||||
|
||||
03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
||||
|
||||
# Note: the following cases for non-final are already in the UnicodeData file.
|
||||
|
||||
# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
|
||||
# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
|
||||
# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
|
||||
|
||||
# Note: the following cases are not included, since they would case-fold in lowercasing
|
||||
|
||||
# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
|
||||
# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
|
||||
|
||||
# ================================================================================
|
||||
# Locale-sensitive mappings
|
||||
# ================================================================================
|
||||
|
||||
# Lithuanian
|
||||
|
||||
# Lithuanian retains the dot in a lowercase i when followed by accents.
|
||||
|
||||
# Remove DOT ABOVE after "i" with upper or titlecase
|
||||
|
||||
0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
|
||||
# Introduce an explicit dot above when lowercasing capital I's and J's
|
||||
# whenever there are more accents above.
|
||||
# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
|
||||
|
||||
0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
|
||||
004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
|
||||
012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
|
||||
00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
|
||||
|
||||
# ================================================================================
|
||||
|
||||
# Turkish and Azeri
|
||||
|
||||
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||||
# The following rules handle those cases.
|
||||
|
||||
0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
|
||||
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||||
# This matches the behavior of the canonically equivalent I-dot_above
|
||||
|
||||
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
|
||||
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
|
||||
|
||||
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
|
||||
0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
|
||||
0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
|
||||
|
||||
# When uppercasing, i turns into a dotted capital I
|
||||
|
||||
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
|
||||
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
|
||||
|
||||
# Note: the following case is already in the UnicodeData file.
|
||||
|
||||
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
|
||||
|
||||
# EOF
|
||||
|
@ -1,46 +0,0 @@
|
||||
#
|
||||
# Special Casing Properties
|
||||
#
|
||||
# This file is a supplement to the UnicodeData file.
|
||||
# It contains additional information about the casing of Unicode characters.
|
||||
# (For compatibility, the UnicodeData.txt file only contains case mappings for
|
||||
# characters where they are 1-1, and does not have locale-specific mappings.)
|
||||
# For more information, see the discussion of Case Mappings in the Unicode Standard.
|
||||
#
|
||||
# All code points not listed in this file that do not have a simple case mappings
|
||||
# in UnicodeData.txt map to themselves.
|
||||
# ================================================================================
|
||||
# Format
|
||||
# ================================================================================
|
||||
# The entries in this file are in the following machine-readable format:
|
||||
#
|
||||
# <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
|
||||
#
|
||||
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more
|
||||
# than one character, they are separated by spaces. Other than as used to separate
|
||||
# elements, spaces are to be ignored.
|
||||
#
|
||||
# The <condition_list> is optional. Where present, it consists of one or more locale IDs
|
||||
# or contexts, separated by spaces. In these conditions:
|
||||
# - A condition list overrides the normal behavior if all of the listed conditions are true.
|
||||
# - The context is always the context of the characters in the original string,
|
||||
# NOT in the resulting string.
|
||||
# - Case distinctions in the condition list are not significant.
|
||||
# - Conditions preceded by "Not_" represent the negation of the condition.
|
||||
#
|
||||
# A locale ID is defined by taking any language tag as defined by
|
||||
# RFC 3066 (or its successor), and replacing '-' by '_'.
|
||||
#
|
||||
# A context for a character C is defined by Section 3.13 Default Case
|
||||
# Operations, of The Unicode Standard, Version 5.0.
|
||||
# (This is identical to the context defined by Unicode 4.1.0,
|
||||
# as specified in http://www.unicode.org/versions/Unicode4.1.0/)
|
||||
#
|
||||
# Parsers of this file must be prepared to deal with future additions to this format:
|
||||
# * Additional contexts
|
||||
# * Additional fields
|
||||
# ================================================================================
|
||||
|
||||
# ================================================================================
|
||||
# Unconditional mappings
|
||||
# ================================================================================
|
@ -1,13 +0,0 @@
|
||||
# IMPORTANT-when capitalizing iota-subscript (0345)
|
||||
# It MUST be in normalized form--moved to the end of any sequence of combining marks.
|
||||
# This is because logically it represents a following base character!
|
||||
# E.g. <iota_subscript> (<Mn> | <Mc> | <Me>)+ => (<Mn> | <Mc> | <Me>)+ <iota_subscript>
|
||||
# It should never be the first character in a word, so in titlecasing it can be left as is.
|
||||
|
||||
# The following cases are already in the UnicodeData file, so are only commented here.
|
||||
|
||||
# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI
|
||||
|
||||
# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
|
||||
# have special uppercases.
|
||||
# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
|
@ -1,108 +0,0 @@
|
||||
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<meta name="keywords" content="unicode, variant glyphs">
|
||||
<meta name="description" content="Describes and displays standardized variant glyphs">
|
||||
<title>Standardized Variants</title>
|
||||
<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
|
||||
</head>
|
||||
|
||||
<body bgcolor="#ffffff">
|
||||
|
||||
<table class="header">
|
||||
<tr>
|
||||
<td class="icon"><a href="http://www.unicode.org">
|
||||
<img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a> <a class="bar" href="http://www.unicode.org/ucd">Unicode
|
||||
Character Database</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="gray"> </td>
|
||||
</tr>
|
||||
</table>
|
||||
<blockquote>
|
||||
<h1>Standardized Variants</h1>
|
||||
<table class="wide">
|
||||
<tr>
|
||||
<td valign="top" width="144">Revision</td>
|
||||
<td valign="top">@revision@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Authors</td>
|
||||
<td valign="top">Members of the Editorial Committee</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Date</td>
|
||||
<td valign="top">@date@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">This Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/@updateDirectory@/@filename@.html">
|
||||
http://www.unicode.org/Public/@updateDirectory@/@filename@.html</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Previous Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html">
|
||||
http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Latest Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html">
|
||||
http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html</a></td>
|
||||
</tr>
|
||||
</table>
|
||||
<h3><br>
|
||||
<i>Summary</i></h3>
|
||||
<blockquote>
|
||||
<p>This file provides a visual display of the standard variant sequences derived from
|
||||
StandardizedVariants.txt.</p>
|
||||
</blockquote>
|
||||
<h3><i>Status</i></h3>
|
||||
<blockquote>
|
||||
<p><i>This file and the files described herein are part of the Unicode Character Database and
|
||||
are governed by the terms of use at <a href="http://www.unicode.org/terms_of_use.html">
|
||||
http://www.unicode.org/terms_of_use.html</a>.</i></p>
|
||||
</blockquote>
|
||||
<hr width="50%">
|
||||
<h2>Introduction</h2>
|
||||
<p>The tables here <i>exhaustively</i> lists the valid, registered combinations of base character
|
||||
plus variation indicator. All combinations not listed in StandardizedVariants.txt are unspecified
|
||||
and are reserved for future standardization; no conformant process may interpret them as
|
||||
standardized variants. Variation selectors and their use are described in The Unicode Standard.</p>
|
||||
<p>These mathematical variants are all produced with the addition of Variation Selector 1 (VS1 or
|
||||
U+FE00) to mathematical operator base characters. There is no variation according to context. The
|
||||
Mongolian variants use the Mongolian Variant Selectors, and may vary according to context. That
|
||||
is, if a contextual shape is not listed below, then the variation sequence has an unmodified
|
||||
appearance. At this time no Han variants exist.</p>
|
||||
<blockquote>
|
||||
<p><a name="fonts"><b>Note: </b></a>The glyphs used to show the variations are often derived
|
||||
from different physical fonts than the representative glyphs in the standard. They may therefore
|
||||
exhibit minor differences in size, proportion, or weight <i>unrelated</i> to the intentional
|
||||
difference in feature that is the defining element of the variation. Such minor differences
|
||||
should be ignored. Likewise, in some cases the existing representative fonts may not yet contain
|
||||
newly encoded characters and hence some representative glyphs shown in these tables may have a
|
||||
slightly different style than others.</p>
|
||||
</blockquote>
|
||||
<p>@table@</p>
|
||||
<hr width="50%">
|
||||
<div align="center">
|
||||
<center>
|
||||
<table cellspacing="0" cellpadding="0" border="0">
|
||||
<tr>
|
||||
<td><a href="http://www.unicode.org/unicode/copyright.html">
|
||||
<img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
|
||||
</tr>
|
||||
</table>
|
||||
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
|
||||
</script>
|
||||
</center>
|
||||
</div>
|
||||
</blockquote>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -1,566 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.text.utility.*;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
// Enumerated properties will be IntCodePointProperty.
|
||||
// The string values they return will be the property value names.
|
||||
// Binary properties are Enumerated properties. They return 0 or 1
|
||||
|
||||
public final class TernaryStore {
|
||||
|
||||
static final int DONE = Integer.MIN_VALUE;
|
||||
static final int NOT_FOUND = Integer.MIN_VALUE+1;
|
||||
|
||||
// for testing
|
||||
static DepthPrinter dp;
|
||||
|
||||
static void test() throws java.io.IOException {
|
||||
|
||||
|
||||
PrintWriter pw = Utility.openPrintWriter("TestTernary.txt", Utility.LATIN1_WINDOWS);
|
||||
try {
|
||||
dp = new DepthPrinter(pw);
|
||||
|
||||
String[] tests = {"the", "quick", "fish", "fisherman", "fishes",
|
||||
"brown", "brow", "bracket", "bright", "brat",
|
||||
"brough", "dogs", "upper", "zebra",
|
||||
"fisher"};
|
||||
test("Simple: ", tests, tests.length);
|
||||
|
||||
|
||||
tests = new String[300000];
|
||||
int counter = 0;
|
||||
int i;
|
||||
for (i = 0; counter < tests.length && i <= 0x10FFFF; ++i) {
|
||||
if (Default.ucd().hasComputableName(i)) continue;
|
||||
|
||||
String temp = UCharacter.getName(i);
|
||||
if (temp != null) tests[counter++] = temp.trim();
|
||||
}
|
||||
System.out.println("max-cp: " + Utility.hex(i));
|
||||
test("Unicode Names: ", tests, counter);
|
||||
|
||||
//if (true) return;
|
||||
|
||||
BufferedReader br = Utility.openReadFile(UCD_Types.BASE_DIR + "dict\\DiploFreq.txt", Utility.LATIN1);
|
||||
String line;
|
||||
counter = 0;
|
||||
while (counter < tests.length) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
Utility.dot(counter);
|
||||
int tabPos = line.indexOf('\t');
|
||||
if (tabPos < 0) {
|
||||
System.out.println("???" + line);
|
||||
continue;
|
||||
}
|
||||
tests[counter++] = line.substring(tabPos+1);
|
||||
}
|
||||
test("French: ", tests, counter);
|
||||
} finally {
|
||||
pw.close();
|
||||
}
|
||||
}
|
||||
|
||||
static void test(String title, String[] tests, int len) {
|
||||
System.out.println();
|
||||
System.out.println(title);
|
||||
dp.println();
|
||||
dp.print(title, 0);
|
||||
dp.println();
|
||||
TernaryStore.Builder builder = new TernaryStore.Builder();
|
||||
int charCount = 0;
|
||||
for (int i = 0; i < len; ++i) {
|
||||
builder.add(tests[i], i);
|
||||
charCount += tests[i].length();
|
||||
}
|
||||
System.out.println("charCount: " + charCount);
|
||||
TernaryStore store = builder.build();
|
||||
store.showNodes();
|
||||
store.checkNodes();
|
||||
|
||||
dp.println("Storage");
|
||||
dp.println(store.stringStore.toString());
|
||||
System.out.println("StorageSize: " + store.stringStore.toString().length());
|
||||
|
||||
Matcher matcher = store.getMatcher();
|
||||
for (int i = 0; i < len; ++i) {
|
||||
int check = test(tests[i], matcher);
|
||||
if (check != i) {
|
||||
System.out.println("\tFail, result: " + tests[i] + ", " + check);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int test(String s, Matcher matcher) {
|
||||
matcher.reset(s, 0);
|
||||
int lastResult = -1;
|
||||
for (int result = matcher.next(); result != DONE; result = matcher.next()) {
|
||||
lastResult = result;
|
||||
}
|
||||
return lastResult;
|
||||
}
|
||||
|
||||
static final class Node {
|
||||
String getString(StringStore stringStore) {
|
||||
if (stringCode < 0) return tempString;
|
||||
return stringStore.get(stringCode);
|
||||
}
|
||||
void setString(String s) {
|
||||
tempString = s;
|
||||
}
|
||||
String tempString;
|
||||
int stringCode = -1;
|
||||
Node less;
|
||||
Node greater;
|
||||
Node next;
|
||||
int result = NOT_FOUND;
|
||||
|
||||
public String toString(StringStore store) {
|
||||
return getString(store)
|
||||
+ (result != NOT_FOUND ? "(" + result + ")" : "")
|
||||
+ (next != null ? next.toString() : "");
|
||||
}
|
||||
}
|
||||
|
||||
Node base;
|
||||
StringStore stringStore = new StringStore();
|
||||
|
||||
final static class Matcher {
|
||||
TernaryStore store;
|
||||
String s;
|
||||
int position;
|
||||
Node lastNode;
|
||||
|
||||
void reset(String s, int position) {
|
||||
this.s = s;
|
||||
this.position = position;
|
||||
this.lastNode = store.base;
|
||||
}
|
||||
|
||||
// returns the next result
|
||||
// or DONE when done
|
||||
// sets position to point after end of found string
|
||||
|
||||
int next() {
|
||||
while (lastNode != null && position < s.length()) {
|
||||
char ch = s.charAt(position++);
|
||||
do {
|
||||
String nodeString = lastNode.getString(store.stringStore);
|
||||
char first = nodeString.charAt(0);
|
||||
if (ch == first) {
|
||||
// now check the rest of the string
|
||||
for (int i = 1; i < nodeString.length(); ++i) {
|
||||
char other = nodeString.charAt(i);
|
||||
if (other != s.charAt(position++)) {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
// if we succeed, return result if there is one
|
||||
int result = lastNode.result;
|
||||
lastNode = lastNode.next;
|
||||
if (result != NOT_FOUND) return result;
|
||||
break; // get next char
|
||||
}
|
||||
// otherwise branch sideways, keeping same char
|
||||
if (ch > first) {
|
||||
lastNode = lastNode.greater;
|
||||
} else {
|
||||
lastNode = lastNode.less;
|
||||
}
|
||||
} while (lastNode != null);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
public Matcher getMatcher() {
|
||||
Matcher result = new Matcher();
|
||||
result.store = this;
|
||||
return result;
|
||||
}
|
||||
|
||||
public void showNodes() {
|
||||
showNodes2(base, "", 5);
|
||||
}
|
||||
|
||||
public void showNodes2(Node n, String path, int depth) {
|
||||
if (n.less != null) {
|
||||
showNodes2(n.less, path+"-", depth);
|
||||
}
|
||||
dp.print("", depth);
|
||||
if (false) dp.print(path);
|
||||
dp.print(n.getString(stringStore));
|
||||
if (n.result != NOT_FOUND) dp.print("/" + n.result);
|
||||
dp.println();
|
||||
if (n.next != null) {
|
||||
showNodes2(n.next, path+".", depth+n.getString(stringStore).length());
|
||||
}
|
||||
if (n.greater != null) {
|
||||
showNodes2(n.greater, path+"+", depth);
|
||||
}
|
||||
}
|
||||
|
||||
static class NodeInfo {
|
||||
int nodeCount;
|
||||
int resultCount;
|
||||
int nullLessCount;
|
||||
int nullGreaterCount;
|
||||
int nullSimpleCount;
|
||||
int nullNextCount;
|
||||
}
|
||||
|
||||
public void checkNodes() {
|
||||
NodeInfo nodeInfo = new NodeInfo();
|
||||
checkNodes(base, nodeInfo);
|
||||
System.out.println("Nodes: " + nodeInfo.nodeCount);
|
||||
System.out.println("nullLessCount: " + nodeInfo.nullLessCount);
|
||||
System.out.println("nullGreaterCount: " + nodeInfo.nullGreaterCount);
|
||||
System.out.println("nullNextCount: " + nodeInfo.nullNextCount);
|
||||
System.out.println("resultCount: " + nodeInfo.resultCount);
|
||||
System.out.println("nullSimpleCount: " + nodeInfo.nullSimpleCount);
|
||||
}
|
||||
|
||||
public void checkNodes(Node n, NodeInfo nodeInfo) {
|
||||
nodeInfo.nodeCount++;
|
||||
if (n.result != NOT_FOUND) nodeInfo.resultCount++;
|
||||
if (n.less != null) {
|
||||
checkNodes(n.less, nodeInfo);
|
||||
} else {
|
||||
nodeInfo.nullLessCount++;
|
||||
if (n.greater == null && n.result == NOT_FOUND) nodeInfo.nullSimpleCount++;
|
||||
}
|
||||
if (n.next != null) {
|
||||
checkNodes(n.next, nodeInfo);
|
||||
} else {
|
||||
nodeInfo.nullNextCount++;
|
||||
}
|
||||
if (n.greater != null) {
|
||||
checkNodes(n.greater, nodeInfo);
|
||||
} else {
|
||||
nodeInfo.nullGreaterCount++;
|
||||
}
|
||||
}
|
||||
|
||||
final static class DepthPrinter {
|
||||
private PrintWriter pw;
|
||||
private int currentDepth = 0;
|
||||
private String leader = ".";
|
||||
|
||||
DepthPrinter(PrintWriter pw) {
|
||||
this.pw = pw;
|
||||
}
|
||||
|
||||
void print(char ch) {
|
||||
print(ch, 0);
|
||||
}
|
||||
|
||||
void print(String s) {
|
||||
print(s, 0);
|
||||
}
|
||||
|
||||
void print(char ch, int depth) {
|
||||
print(String.valueOf(ch), depth);
|
||||
}
|
||||
|
||||
void print(String s, int depth) {
|
||||
int delta = depth - currentDepth;
|
||||
if (delta > 0) {
|
||||
pw.print(Utility.repeat(leader, delta - 1));
|
||||
currentDepth = depth;
|
||||
}
|
||||
pw.print(s);
|
||||
currentDepth += s.length();
|
||||
}
|
||||
|
||||
void println() {
|
||||
pw.println();
|
||||
currentDepth = 0;
|
||||
}
|
||||
|
||||
void println(String s) {
|
||||
pw.print(s);
|
||||
pw.println();
|
||||
currentDepth = 0;
|
||||
}
|
||||
}
|
||||
|
||||
final static class StringStore {
|
||||
// initially, there is a simple strategy
|
||||
|
||||
private String buffer = "";
|
||||
private static final char TERMINATOR = '\u007E';
|
||||
private static final int PIECE_LENGTH = 5;
|
||||
private static String[] pieces = new String[50]; // HACK
|
||||
private static Set strings = new HashSet();
|
||||
|
||||
public void add(String s) {
|
||||
strings.add(s);
|
||||
}
|
||||
|
||||
public void compact() {
|
||||
System.out.println("Adding Pieces");
|
||||
// add all the pieces
|
||||
Iterator it = strings.iterator();
|
||||
Set additions = new HashSet();
|
||||
while (it.hasNext()) {
|
||||
String s = (String)it.next();
|
||||
int len = Utility.split(s, ' ', pieces);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
additions.add(pieces[i]);
|
||||
}
|
||||
}
|
||||
|
||||
store(additions);
|
||||
store(strings);
|
||||
}
|
||||
|
||||
private void store(Set stuff) {
|
||||
System.out.println("Sorting");
|
||||
// sort them by length, longest first
|
||||
Set ordered = new TreeSet();
|
||||
Iterator it = stuff.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String)it.next();
|
||||
ordered.add(new Pair(new Integer(-s.length()), s));
|
||||
}
|
||||
System.out.println("Storing");
|
||||
// add them
|
||||
it = ordered.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String)(((Pair)it.next()).second);
|
||||
get(s);
|
||||
}
|
||||
}
|
||||
|
||||
private int get(String s) {
|
||||
System.out.println("Adding: \'" + s + "\'");
|
||||
int index;
|
||||
if (s.indexOf(' ') < 0) {
|
||||
index = addNoSplit(s);
|
||||
System.out.println("\tReturning: " + index);
|
||||
return index;
|
||||
}
|
||||
int len = Utility.split(s, ' ', pieces);
|
||||
StringBuffer itemCodes = new StringBuffer();
|
||||
for (int i = 0; i < len; ++i) {
|
||||
String piece = pieces[i];
|
||||
itemCodes.append((char)addNoSplit(piece));
|
||||
/*for (int j = 0; j < piece.length(); j += PIECE_LENGTH) {
|
||||
int maxLen = j + PIECE_LENGTH;
|
||||
if (maxLen > piece.length()) maxLen = piece.length();
|
||||
itemCodes.append((char)addNoSplit(piece.substring(j, maxLen)));
|
||||
}*/
|
||||
}
|
||||
index = 0x8000 | addNoSplit(itemCodes.toString()); // mark it as composite
|
||||
System.out.println("\tReturning: " + index);
|
||||
return index;
|
||||
}
|
||||
|
||||
private int addNoSplit(String s) {
|
||||
System.out.println("\tAdding2: \'" + s + "\'");
|
||||
String sTerm = s + TERMINATOR;
|
||||
int index = buffer.indexOf(sTerm);
|
||||
if (index >= 0) return index;
|
||||
|
||||
index = buffer.length();
|
||||
buffer += sTerm;
|
||||
System.out.println("\t\tReturning2: " + index);
|
||||
return index;
|
||||
}
|
||||
|
||||
public String get(int index) {
|
||||
String result;
|
||||
System.out.println("Fetching: " + index);
|
||||
|
||||
if ((index & 0x8000) == 0) {
|
||||
int end = buffer.indexOf(TERMINATOR, index);
|
||||
result = buffer.substring(index, end);
|
||||
System.out.println("\tReturning: '" + result + "'");
|
||||
return result;
|
||||
}
|
||||
index &= ~0x8000; // remove 1 bit
|
||||
|
||||
int end = buffer.indexOf(TERMINATOR, index);
|
||||
result = "";
|
||||
for (int i = index; i < end; ++i) {
|
||||
if (result.length() != 0) result += " ";
|
||||
result += get(buffer.charAt(i));
|
||||
}
|
||||
System.out.println("\tReturning: '" + result + "'");
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return buffer;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
final static class Builder {
|
||||
Map map = new TreeMap();
|
||||
String[] names;
|
||||
TernaryStore store;
|
||||
Set set = new TreeSet();
|
||||
|
||||
public void add(String name, int result) {
|
||||
map.put(name, new Integer(result));
|
||||
}
|
||||
|
||||
public TernaryStore build() {
|
||||
// flatten strings into array
|
||||
names = new String[map.size()];
|
||||
Iterator it = map.keySet().iterator();
|
||||
int count = 0;
|
||||
while (it.hasNext()) {
|
||||
names[count++] = (String) it.next();
|
||||
if (false) {
|
||||
dp.print((count-1) + " " + names[count-1]);
|
||||
dp.println();
|
||||
}
|
||||
}
|
||||
|
||||
// build nodes
|
||||
store = new TernaryStore();
|
||||
addNode(0, names.length);
|
||||
|
||||
// free storage
|
||||
names = null;
|
||||
map.clear();
|
||||
|
||||
System.out.println("compacting");
|
||||
compactStore(store.base);
|
||||
store.stringStore.compact();
|
||||
|
||||
//compactStrings(store);
|
||||
//set.clear(); // free more storage
|
||||
|
||||
replaceStrings(store.base);
|
||||
//map.clear(); // free storage
|
||||
|
||||
// free storage
|
||||
TernaryStore result = store;
|
||||
store = null;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
void compactStrings(TernaryStore t) {
|
||||
// we have a set of Pairs, first is length, second is string
|
||||
// compact them, word by word
|
||||
Iterator it = set.iterator();
|
||||
while (it.hasNext()) {
|
||||
String string = ((String)((Pair)it.next()).second);
|
||||
int index = t.stringStore.add(string);
|
||||
if (true) {
|
||||
System.out.println("Checking: " + index);
|
||||
String reverse = t.stringStore.get(index);
|
||||
if (!reverse.equals(string)) {
|
||||
System.out.println("source: \'" + string + "\'");
|
||||
System.out.println("reverse: \'" + reverse + "\'");
|
||||
throw new IllegalArgumentException("Failed roundtrip");
|
||||
}
|
||||
}
|
||||
|
||||
map.put(string, new Integer(index));
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
public void replaceStrings(Node n) {
|
||||
n.stringCode = store.stringStore.get(n.getString(store.stringStore));
|
||||
n.setString(null);
|
||||
if (n.less != null) replaceStrings(n.less);
|
||||
if (n.next != null) replaceStrings(n.next);
|
||||
if (n.greater != null) replaceStrings(n.greater);
|
||||
}
|
||||
|
||||
public void compactStore(Node n) {
|
||||
Node nextNode = n.next;
|
||||
if (false) dp.println(n.toString());
|
||||
while (n.result == NOT_FOUND && nextNode != null && nextNode.greater == null
|
||||
&& nextNode.less == null) {
|
||||
n.setString(n.getString(store.stringStore) + nextNode.getString(store.stringStore));
|
||||
n.result = nextNode.result;
|
||||
n.next = nextNode = nextNode.next; // remove old node
|
||||
}
|
||||
// add strings sorted by length, longest first
|
||||
store.stringStore.add(n.getString(store.stringStore));
|
||||
|
||||
if (n.less != null) compactStore(n.less);
|
||||
if (n.next != null) compactStore(n.next);
|
||||
if (n.greater != null) compactStore(n.greater);
|
||||
}
|
||||
|
||||
private void addNode(int start, int limit) {
|
||||
if (start >= limit) return;
|
||||
int mid = (start + limit) / 2;
|
||||
//System.out.println("start: " + start + ", mid: " + mid + ", limit: " + limit);
|
||||
//System.out.println("adding: " + names[mid]);
|
||||
addNode(names[mid], ((Integer)map.get(names[mid])).intValue());
|
||||
addNode(start, mid);
|
||||
addNode(mid+1, limit);
|
||||
}
|
||||
|
||||
private void addNode(String s, int result) {
|
||||
if (store.base == null) {
|
||||
store.base = addRest(s, 0, result);
|
||||
return;
|
||||
}
|
||||
Node n = store.base;
|
||||
Node lastNode = n;
|
||||
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
char ch = s.charAt(i);
|
||||
while (true) {
|
||||
char first = n.getString(store.stringStore).charAt(0);
|
||||
if (ch == first) {
|
||||
if (n.next == null) {
|
||||
n.next = addRest(s, i+1, result);
|
||||
return;
|
||||
}
|
||||
lastNode = n;
|
||||
n = n.next;
|
||||
break; // get next char
|
||||
}
|
||||
// otherwise branch sideways, keeping same char
|
||||
if (ch > first) {
|
||||
if (n.greater == null) {
|
||||
n.greater = addRest(s, i, result);
|
||||
return;
|
||||
}
|
||||
n = n.greater;
|
||||
} else {
|
||||
if (n.less == null) {
|
||||
n.less = addRest(s, i, result);
|
||||
return;
|
||||
}
|
||||
n = n.less;
|
||||
}
|
||||
}
|
||||
}
|
||||
lastNode.result = result;
|
||||
}
|
||||
|
||||
private Node addRest(String s, int position, int result) {
|
||||
Node lastNode = null;
|
||||
for (int i = s.length() - 1; i >= position; --i) {
|
||||
Node n = new Node();
|
||||
n.setString(s.substring(i, i+1)); // + "" to force a new string
|
||||
if (lastNode == null) {
|
||||
n.result = result;
|
||||
}
|
||||
n.next = lastNode;
|
||||
lastNode = n;
|
||||
}
|
||||
return lastNode;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,378 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.XEquivalenceClass;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.text.utility.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
||||
public class TestIdentifiers {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
String[] tests = { "SØS", "façade", "MOPE", "VOP", "scope", "ibm", "vop",
|
||||
"toys-я-us", "1iνе", "back", "boгing" };
|
||||
|
||||
TestIdentifiers ti = new TestIdentifiers("L");
|
||||
TestIdentifiers tiany = new TestIdentifiers("A");
|
||||
ti.loadIdentifiers();
|
||||
UnicodeSet idnCharSet = ti.idnChars.getSet("output", new UnicodeSet());
|
||||
System.out.println("idnCharSet: " + idnCharSet.size());
|
||||
UnicodeSet idnCharNonStarting = ti.nonstarting;
|
||||
System.out.println("idnCharNonStarting: " + idnCharSet);
|
||||
if (true) return;
|
||||
|
||||
for (int i = 0; i < tests.length; ++i) {
|
||||
System.out.print(tests[i]);
|
||||
String folded = UCharacter.foldCase(tests[i], true);
|
||||
if (folded.equals(tests[i])) {
|
||||
ti.testItem(tests[i]);
|
||||
} else {
|
||||
System.out.print("\t");
|
||||
tiany.testItem(tests[i]);
|
||||
System.out.print(folded);
|
||||
ti.testItem(folded);
|
||||
}
|
||||
for (int j = 0; j < tests[i].length(); ++j) {
|
||||
int cp = tests[i].charAt(j);
|
||||
Set s = ti.getConfusables(cp, "MA");
|
||||
System.out.println(Default.ucd().getCodeAndName(cp));
|
||||
for (Iterator it = s.iterator(); it.hasNext();) {
|
||||
System.out.println("\t= " + Default.ucd().getCodeAndName((String)it.next()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void testItem(String test) {
|
||||
test = Normalizer.normalize(test, Normalizer.DECOMP_COMPAT);
|
||||
BitSet scripts = new BitSet();
|
||||
System.out.print("\t" + caseType + "\t");
|
||||
boolean foundProblem = false;
|
||||
if (hasWholeScriptConfusable(test, scripts)) {
|
||||
System.out.print("whole-script confusables: ");
|
||||
for (int j = 0; j < scripts.length(); ++j) {
|
||||
if (scripts.get(j))
|
||||
System.out.print(UScript.getName(j) + " ");
|
||||
}
|
||||
System.out.println();
|
||||
foundProblem = true;
|
||||
}
|
||||
if (hasMixedScriptConfusable(test)) {
|
||||
System.out.println("mixed-script confusable");
|
||||
foundProblem = true;
|
||||
}
|
||||
if (!foundProblem) {
|
||||
System.out.println("no confusable");
|
||||
}
|
||||
}
|
||||
|
||||
private static final String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
|
||||
|
||||
private static UnicodeSet commonAndInherited = new UnicodeSet(
|
||||
"[[:script=common:][:script=inherited:]]");
|
||||
|
||||
private static UnicodeSet XIDContinueSet = new UnicodeSet("[:xidcontinue:]")
|
||||
.add('-');
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
private String caseType;
|
||||
|
||||
TestIdentifiers(String caseType) throws IOException {
|
||||
this.caseType = caseType;
|
||||
loadWholeScriptConfusables(caseType);
|
||||
}
|
||||
|
||||
private static class UnicodeSetToScript {
|
||||
public int getScript() {
|
||||
return script;
|
||||
}
|
||||
|
||||
public UnicodeSetToScript setScript(int script) {
|
||||
this.script = script;
|
||||
return this;
|
||||
}
|
||||
|
||||
public UnicodeSet getSet() {
|
||||
return set;
|
||||
}
|
||||
|
||||
public UnicodeSetToScript setSet(UnicodeSet set) {
|
||||
this.set = set;
|
||||
return this;
|
||||
}
|
||||
|
||||
private UnicodeSet set;
|
||||
|
||||
private int script;
|
||||
}
|
||||
|
||||
UnicodeSetToScript[][] scriptToUnicodeSetToScript = new UnicodeSetToScript[UScript.CODE_LIMIT][];
|
||||
UnicodeSet[] fastReject = new UnicodeSet[UScript.CODE_LIMIT];
|
||||
|
||||
UnicodeMap idnChars = new UnicodeMap();
|
||||
UnicodeSet nonstarting = new UnicodeSet();
|
||||
|
||||
void loadIdentifiers() throws IOException {
|
||||
BufferedReader br = BagFormatter.openUTF8Reader(indir,
|
||||
"idnchars.txt");
|
||||
String line = null;
|
||||
try {
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null)
|
||||
break;
|
||||
if (line.length() == 0)
|
||||
continue;
|
||||
String[] pieces = Utility.split(line, ';');
|
||||
// part 0 is range
|
||||
String range = pieces[0].trim();
|
||||
int rangeDivider = range.indexOf("..");
|
||||
int start, end;
|
||||
if (rangeDivider < 0) {
|
||||
start = end = Integer.parseInt(range, 16);
|
||||
} else {
|
||||
start = Integer.parseInt(range.substring(0, rangeDivider),
|
||||
16);
|
||||
end = Integer.parseInt(range.substring(rangeDivider + 2),
|
||||
16);
|
||||
}
|
||||
// part 1 is script1
|
||||
String type = pieces[1].trim().intern();
|
||||
if (type.equals("nonstarting")) nonstarting.add(start,end);
|
||||
else idnChars.putAll(start, end, type);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw (RuntimeException) new RuntimeException("Failure on line "
|
||||
+ line).initCause(e);
|
||||
}
|
||||
br.close();
|
||||
}
|
||||
|
||||
Map type_equivalences;
|
||||
|
||||
void loadConfusables() throws IOException {
|
||||
BufferedReader br = BagFormatter.openUTF8Reader(indir,
|
||||
"confusables.txt");
|
||||
String line = null;
|
||||
type_equivalences = new HashMap();
|
||||
try {
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null)
|
||||
break;
|
||||
if (line.length() == 0)
|
||||
continue;
|
||||
String[] pieces = Utility.split(line, ';');
|
||||
// part 0 is source code point
|
||||
String s = Utility.fromHex(pieces[0].trim());
|
||||
// part 1 is script1
|
||||
String t = Utility.fromHex(pieces[1].trim());
|
||||
|
||||
String type = pieces[2].trim();
|
||||
XEquivalenceClass ec = (XEquivalenceClass) type_equivalences.get(type);
|
||||
if (ec == null) type_equivalences.put(type, ec = new XEquivalenceClass(""));
|
||||
ec.add(s, t);
|
||||
//System.out.println(type + ": " + Default.ucd().getCodeAndName(s) + " => " + Default.ucd().getCodeAndName(t));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw (RuntimeException) new RuntimeException("Failure on line "
|
||||
+ line).initCause(e);
|
||||
}
|
||||
br.close();
|
||||
}
|
||||
|
||||
public Set getConfusables(int cp, String type) {
|
||||
try {
|
||||
if (type_equivalences == null) loadConfusables();
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
XEquivalenceClass ec = (XEquivalenceClass) type_equivalences.get(type);
|
||||
return ec.getEquivalences(UTF16.valueOf(cp));
|
||||
}
|
||||
|
||||
void loadWholeScriptConfusables(String filterType) throws IOException {
|
||||
UnicodeSet[][] script_script_set = new UnicodeSet[UScript.CODE_LIMIT][UScript.CODE_LIMIT];
|
||||
for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
|
||||
script_script_set[i] = new UnicodeSet[UScript.CODE_LIMIT];
|
||||
}
|
||||
BufferedReader br = BagFormatter.openUTF8Reader(indir,
|
||||
"confusablesWholeScript.txt");
|
||||
String line = null;
|
||||
try {
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null)
|
||||
break;
|
||||
if (line.length() == 0)
|
||||
continue;
|
||||
String[] pieces = Utility.split(line, ';');
|
||||
// part 0 is range
|
||||
String range = pieces[0].trim();
|
||||
int rangeDivider = range.indexOf("..");
|
||||
int start, end;
|
||||
if (rangeDivider < 0) {
|
||||
start = end = Integer.parseInt(range, 16);
|
||||
} else {
|
||||
start = Integer.parseInt(range.substring(0, rangeDivider),
|
||||
16);
|
||||
end = Integer.parseInt(range.substring(rangeDivider + 2),
|
||||
16);
|
||||
}
|
||||
// part 1 is script1
|
||||
int script1 = UScript.getCodeFromName(pieces[1].trim());
|
||||
// part 2 is script2
|
||||
int script2 = UScript.getCodeFromName(pieces[2].trim());
|
||||
String type = pieces[3].trim();
|
||||
if (!type.equals(filterType))
|
||||
continue;
|
||||
if (script_script_set[script1][script2] == null) {
|
||||
script_script_set[script1][script2] = new UnicodeSet();
|
||||
}
|
||||
script_script_set[script1][script2].add(start, end);
|
||||
}
|
||||
for (int i = 0; i < script_script_set.length; ++i) {
|
||||
UnicodeSet accept = new UnicodeSet();
|
||||
List curr = new ArrayList();
|
||||
for (int j = 0; j < script_script_set[i].length; ++j) {
|
||||
if (script_script_set[i][j] == null)
|
||||
continue;
|
||||
accept.addAll(script_script_set[i][j]);
|
||||
curr.add(new UnicodeSetToScript().setScript(j).setSet(
|
||||
script_script_set[i][j]));
|
||||
if (DEBUG && i == UScript.LATIN)
|
||||
System.out.println(UScript.getName(i) + "; "
|
||||
+ UScript.getName(j) + "; "
|
||||
+ script_script_set[i][j]);
|
||||
}
|
||||
if (curr.size() == 0)
|
||||
continue;
|
||||
scriptToUnicodeSetToScript[i] = (UnicodeSetToScript[]) curr
|
||||
.toArray(new UnicodeSetToScript[curr.size()]);
|
||||
fastReject[i] = accept.complement();
|
||||
if (DEBUG && i == UScript.LATIN)
|
||||
System.out.println(UScript.getName(i) + "; "
|
||||
+ fastReject[i]);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw (RuntimeException) new RuntimeException("Failure on line "
|
||||
+ line).initCause(e);
|
||||
}
|
||||
br.close();
|
||||
}
|
||||
|
||||
/*
|
||||
* for this routine, we don't care what the targetScripts are, just whether
|
||||
* there is at least one whole-script confusable.
|
||||
*/
|
||||
boolean hasWholeScriptConfusable(String givenString, BitSet resultingScripts) {
|
||||
int givenScript = getSingleScript(givenString);
|
||||
if (givenScript == UScript.INVALID_CODE)
|
||||
return false;
|
||||
UnicodeSet givenSet = new UnicodeSet().addAll(givenString).removeAll(
|
||||
commonAndInherited);
|
||||
return hasWholeScriptConfusable(givenScript, givenSet, resultingScripts);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private boolean hasWholeScriptConfusable(int givenScript,
|
||||
UnicodeSet givenSet, BitSet resultingScripts) {
|
||||
resultingScripts.clear();
|
||||
if (fastReject[givenScript] == null)
|
||||
return false;
|
||||
if (fastReject[givenScript].containsSome(givenSet))
|
||||
return false;
|
||||
UnicodeSetToScript[] possibles = scriptToUnicodeSetToScript[givenScript];
|
||||
for (int i = 0; i < possibles.length; ++i) {
|
||||
if (possibles[i].set.containsAll(givenSet)) {
|
||||
resultingScripts.set(possibles[i].script);
|
||||
}
|
||||
}
|
||||
return !resultingScripts.isEmpty();
|
||||
}
|
||||
|
||||
/*
|
||||
* for this routine, we don't care what the targetScripts are, just
|
||||
* whether there is at least one whole-script confusable.
|
||||
*/
|
||||
boolean hasMixedScriptConfusable(String givenString) {
|
||||
UnicodeSet givenSet = new UnicodeSet().addAll(givenString).removeAll(
|
||||
commonAndInherited);
|
||||
UnicodeSet[] byScript = getScripts(givenSet);
|
||||
BitSet wholeScripts = new BitSet();
|
||||
boolean result = false;
|
||||
main: for (int i = 0; i < byScript.length; ++i) {
|
||||
if (byScript[i] == null)
|
||||
continue;
|
||||
// see if the other characters have whole script confusables in
|
||||
// my script
|
||||
for (int j = 0; j < byScript.length; ++j) {
|
||||
if (j == i || byScript[j] == null)
|
||||
continue;
|
||||
if (!hasWholeScriptConfusable(j, byScript[j], wholeScripts))
|
||||
continue main;
|
||||
if (!wholeScripts.get(i))
|
||||
continue main; // doesn't have the
|
||||
// one we want
|
||||
result = true;
|
||||
}
|
||||
return result; // passed the guantlet
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns UScript.INVALID_CODE if mixed script, otherwise the script
|
||||
*/
|
||||
public static int getSingleScript(String source) {
|
||||
int lastScript = UScript.INVALID_CODE;
|
||||
int cp;
|
||||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
int script = UScript.getScript(cp);
|
||||
if (script == UScript.COMMON || script == UScript.INHERITED) {
|
||||
if (XIDContinueSet.contains(cp)) {
|
||||
if (lastScript == UScript.INVALID_CODE)
|
||||
lastScript = script;
|
||||
continue; // skip if not identifier
|
||||
}
|
||||
script = UScript.COMMON;
|
||||
}
|
||||
if (lastScript == UScript.INVALID_CODE)
|
||||
lastScript = script;
|
||||
else if (script != lastScript)
|
||||
return UScript.INVALID_CODE;
|
||||
}
|
||||
return lastScript;
|
||||
}
|
||||
|
||||
public static UnicodeSet[] getScripts(UnicodeSet sourceSet) {
|
||||
UnicodeSet[] byScript = new UnicodeSet[UScript.CODE_LIMIT];
|
||||
for (UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet); usi
|
||||
.next();) {
|
||||
int script = UScript.getScript(usi.codepoint);
|
||||
if (byScript[script] == null)
|
||||
byScript[script] = new UnicodeSet();
|
||||
byScript[script].add(usi.codepoint);
|
||||
}
|
||||
return byScript;
|
||||
}
|
||||
|
||||
}
|
@ -1,187 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java,v $
|
||||
* $Date: 2004/10/14 17:54:56 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public class TestNameUniqueness implements UCD_Types {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
checkNameList();
|
||||
// new TestNameUniqueness().checkNames();
|
||||
}
|
||||
|
||||
Map names = new HashMap();
|
||||
int[] charCount = new int[128];
|
||||
int[] samples = new int[128];
|
||||
|
||||
public static class NameIterator {
|
||||
int fileCount = -1;
|
||||
String line;
|
||||
BufferedReader br;
|
||||
String[] pieces = new String[3];
|
||||
/**
|
||||
* @return null when done
|
||||
*/
|
||||
static String[][] files = {
|
||||
{"C:\\DATA\\", "pdam1040630.lst"},
|
||||
{"C:\\DATA\\UCD\\4.1.0-Update\\", "NamedCompositeEntities-4.1.0d2.txt"}
|
||||
};
|
||||
|
||||
public String next() {
|
||||
while (true) {
|
||||
try {
|
||||
if (br != null) line = br.readLine();
|
||||
if (line == null) {
|
||||
fileCount++;
|
||||
br = BagFormatter.openReader(files[fileCount][0], files[fileCount][1], "ISO-8859-1");
|
||||
line = br.readLine();
|
||||
}
|
||||
} catch (IOException e) {}
|
||||
if (line == null) return null;
|
||||
if (line.length() == 0) continue;
|
||||
if (fileCount == 0) {
|
||||
char c = line.charAt(0);
|
||||
// skip if doesn't start with hex digit
|
||||
if (!(('0' <= c && c <= '9') || ('A' <= c && c <= 'F'))) continue;
|
||||
Utility.split(line,'\t',pieces,true);
|
||||
Utility.split(pieces[1],'(',pieces,true);
|
||||
Utility.split(pieces[0],'*',pieces,true);
|
||||
return pieces[0];
|
||||
} else {
|
||||
Utility.split(line,';',pieces,true);
|
||||
return pieces[1];
|
||||
}
|
||||
//throw new IllegalArgumentException("Illegal file type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void checkNameList() throws IOException {
|
||||
Map map = new HashMap();
|
||||
NameIterator nameIterator = new NameIterator();
|
||||
int lineCount = 0;
|
||||
while (true) {
|
||||
String name = nameIterator.next();
|
||||
if (name == null) break;
|
||||
String key;
|
||||
try {
|
||||
if (name.startsWith("<")) key = name;
|
||||
else key = UnicodeProperty.toNameSkeleton(name);
|
||||
} catch (RuntimeException e) {
|
||||
System.out.println("Error on " + nameIterator.line);
|
||||
throw e;
|
||||
}
|
||||
Object value = map.get(key);
|
||||
if (value != null && !key.startsWith("<")) {
|
||||
System.out.println("*!*!*!* Collision at " + key + " between: ");
|
||||
System.out.println("\t" + value);
|
||||
System.out.println("\t" + nameIterator.line);
|
||||
//throw new IllegalArgumentException();
|
||||
}
|
||||
map.put(key, nameIterator.line);
|
||||
if (nameIterator.line.startsWith("116C")
|
||||
|| nameIterator.line.startsWith("1180")
|
||||
|| name.indexOf('-') >= 0
|
||||
|| (lineCount++ % 1000) == 0) {
|
||||
System.out.println("[" + lineCount + "]\t" + nameIterator.line + "\t" + name);
|
||||
System.out.println("\t" + name);
|
||||
System.out.println("\t" + key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void checkNames() throws IOException {
|
||||
PrintWriter out = Utility.openPrintWriter("name_uniqueness.txt", Utility.LATIN1_WINDOWS);
|
||||
try {
|
||||
out.println("Collisions");
|
||||
out.println();
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!Default.ucd().isAllocated(cp)) continue;
|
||||
if (Default.ucd().hasComputableName(cp)) continue;
|
||||
int cat = Default.ucd().getCategory(cp);
|
||||
if (cat == Cc) continue;
|
||||
|
||||
String name = Default.ucd().getName(cp);
|
||||
String processedName = processName(cp, name);
|
||||
Integer existing = (Integer) names.get(processedName);
|
||||
if (existing != null) {
|
||||
out.println("Collision between: "
|
||||
+ Default.ucd().getCodeAndName(existing.intValue())
|
||||
+ ", " + Default.ucd().getCodeAndName(cp));
|
||||
} else {
|
||||
names.put(processedName, new Integer(cp));
|
||||
}
|
||||
}
|
||||
out.println();
|
||||
out.println("Samples");
|
||||
out.println();
|
||||
for (int i = 0; i < charCount.length; ++i) {
|
||||
int count = charCount[i];
|
||||
if (count == 0) continue;
|
||||
String sampleName = Default.ucd().getCodeAndName(samples[i]);
|
||||
out.println(count + "\t'" + ((char)i)
|
||||
+ "'\t" + Default.ucd().getCodeAndName(samples[i])
|
||||
+ "\t=>\t" + processName(samples[i], Default.ucd().getName(samples[i])));
|
||||
}
|
||||
out.println();
|
||||
out.println("Name Samples");
|
||||
out.println();
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
int cat = Default.ucd().getCategory(i);
|
||||
if (cat == Cc) continue;
|
||||
out.println(Default.ucd().getCodeAndName(i)
|
||||
+ "\t=>\t" + processName(i, Default.ucd().getName(i)));
|
||||
}
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
static final String[][] replacements = {
|
||||
//{"SMALL LETTER", ""},
|
||||
{"LETTER", ""},
|
||||
{"CHARACTER", ""},
|
||||
{"DIGIT", ""},
|
||||
{"SIGN", ""},
|
||||
//{"WITH", ""},
|
||||
};
|
||||
|
||||
StringBuffer processNamesBuffer = new StringBuffer();
|
||||
|
||||
String processName(int codePoint, String name) {
|
||||
name = Utility.replace(name, replacements);
|
||||
processNamesBuffer.setLength(0);
|
||||
for (int i = 0; i < name.length(); ++i) {
|
||||
char c = name.charAt(i);
|
||||
++charCount[c];
|
||||
if (samples[c] == 0) samples[c] = codePoint;
|
||||
if ('A' <= c && c <= 'Z'
|
||||
|| '0' <= c && c <= '9') processNamesBuffer.append(c);
|
||||
|
||||
}
|
||||
if (processNamesBuffer.length() == name.length()) return name;
|
||||
return processNamesBuffer.toString();
|
||||
}
|
||||
}
|
||||
|
@ -1,246 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $
|
||||
* $Date: 2004/02/12 08:23:16 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public final class TestNormalization {
|
||||
static final String DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\Update 3.0.1\\";
|
||||
static final boolean SKIP_FILE = true;
|
||||
|
||||
static PrintWriter out = null;
|
||||
static BufferedReader in = null;
|
||||
|
||||
static BitSet charsListed = new BitSet(0x110000);
|
||||
static int errorCount = 0;
|
||||
static int lineErrorCount = 0;
|
||||
static String originalLine = "";
|
||||
static String lastLine = "";
|
||||
|
||||
public static void main(String[] args) throws java.io.IOException {
|
||||
System.out.println("Creating Normalizers");
|
||||
|
||||
|
||||
String[] testSet = {"a\u0304\u0328", "a\u0328\u0304"};
|
||||
for (int i = 0; i < testSet.length; ++i) {
|
||||
String s = testSet[i];
|
||||
boolean test = Default.nfc().isFCD(s);
|
||||
System.out.println(test + ": " + Default.ucd().getCodeAndName(s));
|
||||
}
|
||||
|
||||
|
||||
String x = UTF32.valueOf32(0x10000);
|
||||
check("NFC", Default.nfc(), x);
|
||||
check("NFD", Default.nfd(), x);
|
||||
check("NFKC", Default.nfkc(), x);
|
||||
check("NFKD", Default.nfkd(), x);
|
||||
|
||||
|
||||
out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream("NormalizationTestLog.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
|
||||
in = new BufferedReader (
|
||||
new FileReader (DIR + "NormalizationTest.txt"),
|
||||
32*1024);
|
||||
|
||||
try {
|
||||
String[] parts = new String[10];
|
||||
|
||||
System.out.println("Checking files");
|
||||
|
||||
int count = 0;
|
||||
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if ((count++ & 0x3FF) == 0) System.out.println("#LINE: " + line);
|
||||
if (line == null) break;
|
||||
originalLine = line;
|
||||
int pos = line.indexOf('#');
|
||||
if (pos >= 0) {
|
||||
line = line.substring(0,pos);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
|
||||
int splitCount = Utility.split(line, ';', parts);
|
||||
// FIX check splitCount
|
||||
for (int i = 0; i < splitCount; ++i) {
|
||||
parts[i] = Utility.fromHex(parts[i]);
|
||||
}
|
||||
|
||||
if (UTF32.length32(parts[0]) == 1) {
|
||||
int code = UTF32.char32At(parts[0],0);
|
||||
charsListed.set(code);
|
||||
if ((code & 0x3FF) == 0) System.out.println("# " + Utility.hex(code));
|
||||
}
|
||||
|
||||
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
errorCount += check("NFCa", Default.nfc(), parts[1], parts[0]);
|
||||
errorCount += check("NFCb", Default.nfc(), parts[1], parts[1]);
|
||||
errorCount += check("NFCc", Default.nfc(), parts[1], parts[2]);
|
||||
|
||||
// c4 == NFC(c4) == NFC(c5)
|
||||
errorCount += check("NFCd", Default.nfc(), parts[3], parts[3]);
|
||||
errorCount += check("NFCe", Default.nfc(), parts[3], parts[4]);
|
||||
|
||||
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
errorCount += check("NFDa", Default.nfd(), parts[2], parts[0]);
|
||||
errorCount += check("NFDb", Default.nfd(), parts[2], parts[1]);
|
||||
errorCount += check("NFDc", Default.nfd(), parts[2], parts[2]);
|
||||
|
||||
// c5 == NFD(c4) == NFD(c5)
|
||||
errorCount += check("NFDd", Default.nfd(), parts[4], parts[3]);
|
||||
errorCount += check("NFDe", Default.nfd(), parts[4], parts[4]);
|
||||
|
||||
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
errorCount += check("NFKCa", Default.nfkc(), parts[3], parts[0]);
|
||||
errorCount += check("NFKCb", Default.nfkc(), parts[3], parts[1]);
|
||||
errorCount += check("NFKCc", Default.nfkc(), parts[3], parts[2]);
|
||||
errorCount += check("NFKCd", Default.nfkc(), parts[3], parts[3]);
|
||||
errorCount += check("NFKCe", Default.nfkc(), parts[3], parts[4]);
|
||||
|
||||
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
errorCount += check("NFKDa", Default.nfkd(), parts[4], parts[0]);
|
||||
errorCount += check("NFKDb", Default.nfkd(), parts[4], parts[1]);
|
||||
errorCount += check("NFKDc", Default.nfkd(), parts[4], parts[2]);
|
||||
errorCount += check("NFKDd", Default.nfkd(), parts[4], parts[3]);
|
||||
errorCount += check("NFKDe", Default.nfkd(), parts[4], parts[4]);
|
||||
}
|
||||
System.out.println("Total errors in file: " + errorCount
|
||||
+ ", lines: " + lineErrorCount);
|
||||
errorCount = lineErrorCount = 0;
|
||||
|
||||
System.out.println("Checking Missing");
|
||||
checkMissing();
|
||||
System.out.println("Total errors in unlisted items: " + errorCount
|
||||
+ ", lines: " + lineErrorCount);
|
||||
|
||||
} finally {
|
||||
if (in != null) in.close();
|
||||
if (out != null) out.close();
|
||||
}
|
||||
}
|
||||
|
||||
static String lastBase = "";
|
||||
|
||||
public static int check(String type, Normalizer n, String base, String other) {
|
||||
try {
|
||||
String trans = n.normalize(other);
|
||||
if (!trans.equals(base)) {
|
||||
String temp = "";
|
||||
if (!lastLine.equals(originalLine)) {
|
||||
temp = "// " + originalLine;
|
||||
lastLine = originalLine;
|
||||
}
|
||||
if (!base.equals(lastBase)) {
|
||||
lastBase = base;
|
||||
lineErrorCount++;
|
||||
}
|
||||
String otherList = "";
|
||||
if (!base.equals(other)) {
|
||||
otherList = "(" + Default.ucd().getCodeAndName(other) + ")";
|
||||
}
|
||||
out.println("DIFF " + type + ": "
|
||||
+ Default.ucd().getCodeAndName(base) + " != "
|
||||
+ type
|
||||
+ otherList
|
||||
+ " == " + Default.ucd().getCodeAndName(trans)
|
||||
+ temp
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("DIFF " + type + ": "
|
||||
+ Default.ucd().getCodeAndName(base) + " != "
|
||||
+ type + "(" + Default.ucd().getCodeAndName(other) + ")", new Object[]{}, e);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static int check(String type, Normalizer n, String base) {
|
||||
return check(type, n, base, base);
|
||||
}
|
||||
|
||||
static void checkMissing() {
|
||||
for (int missing = 0; missing < 0x100000; ++missing) {
|
||||
if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing));
|
||||
if (charsListed.get(missing)) continue;
|
||||
String x = UTF32.valueOf32(missing);
|
||||
errorCount += check("NFC", Default.nfc(), x);
|
||||
errorCount += check("NFD", Default.nfd(), x);
|
||||
errorCount += check("NFKC", Default.nfkc(), x);
|
||||
errorCount += check("NFKD", Default.nfkd(), x);
|
||||
}
|
||||
}
|
||||
|
||||
public static void checkStarters () {
|
||||
System.out.println("Checking Starters");
|
||||
UnicodeSet leading = new UnicodeSet();
|
||||
UnicodeSet trailing = new UnicodeSet();
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
if (Default.nfc().isLeading(i)) leading.add(i);
|
||||
if (Default.ucd().getCombiningClass(i) != 0) continue;
|
||||
if (Default.nfc().isTrailing(i)) trailing.add(i);
|
||||
}
|
||||
System.out.println("Leading: " + leading.size());
|
||||
System.out.println("Trailing Starters: " + trailing.size());
|
||||
UnicodeSetIterator lead = new UnicodeSetIterator(leading);
|
||||
UnicodeSetIterator trail = new UnicodeSetIterator(trailing);
|
||||
UnicodeSet followers = new UnicodeSet();
|
||||
Map map = new TreeMap(new CompareProperties.UnicodeSetComparator());
|
||||
while (lead.next()) {
|
||||
trail.reset();
|
||||
followers.clear();
|
||||
while (trail.next()) {
|
||||
if (Default.nfc().getComposition(lead.codepoint, trail.codepoint) != 0xFFFF) {
|
||||
followers.add(trail.codepoint);
|
||||
}
|
||||
}
|
||||
if (followers.size() == 0) continue;
|
||||
System.out.println(Default.ucd().getCode(lead.codepoint)
|
||||
+ "\t" + followers.toPattern(true));
|
||||
UnicodeSet possLead = (UnicodeSet) map.get(followers);
|
||||
if (possLead == null) {
|
||||
possLead = new UnicodeSet();
|
||||
map.put(followers.clone(), possLead);
|
||||
}
|
||||
possLead.add(lead.codepoint);
|
||||
}
|
||||
Iterator it = map.keySet().iterator();
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setLineSeparator("<br>");
|
||||
bf.setLabelSource(null);
|
||||
bf.setAbbreviated(true);
|
||||
while (it.hasNext()) {
|
||||
UnicodeSet t = (UnicodeSet) it.next();
|
||||
UnicodeSet l = (UnicodeSet) map.get(t);
|
||||
System.out.println("<tr><td>"
|
||||
+ bf.showSetNames(l)
|
||||
+ "</td><td>"
|
||||
+ bf.showSetNames(t)
|
||||
+ "</td></tr>");
|
||||
}
|
||||
}
|
||||
}
|
@ -1,259 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.text.ParseException;
|
||||
import java.text.ParsePosition;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.Tabber;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.tool.UOption;
|
||||
import com.ibm.icu.text.SymbolTable;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeMatcher;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class TestUnicodeInvariants {
|
||||
private static final int
|
||||
HELP1 = 0,
|
||||
FILE = 1,
|
||||
RANGE = 2,
|
||||
TABLE = 3
|
||||
;
|
||||
|
||||
private static final UOption[] options = {
|
||||
UOption.HELP_H(),
|
||||
UOption.create("file", 'f', UOption.REQUIRES_ARG),
|
||||
UOption.create("norange", 'n', UOption.NO_ARG),
|
||||
UOption.create("table", 't', UOption.NO_ARG),
|
||||
};
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
UOption.parseArgs(args, options);
|
||||
|
||||
String file = "UnicodeInvariants.txt";
|
||||
if (options[FILE].doesOccur) file = options[FILE].value;
|
||||
boolean doRange = !options[RANGE].doesOccur;
|
||||
System.out.println("File:\t" + file);
|
||||
System.out.println("Ranges?\t" + doRange);
|
||||
System.out.println("HTML?\t" + options[TABLE].doesOccur);
|
||||
|
||||
testInvariants(file, doRange);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chain together several SymbolTables.
|
||||
* @author Davis
|
||||
*/
|
||||
static class ChainedSymbolTable implements SymbolTable {
|
||||
// TODO: add accessors?
|
||||
private List symbolTables;
|
||||
/**
|
||||
* Each SymbolTable is each accessed in order by the other methods,
|
||||
* so the first in the list is accessed first, etc.
|
||||
* @param symbolTables
|
||||
*/
|
||||
ChainedSymbolTable(SymbolTable[] symbolTables) {
|
||||
this.symbolTables = Arrays.asList(symbolTables);
|
||||
}
|
||||
public char[] lookup(String s) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
char[] result = st.lookup(s);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public UnicodeMatcher lookupMatcher(int ch) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
UnicodeMatcher result = st.lookupMatcher(ch);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Warning: this depends on pos being left alone unless a string is returned!!
|
||||
public String parseReference(String text, ParsePosition pos, int limit) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
String result = st.parseReference(text, pos, limit);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
|
||||
|
||||
public static void testInvariants(String outputFile, boolean doRange) throws IOException {
|
||||
String[][] variables = new String[100][2];
|
||||
int variableCount = 0;
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", outputFile);
|
||||
|
||||
BagFormatter errorLister = new BagFormatter();
|
||||
errorLister.setMergeRanges(doRange);
|
||||
errorLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
errorLister.setShowLiteral(TransliteratorUtilities.toXML);
|
||||
if (options[TABLE].doesOccur) errorLister.setTabber(new Tabber.HTMLTabber());
|
||||
|
||||
BagFormatter showLister = new BagFormatter();
|
||||
showLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
showLister.setMergeRanges(doRange);
|
||||
showLister.setShowLiteral(TransliteratorUtilities.toXML);
|
||||
if (options[TABLE].doesOccur) showLister.setTabber(new Tabber.HTMLTabber());
|
||||
|
||||
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
|
||||
ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"),
|
||||
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
|
||||
ParsePosition pp = new ParsePosition(0);
|
||||
int parseErrorCount = 0;
|
||||
int testFailureCount = 0;
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.startsWith("\uFEFF")) line = line.substring(1);
|
||||
out.println(line);
|
||||
line = line.trim();
|
||||
int pos = line.indexOf('#');
|
||||
if (pos >= 0) line = line.substring(0,pos).trim();
|
||||
if (line.length() == 0) continue;
|
||||
if (line.equalsIgnoreCase("Stop")) break;
|
||||
|
||||
// fix all the variables
|
||||
String oldLine = line;
|
||||
line = Utility.replace(line, variables, variableCount);
|
||||
|
||||
// detect variables
|
||||
if (line.startsWith("Let")) {
|
||||
int x = line.indexOf('=');
|
||||
variables[variableCount][0] = line.substring(3,x).trim();
|
||||
variables[variableCount][1] = line.substring(x+1).trim();
|
||||
variableCount++;
|
||||
if (false) System.out.println("Added variable: <" + variables[variableCount-1][0] + "><"
|
||||
+ variables[variableCount-1][1] + ">");
|
||||
continue;
|
||||
}
|
||||
|
||||
// detect variables
|
||||
if (line.startsWith("Show")) {
|
||||
String part = line.substring(4).trim();
|
||||
if (part.startsWith("Each")) {
|
||||
part = part.substring(4).trim();
|
||||
showLister.setMergeRanges(false);
|
||||
}
|
||||
pp.setIndex(0);
|
||||
UnicodeSet leftSet = new UnicodeSet(part, pp, st);
|
||||
showLister.showSetNames(out, leftSet);
|
||||
showLister.setMergeRanges(doRange);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.startsWith("Test")) {
|
||||
line = line.substring(4).trim();
|
||||
}
|
||||
|
||||
char relation = 0;
|
||||
String rightSide = null;
|
||||
String leftSide = null;
|
||||
UnicodeSet leftSet = null;
|
||||
UnicodeSet rightSet = null;
|
||||
try {
|
||||
pp.setIndex(0);
|
||||
leftSet = new UnicodeSet(line, pp, st);
|
||||
leftSide = line.substring(0,pp.getIndex());
|
||||
eatWhitespace(line, pp);
|
||||
relation = line.charAt(pp.getIndex());
|
||||
if (!INVARIANT_RELATIONS.contains(relation)) {
|
||||
throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false),
|
||||
pp.getIndex());
|
||||
}
|
||||
pp.setIndex(pp.getIndex()+1); // skip char
|
||||
eatWhitespace(line, pp);
|
||||
int start = pp.getIndex();
|
||||
rightSet = new UnicodeSet(line, pp, st);
|
||||
rightSide = line.substring(start,pp.getIndex());
|
||||
eatWhitespace(line, pp);
|
||||
if (line.length() != pp.getIndex()) {
|
||||
throw new ParseException("Extra characters at end", pp.getIndex());
|
||||
}
|
||||
} catch (ParseException e) {
|
||||
out.println("PARSE ERROR:\t" + line.substring(0,e.getErrorOffset())
|
||||
+ "<@>" + line.substring(e.getErrorOffset()));
|
||||
out.println();
|
||||
out.println("**** START Error Info ****");
|
||||
out.println(e.getMessage());
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
parseErrorCount++;
|
||||
continue;
|
||||
} catch (IllegalArgumentException e) {
|
||||
out.println("PARSE ERROR:\t" + line);
|
||||
out.println();
|
||||
out.println("**** START Error Info ****");
|
||||
out.println(e.getMessage());
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
parseErrorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean ok = true;
|
||||
switch(relation) {
|
||||
case '=': case '\u2261': ok = leftSet.equals(rightSet); break;
|
||||
case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break;
|
||||
case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break;
|
||||
case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break;
|
||||
case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break;
|
||||
case '!': ok = leftSet.containsNone(rightSet); break;
|
||||
case '?': ok = !leftSet.equals(rightSet)
|
||||
&& !leftSet.containsAll(rightSet)
|
||||
&& !rightSet.containsAll(leftSet)
|
||||
&& !leftSet.containsNone(rightSet);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Internal Error");
|
||||
}
|
||||
if (ok) continue;
|
||||
out.println();
|
||||
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
|
||||
out.println("**** START Error Info ****");
|
||||
errorLister.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
testFailureCount++;
|
||||
}
|
||||
out.println();
|
||||
out.println("**** SUMMARY ****");
|
||||
out.println();
|
||||
out.println("ParseErrorCount=" + parseErrorCount);
|
||||
out.println("TestFailureCount=" + testFailureCount);
|
||||
out.close();
|
||||
System.out.println("ParseErrorCount=" + parseErrorCount);
|
||||
System.out.println("TestFailureCount=" + testFailureCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param line
|
||||
* @param pp
|
||||
*/
|
||||
private static void eatWhitespace(String line, ParsePosition pp) {
|
||||
int cp = 0;
|
||||
int i;
|
||||
for (i = pp.getIndex(); i < line.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(line, i);
|
||||
if (!com.ibm.icu.lang.UCharacter.isUWhiteSpace(cp)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
pp.setIndex(i);
|
||||
}
|
||||
}
|
@ -1,780 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.text.NumberFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
private UCD ucd;
|
||||
|
||||
private Normalizer nfc, nfd, nfkd, nfkc;
|
||||
|
||||
private static boolean needAgeCache = true;
|
||||
|
||||
private static UCD[] ucdCache = new UCD[UCD_Types.LIMIT_AGE];
|
||||
|
||||
private static HashMap factoryCache = new HashMap();
|
||||
|
||||
public static synchronized ToolUnicodePropertySource make(String version) {
|
||||
ToolUnicodePropertySource result = (ToolUnicodePropertySource) factoryCache.get(version);
|
||||
if (result != null)
|
||||
return result;
|
||||
result = new ToolUnicodePropertySource(version);
|
||||
factoryCache.put(version, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private ToolUnicodePropertySource(String version) {
|
||||
ucd = UCD.make(version);
|
||||
nfc = new Normalizer(Normalizer.NFC, ucd.getVersion());
|
||||
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
|
||||
nfkc = new Normalizer(Normalizer.NFKC, ucd.getVersion());
|
||||
nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion());
|
||||
|
||||
version = ucd.getVersion(); // regularize
|
||||
|
||||
// first the special cases
|
||||
if (DEBUG)
|
||||
System.out.println("Adding Simple Cases");
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0)
|
||||
return null;
|
||||
return ucd.getName(codepoint);
|
||||
}
|
||||
}.setValues("<string>").setMain("Name", "na", UnicodeProperty.MISC, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
if (DEBUG && codepoint == 0x1D100) {
|
||||
System.out.println("here");
|
||||
}
|
||||
//if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
|
||||
return ucd.getBlock(codepoint);
|
||||
}
|
||||
|
||||
protected UnicodeMap _getUnicodeMap() {
|
||||
return ucd.blockData;
|
||||
}
|
||||
}.setValues(ucd.getBlockNames(null)).setMain("Block", "blk", UnicodeProperty.CATALOG, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
//if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
|
||||
return ucd.getBidiMirror(codepoint);
|
||||
}
|
||||
}.setValues("<string>").setMain("Bidi_Mirroring_Glyph", "bmg", UnicodeProperty.STRING, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
//if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
|
||||
return ucd.getCase(codepoint, UCD_Types.FULL, UCD_Types.FOLD);
|
||||
}
|
||||
}.setValues("<string>").setMain("Case_Folding", "cf", UnicodeProperty.STRING, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
{
|
||||
nf.setGroupingUsed(false);
|
||||
nf.setMaximumFractionDigits(8);
|
||||
nf.setMinimumFractionDigits(1);
|
||||
}
|
||||
|
||||
public String _getValue(int codepoint) {
|
||||
|
||||
double num = ucd.getNumericValue(codepoint);
|
||||
if (Double.isNaN(num))
|
||||
return null;
|
||||
return nf.format(num);
|
||||
}
|
||||
}.setMain("Numeric_Value", "nv", UnicodeProperty.NUMERIC, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int cp) {
|
||||
if (!ucd.isRepresented(cp))
|
||||
return null;
|
||||
String b = nfkc.normalize(ucd.getCase(cp, UCD_Types.FULL, UCD_Types.FOLD));
|
||||
String c = nfkc.normalize(ucd.getCase(b, UCD_Types.FULL, UCD_Types.FOLD));
|
||||
if (c.equals(b))
|
||||
return null;
|
||||
return c;
|
||||
}
|
||||
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 14;
|
||||
}
|
||||
}.setMain("FC_NFKC_Closure", "FC_NFKC", UnicodeProperty.STRING, version)
|
||||
//.addName("FNC")
|
||||
);
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
if (!nfd.isNormalized(codepoint))
|
||||
return "No";
|
||||
else if (nfd.isTrailing(codepoint))
|
||||
throw new IllegalArgumentException("Internal Error!");
|
||||
else
|
||||
return "Yes";
|
||||
}
|
||||
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 15;
|
||||
}
|
||||
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases().setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
if (!nfc.isNormalized(codepoint))
|
||||
return "No";
|
||||
else if (nfc.isTrailing(codepoint))
|
||||
return "Maybe";
|
||||
else
|
||||
return "Yes";
|
||||
}
|
||||
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 15;
|
||||
}
|
||||
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases().setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
if (!nfkd.isNormalized(codepoint))
|
||||
return "No";
|
||||
else if (nfkd.isTrailing(codepoint))
|
||||
throw new IllegalArgumentException("Internal Error!");
|
||||
else
|
||||
return "Yes";
|
||||
}
|
||||
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 15;
|
||||
}
|
||||
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases().setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
if (!nfkc.isNormalized(codepoint))
|
||||
return "No";
|
||||
else if (nfkc.isTrailing(codepoint))
|
||||
return "Maybe";
|
||||
else
|
||||
return "Yes";
|
||||
}
|
||||
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 15;
|
||||
}
|
||||
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases().setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version));
|
||||
|
||||
/*
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
public String _getValue(int codepoint) {
|
||||
if (!nfx.isNormalized(codepoint)) return NO;
|
||||
else if (nfx.isTrailing(codepoint)) return MAYBE;
|
||||
else return "";
|
||||
}
|
||||
}.setMain("NFD_QuickCheck", "nv", UnicodeProperty.NUMERIC, version)
|
||||
.setValues("<number>"));
|
||||
*/
|
||||
|
||||
// Now the derived properties
|
||||
if (DEBUG)
|
||||
System.out.println("Derived Properties");
|
||||
for (int i = 0; i < DerivedProperty.DERIVED_PROPERTY_LIMIT; ++i) {
|
||||
UCDProperty prop = DerivedProperty.make(i);
|
||||
if (prop == null)
|
||||
continue;
|
||||
if (!prop.isStandard())
|
||||
continue;
|
||||
String name = prop.getName();
|
||||
if (getProperty(name) != null) {
|
||||
if (DEBUG)
|
||||
System.out.println("Iterated Names: " + name + ", ALREADY PRESENT*");
|
||||
continue; // skip if already there
|
||||
}
|
||||
int type = prop.getValueType();
|
||||
if (i == UCD_Types.FC_NFKC_Closure)
|
||||
type = UnicodeProperty.STRING;
|
||||
else if (i == UCD_Types.FullCompExclusion)
|
||||
type = UnicodeProperty.BINARY;
|
||||
else
|
||||
type = remapUCDType(type);
|
||||
|
||||
if (DEBUG)
|
||||
System.out.println(prop.getName());
|
||||
add(new UCDPropertyWrapper(prop, type, false));
|
||||
}
|
||||
|
||||
// then the general stuff
|
||||
|
||||
if (DEBUG)
|
||||
System.out.println("Other Properties");
|
||||
List names = new ArrayList();
|
||||
UnifiedProperty.getAvailablePropertiesAliases(names, ucd);
|
||||
Iterator it = names.iterator();
|
||||
while (it.hasNext()) {
|
||||
String name = (String) it.next();
|
||||
if (getProperty(name) != null) {
|
||||
if (DEBUG)
|
||||
System.out.println("Iterated Names: " + name + ", ALREADY PRESENT");
|
||||
continue; // skip if already there
|
||||
}
|
||||
if (DEBUG)
|
||||
System.out.println("Iterated Names: " + name);
|
||||
add(new ToolUnicodeProperty(name));
|
||||
}
|
||||
|
||||
int compositeVersion = ucd.getCompositeVersion();
|
||||
if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() {
|
||||
{
|
||||
unicodeMap = new UnicodeMap();
|
||||
unicodeMap.setErrorOnReset(true);
|
||||
unicodeMap.put(0xD, "CR");
|
||||
unicodeMap.put(0xA, "LF");
|
||||
UnicodeProperty cat = getProperty("General_Category");
|
||||
UnicodeSet temp = cat.getSet("Line_Separator").addAll(cat.getSet("Paragraph_Separator")).addAll(cat.getSet("Control")).addAll(cat.getSet("Format")).remove(0xD).remove(0xA).remove(0x200C)
|
||||
.remove(0x200D);
|
||||
unicodeMap.putAll(temp, "Control");
|
||||
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
|
||||
unicodeMap.putAll(graphemeExtend, "Extend");
|
||||
UnicodeProperty hangul = getProperty("Hangul_Syllable_Type");
|
||||
unicodeMap.putAll(hangul.getSet("L"), "L");
|
||||
unicodeMap.putAll(hangul.getSet("V"), "V");
|
||||
unicodeMap.putAll(hangul.getSet("T"), "T");
|
||||
unicodeMap.putAll(hangul.getSet("LV"), "LV");
|
||||
unicodeMap.putAll(hangul.getSet("LVT"), "LVT");
|
||||
unicodeMap.setMissing("Other");
|
||||
}
|
||||
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version).addValueAliases(new String[][] { { "Control", "CN" }, { "Extend", "EX" }, { "Other", "XX" }, }, true)
|
||||
.swapFirst2ValueAliases());
|
||||
|
||||
if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() {
|
||||
{
|
||||
unicodeMap = new UnicodeMap();
|
||||
unicodeMap.setErrorOnReset(true);
|
||||
UnicodeProperty cat = getProperty("General_Category");
|
||||
unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format");
|
||||
UnicodeProperty script = getProperty("Script");
|
||||
unicodeMap.putAll(script.getSet("Katakana").addAll(new UnicodeSet("[\u3031\u3032\u3033\u3034\u3035\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]")), "Katakana");
|
||||
Object foo = unicodeMap.getSet("Katakana");
|
||||
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
|
||||
UnicodeProperty lineBreak = getProperty("Line_Break");
|
||||
unicodeMap.putAll(getProperty("Alphabetic").getSet("true").add(0x05F3).removeAll(getProperty("Ideographic").getSet("true")).removeAll(unicodeMap.getSet("Katakana"))
|
||||
//.removeAll(script.getSet("Thai"))
|
||||
//.removeAll(script.getSet("Lao"))
|
||||
.removeAll(lineBreak.getSet("SA")).removeAll(script.getSet("Hiragana")).removeAll(graphemeExtend), "ALetter");
|
||||
unicodeMap.putAll(new UnicodeSet("[\\u0027\\u00B7\\u05F4\\u2019\\u2027\\u003A]"), "MidLetter");
|
||||
unicodeMap.putAll(lineBreak.getSet("Infix_Numeric").remove(0x003A), "MidNum");
|
||||
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
|
||||
unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "ExtendNumLet");
|
||||
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
|
||||
unicodeMap.setMissing("Other");
|
||||
}
|
||||
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version).addValueAliases(
|
||||
new String[][] { { "Format", "FO" }, { "Katakana", "KA" }, { "ALetter", "LE" }, { "MidLetter", "ML" }, { "MidNum", "MN" }, { "Numeric", "NU" }, { "ExtendNumLet", "EX" }, { "Other", "XX" }, },
|
||||
true).swapFirst2ValueAliases());
|
||||
|
||||
if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() {
|
||||
{
|
||||
unicodeMap = new UnicodeMap();
|
||||
unicodeMap.setErrorOnReset(true);
|
||||
unicodeMap.putAll(new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"), "Sep");
|
||||
UnicodeProperty cat = getProperty("General_Category");
|
||||
unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format");
|
||||
unicodeMap.putAll(getProperty("Whitespace").getSet("true").removeAll(unicodeMap.getSet("Sep")).remove(0xA0), "Sp");
|
||||
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
|
||||
unicodeMap.putAll(getProperty("Lowercase").getSet("true").removeAll(graphemeExtend), "Lower");
|
||||
unicodeMap.putAll(getProperty("Uppercase").getSet("true").addAll(cat.getSet("Titlecase_Letter")), "Upper");
|
||||
UnicodeSet temp = getProperty("Alphabetic").getSet("true").add(0xA0).add(0x5F3).removeAll(unicodeMap.getSet("Lower")).removeAll(unicodeMap.getSet("Upper")).removeAll(graphemeExtend);
|
||||
unicodeMap.putAll(temp, "OLetter");
|
||||
UnicodeProperty lineBreak = getProperty("Line_Break");
|
||||
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
|
||||
unicodeMap.put(0x002E, "ATerm");
|
||||
unicodeMap.putAll(getProperty("STerm").getSet("true").removeAll(unicodeMap.getSet("ATerm")), "STerm");
|
||||
unicodeMap.putAll(cat.getSet("Open_Punctuation").addAll(cat.getSet("Close_Punctuation")).addAll(lineBreak.getSet("Quotation")).remove(0x05F3).removeAll(unicodeMap.getSet("ATerm")).removeAll(
|
||||
unicodeMap.getSet("STerm")), "Close");
|
||||
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
|
||||
unicodeMap.setMissing("Other");
|
||||
}
|
||||
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version).addValueAliases(
|
||||
new String[][] { { "Sep", "SE" }, { "Format", "FO" }, { "Sp", "SP" }, { "Lower", "LO" }, { "Upper", "UP" }, { "OLetter", "LE" }, { "Numeric", "NU" }, { "ATerm", "AT" }, { "STerm", "ST" },
|
||||
{ "Close", "CL" }, { "Other", "XX" }, }, false).swapFirst2ValueAliases());
|
||||
}
|
||||
|
||||
static String[] YES_NO_MAYBE = { "N", "M", "Y" };
|
||||
|
||||
static String[] LONG_YES_NO_MAYBE = { "No", "Maybe", "Yes" };
|
||||
|
||||
static String[] YES_NO = { "N", "Y" };
|
||||
|
||||
static String[] LONG_YES_NO = { "No", "Yes" };
|
||||
|
||||
/*
|
||||
"Bidi_Mirroring_Glyph", "Block", "Case_Folding", "Case_Sensitive", "ISO_Comment",
|
||||
"Lowercase_Mapping", "Name", "Numeric_Value", "Simple_Case_Folding",
|
||||
"Simple_Lowercase_Mapping", "Simple_Titlecase_Mapping", "Simple_Uppercase_Mapping",
|
||||
"Titlecase_Mapping", "Unicode_1_Name", "Uppercase_Mapping", "isCased", "isCasefolded",
|
||||
"isLowercase", "isNFC", "isNFD", "isNFKC", "isNFKD", "isTitlecase", "isUppercase",
|
||||
"toNFC", "toNFD", "toNFKC", "toNKFD"
|
||||
});
|
||||
*/
|
||||
|
||||
/*
|
||||
private class NameProperty extends UnicodeProperty.SimpleProperty {
|
||||
{set("Name", "na", "<string>", UnicodeProperty.STRING);}
|
||||
public String getPropertyValue(int codepoint) {
|
||||
if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null;
|
||||
return ucd.getName(codepoint);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
static class UCDPropertyWrapper extends UnicodeProperty {
|
||||
UCDProperty ucdProperty;
|
||||
|
||||
boolean yes_no_maybe;
|
||||
|
||||
UCDPropertyWrapper(UCDProperty ucdProperty, int type, boolean yes_no_maybe) {
|
||||
this.ucdProperty = ucdProperty;
|
||||
setType(type);
|
||||
String name = ucdProperty.getName(UCDProperty.LONG);
|
||||
if (name == null)
|
||||
ucdProperty.getName(UCDProperty.SHORT);
|
||||
setName(name);
|
||||
this.yes_no_maybe = yes_no_maybe;
|
||||
}
|
||||
|
||||
protected String _getVersion() {
|
||||
return ucdProperty.getUCD().getVersion();
|
||||
}
|
||||
|
||||
protected String _getValue(int codepoint) {
|
||||
String result = ucdProperty.getValue(codepoint, UCDProperty.LONG);
|
||||
if (result.length() == 0) {
|
||||
return "False";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
protected List _getNameAliases(List result) {
|
||||
addUnique(ucdProperty.getName(UCDProperty.SHORT), result);
|
||||
String name = getName();
|
||||
addUnique(name, result);
|
||||
if (name.equals("White_Space"))
|
||||
addUnique("space", result);
|
||||
return result;
|
||||
}
|
||||
|
||||
protected List _getValueAliases(String valueAlias, List result) {
|
||||
if (isType(BINARY_MASK)) {
|
||||
if (valueAlias.equals("True"))
|
||||
addUnique("T", result);
|
||||
else if (valueAlias.equals("False"))
|
||||
addUnique("F", result);
|
||||
addUnique(valueAlias, result);
|
||||
}
|
||||
if (yes_no_maybe) {
|
||||
if (valueAlias.equals("Yes"))
|
||||
addUnique("Y", result);
|
||||
else if (valueAlias.equals("No"))
|
||||
addUnique("N", result);
|
||||
else if (valueAlias.equals("Maybe"))
|
||||
addUnique("M", result);
|
||||
addUnique(valueAlias, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
protected List _getAvailableValues(List result) {
|
||||
if (isType(BINARY_MASK)) {
|
||||
addUnique("True", result);
|
||||
addUnique("False", result);
|
||||
}
|
||||
if (yes_no_maybe) {
|
||||
addUnique("No", result);
|
||||
addUnique("Maybe", result);
|
||||
addUnique("Yes", result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
static final int ODD_BALLS = (1 << UCD_Types.Cn) | (1 << UCD_Types.Co) | (1 << UCD_Types.Cs) | (1 << UCD.Cc);
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.dev.test.util.UnicodePropertySource#getPropertyAliases(java.util.Collection)
|
||||
*/
|
||||
private class ToolUnicodeProperty extends UnicodeProperty {
|
||||
com.ibm.text.UCD.UCDProperty up;
|
||||
|
||||
int propMask;
|
||||
|
||||
static final int EXTRA_START = 0x10000;
|
||||
|
||||
private ToolUnicodeProperty(String propertyAlias) {
|
||||
propMask = UnifiedProperty.getPropmask(propertyAlias, ucd);
|
||||
up = UnifiedProperty.make(propMask, ucd);
|
||||
if (up == null)
|
||||
throw new IllegalArgumentException("Not found: " + propertyAlias);
|
||||
if (propertyAlias.equals("Case_Fold_Turkish_I")) {
|
||||
System.out.println(propertyAlias + " " + getTypeName(getType()));
|
||||
}
|
||||
setType(getPropertyTypeInternal());
|
||||
setName(propertyAlias);
|
||||
}
|
||||
|
||||
public List _getAvailableValues(List result) {
|
||||
if (result == null)
|
||||
result = new ArrayList();
|
||||
int type = getType() & CORE_MASK;
|
||||
if (type == STRING || type == MISC)
|
||||
result.add("<string>");
|
||||
else if (type == NUMERIC)
|
||||
result.add("<number>");
|
||||
else if (type == BINARY) {
|
||||
result.add("True");
|
||||
result.add("False");
|
||||
} else if (type == ENUMERATED || type == CATALOG) {
|
||||
byte style = UCD_Types.LONG;
|
||||
int prop = propMask >> 8;
|
||||
String temp = null;
|
||||
boolean titlecase = false;
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
boolean check = false;
|
||||
try {
|
||||
switch (prop) {
|
||||
case UCD_Types.CATEGORY >> 8:
|
||||
temp = (ucd.getCategoryID_fromIndex((byte) i, style));
|
||||
break;
|
||||
case UCD_Types.COMBINING_CLASS >> 8:
|
||||
temp = (ucd.getCombiningClassID_fromIndex((short) i, style));
|
||||
break;
|
||||
case UCD_Types.BIDI_CLASS >> 8:
|
||||
temp = (ucd.getBidiClassID_fromIndex((byte) i, style));
|
||||
break;
|
||||
case UCD_Types.DECOMPOSITION_TYPE >> 8:
|
||||
temp = (ucd.getDecompositionTypeID_fromIndex((byte) i, style));
|
||||
//check = temp != null;
|
||||
break;
|
||||
case UCD_Types.NUMERIC_TYPE >> 8:
|
||||
temp = (ucd.getNumericTypeID_fromIndex((byte) i, style));
|
||||
titlecase = true;
|
||||
break;
|
||||
case UCD_Types.EAST_ASIAN_WIDTH >> 8:
|
||||
temp = (ucd.getEastAsianWidthID_fromIndex((byte) i, style));
|
||||
break;
|
||||
case UCD_Types.LINE_BREAK >> 8:
|
||||
temp = (ucd.getLineBreakID_fromIndex((byte) i, style));
|
||||
break;
|
||||
case UCD_Types.JOINING_TYPE >> 8:
|
||||
temp = (ucd.getJoiningTypeID_fromIndex((byte) i, style));
|
||||
break;
|
||||
case UCD_Types.JOINING_GROUP >> 8:
|
||||
temp = (ucd.getJoiningGroupID_fromIndex((byte) i, style));
|
||||
break;
|
||||
case UCD_Types.SCRIPT >> 8:
|
||||
temp = (ucd.getScriptID_fromIndex((byte) i, style));
|
||||
titlecase = true;
|
||||
if (UnicodeProperty.UNUSED.equals(temp))
|
||||
continue;
|
||||
if (temp != null)
|
||||
temp = UCharacter.toTitleCase(Locale.ENGLISH, temp, null);
|
||||
break;
|
||||
case UCD_Types.AGE >> 8:
|
||||
temp = (ucd.getAgeID_fromIndex((byte) i, style));
|
||||
break;
|
||||
case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8:
|
||||
temp = (ucd.getHangulSyllableTypeID_fromIndex((byte) i, style));
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Internal Error: " + prop);
|
||||
}
|
||||
} catch (ArrayIndexOutOfBoundsException e) {
|
||||
continue;
|
||||
}
|
||||
if (check)
|
||||
System.out.println("Value: " + temp);
|
||||
if (temp != null && temp.length() != 0 && !temp.equals(UNUSED)) {
|
||||
result.add(Utility.getUnskeleton(temp, titlecase));
|
||||
}
|
||||
if (check)
|
||||
System.out.println("Value2: " + temp);
|
||||
}
|
||||
//if (prop == (UCD_Types.DECOMPOSITION_TYPE>>8)) result.add("none");
|
||||
//if (prop == (UCD_Types.JOINING_TYPE>>8)) result.add("Non_Joining");
|
||||
//if (prop == (UCD_Types.NUMERIC_TYPE>>8)) result.add("None");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public List _getNameAliases(List result) {
|
||||
if (result == null)
|
||||
result = new ArrayList();
|
||||
addUnique(Utility.getUnskeleton(up.getName(UCD_Types.SHORT), false), result);
|
||||
String longName = up.getName(UCD_Types.LONG);
|
||||
addUnique(Utility.getUnskeleton(longName, true), result);
|
||||
// hack
|
||||
if (longName.equals("White_Space"))
|
||||
addUnique("space", result);
|
||||
return result;
|
||||
}
|
||||
|
||||
public List _getValueAliases(String valueAlias, List result) {
|
||||
if (result == null)
|
||||
result = new ArrayList();
|
||||
int type = getType() & CORE_MASK;
|
||||
if (type == STRING || type == MISC || type == NUMERIC) {
|
||||
UnicodeProperty.addUnique(valueAlias, result);
|
||||
return result;
|
||||
} else if (type == BINARY) {
|
||||
UnicodeProperty.addUnique(valueAlias, result);
|
||||
return lookup(valueAlias, UCD_Names.YN_TABLE_LONG, UCD_Names.YN_TABLE, null, result);
|
||||
} else if (type == ENUMERATED || type == CATALOG) {
|
||||
byte style = UCD_Types.LONG;
|
||||
int prop = propMask >> 8;
|
||||
boolean titlecase = false;
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
try {
|
||||
switch (prop) {
|
||||
case UCD_Types.CATEGORY >> 8:
|
||||
return lookup(valueAlias, UCD_Names.LONG_GENERAL_CATEGORY, UCD_Names.GENERAL_CATEGORY, UCD_Names.EXTRA_GENERAL_CATEGORY, result);
|
||||
case UCD_Types.COMBINING_CLASS >> 8:
|
||||
addUnique(String.valueOf(0xFF & Utility.lookup(valueAlias, UCD_Names.LONG_COMBINING_CLASS, true)), result);
|
||||
return lookup(valueAlias, UCD_Names.LONG_COMBINING_CLASS, UCD_Names.COMBINING_CLASS, null, result);
|
||||
case UCD_Types.BIDI_CLASS >> 8:
|
||||
return lookup(valueAlias, UCD_Names.LONG_BIDI_CLASS, UCD_Names.BIDI_CLASS, null, result);
|
||||
case UCD_Types.DECOMPOSITION_TYPE >> 8:
|
||||
return lookup(valueAlias, UCD_Names.LONG_DECOMPOSITION_TYPE, UCD_Names.DECOMPOSITION_TYPE, null, result);
|
||||
case UCD_Types.NUMERIC_TYPE >> 8:
|
||||
return lookup(valueAlias, UCD_Names.LONG_NUMERIC_TYPE, UCD_Names.NUMERIC_TYPE, null, result);
|
||||
case UCD_Types.EAST_ASIAN_WIDTH >> 8:
|
||||
return lookup(valueAlias, UCD_Names.LONG_EAST_ASIAN_WIDTH, UCD_Names.EAST_ASIAN_WIDTH, null, result);
|
||||
case UCD_Types.LINE_BREAK >> 8:
|
||||
lookup(valueAlias, UCD_Names.LONG_LINE_BREAK, UCD_Names.LINE_BREAK, null, result);
|
||||
if (valueAlias.equals("Inseparable"))
|
||||
addUnique("Inseperable", result);
|
||||
// Inseparable; Inseperable
|
||||
return result;
|
||||
case UCD_Types.JOINING_TYPE >> 8:
|
||||
return lookup(valueAlias, UCD_Names.LONG_JOINING_TYPE, UCD_Names.JOINING_TYPE, null, result);
|
||||
case UCD_Types.JOINING_GROUP >> 8:
|
||||
return lookup(valueAlias, UCD_Names.JOINING_GROUP, null, null, result);
|
||||
case UCD_Types.SCRIPT >> 8:
|
||||
return lookup(valueAlias, UCD_Names.LONG_SCRIPT, UCD_Names.SCRIPT, UCD_Names.EXTRA_SCRIPT, result);
|
||||
case UCD_Types.AGE >> 8:
|
||||
return lookup(valueAlias, UCD_Names.AGE, null, null, result);
|
||||
case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8:
|
||||
return lookup(valueAlias, UCD_Names.LONG_HANGUL_SYLLABLE_TYPE, UCD_Names.HANGUL_SYLLABLE_TYPE, null, result);
|
||||
default:
|
||||
throw new IllegalArgumentException("Internal Error: " + prop);
|
||||
}
|
||||
} catch (ArrayIndexOutOfBoundsException e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new ArrayIndexOutOfBoundsException("not supported yet");
|
||||
}
|
||||
|
||||
public String _getValue(int codepoint) {
|
||||
byte style = UCD_Types.LONG;
|
||||
String temp = null;
|
||||
boolean titlecase = false;
|
||||
switch (propMask >> 8) {
|
||||
case UCD_Types.CATEGORY >> 8:
|
||||
temp = (ucd.getCategoryID_fromIndex(ucd.getCategory(codepoint), style));
|
||||
break;
|
||||
case UCD_Types.COMBINING_CLASS >> 8:
|
||||
temp = (ucd.getCombiningClassID_fromIndex(ucd.getCombiningClass(codepoint), style));
|
||||
//if (temp.startsWith("Fixed_")) temp = temp.substring(6);
|
||||
break;
|
||||
case UCD_Types.BIDI_CLASS >> 8:
|
||||
temp = (ucd.getBidiClassID_fromIndex(ucd.getBidiClass(codepoint), style));
|
||||
break;
|
||||
case UCD_Types.DECOMPOSITION_TYPE >> 8:
|
||||
temp = (ucd.getDecompositionTypeID_fromIndex(ucd.getDecompositionType(codepoint), style));
|
||||
if (temp == null || temp.length() == 0)
|
||||
temp = "none";
|
||||
break;
|
||||
case UCD_Types.NUMERIC_TYPE >> 8:
|
||||
temp = (ucd.getNumericTypeID_fromIndex(ucd.getNumericType(codepoint), style));
|
||||
titlecase = true;
|
||||
if (temp == null || temp.length() == 0)
|
||||
temp = "None";
|
||||
break;
|
||||
case UCD_Types.EAST_ASIAN_WIDTH >> 8:
|
||||
temp = (ucd.getEastAsianWidthID_fromIndex(ucd.getEastAsianWidth(codepoint), style));
|
||||
break;
|
||||
case UCD_Types.LINE_BREAK >> 8:
|
||||
temp = (ucd.getLineBreakID_fromIndex(ucd.getLineBreak(codepoint), style));
|
||||
break;
|
||||
case UCD_Types.JOINING_TYPE >> 8:
|
||||
temp = (ucd.getJoiningTypeID_fromIndex(ucd.getJoiningType(codepoint), style));
|
||||
if (temp == null || temp.length() == 0)
|
||||
temp = "Non_Joining";
|
||||
break;
|
||||
case UCD_Types.JOINING_GROUP >> 8:
|
||||
temp = (ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(codepoint), style));
|
||||
break;
|
||||
case UCD_Types.SCRIPT >> 8:
|
||||
temp = (ucd.getScriptID_fromIndex(ucd.getScript(codepoint), style));
|
||||
if (temp != null)
|
||||
temp = UCharacter.toTitleCase(Locale.ENGLISH, temp, null);
|
||||
titlecase = true;
|
||||
break;
|
||||
case UCD_Types.AGE >> 8:
|
||||
temp = getAge(codepoint);
|
||||
break;
|
||||
case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8:
|
||||
temp = (ucd.getHangulSyllableTypeID_fromIndex(ucd.getHangulSyllableType(codepoint), style));
|
||||
break;
|
||||
}
|
||||
if (temp != null)
|
||||
return Utility.getUnskeleton(temp, titlecase);
|
||||
if (isType(BINARY_MASK)) {
|
||||
return up.hasValue(codepoint) ? "True" : "False";
|
||||
}
|
||||
throw new IllegalArgumentException("Failed to find value for " + Utility.hex(codepoint));
|
||||
}
|
||||
|
||||
public String getAge(int codePoint) {
|
||||
if (codePoint == 0xF0000) {
|
||||
System.out.println("debug point");
|
||||
}
|
||||
if (needAgeCache) {
|
||||
for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) {
|
||||
ucdCache[i] = UCD.make(UCD_Names.AGE_VERSIONS[i]);
|
||||
}
|
||||
needAgeCache = false;
|
||||
}
|
||||
for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) {
|
||||
if (ucdCache[i].isAllocated(codePoint))
|
||||
return UCD_Names.AGE[i];
|
||||
}
|
||||
return UCD_Names.AGE[UCD_Types.UNKNOWN];
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.dev.test.util.UnicodePropertySource#getPropertyType()
|
||||
*/
|
||||
private int getPropertyTypeInternal() {
|
||||
|
||||
switch (propMask) {
|
||||
case UCD_Types.BINARY_PROPERTIES | UCD_Types.CaseFoldTurkishI:
|
||||
case UCD_Types.BINARY_PROPERTIES | UCD_Types.Non_break:
|
||||
return EXTENDED_BINARY;
|
||||
}
|
||||
|
||||
switch (propMask >> 8) {
|
||||
case UCD_Types.SCRIPT >> 8:
|
||||
case UCD_Types.AGE >> 8:
|
||||
return CATALOG;
|
||||
}
|
||||
int mask = 0;
|
||||
if (!up.isStandard())
|
||||
mask = EXTENDED_MASK;
|
||||
return remapUCDType(up.getValueType()) | mask;
|
||||
}
|
||||
|
||||
public String _getVersion() {
|
||||
return up.ucd.getVersion();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private int remapUCDType(int result) {
|
||||
switch (result) {
|
||||
case UCD_Types.NUMERIC_PROP:
|
||||
result = UnicodeProperty.NUMERIC;
|
||||
break;
|
||||
case UCD_Types.STRING_PROP:
|
||||
result = UnicodeProperty.STRING;
|
||||
break;
|
||||
case UCD_Types.MISC_PROP:
|
||||
result = UnicodeProperty.STRING;
|
||||
break;
|
||||
case UCD_Types.CATALOG_PROP:
|
||||
result = UnicodeProperty.ENUMERATED;
|
||||
break;
|
||||
case UCD_Types.FLATTENED_BINARY_PROP:
|
||||
case UCD_Types.ENUMERATED_PROP:
|
||||
result = UnicodeProperty.ENUMERATED;
|
||||
break;
|
||||
case UCD_Types.BINARY_PROP:
|
||||
result = UnicodeProperty.BINARY;
|
||||
break;
|
||||
case UCD_Types.UNKNOWN_PROP:
|
||||
default:
|
||||
result = UnicodeProperty.STRING;
|
||||
//throw new IllegalArgumentException("Type: UNKNOWN_PROP");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static List lookup(String valueAlias, String[] main, String[] aux, Map aux2, List result) {
|
||||
//System.out.println(valueAlias + "=>");
|
||||
//System.out.println("=>" + aux[pos]);
|
||||
if (aux != null) {
|
||||
int pos = 0xFF & Utility.lookup(valueAlias, main, true);
|
||||
UnicodeProperty.addUnique(aux[pos], result);
|
||||
}
|
||||
UnicodeProperty.addUnique(valueAlias, result);
|
||||
if (aux2 != null) {
|
||||
String xtra = (String) aux2.get(valueAlias);
|
||||
if (xtra != null)
|
||||
UnicodeProperty.addUnique(xtra, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
static class DerivedPropertyWrapper extends UnicodeProperty {
|
||||
UCDProperty derivedProperty;
|
||||
UCD ucd;
|
||||
|
||||
DerivedPropertyWrapper(int derivedPropertyID, UCD ucd) {
|
||||
this.ucd = ucd;
|
||||
derivedProperty = DerivedProperty.make(derivedPropertyID, ucd);
|
||||
}
|
||||
protected String _getVersion() {
|
||||
return ucd.getVersion();
|
||||
}
|
||||
|
||||
protected String _getValue(int codepoint) {
|
||||
return derivedProperty.getValue(codepoint, UCD_Types.LONG);
|
||||
}
|
||||
protected List _getNameAliases(List result) {
|
||||
if (result != null) result = new ArrayList(1);
|
||||
addUnique(derivedProperty.getName(UCD_Types.SHORT), result);
|
||||
addUnique(derivedProperty.getName(UCD_Types.LONG), result);
|
||||
return null;
|
||||
}
|
||||
|
||||
protected List _getValueAliases(String valueAlias, List result) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
protected List _getAvailableValues(List result) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
}
|
@ -1,226 +0,0 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<title>Unicode Character Database</title>
|
||||
<style>
|
||||
<!--
|
||||
table { padding: 4 }
|
||||
td { padding: 4 }
|
||||
-->
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<span class="cb" id style="DISPLAY: block">
|
||||
<h1 align="center">Unicode Character Database (UCD) in XML Format</h1>
|
||||
<h1 align="center"><b><font color="#FF0000">WARNING: FORMAT IS DRAFT!</font></b></h1>
|
||||
<p align="center">MD 2000.10.16</p>
|
||||
<table border="1" width="40%" align="right" cellspacing="4" cellpadding="0">
|
||||
<tr>
|
||||
<td width="100%" bgcolor="#C0C0C0"><span class="cb" id
|
||||
style="DISPLAY: block">
|
||||
<h4 align="center">Using Internet Explorer</h4>
|
||||
<p>The UCD-Main.xml file can be read in Internet Explorer (5.0 and above).
|
||||
However:</p>
|
||||
<ul>
|
||||
<li>It may take a few minutes to load completely.</li>
|
||||
<li>The XML parser in IE does not appear to be conformant: it seems to
|
||||
break on</span> the following valid code points (and others):
|
||||
<ul>
|
||||
<li><IEbugs<br>
|
||||
c1='&#xFFF9;'<br>
|
||||
c2='&#xFFFA;'<br>
|
||||
c3='&#xFFFB;'<br>
|
||||
c4='&#xFFFC;'<br>
|
||||
c5='&#xFFFD;'<br>
|
||||
c6='&#xF0000;'<br>
|
||||
c7='&#xFFFFD;'<br>
|
||||
c8='&#x100000;'<br>
|
||||
c9='&#x10FFFD;'/></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<p><a href="UCD-Main.xml">UCD-Main.xml</a> provides an XML format for the main
|
||||
files in the Unicode Character Database. These include:</p>
|
||||
<ul>
|
||||
<li><code>UnicodeData.txt</code></li>
|
||||
<li><code>ArabicShaping.txt</code></li>
|
||||
<li><code>Jamo.txt</code></li>
|
||||
<li><code>SpecialCasing.txt</code></li>
|
||||
<li><code>CompositionExclusions.txt</code></li>
|
||||
<li><code>EastAsianWidth.txt</code></li>
|
||||
<li><code>LineBreak.txt</code></li>
|
||||
<li><code>BidiMirroring.txt</code></li>
|
||||
<li><code>CaseFolding.txt</code></li>
|
||||
<li><code>Blocks.txt</code></li>
|
||||
<li><code>PropList.alpha.txt</code></li>
|
||||
</ul>
|
||||
<p>Other files in the UCD have very different structure or purpose, and are best
|
||||
expressed with separate files. Some annotational data, such as that in
|
||||
NamesList.txt or the 10646 comment in UnicodeData, is also best served with
|
||||
separate files. The current UCD files not yet in XML format are:</p>
|
||||
<ul>
|
||||
<li><code>Unihan.txt</code></li>
|
||||
<li><code>NamesList.txt</code></li>
|
||||
<li><code>Index.txt</code></li>
|
||||
<li><code>NormalizationTest.txt</code></li>
|
||||
</ul>
|
||||
<h3>Format</h3>
|
||||
<p>The Unicode blocks are provided as a list of <block .../> elements,
|
||||
with attributes providing the start, end, and name.</p>
|
||||
<p>Each assigned code point is a <e .../> element, with attributes
|
||||
supplying specific properties. The meaning of the attributes is specified below.
|
||||
There is one exception: large ranges of code points for characters such as
|
||||
Hangul Syllables are abbreviated by indicating the start and end of the range.</p>
|
||||
<p>Because of the volume of data, the attribute names are abbreviated. A <a
|
||||
href="#AttributeAbbreviations">key</a> explains the abbreviations, and relates
|
||||
them to the fields and values of the original UCD semicolon-delimited files.
|
||||
With few exceptions, the values in the XML are directly copied from data in the
|
||||
original UCD semicolon-delimited files. Those exceptions are described <a
|
||||
href="http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html#DataModifications">below</a>.</p>
|
||||
<p>Numeric character references (NCRs) are used to encode the Unicode code
|
||||
points. Some Unicode code points cannot be transmitted in XML, even as NCRs (see
|
||||
<a href="http://www.w3.org/TR/REC-xml#charsets">http://www.w3.org/TR/REC-xml#charsets</a>),
|
||||
or would not be visibly distinct (TAB, CR, LF) in the data. Such code points are
|
||||
represented by '#xX;', where X is a hex number.</p>
|
||||
<h3><a name="AttributeAbbreviations">Attribute Abbreviations</a></h3>
|
||||
<p>To reduce the size of the document, the following attribute abbreviations are
|
||||
used. If an attribute is missing, that means it gets a default value. The
|
||||
defaults are listed in parentheses below. If there is no specific default, then
|
||||
a missing attribute should be read as N/A (not applicable). A default with '='
|
||||
means the default is the value of another other field (recursively!). Thus if
|
||||
the titlecase attribute is missing, then the value is the same as the uppercase.
|
||||
If that in turn is missing, then the value is the same as the code point itself.</p>
|
||||
<p>For a description of the source files, see <a
|
||||
href="http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html">UnicodeCharacterDatabase.html</a>.
|
||||
That file also has links to the descriptions of the fields within the files.
|
||||
Since the PropList values are so long, they will probably also be abbreviated in
|
||||
the future.</p>
|
||||
<table border="1" width="100%">
|
||||
<tr>
|
||||
<td width="50%" valign="top"><span class="cb" id style="DISPLAY: block">
|
||||
<h4>UnicodeData</h4>
|
||||
<p> c: code point<br>
|
||||
n: name<br>
|
||||
gc: general category (Lo)<br>
|
||||
cc: combining class (0)<br>
|
||||
bc: bidi category (L)<br>
|
||||
dm: decomposition mapping<br>
|
||||
dt: decomposition type (canonical)<br>
|
||||
nt: numeric type<br>
|
||||
nv: numeric value<br>
|
||||
bm: bidi mirrored (N)<br>
|
||||
uc: uppercase (=c)<br>
|
||||
lc: lowercase (=c)<br>
|
||||
tc: titlecase (=uc)</p>
|
||||
<h4>SpecialCasing:</h4>
|
||||
<p> sl: special lower (=lc)<br>
|
||||
su: special upper (=uc)<br>
|
||||
st: special title (=su)<br>
|
||||
sc: special case condition</p>
|
||||
<h4>CaseFolding:</h4>
|
||||
<p> fc: foldcase (=sl)</span></td>
|
||||
<td width="50%" valign="top"><span class="cb" id style="DISPLAY: block">
|
||||
<h4>CompositionExclusions:</h4>
|
||||
<p> ce: composition exclusion (N)</p>
|
||||
<h4>EastAsianWidth:</h4>
|
||||
<p> ea: east asian width (N)</p>
|
||||
<h4>Jamo:</h4>
|
||||
<p> jn: jamo name</p>
|
||||
<h4>LineBreak:</h4>
|
||||
<p> lb: line break class (AL)</p>
|
||||
<h4>ArabicShaping:</h4>
|
||||
<p> jt: joining type<br>
|
||||
jg: joining group</p>
|
||||
<h4>BidiMirroring:</h4>
|
||||
<p> bg: bidi mirroring glyph (=c)</p>
|
||||
<p><b>PropList:</b></p>
|
||||
<p> xs: space-delimited list of properties from the file</p>
|
||||
<p><b><i>WARNING: these values are likely to change!</i></b></span></td>
|
||||
</tr>
|
||||
</table>
|
||||
<br>
|
||||
<h3><a name="DataModifications">Data Modifications</a></h3>
|
||||
</span>
|
||||
<p>The XML format is generated from the original semicolon-delimited UCD files.
|
||||
In general, all fields and values are direct copies. However, there are some
|
||||
changes, detailed below.</p>
|
||||
<h4>1. Some redundant or annotational fields are omitted</h4>
|
||||
<table border="1" width="100%">
|
||||
<tr>
|
||||
<td width="50%" valign="top"><b>UnicodeData<br>
|
||||
</b>1.0 Name<br>
|
||||
10646 comment<br>
|
||||
<br>
|
||||
<b>CaseFolding<br>
|
||||
</b>Type (since it is computable from whether the fold equals the normal
|
||||
lowercase)
|
||||
<p><b>ArabicShaping<br>
|
||||
</b>Name<br>
|
||||
<br>
|
||||
<b>EastAsianWidth<br>
|
||||
</b>Name<br>
|
||||
<br>
|
||||
<b>LineBreak<br>
|
||||
</b>Name</p>
|
||||
</td>
|
||||
<td width="50%" valign="top"><b>PropList</b><font face="Times New Roman"
|
||||
color="#000000">
|
||||
<p>The fields are based on the proposed PropList.alpha, which changes the
|
||||
fields considerably.</p>
|
||||
</font>
|
||||
<p><span class="cb" id style="display: block"><b><i>WARNING: other values
|
||||
are also likely to change!</i></b></span></p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<h4>2. Some fields are broken into several fields; others may be combined into a
|
||||
single field</h4>
|
||||
<ul>
|
||||
<li><b>dt: </b>decomposition tag
|
||||
<ul>
|
||||
<li>the 'tag' field extracted from the decomposition mapping. If there is
|
||||
no tag, the value is "canonical". Only has meaning if there is
|
||||
a decomposition (<b>dm</b>).</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><b>nt: </b>numeric type
|
||||
<ul>
|
||||
<li>an enumeration [decimal, digit, numeric] for the type of number. It
|
||||
replaces having duplicate field values for numbers</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><b>rg: </b>range
|
||||
<ul>
|
||||
<li>used for ranges of values that share characteristics, instead of
|
||||
having to do a substring check.<br>
|
||||
"START" corresponds to "<..., First>"<br>
|
||||
"END" corresponds to "<..., Last>"</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><b>nc: </b>name computed
|
||||
<ul>
|
||||
<li>if "COMPUTED", indicates that the name must be computed:
|
||||
e.g. Hangul Syllables, Ideographs</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><b>na: </b>name annotation
|
||||
<ul>
|
||||
<li>used for code points that do not really have associated names, like
|
||||
control characters and private use characters. The data in that case is
|
||||
either extracted from the "<...>" style name in the old
|
||||
format, or gotten from the "1.0 Unicode name".</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
File diff suppressed because it is too large
Load Diff
@ -1,180 +0,0 @@
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public abstract class UCDProperty implements UCD_Types {
|
||||
|
||||
// TODO: turn all of these into privates, and use setters only
|
||||
|
||||
protected UCD ucd;
|
||||
protected boolean isStandard = true;
|
||||
protected byte type = NOT_DERIVED;
|
||||
private byte valueType = BINARY_PROP;
|
||||
protected boolean hasUnassigned = false;
|
||||
protected boolean isBinary = true;
|
||||
protected byte defaultValueStyle = SHORT;
|
||||
protected byte defaultPropertyStyle = LONG;
|
||||
protected String valueName;
|
||||
protected String numberValueName;
|
||||
protected String shortValueName;
|
||||
protected String header;
|
||||
protected String subheader;
|
||||
protected String name;
|
||||
protected String shortName;
|
||||
protected String numberName;
|
||||
protected boolean skeletonize = true;
|
||||
|
||||
/**
|
||||
* Return the UCD in use
|
||||
*/
|
||||
public UCD getUCD() { return ucd; }
|
||||
|
||||
/**
|
||||
* Is it part of the standard, or just for my testing?
|
||||
*/
|
||||
public boolean isStandard() { return isStandard; }
|
||||
public void setStandard(boolean in) { isStandard = in; }
|
||||
|
||||
public boolean skipInDerivedListing() {return false;}
|
||||
public boolean isDefaultValue() {return false;}
|
||||
|
||||
/**
|
||||
* What type is it? DERIVED..
|
||||
*/
|
||||
public byte getType() { return type; }
|
||||
public void setType(byte in) { type = in; }
|
||||
|
||||
/**
|
||||
* Does getProperty vary in contents? ENUMERATED,...
|
||||
*/
|
||||
public byte getValueType() { return valueType; }
|
||||
public void setValueType(byte in) { valueType = in; }
|
||||
|
||||
/**
|
||||
* Does it apply to any unassigned characters?
|
||||
*/
|
||||
public boolean hasUnassigned() { return hasUnassigned; }
|
||||
public void setHasUnassigned(boolean in) { hasUnassigned = in; }
|
||||
|
||||
/** Header used in DerivedXXX files
|
||||
*/
|
||||
public String getHeader() { return header; }
|
||||
public void setHeader(String in) { header = in; }
|
||||
|
||||
/** Header used in DerivedXXX files
|
||||
*/
|
||||
public String getSubHeader() { return subheader; }
|
||||
public void setSubHeader(String in) { subheader = in; }
|
||||
|
||||
/**
|
||||
* Get the full name. Style is SHORT, NORMAL, LONG
|
||||
*/
|
||||
public String getFullName(byte style) {
|
||||
return getPropertyName(style) + "=" + getValue(style);
|
||||
}
|
||||
|
||||
public String getFullName() {
|
||||
return getFullName(NORMAL);
|
||||
}
|
||||
/**
|
||||
* Get the property name. Style is SHORT, NORMAL, LONG
|
||||
*/
|
||||
public String getPropertyName(byte style) {
|
||||
if (style == NORMAL) style = defaultPropertyStyle;
|
||||
switch (style) {
|
||||
case LONG: return skeletonize ? Utility.getUnskeleton(name.toString(), false) : name.toString();
|
||||
case SHORT: return shortName.toString();
|
||||
case NUMBER: return numberName.toString();
|
||||
default: throw new IllegalArgumentException("Bad property: " + style);
|
||||
}
|
||||
}
|
||||
|
||||
public String getPropertyName() { return getPropertyName(NORMAL); }
|
||||
|
||||
public void setPropertyName(byte style, String in) {
|
||||
if (style == NORMAL) style = defaultPropertyStyle;
|
||||
switch (style) {
|
||||
case LONG: name = Utility.getUnskeleton(in, false); break;
|
||||
case SHORT: shortName = in; break;
|
||||
case NUMBER: numberName = in; break;
|
||||
default: throw new IllegalArgumentException("Bad property: " + style);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value name. Style is SHORT, NORMAL, LONG
|
||||
* "" if hasValue is false
|
||||
* MUST OVERRIDE getValue(cp...) if valueVaries
|
||||
*/
|
||||
public String getValue(int cp, byte style) {
|
||||
if (!hasValue(cp)) return "";
|
||||
return getValue(style);
|
||||
}
|
||||
|
||||
public String getValue(int cp) { return getValue(cp, NORMAL); }
|
||||
|
||||
public void setValue(byte style, String in) {
|
||||
if (getValueType() < BINARY_PROP) throw new IllegalArgumentException("Can't set varying value: " + style);
|
||||
if (style == NORMAL) style = defaultValueStyle;
|
||||
switch (style) {
|
||||
case LONG: valueName = Utility.getUnskeleton(in, false); break;
|
||||
case SHORT: shortValueName = in; break;
|
||||
case NUMBER: numberValueName = in; break;
|
||||
default: throw new IllegalArgumentException("Bad value: " + style);
|
||||
}
|
||||
}
|
||||
|
||||
public String getValue(byte style) {
|
||||
if (getValueType() < BINARY_PROP) throw new IllegalArgumentException(
|
||||
"Value varies in " + getName(LONG) + "; call getValue(cp)");
|
||||
try {
|
||||
if (style == NORMAL) style = defaultValueStyle;
|
||||
switch (style) {
|
||||
case LONG: return Utility.getUnskeleton(valueName.toString(), false);
|
||||
case SHORT: return shortValueName.toString();
|
||||
case NUMBER: return numberValueName.toString();
|
||||
default: throw new IllegalArgumentException("Bad property: " + style);
|
||||
}
|
||||
} catch (RuntimeException e) {
|
||||
throw new com.ibm.text.utility.ChainException("Unset value string in " + getName(LONG), null, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* special hack for NFD/NFKD
|
||||
*/
|
||||
public String getListingValue(int cp) {
|
||||
if (getValueType() != BINARY_PROP) return getValue(cp, LONG);
|
||||
return getPropertyName(LONG);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does it have the propertyValue?
|
||||
*/
|
||||
abstract public boolean hasValue(int cp);
|
||||
|
||||
/**
|
||||
* Get the set of characters it contains
|
||||
*/
|
||||
|
||||
private UnicodeSet cache = null;
|
||||
|
||||
public UnicodeSet getSet() {
|
||||
if (cache == null) {
|
||||
cache = new UnicodeSet();
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (hasValue(cp)) cache.add(cp);
|
||||
}
|
||||
}
|
||||
return (UnicodeSet) cache.clone();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////
|
||||
|
||||
// Old Name for compatibility
|
||||
boolean isTest() { return isStandard(); }
|
||||
String getName(byte style) { return getPropertyName(style); }
|
||||
String getName() { return getPropertyName(); }
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,575 +0,0 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.33 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
public interface UCD_Types {
|
||||
|
||||
static final byte BINARY_FORMAT = 17; // bumped if binary format of UCD changes. Forces rebuild
|
||||
|
||||
public static final String BASE_DIR = "C:\\DATA\\";
|
||||
public static final String UCD_DIR = BASE_DIR + "UCD\\";
|
||||
public static final String BIN_DIR = BASE_DIR + "BIN\\";
|
||||
public static final String GEN_DIR = BASE_DIR + "GEN\\";
|
||||
|
||||
public static final char DOTTED_CIRCLE = '\u25CC';
|
||||
|
||||
public static final int
|
||||
CJK_BASE = 0x4E00,
|
||||
CJK_LIMIT = 0x9FFF+1,
|
||||
CJK_COMPAT_USED_BASE = 0xFA0E,
|
||||
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
|
||||
CJK_A_BASE = 0x3400,
|
||||
CJK_A_LIMIT = 0x4DBF+1,
|
||||
CJK_B_BASE = 0x20000,
|
||||
CJK_B_LIMIT = 0x2A6DF+1;
|
||||
|
||||
// Unicode Property Types
|
||||
static final byte
|
||||
NOT_DERIVED = 1,
|
||||
DERIVED_CORE = 2,
|
||||
DERIVED_NORMALIZATION = 4,
|
||||
DERIVED_ALL = 0x6,
|
||||
ALL = (byte)-1;
|
||||
|
||||
static final byte
|
||||
NUMERIC_PROP = 0,
|
||||
STRING_PROP = 1,
|
||||
MISC_PROP = 2,
|
||||
CATALOG_PROP = 3,
|
||||
ENUMERATED_PROP = 4,
|
||||
BINARY_PROP = 5,
|
||||
FLATTENED_BINARY_PROP = 6,
|
||||
UNKNOWN_PROP = 7;
|
||||
|
||||
/*
|
||||
0 Code value in 4-digit hexadecimal format.
|
||||
1 Unicode 2.1 Character Name. These names match exactly the
|
||||
2 General Category. This is a useful breakdown into various "character
|
||||
3 Canonical Combining Classes. The classes used for the
|
||||
4 Bidirectional Category. See the list below for an explanation of the
|
||||
5 Character Decomposition. In the Unicode Standard, not all of
|
||||
6 Decimal digit value. This is a numeric field. If the character
|
||||
7 Digit value. This is a numeric field. If the character represents a
|
||||
8 Numeric value. This is a numeric field. If the character has the
|
||||
9 If the characters has been identified as a "mirrored" character in
|
||||
10 Unicode 1.0 Name. This is the old name as published in Unicode 1.0.
|
||||
11 10646 Comment field. This field is informative.
|
||||
12 Upper case equivalent mapping. If a character is part of an
|
||||
13 Lower case equivalent mapping. Similar to 12. This field is informative.
|
||||
14 Title case equivalent mapping. Similar to 12. This field is informative.
|
||||
*/
|
||||
|
||||
|
||||
// for IDs
|
||||
static final byte NUMBER = -2, SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2, EXTRA_ALIAS = 3;
|
||||
|
||||
// Binary ENUM Grouping
|
||||
public static final int
|
||||
CATEGORY = 0,
|
||||
COMBINING_CLASS = 0x100,
|
||||
BIDI_CLASS = 0x200,
|
||||
DECOMPOSITION_TYPE = 0x300,
|
||||
NUMERIC_TYPE = 0x400,
|
||||
EAST_ASIAN_WIDTH = 0x500,
|
||||
LINE_BREAK = 0x600,
|
||||
JOINING_TYPE = 0x700,
|
||||
JOINING_GROUP = 0x800,
|
||||
BINARY_PROPERTIES = 0x900,
|
||||
SCRIPT = 0xA00,
|
||||
AGE = 0xB00,
|
||||
HANGUL_SYLLABLE_TYPE = 0xC00,
|
||||
DERIVED = 0xD00,
|
||||
LIMIT_ENUM = DERIVED + 0x100,
|
||||
NEXT_ENUM = 0x100;
|
||||
|
||||
public static final int LIMIT_COMBINING_CLASS = 256;
|
||||
|
||||
// getCategory
|
||||
public static final byte
|
||||
UNASSIGNED = 0,
|
||||
UPPERCASE_LETTER = 1,
|
||||
LOWERCASE_LETTER = 2,
|
||||
TITLECASE_LETTER = 3,
|
||||
MODIFIER_LETTER = 4,
|
||||
OTHER_LETTER = 5,
|
||||
NON_SPACING_MARK = 6,
|
||||
ENCLOSING_MARK = 7,
|
||||
COMBINING_SPACING_MARK = 8,
|
||||
DECIMAL_DIGIT_NUMBER = 9,
|
||||
LETTER_NUMBER = 10,
|
||||
OTHER_NUMBER = 11,
|
||||
SPACE_SEPARATOR = 12,
|
||||
LINE_SEPARATOR = 13,
|
||||
PARAGRAPH_SEPARATOR = 14,
|
||||
CONTROL = 15,
|
||||
FORMAT = 16,
|
||||
UNUSED_CATEGORY = 17,
|
||||
PRIVATE_USE = 18,
|
||||
SURROGATE = 19,
|
||||
DASH_PUNCTUATION = 20,
|
||||
START_PUNCTUATION = 21,
|
||||
END_PUNCTUATION = 22,
|
||||
CONNECTOR_PUNCTUATION = 23,
|
||||
OTHER_PUNCTUATION = 24,
|
||||
MATH_SYMBOL = 25,
|
||||
CURRENCY_SYMBOL = 26,
|
||||
MODIFIER_SYMBOL = 27,
|
||||
OTHER_SYMBOL = 28,
|
||||
INITIAL_PUNCTUATION = 29,
|
||||
FINAL_PUNCTUATION = 30,
|
||||
LIMIT_CATEGORY = FINAL_PUNCTUATION+1,
|
||||
|
||||
// Unicode abbreviations
|
||||
Lu = UPPERCASE_LETTER,
|
||||
Ll = LOWERCASE_LETTER,
|
||||
Lt = TITLECASE_LETTER,
|
||||
Lm = MODIFIER_LETTER,
|
||||
Lo = OTHER_LETTER,
|
||||
Mn = NON_SPACING_MARK,
|
||||
Me = ENCLOSING_MARK,
|
||||
Mc = COMBINING_SPACING_MARK,
|
||||
Nd = DECIMAL_DIGIT_NUMBER,
|
||||
Nl = LETTER_NUMBER,
|
||||
No = OTHER_NUMBER,
|
||||
Zs = SPACE_SEPARATOR,
|
||||
Zl = LINE_SEPARATOR,
|
||||
Zp = PARAGRAPH_SEPARATOR,
|
||||
Cc = CONTROL,
|
||||
Cf = FORMAT,
|
||||
Cs = SURROGATE,
|
||||
Co = PRIVATE_USE,
|
||||
Cn = UNASSIGNED,
|
||||
Pc = CONNECTOR_PUNCTUATION,
|
||||
Pd = DASH_PUNCTUATION,
|
||||
Ps = START_PUNCTUATION,
|
||||
Pe = END_PUNCTUATION,
|
||||
Po = OTHER_PUNCTUATION,
|
||||
Pi = INITIAL_PUNCTUATION,
|
||||
Pf = FINAL_PUNCTUATION,
|
||||
Sm = MATH_SYMBOL,
|
||||
Sc = CURRENCY_SYMBOL,
|
||||
Sk = MODIFIER_SYMBOL,
|
||||
So = OTHER_SYMBOL;
|
||||
|
||||
static final int
|
||||
LETTER_MASK = (1<<Lu) | (1<<Ll) | (1<<Lt) | (1<<Lm) | (1 << Lo),
|
||||
CASED_LETTER_MASK = (1<<Lu) | (1<<Ll) | (1<<Lt),
|
||||
MARK_MASK = (1<<Mn) | (1<<Me) | (1<<Mc),
|
||||
NUMBER_MASK = (1<<Nd) | (1<<Nl) | (1<<No),
|
||||
SEPARATOR_MASK = (1<<Zs) | (1<<Zl) | (1<<Zp),
|
||||
CONTROL_MASK = (1<<Cc) | (1<<Cf) | (1<<Cs) | (1<<Co),
|
||||
PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
|
||||
SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
|
||||
UNASSIGNED_MASK = (1<<Cn),
|
||||
BASE_MASK = LETTER_MASK | NUMBER_MASK | PUNCTUATION_MASK | SYMBOL_MASK | (1<<Mc),
|
||||
NONSPACING_MARK_MASK = (1<<Mn) | (1<<Me);
|
||||
|
||||
|
||||
// Binary Properties
|
||||
|
||||
public static final byte
|
||||
BidiMirrored = 0,
|
||||
CompositionExclusion = 1,
|
||||
White_space = 2,
|
||||
Non_break = 3,
|
||||
Bidi_Control = 4,
|
||||
Join_Control = 5,
|
||||
Dash = 6,
|
||||
Hyphen = 7,
|
||||
Quotation_Mark = 8,
|
||||
Terminal_Punctuation = 9,
|
||||
Math_Property = 10,
|
||||
Hex_Digit = 11,
|
||||
ASCII_Hex_Digit = 12,
|
||||
Other_Alphabetic = 13,
|
||||
Ideographic = 14,
|
||||
Diacritic = 15,
|
||||
Extender = 16,
|
||||
Other_Lowercase = 17,
|
||||
Other_Uppercase = 18,
|
||||
Noncharacter_Code_Point = 19,
|
||||
CaseFoldTurkishI = 20,
|
||||
Other_GraphemeExtend = 21,
|
||||
GraphemeLink = 22,
|
||||
IDS_BinaryOperator = 23,
|
||||
IDS_TrinaryOperator = 24,
|
||||
Radical = 25,
|
||||
UnifiedIdeograph = 26,
|
||||
Other_Default_Ignorable_Code_Point = 27,
|
||||
Deprecated = 28,
|
||||
Soft_Dotted = 29,
|
||||
Logical_Order_Exception = 30,
|
||||
Other_ID_Start = 31,
|
||||
Sentence_Terminal = 32,
|
||||
Variation_Selector = 33,
|
||||
Other_ID_Continue = 34,
|
||||
Pattern_White_Space = 35,
|
||||
Pattern_Syntax = 36,
|
||||
LIMIT_BINARY_PROPERTIES = 37;
|
||||
|
||||
/*
|
||||
static final int
|
||||
BidiMirroredMask = 1<<BidiMirrored,
|
||||
CompositionExclusionMask = 1<<CompositionExclusion,
|
||||
AlphabeticMask = 1<<Other_Alphabetic,
|
||||
Bidi_ControlMask = 1<<Bidi_Control,
|
||||
DashMask = 1<<Dash,
|
||||
DiacriticMask = 1<<Diacritic,
|
||||
ExtenderMask = 1<<Extender,
|
||||
Hex_DigitMask = 1<<Hex_Digit,
|
||||
HyphenMask = 1<<Hyphen,
|
||||
IdeographicMask = 1<<Ideographic,
|
||||
Join_ControlMask = 1<<Join_Control,
|
||||
Math_PropertyMask = 1<<Math_Property,
|
||||
Non_breakMask = 1<<Non_break,
|
||||
Noncharacter_Code_PointMask = 1<<Noncharacter_Code_Point,
|
||||
Other_LowercaseMask = 1<<Other_Lowercase,
|
||||
Other_UppercaseMask = 1<<Other_Uppercase,
|
||||
Quotation_MarkMask = 1<<Quotation_Mark,
|
||||
Terminal_PunctuationMask = 1<<Terminal_Punctuation,
|
||||
White_spaceMask = 1<<White_space;
|
||||
*/
|
||||
|
||||
// line break
|
||||
public static final byte
|
||||
LB_XX = 0, LB_OP = 1, LB_CL = 2, LB_QU = 3, LB_GL = 4, LB_NS = 5, LB_EX = 6, LB_SY = 7,
|
||||
LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15,
|
||||
LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23,
|
||||
LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28,
|
||||
LB_NL = 29,
|
||||
LB_WJ = 30,
|
||||
LB_JL = 31,
|
||||
LB_JV = 32,
|
||||
LB_JT = 33,
|
||||
LB_H2 = 34,
|
||||
LB_H3 = 35,
|
||||
//LB_JL = 29,
|
||||
//LB_JV = 30,
|
||||
//LB_JT = 31,
|
||||
LIMIT_LINE_BREAK = 36,
|
||||
LB_LIMIT = LIMIT_LINE_BREAK;
|
||||
|
||||
// east asian width
|
||||
public static final byte
|
||||
EAN = 0, EAA = 1, EAH = 2, EAW = 3, EAF = 4, EANa = 5,
|
||||
LIMIT_EAST_ASIAN_WIDTH = 6;
|
||||
|
||||
// bidi class
|
||||
static final byte
|
||||
BIDI_L = 0, // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
|
||||
BIDI_R = 1, // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
|
||||
BIDI_EN = 2, // European Number
|
||||
BIDI_ES = 3, // European Number Separator
|
||||
BIDI_ET = 4, // European Number Terminator
|
||||
BIDI_AN = 5, // Arabic Number
|
||||
BIDI_CS = 6, // Common Number Separator
|
||||
BIDI_B = 7, // Block Separator
|
||||
BIDI_S = 8, // Segment Separator
|
||||
BIDI_WS = 9, // Whitespace
|
||||
BIDI_ON = 10, // Other Neutrals ; All other characters: punctuation, symbols
|
||||
LIMIT_BIDI_2 = 11,
|
||||
BIDI_UNUSED = 11,
|
||||
BIDI_BN = 12,
|
||||
BIDI_NSM = 13,
|
||||
BIDI_AL = 14,
|
||||
BIDI_LRO = 15,
|
||||
BIDI_RLO = 16,
|
||||
BIDI_LRE = 17,
|
||||
BIDI_RLE = 18,
|
||||
BIDI_PDF = 19,
|
||||
LIMIT_BIDI_CLASS = 20;
|
||||
|
||||
// decompositionType
|
||||
static final byte NONE = 0,
|
||||
CANONICAL = 1,
|
||||
COMPATIBILITY = 2,
|
||||
COMPAT_UNSPECIFIED = 2, // Otherwise unspecified compatibility character.
|
||||
COMPAT_FONT = 3, // A font variant (e.g. a blackletter form).
|
||||
COMPAT_NOBREAK = 4, // A no-break version of a space or hyphen.
|
||||
COMPAT_INITIAL = 5, // // An initial presentation form (Arabic).
|
||||
COMPAT_MEDIAL = 6, // // A medial presentation form (Arabic).
|
||||
COMPAT_FINAL = 7, // // A final presentation form (Arabic).
|
||||
COMPAT_ISOLATED = 8, // An isolated presentation form (Arabic).
|
||||
COMPAT_CIRCLE = 9, // An encircled form.
|
||||
COMPAT_SUPER = 10, // A superscript form.
|
||||
COMPAT_SUB = 11, // A subscript form.
|
||||
COMPAT_VERTICAL = 12, // A vertical layout presentation form.
|
||||
COMPAT_WIDE = 13, // A wide (or zenkaku) compatibility character.
|
||||
COMPAT_NARROW = 14, // A narrow (or hankaku) compatibility character.
|
||||
COMPAT_SMALL = 15, // A small variant form (CNS compatibility).
|
||||
COMPAT_SQUARE = 16, // A CJK squared font variant.
|
||||
COMPAT_FRACTION = 17, // A vulgar fraction form.
|
||||
LIMIT_DECOMPOSITION_TYPE = 18;
|
||||
|
||||
// mirrored type
|
||||
static final byte NO = 0, YES = 1, LIMIT_MIRRORED = 2;
|
||||
|
||||
// for QuickCheck
|
||||
static final byte QNO = 0, QMAYBE = 1, QYES = 2;
|
||||
|
||||
// case type
|
||||
static final byte LOWER = 0, TITLE = 1, UPPER = 2, UNCASED = 3, FOLD = 3, LIMIT_CASE = 4;
|
||||
static final byte SIMPLE = 0, FULL = 8;
|
||||
|
||||
// normalization type
|
||||
static final byte UNNORMALIZED = 0, C = 1, KC = 2, D = 3, KD = 4, FORM_LIMIT = 5;
|
||||
|
||||
// numericType
|
||||
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
|
||||
LIMIT_NUMERIC_TYPE = 4;
|
||||
// HAN_PRIMARY = 4, HAN_ACCOUNTING = 5, HAN_OTHER = 6,
|
||||
// WARNING, reset to 7 if all properties desired!!
|
||||
|
||||
static final byte NA = 0, L = 1, V = 2, T = 3, LV = 4, LVT = 5,
|
||||
HANGUL_SYLLABLE_TYPE_LIMIT = 6;
|
||||
|
||||
public static final byte // SCRIPT CODE
|
||||
COMMON_SCRIPT = 0,
|
||||
LATIN_SCRIPT = 1,
|
||||
GREEK_SCRIPT = 2,
|
||||
CYRILLIC_SCRIPT = 3,
|
||||
ARMENIAN_SCRIPT = 4,
|
||||
HEBREW_SCRIPT = 5,
|
||||
ARABIC_SCRIPT = 6,
|
||||
SYRIAC_SCRIPT = 7,
|
||||
THAANA_SCRIPT = 8,
|
||||
DEVANAGARI_SCRIPT = 9,
|
||||
BENGALI_SCRIPT = 10,
|
||||
GURMUKHI_SCRIPT = 11,
|
||||
GUJARATI_SCRIPT = 12,
|
||||
ORIYA_SCRIPT = 13,
|
||||
TAMIL_SCRIPT = 14,
|
||||
TELUGU_SCRIPT = 15,
|
||||
KANNADA_SCRIPT = 16,
|
||||
MALAYALAM_SCRIPT = 17,
|
||||
SINHALA_SCRIPT = 18,
|
||||
THAI_SCRIPT = 19,
|
||||
LAO_SCRIPT = 20,
|
||||
TIBETAN_SCRIPT = 21,
|
||||
MYANMAR_SCRIPT = 22,
|
||||
GEORGIAN_SCRIPT = 23,
|
||||
UNUSED_SCRIPT = 24,
|
||||
HANGUL_SCRIPT = 25,
|
||||
ETHIOPIC_SCRIPT = 26,
|
||||
CHEROKEE_SCRIPT = 27,
|
||||
ABORIGINAL_SCRIPT = 28,
|
||||
OGHAM_SCRIPT = 29,
|
||||
RUNIC_SCRIPT = 30,
|
||||
KHMER_SCRIPT = 31,
|
||||
MONGOLIAN_SCRIPT = 32,
|
||||
HIRAGANA_SCRIPT = 33,
|
||||
KATAKANA_SCRIPT = 34,
|
||||
BOPOMOFO_SCRIPT = 35,
|
||||
HAN_SCRIPT = 36,
|
||||
YI_SCRIPT = 37,
|
||||
OLD_ITALIC_SCRIPT = 38,
|
||||
GOTHIC_SCRIPT = 39,
|
||||
DESERET_SCRIPT = 40,
|
||||
INHERITED_SCRIPT = 41,
|
||||
TAGALOG_SCRIPT = 42,
|
||||
HANUNOO_SCRIPT = 43,
|
||||
BUHID_SCRIPT = 44,
|
||||
TAGBANWA_SCRIPT = 45,
|
||||
LIMBU = 46,
|
||||
TAI_LE = 47,
|
||||
LINEAR_B = 48,
|
||||
UGARITIC = 49,
|
||||
SHAVIAN = 50,
|
||||
OSMANYA = 51,
|
||||
CYPRIOT = 52,
|
||||
BRAILLE = 53,
|
||||
KATAKANA_OR_HIRAGANA = 54,
|
||||
BUGINESE = 55,
|
||||
COPTIC = 56,
|
||||
NEW_TAI_LUE = 57,
|
||||
GLAGOLITIC = 58,
|
||||
TIFINAGH = 59,
|
||||
SYLOTI_NAGRI = 60,
|
||||
OLD_PERSIAN = 61,
|
||||
KHAROSHTHI = 62,
|
||||
Balinese = 63,
|
||||
Cuneiform = 64,
|
||||
Phoenician = 65,
|
||||
Phags_Pa = 66,
|
||||
NKo = 67,
|
||||
Unknown_Script = 68,
|
||||
|
||||
LIMIT_SCRIPT = 69;
|
||||
|
||||
static final int
|
||||
UNKNOWN = 0,
|
||||
AGE11 = 1,
|
||||
AGE20 = 2,
|
||||
AGE21 = 3,
|
||||
AGE30 = 4,
|
||||
AGE31 = 5,
|
||||
AGE32 = 6,
|
||||
AGE40 = 7,
|
||||
AGE41 = 8,
|
||||
AGE50 = 9,
|
||||
LIMIT_AGE = 10;
|
||||
|
||||
static final String[] AGE_VERSIONS = {
|
||||
"?",
|
||||
"1.1.0",
|
||||
"2.0.0",
|
||||
"2.1.2",
|
||||
"3.0.0",
|
||||
"3.1.0",
|
||||
"3.2.0",
|
||||
"4.0.0",
|
||||
"4.1.0",
|
||||
"5.0.0"
|
||||
};
|
||||
|
||||
public static byte
|
||||
JT_C = 0,
|
||||
JT_D = 1,
|
||||
JT_R = 2,
|
||||
JT_U = 3,
|
||||
JT_L = 4,
|
||||
JT_T = 5,
|
||||
LIMIT_JOINING_TYPE = 6;
|
||||
|
||||
public static byte
|
||||
NO_SHAPING = 0,
|
||||
AIN = 1,
|
||||
ALAPH = 2,
|
||||
ALEF = 3,
|
||||
BEH = 4,
|
||||
BETH = 5,
|
||||
DAL = 6,
|
||||
DALATH_RISH = 7,
|
||||
E = 8,
|
||||
FEH = 9,
|
||||
FINAL_SEMKATH = 10,
|
||||
GAF = 11,
|
||||
GAMAL = 12,
|
||||
HAH = 13,
|
||||
HAMZA_ON_HEH_GOAL = 14,
|
||||
HE = 15,
|
||||
HEH = 16,
|
||||
HEH_GOAL = 17,
|
||||
HETH = 18,
|
||||
KAF = 19,
|
||||
KAPH = 20,
|
||||
KNOTTED_HEH = 21,
|
||||
LAM = 22,
|
||||
LAMADH = 23,
|
||||
MEEM = 24,
|
||||
MIM = 25,
|
||||
NOON = 26,
|
||||
NUN = 27,
|
||||
PE = 28,
|
||||
QAF = 29,
|
||||
QAPH = 30,
|
||||
REH = 31,
|
||||
REVERSED_PE = 32,
|
||||
SAD = 33,
|
||||
SADHE = 34,
|
||||
SEEN = 35,
|
||||
SEMKATH = 36,
|
||||
SHIN = 37,
|
||||
SWASH_KAF = 38,
|
||||
TAH = 39,
|
||||
TAW = 40,
|
||||
TEH_MARBUTA = 41,
|
||||
TETH = 42,
|
||||
WAW = 43,
|
||||
SYRIAC_WAW = 44,
|
||||
YEH = 45,
|
||||
YEH_BARREE = 46,
|
||||
YEH_WITH_TAIL = 47,
|
||||
YUDH = 48,
|
||||
YUDH_HE = 49,
|
||||
ZAIN = 50,
|
||||
ZHAIN = 51,
|
||||
KHAPH = 52,
|
||||
FE = 53,
|
||||
|
||||
LIMIT_JOINING_GROUP = 54;
|
||||
|
||||
static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3;
|
||||
public static final int
|
||||
NF_COMPATIBILITY_MASK = 2,
|
||||
NF_COMPOSITION_MASK = 1;
|
||||
|
||||
// DERIVED PROPERTY
|
||||
|
||||
static final byte
|
||||
PropMath = 0,
|
||||
PropAlphabetic = 1,
|
||||
PropLowercase = 2,
|
||||
PropUppercase = 3,
|
||||
|
||||
ID_Start = 4,
|
||||
ID_Continue_NO_Cf = 5,
|
||||
|
||||
Mod_ID_Start = 6,
|
||||
Mod_ID_Continue_NO_Cf = 7,
|
||||
|
||||
Missing_Uppercase = 8,
|
||||
Missing_Lowercase = 9,
|
||||
Missing_Mixedcase = 10,
|
||||
|
||||
FC_NFKC_Closure = 11,
|
||||
|
||||
FullCompExclusion = 12,
|
||||
FullCompInclusion = 13,
|
||||
|
||||
QuickNFD = 14,
|
||||
QuickNFC = 15,
|
||||
QuickNFKD = 16,
|
||||
QuickNFKC = 17,
|
||||
|
||||
ExpandsOnNFD = 18,
|
||||
ExpandsOnNFC = 19,
|
||||
ExpandsOnNFKD = 20,
|
||||
ExpandsOnNFKC = 21,
|
||||
|
||||
GenNFD = 22,
|
||||
GenNFC = 23,
|
||||
GenNFKD = 24,
|
||||
GenNFKC = 25,
|
||||
|
||||
DefaultIgnorable = 26,
|
||||
GraphemeExtend = 27,
|
||||
GraphemeBase = 28,
|
||||
|
||||
FC_NFC_Closure = 29,
|
||||
|
||||
Other_Case_Ignorable = 30,
|
||||
Case_Ignorable = 31,
|
||||
Type_i = 32,
|
||||
|
||||
NFC_Leading = 33,
|
||||
NFC_TrailingNonZero = 34,
|
||||
NFC_TrailingZero = 35,
|
||||
NFC_Resulting = 36,
|
||||
|
||||
NFD_UnsafeStart = 37,
|
||||
NFC_UnsafeStart = 38,
|
||||
NFKD_UnsafeStart = 39,
|
||||
NFKC_UnsafeStart = 40,
|
||||
|
||||
NFD_Skippable = 41,
|
||||
NFC_Skippable = 42,
|
||||
NFKD_Skippable = 43,
|
||||
NFKC_Skippable = 44,
|
||||
|
||||
Case_Sensitive = 45,
|
||||
|
||||
DERIVED_PROPERTY_LIMIT = 46;
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user