2003-04-23 19:01:21 +00:00
|
|
|
/**
|
|
|
|
*******************************************************************************
|
|
|
|
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
|
|
|
* others. All Rights Reserved. *
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Compare14652.java,v $
|
2003-04-25 01:39:15 +00:00
|
|
|
* $Date: 2003/04/25 01:39:15 $
|
|
|
|
* $Revision: 1.2 $
|
2003-04-23 19:01:21 +00:00
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
package com.ibm.text.UCD;
|
|
|
|
|
|
|
|
import java.util.*;
|
|
|
|
import java.io.*;
|
|
|
|
|
|
|
|
import com.ibm.text.utility.*;
|
|
|
|
import com.ibm.icu.text.UTF16;
|
|
|
|
import com.ibm.icu.text.UnicodeSet;
|
|
|
|
|
|
|
|
// quick and dirty function for grabbing contents of ISO 14652 file
|
|
|
|
|
|
|
|
public class Compare14652 implements UCD_Types {
|
|
|
|
|
2003-04-25 01:39:15 +00:00
|
|
|
static final boolean oldVersion = false;
|
|
|
|
|
2003-04-23 19:01:21 +00:00
|
|
|
public static UnicodeSet getSet(int prop, byte propValue) {
|
|
|
|
return UnifiedBinaryProperty.make(prop | propValue).getSet();
|
|
|
|
}
|
|
|
|
|
|
|
|
static UnicodeSet
|
|
|
|
titleSet = getSet(CATEGORY, Lt),
|
|
|
|
combiningSet = getSet(CATEGORY, Mc)
|
|
|
|
.addAll(getSet(CATEGORY, Me))
|
|
|
|
.addAll(getSet(CATEGORY, Mn)),
|
2003-04-25 01:39:15 +00:00
|
|
|
zSet = getSet(CATEGORY, Zs)
|
|
|
|
.addAll(getSet(CATEGORY, Zl))
|
|
|
|
.addAll(getSet(CATEGORY, Zp)),
|
|
|
|
pSet = getSet(CATEGORY, Pd)
|
2003-04-23 19:01:21 +00:00
|
|
|
.addAll(getSet(CATEGORY, Ps))
|
|
|
|
.addAll(getSet(CATEGORY, Pe))
|
|
|
|
.addAll(getSet(CATEGORY, Pc))
|
|
|
|
.addAll(getSet(CATEGORY, Po))
|
|
|
|
.addAll(getSet(CATEGORY, Pi))
|
|
|
|
.addAll(getSet(CATEGORY, Pf)),
|
2003-04-25 01:39:15 +00:00
|
|
|
sSet = getSet(CATEGORY, Sm)
|
|
|
|
.addAll(getSet(CATEGORY, Sc))
|
|
|
|
.addAll(getSet(CATEGORY, Sk))
|
|
|
|
.addAll(getSet(CATEGORY, So)),
|
|
|
|
noSet = getSet(CATEGORY, No),
|
|
|
|
csSet = getSet(CATEGORY, Cs),
|
|
|
|
cfSet = getSet(CATEGORY, Cf),
|
|
|
|
cnSet = getSet(CATEGORY, Cn),
|
|
|
|
circled = getSet(DECOMPOSITION_TYPE, COMPAT_CIRCLE),
|
|
|
|
whitespaceSet = getSet(BINARY_PROPERTIES, White_space),
|
|
|
|
alphaSet = getSet(DERIVED, PropAlphabetic).addAll(combiningSet),
|
|
|
|
lowerSet = getSet(DERIVED, PropLowercase).addAll(titleSet).removeAll(circled),
|
|
|
|
upperSet = getSet(DERIVED, PropUppercase).addAll(titleSet).removeAll(circled),
|
|
|
|
digitSet = getSet(CATEGORY, Nd),
|
|
|
|
xdigitSet = new UnicodeSet("[a-fA-F\uFF21-\uFF26\uFF41-\uFF46]").addAll(digitSet),
|
|
|
|
spaceSet = whitespaceSet.size() == 0 ? zSet : whitespaceSet,
|
|
|
|
controlSet = getSet(CATEGORY, Cc),
|
|
|
|
punctSet = new UnicodeSet(pSet).addAll(sSet),
|
2003-04-23 19:01:21 +00:00
|
|
|
graphSet = new UnicodeSet(0,0x10ffff)
|
|
|
|
.removeAll(controlSet)
|
|
|
|
//.removeAll(getSet(CATEGORY, Cf))
|
2003-04-25 01:39:15 +00:00
|
|
|
.removeAll(csSet)
|
|
|
|
.removeAll(cnSet)
|
|
|
|
.removeAll(zSet),
|
2003-04-23 19:01:21 +00:00
|
|
|
// Cc, Cf, Cs, Cn, Z
|
|
|
|
blankSet = new UnicodeSet(spaceSet).removeAll(new UnicodeSet("[\\u000A-\\u000D\\u0085]"))
|
|
|
|
.removeAll(getSet(CATEGORY, Zl))
|
|
|
|
.removeAll(getSet(CATEGORY, Zp));
|
|
|
|
|
|
|
|
|
|
|
|
static class Prop {
|
|
|
|
String name;
|
|
|
|
UnicodeSet contents = new UnicodeSet();
|
|
|
|
String guess = "???";
|
|
|
|
UnicodeSet guessContents = new UnicodeSet();
|
|
|
|
|
2003-04-25 01:39:15 +00:00
|
|
|
String wsname = whitespaceSet.size() == 0 ? "gc=Z" : "Whitespace";
|
|
|
|
|
2003-04-23 19:01:21 +00:00
|
|
|
Prop(String name) {
|
|
|
|
this.name = name;
|
|
|
|
if (name.equals("alpha")) {
|
|
|
|
guess = "Alphabetic + gc=M";
|
|
|
|
guessContents = alphaSet;
|
|
|
|
} else if (name.equals("lower")) {
|
2003-04-25 01:39:15 +00:00
|
|
|
guess = "Lowercase + gc=Lt - dt=circle";
|
2003-04-23 19:01:21 +00:00
|
|
|
guessContents = lowerSet;
|
|
|
|
} else if (name.equals("upper")) {
|
2003-04-25 01:39:15 +00:00
|
|
|
guess = "Uppercase + gc=Lt - dt=circle";
|
2003-04-23 19:01:21 +00:00
|
|
|
guessContents = upperSet;
|
|
|
|
} else if (name.equals("digit")) {
|
|
|
|
guess = "gc=Nd";
|
|
|
|
guessContents = digitSet;
|
|
|
|
} else if (name.equals("xdigit")) {
|
|
|
|
guess = "gc=Nd+a..f (upper/lower,normal/fullwidth)";
|
|
|
|
guessContents = xdigitSet;
|
|
|
|
} else if (name.equals("space")) {
|
2003-04-25 01:39:15 +00:00
|
|
|
guess = wsname;
|
2003-04-23 19:01:21 +00:00
|
|
|
guessContents = spaceSet;
|
2003-04-25 01:39:15 +00:00
|
|
|
//Utility.showSetNames("Whitespace", spaceSet, true, Default.ucd);
|
2003-04-23 19:01:21 +00:00
|
|
|
} else if (name.equals("cntrl")) {
|
|
|
|
guess = "gc=Cc";
|
|
|
|
guessContents = controlSet;
|
|
|
|
} else if (name.equals("punct")) {
|
2003-04-25 01:39:15 +00:00
|
|
|
guess = "gc=P,S";
|
2003-04-23 19:01:21 +00:00
|
|
|
guessContents = punctSet;
|
|
|
|
} else if (name.equals("graph")) {
|
|
|
|
guess = "All - gc=Cc, Cs, Cn, or Z";
|
|
|
|
guessContents = graphSet;
|
|
|
|
} else if (name.equals("blank")) {
|
2003-04-25 01:39:15 +00:00
|
|
|
guess = wsname + " - (LF,VT,FF,CR,NEL + gc=Zl,Zp)";
|
2003-04-23 19:01:21 +00:00
|
|
|
guessContents = blankSet;
|
|
|
|
} else if (name.equals("ISO_14652_class \"combining\"")) {
|
|
|
|
guess = "gc=M";
|
|
|
|
guessContents = combiningSet;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*upper
|
|
|
|
lower
|
|
|
|
alpha
|
|
|
|
digit
|
|
|
|
outdigit
|
|
|
|
space
|
|
|
|
cntrl
|
|
|
|
punct
|
|
|
|
graph
|
|
|
|
xdigit
|
|
|
|
blank
|
|
|
|
toupper
|
|
|
|
tolower
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
void show(PrintWriter pw) {
|
|
|
|
if (name.equals("ISO_14652_LC_CTYPE")) return;
|
|
|
|
if (name.equals("ISO_14652_toupper")) return;
|
|
|
|
if (name.equals("ISO_14652_tolower")) return;
|
|
|
|
if (name.equals("ISO_14652_outdigit")) return;
|
|
|
|
if (name.equals("ISO_14652_outdigit")) return;
|
|
|
|
if (name.startsWith("ISO_14652_class")) return;
|
|
|
|
|
|
|
|
pw.println();
|
|
|
|
pw.println("**************************************************");
|
|
|
|
pw.println(name);
|
|
|
|
pw.println("**************************************************");
|
|
|
|
Utility.showSetDifferences(pw, name, contents, guess, guessContents, false, true, null, Default.ucd);
|
|
|
|
//pw.println(props[i].contents);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static Prop[] props = new Prop[100];
|
|
|
|
static int propCount = 0;
|
|
|
|
|
|
|
|
public static void main(String[] args) throws IOException {
|
2003-04-25 01:39:15 +00:00
|
|
|
|
|
|
|
String version = Default.ucd.getVersion();
|
|
|
|
PrintWriter log = Utility.openPrintWriter("Diff14652_" + version + ".txt", Utility.UTF8_WINDOWS);
|
|
|
|
try {
|
|
|
|
log.write('\uFEFF');
|
|
|
|
log.print("Version: " + version);
|
|
|
|
|
|
|
|
if (false) {
|
|
|
|
UnicodeSet ID = getSet(DERIVED, ID_Start).addAll(getSet(DERIVED, ID_Continue_NO_Cf));
|
|
|
|
UnicodeSet XID = getSet(DERIVED, Mod_ID_Start).addAll(getSet(DERIVED, Mod_ID_Continue_NO_Cf));
|
|
|
|
UnicodeSet alphanumSet = new UnicodeSet(alphaSet).addAll(digitSet).addAll(getSet(CATEGORY, Pc));
|
|
|
|
|
|
|
|
Utility.showSetDifferences("ID", ID, "XID", XID, false, Default.ucd);
|
|
|
|
Utility.showSetDifferences("ID", ID, "Alphabetic+Digit+Pc", alphanumSet, false, Default.ucd);
|
2003-04-23 19:01:21 +00:00
|
|
|
}
|
|
|
|
|
2003-04-25 01:39:15 +00:00
|
|
|
BufferedReader br = Utility.openReadFile("C:\\DATA\\ISO14652_CTYPE.txt", Utility.LATIN1);
|
|
|
|
while (true) {
|
|
|
|
String line = br.readLine();
|
|
|
|
if (line == null) break;
|
|
|
|
line = line.trim();
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
if (line.charAt(line.length() - 1) == '/') {
|
|
|
|
line = line.substring(0, line.length() - 1);
|
|
|
|
}
|
|
|
|
line = line.trim();
|
|
|
|
if (line.length() == 0) continue;
|
|
|
|
|
|
|
|
char ch = line.charAt(0);
|
|
|
|
if (ch == '%') continue;
|
|
|
|
if (ch == '(') continue;
|
|
|
|
if (ch == '<') {
|
|
|
|
addItems(line, props[propCount-1].contents);
|
|
|
|
} else {
|
|
|
|
// new property
|
|
|
|
System.out.println(line);
|
|
|
|
if (line.equals("width")) break;
|
|
|
|
props[propCount] = new Prop(line);
|
|
|
|
props[propCount].name = "ISO_14652_" + line;
|
|
|
|
props[propCount].contents = new UnicodeSet();
|
|
|
|
propCount++;
|
|
|
|
}
|
2003-04-23 19:01:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < propCount; ++i) props[i].show(log);
|
2003-04-25 01:39:15 +00:00
|
|
|
|
|
|
|
log.println();
|
|
|
|
log.println("**************************************************");
|
|
|
|
log.println("Checking POSIX requirements for inclusion and disjointness.");
|
|
|
|
log.println("**************************************************");
|
|
|
|
log.println();
|
|
|
|
/*
|
|
|
|
alpha, digit, punct, cntrl are all disjoint
|
|
|
|
space, cntrl, blank are pairwise disjoint with any of alpha, digit, xdigit
|
|
|
|
alpha includes upper, lower
|
|
|
|
graph includes alpha, digit, punct
|
|
|
|
print includes graph
|
|
|
|
xdigit includes digit
|
|
|
|
*/
|
|
|
|
Prop
|
|
|
|
alpha = getProp("ISO_14652_alpha"),
|
|
|
|
upper = getProp("ISO_14652_upper"),
|
|
|
|
lower = getProp("ISO_14652_lower"),
|
|
|
|
graph = getProp("ISO_14652_graph"),
|
|
|
|
//print = getProp("ISO_14652_print"),
|
|
|
|
punct = getProp("ISO_14652_punct"),
|
|
|
|
digit = getProp("ISO_14652_digit"),
|
|
|
|
xdigit = getProp("ISO_14652_xdigit"),
|
|
|
|
space = getProp("ISO_14652_space"),
|
|
|
|
blank = getProp("ISO_14652_blank"),
|
|
|
|
cntrl = getProp("ISO_14652_cntrl");
|
|
|
|
|
|
|
|
checkDisjoint(log, new Prop[] {alpha, digit, punct, cntrl});
|
|
|
|
|
|
|
|
Prop [] l1 = new Prop[] {space, cntrl, blank};
|
|
|
|
Prop [] l2 = new Prop[] {alpha, digit, xdigit};
|
|
|
|
for (int i = 0; i < l1.length; ++i) {
|
|
|
|
for (int j = i + 1; j < l2.length; ++j) {
|
|
|
|
checkDisjoint(log, l1[i], l2[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
checkIncludes(log, alpha, upper);
|
|
|
|
checkIncludes(log, alpha, lower);
|
|
|
|
checkIncludes(log, graph, alpha);
|
|
|
|
checkIncludes(log, graph, digit);
|
|
|
|
checkIncludes(log, graph, punct);
|
|
|
|
//checkIncludes(log, print, graph);
|
|
|
|
checkIncludes(log, xdigit, digit);
|
|
|
|
|
|
|
|
|
|
|
|
// possibly alpha, digit, punct, cntrl, space cover the !(Cn,Cs)
|
|
|
|
|
|
|
|
UnicodeSet trRemainder = new UnicodeSet(cnSet)
|
|
|
|
.complement()
|
|
|
|
.removeAll(csSet)
|
|
|
|
.removeAll(digit.contents)
|
|
|
|
.removeAll(punct.contents)
|
|
|
|
.removeAll(alpha.contents)
|
|
|
|
.removeAll(cntrl.contents)
|
|
|
|
.removeAll(space.contents);
|
|
|
|
Utility.showSetNames(log, "TR Remainder: ", trRemainder, false, false, Default.ucd);
|
|
|
|
|
|
|
|
UnicodeSet propRemainder = new UnicodeSet(cnSet)
|
|
|
|
.complement()
|
|
|
|
.removeAll(csSet)
|
|
|
|
//.removeAll(noSet)
|
|
|
|
//.removeAll(cfSet)
|
|
|
|
.removeAll(digit.guessContents)
|
|
|
|
.removeAll(punct.guessContents)
|
|
|
|
.removeAll(alpha.guessContents)
|
|
|
|
.removeAll(cntrl.guessContents)
|
|
|
|
.removeAll(space.guessContents);
|
|
|
|
Utility.showSetNames(log, "Prop Remainder: ", propRemainder, false, false, Default.ucd);
|
|
|
|
|
|
|
|
/*
|
|
|
|
checkDisjoint(new Prop[] {alpha, digit, punct, cntrl});
|
|
|
|
UnicodeSet remainder = cnSet.complement();
|
|
|
|
UnicodeSet guessRemainder = new UnicodeSet(remainder);
|
|
|
|
for (int i = 0; i < list.length; ++i) {
|
|
|
|
for (int j = i + 1; j < list.length; ++j) {
|
|
|
|
compare(log, list[i].name, list[i].contents, list[j].name, list[j].contents);
|
|
|
|
compare(log, list[i].guess, list[i].guessContents, list[j].guess, list[j].guessContents);
|
|
|
|
}
|
|
|
|
remainder.removeAll(list[i].contents);
|
|
|
|
guessRemainder.removeAll(list[i].guessContents);
|
|
|
|
}
|
|
|
|
if (remainder.size() != 0) {
|
|
|
|
log.println();
|
|
|
|
log.println("Incomplete (TR): " + remainder);
|
|
|
|
}
|
|
|
|
if (guessRemainder.size() != 0) {
|
|
|
|
log.println();
|
|
|
|
log.println("Incomplete (Prop): " + guessRemainder);
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
2003-04-23 19:01:21 +00:00
|
|
|
} finally {
|
|
|
|
log.close();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-04-25 01:39:15 +00:00
|
|
|
static void checkDisjoint(PrintWriter log, Prop[] list) {
|
|
|
|
for (int i = 0; i < list.length; ++i) {
|
|
|
|
for (int j = i + 1; j < list.length; ++j) {
|
|
|
|
checkDisjoint(log, list[i], list[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void checkDisjoint(PrintWriter log, Prop prop1, Prop prop2) {
|
|
|
|
checkDisjoint(log, prop1.name, prop1.contents, prop2.name, prop2.contents);
|
|
|
|
checkDisjoint(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void checkDisjoint(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) {
|
|
|
|
if (set.containsSome(set2)) {
|
|
|
|
log.println();
|
|
|
|
log.println("Fails test: " + name + " disjoint-with " + name2);
|
|
|
|
UnicodeSet diff = new UnicodeSet(set).retainAll(set2);
|
|
|
|
Utility.showSetNames(log, "", diff, false, false, Default.ucd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void checkIncludes(PrintWriter log, Prop prop1, Prop prop2) {
|
|
|
|
checkIncludes(log, prop1.name, prop1.contents, prop2.name, prop2.contents);
|
|
|
|
checkIncludes(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void checkIncludes(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) {
|
|
|
|
if (!set.containsAll(set2)) {
|
|
|
|
log.println();
|
|
|
|
log.println("Fails test:" + name + " includes " + name2);
|
|
|
|
UnicodeSet diff = new UnicodeSet(set2).removeAll(set);
|
|
|
|
Utility.showSetNames(log, "", diff, false, false, Default.ucd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-04-23 19:01:21 +00:00
|
|
|
static String[] pieces = new String[100];
|
|
|
|
|
|
|
|
// example: <U1F48>..<U1F4D>;<U1F59>;<U1F5B>;<U1F5D>;<U1F5F>;<U1F68>..<U1F6F>;/
|
|
|
|
static void addItems(String line, UnicodeSet contents) {
|
|
|
|
int len = Utility.split(line, ';', pieces);
|
|
|
|
for (int i = 0; i < len; ++i) {
|
|
|
|
String piece = pieces[i].trim();
|
|
|
|
if (piece.length() == 0) continue;
|
|
|
|
if (piece.equals("<0>")) continue;
|
|
|
|
int start, end;
|
|
|
|
int rangePoint = piece.indexOf("..");
|
|
|
|
if (rangePoint >= 0) {
|
|
|
|
start = parse(piece.substring(0,rangePoint));
|
|
|
|
end = parse(piece.substring(rangePoint+2));
|
|
|
|
} else {
|
|
|
|
start = end = parse(piece);
|
|
|
|
}
|
|
|
|
contents.add(start, end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int parse(String piece) {
|
|
|
|
if (!piece.startsWith("<U") || !piece.endsWith(">")) {
|
|
|
|
throw new IllegalArgumentException("Bogus code point: " + piece);
|
|
|
|
}
|
|
|
|
return Integer.parseInt(piece.substring(2,piece.length()-1), 16);
|
|
|
|
}
|
|
|
|
|
2003-04-25 01:39:15 +00:00
|
|
|
static Prop getProp(String name) {
|
|
|
|
//System.out.println("Searching for: " + name);
|
|
|
|
for (int i = 0; i < propCount; ++i) {
|
|
|
|
//System.out.println("Checking: " + props[i].name);
|
|
|
|
if (props[i].name.equals(name)) {
|
|
|
|
return props[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
//System.out.println("Missed");
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2003-04-23 19:01:21 +00:00
|
|
|
// oddities:
|
|
|
|
// extra space after ';' <U0300>..<U036F>; <U20D0>..<U20FF>; <UFE20>..<UFE2F>;/
|
|
|
|
// <0>?? <0>;<U0BE7>..<U0BEF>;/
|
|
|
|
// <U202C>; <U202D>;<U202E>; <UFEFF> : 0;/
|
|
|
|
// % "print" is by default "graph", and the <space> character
|
|
|
|
// print is odd, since it includes space but not other spaces.
|
|
|
|
// alnum not defined.
|
|
|
|
|
|
|
|
}
|