ICU-0 updated for 4.0, checking POSIX

X-SVN-Rev: 11640
This commit is contained in:
Mark Davis 2003-04-23 20:18:43 +00:00
parent ddfd1a8408
commit 73d71b32f6
6 changed files with 118 additions and 36 deletions

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $ * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
* $Date: 2003/04/03 02:29:31 $ * $Date: 2003/04/23 20:18:43 $
* $Revision: 1.6 $ * $Revision: 1.7 $
* *
******************************************************************************* *******************************************************************************
*/ */
@ -33,6 +33,47 @@ abstract public class GenerateBreakTest implements UCD_Types {
System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61");
//Default.setUCD(); //Default.setUCD();
if (false) {
PrintWriter log = Utility.openPrintWriter("Diff.txt", Utility.UTF8_WINDOWS);
UnicodeSet Term = new UnicodeSet(
"[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367"
+ "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049"
+ "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]");
UnicodeSet terminal_punctuation = getSet(BINARY_PROPERTIES, Terminal_Punctuation);
UnicodeMap names = new UnicodeMap();
names.add("Pd", getSet(CATEGORY, Pd));
names.add("Ps", getSet(CATEGORY, Ps));
names.add("Pe", getSet(CATEGORY, Pe));
names.add("Pc", getSet(CATEGORY, Pc));
names.add("Po", getSet(CATEGORY, Po));
names.add("Pi", getSet(CATEGORY, Pi));
names.add("Pf", getSet(CATEGORY, Pf));
Utility.showSetDifferences(log, "Term", Term, "Terminal_Punctuation", terminal_punctuation, true, true, names, Default.ucd);
Utility.showSetDifferences(log, "Po", getSet(CATEGORY, Po), "Terminal_Punctuation", terminal_punctuation, true, true, names, Default.ucd);
log.close();
if (true) return;
UnicodeSet whitespace = getSet(BINARY_PROPERTIES, White_space);
UnicodeSet space = getSet(CATEGORY, Zs).addAll(getSet(CATEGORY, Zp)).addAll(getSet(CATEGORY, Zl));
Utility.showSetDifferences("White_Space", whitespace, "Z", space, true, Default.ucd);
UnicodeSet isSpace = new UnicodeSet();
UnicodeSet isSpaceChar = new UnicodeSet();
UnicodeSet isWhitespace = new UnicodeSet();
for (int i = 0; i <= 0xFFFF; ++i) {
if (Character.isSpace((char)i)) isSpace.add(i);
if (Character.isSpaceChar((char)i)) isSpaceChar.add(i);
if (Character.isWhitespace((char)i)) isWhitespace.add(i);
}
Utility.showSetDifferences("White_Space", whitespace, "isSpace", isSpace, true, Default.ucd);
Utility.showSetDifferences("White_Space", whitespace, "isSpaceChar", isSpaceChar, true, Default.ucd);
Utility.showSetDifferences("White_Space", whitespace, "isWhitespace", isWhitespace, true, Default.ucd);
return;
}
if (DEBUG) { if (DEBUG) {
checkDecomps(); checkDecomps();
@ -560,7 +601,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
out.println("<p><b>Suppressed:</b> "); out.println("<p><b>Suppressed:</b> ");
for (int i = 0; i < skippedSamples.length; ++i) { for (int i = 0; i < skippedSamples.length; ++i) {
if (skippedSamples[i] > 0) { if (skippedSamples[i] > 0) {
out.println(getTypeID(UTF16.valueOf(skippedSamples[i]), true)); String tmp = UTF16.valueOf(skippedSamples[i]);
out.println("<span title='" + getInfo(tmp) + "'>" + getTypeID(tmp, true) + "</span>");
} }
} }
out.println("</p>"); out.println("</p>");
@ -790,8 +832,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
static final UnicodeMap map = new UnicodeMap(); static final UnicodeMap map = new UnicodeMap();
static final int static final int
CR = map.add("CR", new UnicodeSet(0xA, 0xA)), CR = map.add("CR", new UnicodeSet(0xD, 0xD)),
LF = map.add("LF", new UnicodeSet(0xD, 0xD)), LF = map.add("LF", new UnicodeSet(0xA, 0xA)),
Control = map.add("Control", Control = map.add("Control",
getSet(CATEGORY, Cc) getSet(CATEGORY, Cc)
.addAll(getSet(CATEGORY, Cf)) .addAll(getSet(CATEGORY, Cf))
@ -1324,27 +1366,22 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (before == LB_ZW) return true; if (before == LB_ZW) return true;
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. // LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
setRule("6: GC -> FC"); setRule("6: DGC -> FC");
if (!grapheme.isBreak( source, offset, recommended)) return false; if (!grapheme.isBreak( source, offset, recommended)) return false;
setRule("6a: X CM* -> X");
if (after == LB_CM) return false;
/* /*
if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false; if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false;
if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false; if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false;
if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false; if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false;
*/ */
byte backBase = -1;
boolean setBase = false; boolean setBase = false;
if (before == LB_CM) { if (before == LB_CM) {
setBase = true; setBase = true;
int backOffset = findLastNon(source, offset, LB_CM, recommended); int backOffset = findLastNon(source, offset, LB_CM, recommended);
if (backOffset < 0) { if (backOffset >= 0) {
before = LB_ID; backBase = getResolvedType(UTF16.charAt(source, backOffset), recommended);
} else {
before = getResolvedType(UTF16.charAt(source, backOffset), recommended);
} }
} }
@ -1353,9 +1390,17 @@ abstract public class GenerateBreakTest implements UCD_Types {
// the space is changed to type ID. In other words, break before SP CM* in the same cases as // the space is changed to type ID. In other words, break before SP CM* in the same cases as
// one would break before an ID. // one would break before an ID.
setRule("7: SP CM* -> ID"); setRule("7: SP CM* -> ID");
if (setBase && before == LB_SP) before = LB_ID; if (setBase && backBase == LB_SP) before = LB_ID;
if (after == LB_SP && after2 == LB_CM) after = LB_ID; if (after == LB_SP && after2 == LB_CM) after = LB_ID;
setRule("7a: X CM* -> X");
if (after == LB_CM) return false;
if (setBase && backBase != -1) before = LB_ID;
setRule("7b: CM -> AL");
if (setBase && backBase == -1) before = LB_AL;
// LB 8 Dont break before ] or ! or ; or /, even after spaces. // LB 8 Dont break before ] or ! or ; or /, even after spaces.
// × CL, × EX, × IS, × SY // × CL, × EX, × IS, × SY
setRule("8: × ( CL | EX | IS | SY )"); setRule("8: × ( CL | EX | IS | SY )");

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2003/03/20 21:47:28 $ * $Date: 2003/04/23 20:18:42 $
* $Revision: 1.30 $ * $Revision: 1.31 $
* *
******************************************************************************* *******************************************************************************
*/ */
@ -165,6 +165,9 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.test(); else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.test();
else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0"); else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0");
else if (arg.equalsIgnoreCase("Compare14652")) Compare14652.main(null);
//else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts(); //else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts();

View File

@ -19,7 +19,7 @@
<table class="header"> <table class="header">
<tr> <tr>
<td class="icon"><a href="http://www.unicode.org"><img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="UnicodeCharacterDatabase.html">Unicode <td class="icon"><a href="http://www.unicode.org"><img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="http://www.unicode.org/ucd">Unicode
Character Database</a></td> Character Database</a></td>
</tr> </tr>
<tr> <tr>

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2003/03/20 21:47:28 $ * $Date: 2003/04/23 20:18:42 $
* $Revision: 1.24 $ * $Revision: 1.25 $
* *
******************************************************************************* *******************************************************************************
*/ */
@ -728,6 +728,7 @@ public final class UCD implements UCD_Types {
} }
public byte getEastAsianWidth(int codePoint) { public byte getEastAsianWidth(int codePoint) {
// if (0x30000 <= codepoint && codepoint <= 0x3FFFD) return EAW;
return get(codePoint, false).eastAsianWidth; return get(codePoint, false).eastAsianWidth;
} }

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2003/03/19 17:30:56 $ * $Date: 2003/04/23 20:18:42 $
* $Revision: 1.19 $ * $Revision: 1.20 $
* *
******************************************************************************* *******************************************************************************
*/ */
@ -60,11 +60,11 @@ final class UCD_Names implements UCD_Types {
"Line Break (listing LineBreak.txt, field 1)\r\n" "Line Break (listing LineBreak.txt, field 1)\r\n"
+ "#\tAll code points not explicitly listed in this file have the property\r\n" + "#\tAll code points not explicitly listed in this file have the property\r\n"
+ "#\tvalue: XX.", + "#\tvalue: XX.",
"Joining Type (listing ArabicShaping.txt, field 1).\r\n" "Joining Type (listing ArabicShaping.txt, field 2).\r\n"
+ "#\tType T is derived, as described in ArabicShaping.txt\r\n" + "#\tType T is derived, as described in ArabicShaping.txt\r\n"
+ "#\tAll code points not explicitly listed in this file have the property\r\n" + "#\tAll code points not explicitly listed in this file have the property\r\n"
+ "#\tvalue: U.", + "#\tvalue: U.",
"Joining Group (listing ArabicShaping.txt, field 2)\r\n" "Joining Group (listing ArabicShaping.txt, field 3)\r\n"
+ "#\tAll code points not explicitly listed in this file have the property\r\n" + "#\tAll code points not explicitly listed in this file have the property\r\n"
+ "#\tvalue: NO_JOINING_GROUP.", + "#\tvalue: NO_JOINING_GROUP.",
"BidiMirrored (listing UnicodeData.txt, field 9: see UCD.html)\r\n" "BidiMirrored (listing UnicodeData.txt, field 9: see UCD.html)\r\n"

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2003/04/01 02:52:00 $ * $Date: 2003/04/23 20:18:41 $
* $Revision: 1.31 $ * $Revision: 1.32 $
* *
******************************************************************************* *******************************************************************************
*/ */
@ -1069,15 +1069,36 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
return "Showing Stack with fake " + sw.getBuffer().toString(); return "Showing Stack with fake " + sw.getBuffer().toString();
} }
public static String getUnicodeImage(int cp) {
String code = hex(cp, 4);
return "<img alt='U+" + code + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + code + "' style='vertical-align:middle'>";
}
static PrintWriter showSetNamesPw; static PrintWriter showSetNamesPw;
public static void showSetDifferences(String name1, UnicodeSet set1, String name2, UnicodeSet set2, boolean separateLines, UCD ucd) { public static void showSetDifferences(String name1, UnicodeSet set1, String name2, UnicodeSet set2, boolean separateLines, UCD ucd) {
if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out);
showSetDifferences(showSetNamesPw, name1, set1, name2, set2, separateLines, false, null, ucd);
}
public static void showSetDifferences(PrintWriter pw, String name1, UnicodeSet set1, String name2, UnicodeSet set2,
boolean separateLines, boolean withChar, UnicodeMap names, UCD ucd) {
UnicodeSet temp = new UnicodeSet(set1).removeAll(set2); UnicodeSet temp = new UnicodeSet(set1).removeAll(set2);
showSetNames("In " + name1 + ", but not " + name2, temp, separateLines, false, false, ucd); pw.println();
pw.println("In " + name1 + ", but not in " + name2 + ": ");
showSetNames(pw, "\t", temp, separateLines, false, withChar, names, ucd);
temp = new UnicodeSet(set2).removeAll(set1); temp = new UnicodeSet(set2).removeAll(set1);
showSetNames("In " + name2 + ", but not " + name1, temp, separateLines, false, false, ucd); pw.println();
pw.println("Not in " + name1 + ", but in " + name2 + ": ");
showSetNames(pw, "\t", temp, separateLines, false, withChar, names, ucd);
temp = new UnicodeSet(set2).retainAll(set1); temp = new UnicodeSet(set2).retainAll(set1);
showSetNames("In " + name1 + " and " + name2, temp, separateLines, false, false, ucd); pw.println();
pw.println("In both " + name1 + " and " + name2 + ": ");
pw.println(temp.size() == 0 ? "<none>" : ""+ temp);
// showSetNames(pw, "\t", temp, false, false, withChar, names, ucd);
} }
public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) { public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) {
@ -1089,17 +1110,24 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
} }
public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) { public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) {
showSetNames( pw, prefix, set, separateLines, IDN, false, ucd); showSetNames( pw, prefix, set, separateLines, IDN, false, null, ucd);
} }
public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, boolean IDN, boolean withChar, UCD ucd) { public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, boolean IDN, boolean withChar, UCD ucd) {
if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out); if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out);
showSetNames(showSetNamesPw, prefix, set, separateLines, IDN, withChar, ucd); showSetNames(showSetNamesPw, prefix, set, separateLines, IDN, withChar, null, ucd);
showSetNamesPw.flush();
} }
static java.text.NumberFormat nf = java.text.NumberFormat.getInstance();
public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN,
boolean withChar, UCD ucd) { boolean withChar, UnicodeMap names, UCD ucd) {
if (set.size() == 0) {
pw.println(prefix + "<none>");
pw.flush();
return;
}
boolean useHTML = false;
int count = set.getRangeCount(); int count = set.getRangeCount();
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
int start = set.getRangeStart(i); int start = set.getRangeStart(i);
@ -1108,8 +1136,11 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
for (int cp = start; cp <= end; ++cp) { for (int cp = start; cp <= end; ++cp) {
if (!IDN) pw.println(prefix + ucd.getCode(cp) if (!IDN) pw.println(prefix + ucd.getCode(cp)
+ "\t# " + "\t# "
+ (withChar ? " (" + UTF16.valueOf(cp) + ") " : "") + (useHTML ? "(" + getUnicodeImage(cp) + ") " : "")
+ ucd.getName(cp)); + (withChar && (cp >= 0x20) ? "(" + UTF16.valueOf(cp) + ") " : "")
+ (names != null ? names.getLabel(cp) + " " : "")
+ ucd.getName(cp)
+ (useHTML ? "<br>" : ""));
else { else {
pw.println(prefix + Utility.hex(cp,4) + "; " + ucd.getName(cp)); pw.println(prefix + Utility.hex(cp,4) + "; " + ucd.getName(cp));
} }
@ -1119,7 +1150,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
pw.println(prefix + ucd.getCode(start) pw.println(prefix + ucd.getCode(start)
+ ((start != end) ? (".." + ucd.getCode(end)) : "") + ((start != end) ? (".." + ucd.getCode(end)) : "")
+ "\t# " + "\t# "
+ (withChar ? " (" + UTF16.valueOf(start) + (withChar && (start >= 0x20) ? " (" + UTF16.valueOf(start)
+ ((start != end) ? (".." + UTF16.valueOf(end)) : "") + ") " : "") + ((start != end) ? (".." + UTF16.valueOf(end)) : "") + ") " : "")
+ ucd.getName(start) + ((start != end) ? (".." + ucd.getName(end)) : "") + ucd.getName(start) + ((start != end) ? (".." + ucd.getName(end)) : "")
); );
@ -1136,6 +1167,8 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
} }
} }
} }
pw.println("Total: " + nf.format(set.size()));
pw.flush();
} }
private static boolean isSeparateLineIDN(int cp, UCD ucd) { private static boolean isSeparateLineIDN(int cp, UCD ucd) {