diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java index e10ecc9f07..88ec39c120 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $ -* $Date: 2003/04/03 02:29:31 $ -* $Revision: 1.6 $ +* $Date: 2003/04/23 20:18:43 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -33,6 +33,47 @@ abstract public class GenerateBreakTest implements UCD_Types { System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); //Default.setUCD(); + if (false) { + + PrintWriter log = Utility.openPrintWriter("Diff.txt", Utility.UTF8_WINDOWS); + UnicodeSet Term = new UnicodeSet( + "[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367" + + "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049" + + "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]"); + UnicodeSet terminal_punctuation = getSet(BINARY_PROPERTIES, Terminal_Punctuation); + UnicodeMap names = new UnicodeMap(); + names.add("Pd", getSet(CATEGORY, Pd)); + names.add("Ps", getSet(CATEGORY, Ps)); + names.add("Pe", getSet(CATEGORY, Pe)); + names.add("Pc", getSet(CATEGORY, Pc)); + names.add("Po", getSet(CATEGORY, Po)); + names.add("Pi", getSet(CATEGORY, Pi)); + names.add("Pf", getSet(CATEGORY, Pf)); + + Utility.showSetDifferences(log, "Term", Term, "Terminal_Punctuation", terminal_punctuation, true, true, names, Default.ucd); + Utility.showSetDifferences(log, "Po", getSet(CATEGORY, Po), "Terminal_Punctuation", terminal_punctuation, true, true, names, Default.ucd); + log.close(); + + if (true) return; + + UnicodeSet whitespace = getSet(BINARY_PROPERTIES, White_space); + UnicodeSet space = getSet(CATEGORY, Zs).addAll(getSet(CATEGORY, Zp)).addAll(getSet(CATEGORY, Zl)); + Utility.showSetDifferences("White_Space", whitespace, "Z", space, true, Default.ucd); + + UnicodeSet isSpace = new UnicodeSet(); + UnicodeSet isSpaceChar = new UnicodeSet(); + UnicodeSet isWhitespace = new UnicodeSet(); + for (int i = 0; i <= 0xFFFF; ++i) { + if (Character.isSpace((char)i)) isSpace.add(i); + if (Character.isSpaceChar((char)i)) isSpaceChar.add(i); + if (Character.isWhitespace((char)i)) isWhitespace.add(i); + } + Utility.showSetDifferences("White_Space", whitespace, "isSpace", isSpace, true, Default.ucd); + Utility.showSetDifferences("White_Space", whitespace, "isSpaceChar", isSpaceChar, true, Default.ucd); + Utility.showSetDifferences("White_Space", whitespace, "isWhitespace", isWhitespace, true, Default.ucd); + return; + } + if (DEBUG) { checkDecomps(); @@ -560,7 +601,8 @@ abstract public class GenerateBreakTest implements UCD_Types { out.println("

Suppressed: "); for (int i = 0; i < skippedSamples.length; ++i) { if (skippedSamples[i] > 0) { - out.println(getTypeID(UTF16.valueOf(skippedSamples[i]), true)); + String tmp = UTF16.valueOf(skippedSamples[i]); + out.println("" + getTypeID(tmp, true) + ""); } } out.println("

"); @@ -790,8 +832,8 @@ abstract public class GenerateBreakTest implements UCD_Types { static final UnicodeMap map = new UnicodeMap(); static final int - CR = map.add("CR", new UnicodeSet(0xA, 0xA)), - LF = map.add("LF", new UnicodeSet(0xD, 0xD)), + CR = map.add("CR", new UnicodeSet(0xD, 0xD)), + LF = map.add("LF", new UnicodeSet(0xA, 0xA)), Control = map.add("Control", getSet(CATEGORY, Cc) .addAll(getSet(CATEGORY, Cf)) @@ -1324,27 +1366,22 @@ abstract public class GenerateBreakTest implements UCD_Types { if (before == LB_ZW) return true; // LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. - setRule("6: GC -> FC"); + setRule("6: DGC -> FC"); if (!grapheme.isBreak( source, offset, recommended)) return false; - setRule("6a: X CM* -> X"); - if (after == LB_CM) return false; - - /* if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false; if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false; if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false; */ + byte backBase = -1; boolean setBase = false; if (before == LB_CM) { setBase = true; int backOffset = findLastNon(source, offset, LB_CM, recommended); - if (backOffset < 0) { - before = LB_ID; - } else { - before = getResolvedType(UTF16.charAt(source, backOffset), recommended); + if (backOffset >= 0) { + backBase = getResolvedType(UTF16.charAt(source, backOffset), recommended); } } @@ -1353,9 +1390,17 @@ abstract public class GenerateBreakTest implements UCD_Types { // the space is changed to type ID. In other words, break before SP CM* in the same cases as // one would break before an ID. setRule("7: SP CM* -> ID"); - if (setBase && before == LB_SP) before = LB_ID; + if (setBase && backBase == LB_SP) before = LB_ID; if (after == LB_SP && after2 == LB_CM) after = LB_ID; + setRule("7a: X CM* -> X"); + if (after == LB_CM) return false; + if (setBase && backBase != -1) before = LB_ID; + + setRule("7b: CM -> AL"); + if (setBase && backBase == -1) before = LB_AL; + + // LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. // ื CL, ื EX, ื IS, ื SY setRule("8: ื ( CL | EX | IS | SY )"); diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 879abf43b7..cbefc2451a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2003/03/20 21:47:28 $ -* $Revision: 1.30 $ +* $Date: 2003/04/23 20:18:42 $ +* $Revision: 1.31 $ * ******************************************************************************* */ @@ -165,6 +165,9 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.test(); else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0"); + else if (arg.equalsIgnoreCase("Compare14652")) Compare14652.main(null); + + //else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts(); diff --git a/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html b/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html index 4756f92dc0..22dc10f8f8 100644 --- a/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html +++ b/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html @@ -19,7 +19,7 @@ - diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index e480b2ab8f..3a0af90604 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2003/03/20 21:47:28 $ -* $Revision: 1.24 $ +* $Date: 2003/04/23 20:18:42 $ +* $Revision: 1.25 $ * ******************************************************************************* */ @@ -728,6 +728,7 @@ public final class UCD implements UCD_Types { } public byte getEastAsianWidth(int codePoint) { +// if (0x30000 <= codepoint && codepoint <= 0x3FFFD) return EAW; return get(codePoint, false).eastAsianWidth; } diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index c55dd460bb..fd2f5aea91 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2003/03/19 17:30:56 $ -* $Revision: 1.19 $ +* $Date: 2003/04/23 20:18:42 $ +* $Revision: 1.20 $ * ******************************************************************************* */ @@ -60,11 +60,11 @@ final class UCD_Names implements UCD_Types { "Line Break (listing LineBreak.txt, field 1)\r\n" + "#\tAll code points not explicitly listed in this file have the property\r\n" + "#\tvalue: XX.", - "Joining Type (listing ArabicShaping.txt, field 1).\r\n" + "Joining Type (listing ArabicShaping.txt, field 2).\r\n" + "#\tType T is derived, as described in ArabicShaping.txt\r\n" + "#\tAll code points not explicitly listed in this file have the property\r\n" + "#\tvalue: U.", - "Joining Group (listing ArabicShaping.txt, field 2)\r\n" + "Joining Group (listing ArabicShaping.txt, field 3)\r\n" + "#\tAll code points not explicitly listed in this file have the property\r\n" + "#\tvalue: NO_JOINING_GROUP.", "BidiMirrored (listing UnicodeData.txt, field 9: see UCD.html)\r\n" diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index e28afe26c3..ee051a1c84 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2003/04/01 02:52:00 $ -* $Revision: 1.31 $ +* $Date: 2003/04/23 20:18:41 $ +* $Revision: 1.32 $ * ******************************************************************************* */ @@ -1069,15 +1069,36 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES return "Showing Stack with fake " + sw.getBuffer().toString(); } + public static String getUnicodeImage(int cp) { + String code = hex(cp, 4); + return "U+" + code + ""; + } + static PrintWriter showSetNamesPw; public static void showSetDifferences(String name1, UnicodeSet set1, String name2, UnicodeSet set2, boolean separateLines, UCD ucd) { + if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out); + showSetDifferences(showSetNamesPw, name1, set1, name2, set2, separateLines, false, null, ucd); + } + + public static void showSetDifferences(PrintWriter pw, String name1, UnicodeSet set1, String name2, UnicodeSet set2, + boolean separateLines, boolean withChar, UnicodeMap names, UCD ucd) { + UnicodeSet temp = new UnicodeSet(set1).removeAll(set2); - showSetNames("In " + name1 + ", but not " + name2, temp, separateLines, false, false, ucd); + pw.println(); + pw.println("In " + name1 + ", but not in " + name2 + ": "); + showSetNames(pw, "\t", temp, separateLines, false, withChar, names, ucd); + temp = new UnicodeSet(set2).removeAll(set1); - showSetNames("In " + name2 + ", but not " + name1, temp, separateLines, false, false, ucd); + pw.println(); + pw.println("Not in " + name1 + ", but in " + name2 + ": "); + showSetNames(pw, "\t", temp, separateLines, false, withChar, names, ucd); + temp = new UnicodeSet(set2).retainAll(set1); - showSetNames("In " + name1 + " and " + name2, temp, separateLines, false, false, ucd); + pw.println(); + pw.println("In both " + name1 + " and " + name2 + ": "); + pw.println(temp.size() == 0 ? "" : ""+ temp); + // showSetNames(pw, "\t", temp, false, false, withChar, names, ucd); } public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) { @@ -1089,17 +1110,24 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES } public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) { - showSetNames( pw, prefix, set, separateLines, IDN, false, ucd); + showSetNames( pw, prefix, set, separateLines, IDN, false, null, ucd); } public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, boolean IDN, boolean withChar, UCD ucd) { if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out); - showSetNames(showSetNamesPw, prefix, set, separateLines, IDN, withChar, ucd); - showSetNamesPw.flush(); + showSetNames(showSetNamesPw, prefix, set, separateLines, IDN, withChar, null, ucd); } + static java.text.NumberFormat nf = java.text.NumberFormat.getInstance(); + public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, - boolean withChar, UCD ucd) { + boolean withChar, UnicodeMap names, UCD ucd) { + if (set.size() == 0) { + pw.println(prefix + ""); + pw.flush(); + return; + } + boolean useHTML = false; int count = set.getRangeCount(); for (int i = 0; i < count; ++i) { int start = set.getRangeStart(i); @@ -1108,8 +1136,11 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES for (int cp = start; cp <= end; ++cp) { if (!IDN) pw.println(prefix + ucd.getCode(cp) + "\t# " - + (withChar ? " (" + UTF16.valueOf(cp) + ") " : "") - + ucd.getName(cp)); + + (useHTML ? "(" + getUnicodeImage(cp) + ") " : "") + + (withChar && (cp >= 0x20) ? "(" + UTF16.valueOf(cp) + ") " : "") + + (names != null ? names.getLabel(cp) + " " : "") + + ucd.getName(cp) + + (useHTML ? "
" : "")); else { pw.println(prefix + Utility.hex(cp,4) + "; " + ucd.getName(cp)); } @@ -1119,7 +1150,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES pw.println(prefix + ucd.getCode(start) + ((start != end) ? (".." + ucd.getCode(end)) : "") + "\t# " - + (withChar ? " (" + UTF16.valueOf(start) + + (withChar && (start >= 0x20) ? " (" + UTF16.valueOf(start) + ((start != end) ? (".." + UTF16.valueOf(end)) : "") + ") " : "") + ucd.getName(start) + ((start != end) ? (".." + ucd.getName(end)) : "") ); @@ -1136,6 +1167,8 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES } } } + pw.println("Total: " + nf.format(set.size())); + pw.flush(); } private static boolean isSeparateLineIDN(int cp, UCD ucd) {
[Unicode]  Unicode + [Unicode]  Unicode Character Database