More changes to check the boundary conditions

X-SVN-Rev: 9574
2002-08-04 21:38:45 +00:00 · 2002-08-04 21:38:45 +00:00 · c0a9dd3bda
commit c0a9dd3bda
parent 1a7dc3a128
10 changed files with 620 additions and 194 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
-* $Date: 2002/06/22 21:02:16 $
-* $Revision: 1.16 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.17 $
 *
 *******************************************************************************
 */
@ -110,7 +110,7 @@ public final class DerivedProperty implements UCD_Types {
                + "\r\n#   WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
                + "\r\n#            The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
        }
-        boolean hasValue(int cp) {
+        public boolean hasValue(int cp) {
            if (ucdData.getDecompositionType(cp) == NONE) return false;
            String norm = nfx.normalize(cp);
            if (UTF16.countCodePoint(norm) != 1) return true;
@ -133,7 +133,7 @@ public final class DerivedProperty implements UCD_Types {
                + "\r\n#   Characters that are cc==0, BUT which may interact with previous characters."
                ;
        }
-        boolean hasValue(int cp) {
+        public boolean hasValue(int cp) {
            if (ucdData.getCombiningClass(cp) != 0) return false;
            String norm = nfx.normalize(cp);
            int first = UTF16.charAt(norm, 0);
@ -172,7 +172,7 @@ public final class DerivedProperty implements UCD_Types {
                + "\r\n#   WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
                + "\r\n#            The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
        }
-        boolean hasValue(int cp) {
+        public boolean hasValue(int cp) {
            boolean result = bitset.get(cp);
            if (result && filter) {
                result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero;
@ -243,7 +243,7 @@ public final class DerivedProperty implements UCD_Types {
            //if (cp >= 0xAC00 && cp <= 0xD7A3) return true;
            //System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps)));
        } // default
-        boolean hasValue(int cp) { return getValue(cp).length() != 0; }
+        public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
    };
    
    class CaseDProp extends UnicodeProperty {
@ -256,7 +256,7 @@ public final class DerivedProperty implements UCD_Types {
            header = "# Derived Property: " + name
            + "\r\n#  Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
        }
-        boolean hasValue(int cp) {
+        public boolean hasValue(int cp) {
            byte cat = ucdData.getCategory(cp);
            if (cat == val
            || val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) return false;
@ -294,7 +294,7 @@ public final class DerivedProperty implements UCD_Types {
    		return getValue(cp, LONG);
    	}
        
-        boolean hasValue(int cp) { return getValue(cp).length() != 0; }
+        public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
    };

    {
@ -323,7 +323,7 @@ public final class DerivedProperty implements UCD_Types {
                    + "\r\n#  Characters that can start an identifier."
                    + "\r\n#  Generated from Lu+Ll+Lt+Lm+Lo+Nl";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                return ucdData.isIdentifierStart(cp, false);
            }
        };
@ -338,7 +338,7 @@ public final class DerivedProperty implements UCD_Types {
                    + "\r\n#  Generated from: ID_Start + Mn+Mc+Nd+Pc"
                    + "\r\n#  NOTE: Cf characters should be filtered out.";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                return ucdData.isIdentifierContinue_NO_Cf(cp, false);
            }
        };
@ -354,7 +354,7 @@ public final class DerivedProperty implements UCD_Types {
                    + "\r\n#  NOTE: Does NOT remove the non-NFKx characters."
                    + "\r\n#        Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                return ucdData.isIdentifierStart(cp, true);
            }
        };
@ -371,7 +371,7 @@ public final class DerivedProperty implements UCD_Types {
                    + "\r\n#  NOTE: Does NOT remove the non-NFKx characters."
                    + "\r\n#        Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                return ucdData.isIdentifierContinue_NO_Cf(cp, true);
            }
        };
@ -384,7 +384,7 @@ public final class DerivedProperty implements UCD_Types {
                header = "# Derived Property: " + name
                    + "\r\n#  Generated from: Sm + Other_Math";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                byte cat = ucdData.getCategory(cp);
                if (cat == Sm
                || ucdData.getBinaryProperty(cp,Math_Property)) return true;
@ -400,7 +400,7 @@ public final class DerivedProperty implements UCD_Types {
                header = "# Derived Property: " + name
                    + "\r\n#  Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                byte cat = ucdData.getCategory(cp);
                if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
                || ucdData.getBinaryProperty(cp, Alphabetic)) return true;
@ -416,7 +416,7 @@ public final class DerivedProperty implements UCD_Types {
                header = "# Derived Property: " + name
                    + "\r\n#  Generated from: Ll + Other_Lowercase";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                byte cat = ucdData.getCategory(cp);
                if (cat == Ll
                || ucdData.getBinaryProperty(cp, Other_Lowercase)) return true;
@ -432,7 +432,7 @@ public final class DerivedProperty implements UCD_Types {
                header = "# Derived Property: " + name
                    + "\r\n#  Generated from: Lu + Other_Uppercase";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                byte cat = ucdData.getCategory(cp);
                if (cat == Lu
                || ucdData.getBinaryProperty(cp, Other_Uppercase)) return true;
@ -461,7 +461,7 @@ of characters, the first of which has a non-zero combining class.
                    + ": Full Composition Exclusion"
                    + "\r\n#  Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                if (!ucdData.isRepresented(cp)) return false;
                byte dtype = ucdData.getDecompositionType(cp);
                if (dtype != CANONICAL) return false;
@ -488,7 +488,7 @@ of characters, the first of which has a non-zero combining class.
                    + ": Full Composition Inclusion"
                    + "\r\n#  characters with Canonical Decompositions MINUS Full Composition Exclusion";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                if (!ucdData.isRepresented(cp)) return false;
                byte dtype = ucdData.getDecompositionType(cp);
                if (dtype != CANONICAL) return false;
@ -516,7 +516,7 @@ of characters, the first of which has a non-zero combining class.
                if (c.equals(b)) return "";
                return "FNC; " + Utility.hex(c);
            } // default
-            boolean hasValue(int cp) { return getValue(cp).length() != 0; }
+            public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
        };
        
        dprops[FC_NFC_Closure] = new UnicodeProperty() {
@ -538,7 +538,7 @@ of characters, the first of which has a non-zero combining class.
                if (c.equals(b)) return "";
                return "FN; " + Utility.hex(c);
            } // default
-            boolean hasValue(int cp) { return getValue(cp).length() != 0; }
+            public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
        };
        
        for (int i = QuickNFD; i <= QuickNFKC; ++i) {
@ -555,7 +555,7 @@ of characters, the first of which has a non-zero combining class.
                    + "\r\n#  Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
                    + "\r\n#    + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
            	if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
                if (ucdData.getBinaryProperty(cp,Other_Default_Ignorable_Code_Point)) return true;
                if (ucdData.getBinaryProperty(cp, White_space)) return false;
@ -573,7 +573,7 @@ of characters, the first of which has a non-zero combining class.
                
                header = header = "# Binary Property";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                switch(cp) {
                    case 0x27: case 0x2019: case 0xAD: return true;
                    //  case 0x2d: case 0x2010: case 0x2011: 
@ -600,7 +600,7 @@ of characters, the first of which has a non-zero combining class.
                    + "\r\n# - has no combining marks with zero canonical combining class"
                ;
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                if (hasSoftDot(cp)) return true;
                if (Default.nfkd.isNormalized(cp)) return false;
                String decomp = Default.nfd.normalize(cp);
@ -629,7 +629,7 @@ of characters, the first of which has a non-zero combining class.
                header = header = "# Derived Property: " + name
                    + "\r\n#  Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
                byte cat = ucdData.getCategory(cp);
                if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true;
                if (dprops[Other_Case_Ignorable].hasValue(cp)) return true;
@ -654,7 +654,7 @@ of characters, the first of which has a non-zero combining class.
                    + "\r\n#  (CGJ = U+034F)";
                     
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
            	if (cp == 0x034F) return false;
                if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
                byte cat = ucdData.getCategory(cp);
@ -674,7 +674,7 @@ of characters, the first of which has a non-zero combining class.
                    + "\r\n#  Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
                    + "\r\n#    - Grapheme_Extend - Grapheme_Link - CGJ";
            }
-            boolean hasValue(int cp) {
+            public boolean hasValue(int cp) {
            	if (cp == 0x034F) return false;
                byte cat = ucdData.getCategory(cp);
                if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.8 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.9 $
 *
 *******************************************************************************
 */
@ -275,6 +275,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
            log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
            log.print('\uFEFF');
            
+            log.println();
+            log.println("@*Override Data");
+            log.println();
+            readOverrides(type);
+
            log.println();
            log.println("@*DICT Data");
            log.println();
@ -426,7 +431,27 @@ public final class GenerateHanTransliterator implements UCD_Types {
            System.out.println("Defined Count: " + count);
            
            log.println();
-            log.println("@Duplicates");
+            log.println("@Duplicates (Frequency Order");
+            log.println();
+            it = rankList.iterator();
+            while (it.hasNext()) {
+                String word = (String) it.next();
+                Collection dups = (Collection) duplicates.get(word);
+                if (dups == null) continue;
+                log.print(hex.transliterate(word) + "\t" + word + "\t");
+                Iterator it2 = dups.iterator();
+                boolean gotFirst = false;
+                while (it2.hasNext()) {
+                    if (!gotFirst) gotFirst = true;
+                    else log.print(", ");
+                    log.print(it2.next());
+                }
+                if (overrideSet.contains(word)) log.print(" *override*");
+                log.println();
+            }
+            
+            log.println();
+            log.println("@Duplicates (Character Order)");
            log.println();
            it = duplicates.keySet().iterator();
            while (it.hasNext()) {
@ -440,6 +465,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
                    else log.print(", ");
                    log.print(it2.next());
                }
+                if (overrideSet.contains(word)) log.print(" *override*");
                log.println();
            }
            
@ -536,13 +562,19 @@ public final class GenerateHanTransliterator implements UCD_Types {
            int overallRank = 0;
            it = combinedRank.iterator();
            
+            boolean showFrequency = false;
+            
+            if (showFrequency) {
                log.println();
                log.println("@Frequency data: Rank of Character");
                log.println();
+            }
+            
+            // make up rankMap, rankList
            
            while(it.hasNext()) {
                Pair p = (Pair) it.next();
-                log.println(p.first + ", " + p.second);
+                if (showFrequency) log.println(p.first + ", " + p.second);
                Object rank = rankMap.get(p.second);
                if (rank == null) {
                    rankMap.put(p.second, new Integer(++overallRank));
@ -550,6 +582,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
                }
            }

+            if (showFrequency) {
                log.println();
                log.println("@Frequency data: Character to Rank");
                log.println();
@ -561,6 +594,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
                    Comparable val = (Comparable) rankMap.get(key);
                    log.println(key + ", " + val);
                }
+            }
            
        } catch (Exception e) {
            throw new ChainException("Line \"{0}\"", new String[] {line}, e);
@ -712,6 +746,38 @@ public final class GenerateHanTransliterator implements UCD_Types {
        }
    }
    
+    static void readOverrides(int type) throws IOException {
+        if (type != CHINESE) return;
+        String fname = "Chinese_override.txt";
+        
+        System.out.println("Reading " + fname);
+        BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, true);
+        int counter = 0;
+        String[] pieces = new String[50];
+        String line = "";
+        try {
+            while (true) {
+                line = Utility.readDataLine(br);
+                if (line == null) break;
+                if (line.length() == 0) continue;
+                Utility.dot(counter++);
+                
+                // skip code
+                int wordStart = line.indexOf('\t') + 1;
+                int wordEnd = line.indexOf('\t', wordStart);
+                String word = line.substring(wordStart, wordEnd);
+                String definition = line.substring(wordEnd+1);
+                addCheck(word, definition, line);
+                overrideSet.add(word);
+            }
+            br.close();
+        } catch (Exception e) {
+            throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
+        }
+    }    
+    
+    static Set overrideSet = new HashSet();
+    
    static void processEdict(String word, String definition, String line) {
        // We have a situation where we have words of the form CCCHHHKKKCCHHCCH > HHHHHHKKKHHHHHHHH
        // C = CJK, H = Hiragana, K = katakana
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
-* $Date: 2002/07/30 09:57:18 $
-* $Revision: 1.1 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.2 $
 *
 *******************************************************************************
 */
@ -22,60 +22,125 @@ import com.ibm.icu.text.UnicodeSet;

 public class GenerateLineBreakTest implements UCD_Types {
    
-    static String[] samples = new String[LB_LIMIT + 3];
+    // COMMON STUFF for Hangul
+    static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5;
+    static final String[] hNames = {"L", "V", "T", "LV", "LVT"};
    
-    static byte[] TROrder = {
+    static byte getHangulType(int cp) {
+        if (Default.ucd.isLeadingJamo(cp)) return hL;
+        if (Default.ucd.isVowelJamo(cp)) return hV;
+        if (Default.ucd.isTrailingJamo(cp)) return hT;
+        if (Default.ucd.isHangulSyllable(cp)) {
+            if (Default.ucd.isDoubleHangul(cp)) return hLV;
+            return hLVT;
+        }
+        return hNot;
+    }
+
+    //============================
+    
+    protected String rule;
+    protected String fileName = "Line";
+
+    // all the other items are supplied in UCD_TYPES
+    static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT, 
+        LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT,
+        LB2_LIMIT = (byte)(LB_SUP + 1);
+    
+    String[] samples = new String[100];
+    
+    
+    byte[] TypeOrder = {
        LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
        LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
        // missing from Pair Table
        LB_SP, LB_BK, LB_CR, LB_LF, 
        // resolved types below
        LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
-        // 3 JAMO CLASSES
-        29, 30, 31
+        // 3 JAMO CLASSES, plus supplementary
+        LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP
    };
-    static final int TABLE_LIMIT = 25;
-     
    
    public static void main(String[] args) throws IOException {
        Default.setUCD();
+        new GenerateLineBreakTest().run();
        
+        new GenerateWordBreakTest().run();
+    }
+    
+    // stuff that subclasses need to override
+    public void run() throws IOException {     
        findSamples();
        
        // test individual cases
        //printLine(out, samples[LB_ZW], "", samples[LB_CL]);
        //printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
        
-        PrintWriter out = Utility.openPrintWriter("LineBreakTest.html", Utility.UTF8_WINDOWS);
-        out.println("<html><body><h1>Current (fixed only for consistency):</h1>");
+        PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
+        out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
+            + fileName + "</title></head>");
+        out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>");
+        
+
+
        generateTable(out, false);
-        out.println("<h1>Recommended:</h1>");
+        out.println("<h3>Recommended:</h3>");
        generateTable(out, true);
        out.println("</body></html>");
        out.close();
        
+        String[] testCase = new String[50];
        // do main test
        
        for (int k = 0; k < 2; ++k) {
-            out = Utility.openPrintWriter(k == 0 ? "LineBreakTest_SHORT.txt" : "LineBreakTest.txt", Utility.UTF8_WINDOWS);
+            out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
            int counter = 0;
            
-            out.println("# Default Linebreak conformance test");
-            out.println("# " + Default.getDate() + ", MED");
+            out.println("# Default " + fileName + " Break Test");
+            out.println("# Generated: " + Default.getDate() + ", MED");
+            out.println("#");
+            out.println("# Format:");
+            out.println("# <string> (# <comment>)? ");
+            out.println("#  <string> contains hex Unicode code points, with ");
+            out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
+            out.println("#\t" + NOBREAK + " wherever there is not.");
+            out.println("#  <comment> the format can change, but currently it shows:");
+            out.println("#\t- the sample character name");
+            out.println("#\t- (x) the line_break property* for the sample character");
+            out.println("#\t- [x] the rule that determines whether there is a break or not");
+            out.println("#");
+            out.println("# Samples:");
+            out.println("# The test currently takes all pairs of linebreak types*,");
+            out.println("# picks a sample for each type, and generates three strings: ");
+            out.println("#\t- the pair alone");
+            out.println("#\t- the pair alone with an imbeded space");
+            out.println("#\t- the pair alone with embedded combining marks");
+            out.println("# The sample for each type is simply the first code point (above NULL)");
+            out.println("# with that property.");
+            out.println("# * Note:");
+            out.println("#\t- SG is omitted");
+            out.println("#\t- 3 different Jamo characters and a supplementary character are added");
+            out.println("#\t  The syllable types for the Jamo (L, V, T) are displayed in comments");
+            out.println("#\t  instead of the linebreak property");
+            out.println("# These samples may be extended in the future.");
            out.println("#");
            
-            for (int ii = 0; ii < samples.length; ++ii) {
-                int i = TROrder[ii];
+            for (int ii = 0; ii < getLimit(); ++ii) {
+                int i = TypeOrder[ii];
+                if (i == LB_SG) continue;
                String before = samples[i];
                
-                for (int jj = 0; jj < samples.length; ++jj) {
-                    Utility.dot(counter++);
-                    int j = TROrder[jj];
+                for (int jj = 0; jj < getLimit(); ++jj) {
+                    Utility.dot(counter);
+                    int j = TypeOrder[jj];
+                    if (j == LB_SG) continue;
                    String after = samples[j];
                    // do line straight
-                    printLine(out, before, "", after, k != 0);
-                    printLine(out, before, " ", after, k != 0);
-                    printLine(out, before, "\u0301\u0308", after, k != 0);
+                    int len = genTestItems(before, after, testCase);
+                    for (int q = 0; q < len; ++q) {
+                        printLine(out, testCase[q], k != 0 && q == 0, false);
+                        ++counter;
+                    }
                }
            }
            out.println("# Lines: " + counter);
@ -83,25 +148,80 @@ public class GenerateLineBreakTest implements UCD_Types {
        }
    }
    
-    public static void generateTable(PrintWriter out, boolean recommended) {
-        out.print("<table border='1' cellspacing='0'><tr><th></th>");
-        for (int i = 0; i < TABLE_LIMIT; ++i) {
-            String h = getLBID(samples[TROrder[i]]);
-            out.print("<th>" + h + "</th>");
+    // stuff that subclasses need to override
+    public int genTestItems(String before, String after, String[] results) {
+        results[0] = before + after;
+        results[1] = before + " " + after;
+        results[2] = before + "\u0301\u0308" + after;
+        return 3;
+    }
+    
+    // stuff that subclasses need to override
+    boolean skipType(byte type) {
+        return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX;
+    }
+    
+    // stuff that subclasses need to override
+    public String getTypeID(int cp) {
+        byte result = getType(cp);
+        if (result == LB_SUP) return "SUP";
+        if (result >= LB_LIMIT) return hNames[result - LB_LIMIT];
+        return Default.ucd.getLineBreakID_fromIndex(result);
+    }
+    
+    // stuff that subclasses need to override
+    public byte getType(int cp) {
+        if (cp > 0xFFFF) return LB_SUP;
+        byte result = getHangulType(cp);
+        if (result != hNot) return (byte)(result + LB_LIMIT);
+        return Default.ucd.getLineBreak(cp);
+    }
+    
+    public int getLimit() {
+        return LB2_LIMIT;
+    }
+    
+    public int getTableLimit() {
+        return LB_SUP; // skip last;
+    }
+    
+    
+    public void generateTable(PrintWriter out, boolean recommended) {
+        String width = "width='" + (100 / (getTableLimit() + 1)) + "%'";
+        out.print("<table border='1' cellspacing='0'><tr><th " + width + "></th>");
+        byte type;
+        for (int i = 0; i < getTableLimit(); ++i) {
+            type = TypeOrder[i];
+            if (skipType(type)) continue;
+            
+            String h = getTypeID(samples[TypeOrder[i]]);
+            out.print("<th " + width + ">" + h + "</th>");
        }
        out.print("</tr>");
        String[] rule = new String[1];
        String[] rule2 = new String[1];
-        for (int i = 0; i < TABLE_LIMIT; ++i) {
-            String before = samples[TROrder[i]];
-            String line = "<tr><th>" + getLBID(before) + "</th>";
-            for (int j = 0; j < TABLE_LIMIT; ++j) {
-                String after = samples[TROrder[j]];
+        for (int i = 0; i < getTableLimit(); ++i) {
+            type = TypeOrder[i];
+            if (skipType(type)) continue;
+            
+            String before = samples[type];
+            String line = "<tr><th>" + getTypeID(before) + "</th>";
+            for (int j = 0; j < getTableLimit(); ++j) {
+                type = TypeOrder[j];
+                if (skipType(type)) continue;
+                
+                String after = samples[type];
                String t = getTableEntry(before, after, recommended, rule);
                String background = "";
-                if (recommended) {
-                    String t2 = getTableEntry(before, after, false, rule2);
-                    if (!t.equals(t2)) background = " bgcolor='#FFFF00'";
+                String t2 = getTableEntry(before, after, !recommended, rule2);
+                if (!t.equals(t2)) {
+                    if (t.equals(NOBREAK)) {
+                        background = " bgcolor='#CCFFFF'";
+                    } else {
+                        background = " bgcolor='#FFFF00'";
+                    }
+                } else if (t.equals(NOBREAK)) {
+                    background = " bgcolor='#CCCCFF'";
                }
                line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
            }
@ -110,7 +230,7 @@ public class GenerateLineBreakTest implements UCD_Types {
        out.println("</table>");
    }
    
-    public static String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
+    public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
        String t = "_";
        boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
        String spaceRule = rule;
@ -137,75 +257,83 @@ public class GenerateLineBreakTest implements UCD_Types {
        return t;
    }
    
+    static final String BREAK = "\u00F7";
+    static final String NOBREAK = "\u00D7";
    
-    public static void printLine(PrintWriter out, String before, String filler, String after, boolean comments) {
-        String s = before + filler + after;
-        int offset = before.length() + filler.length();
+    public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) {
+        int cp;
+        StringBuffer string = new StringBuffer();
+        StringBuffer comment = new StringBuffer("\t# ");
+        String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK;
+        string.append(status);
+        comment.append(' ').append(status).append(" [").append(rule).append(']');
        
-        boolean lb = isBreak(s, offset, false);
+        for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) {
            
-        String tlb = (lb ? "b" : "n");
-        String comment = "";
-        if (comments) comment = 
-            " # " + getLBID(before + filler)
-            + " " + tlb
-            + " " + getLBID(after)
-            + " # " + Default.ucd.getName(before + filler)
-            + " " + tlb
-            + " " + Default.ucd.getName(after);
-            
-        out.println(Utility.hex(before + filler)
-            + "; " + tlb
-            + "; " + Utility.hex(after)
-            + comment);
+            cp = UTF16.charAt(source, offset);
+            if (string.length() > 0) {
+                string.append(' ');
+                comment.append(' ');
            }
            
-    public static void findSamples() {
+            string.append(Utility.hex(cp));
+            comment.append(Default.ucd.getName(cp) + " (" + getTypeID(cp) + ")");
+            
+            status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK;
+            string.append(' ').append(status);
+            comment.append(' ').append(status).append(" [").append(rule).append(']');
+        }
+        
+        if (comments) string.append(comment);
+        out.println(string);
+    }
+    
+    public void findSamples() {
        for (int i = 1; i <= 0x10FFFF; ++i) {
            if (!Default.ucd.isAllocated(i)) continue;
-            if (Default.ucd.isLeadingJamo(i) 
-                || Default.ucd.isVowelJamo(i) 
-                || Default.ucd.isTrailingJamo(i)) continue;
-            byte lb = Default.ucd.getLineBreak(i);
+            if (0xD800 <= i && i <= 0xDFFF) continue;
+            if(i == 0x1100) {
+                System.out.print("here");
+            }
+            byte lb = getType(i);
            if (samples[lb] == null) {
                samples[lb] = UTF16.valueOf(i);
            }
        }
-        // fill the last with special cases
-        samples[LB_LIMIT] = "\u1100";
-        samples[LB_LIMIT+1] = "\u1162";
-        samples[LB_LIMIT+2] = "\u11A8";
+        for (int i = 0; i < TypeOrder.length; ++i) {
+            String sample = samples[i];
+            System.out.println(getTypeID(sample) + ":\t" + Default.ucd.getCodeAndName(sample));
+        }
    }
       

-    public static String getLBID(String s) {
-        if (s.length() == 1) return Default.ucd.getLineBreakID(s.charAt(0));
+    public String getTypeID(String s) {
+        if (s == null) return "<null>";
+        if (s.length() == 1) return getTypeID(s.charAt(0));
        StringBuffer result = new StringBuffer();
        int cp;
        for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
            cp = UTF32.char32At(s, i);
            if (i > 0) result.append(" ");
-            result.append(Default.ucd.getLineBreakID(cp));
+            result.append(getTypeID(cp));
        }
        return result.toString();
    }
       
-    static String rule;
-
-    public static int findLastNon(String source, int offset, byte notLBType) {
+    public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
        int cp;
-        for (int i = offset-2; i >= 0; i -= UTF16.getCharCount(cp)) {
+        for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) {
            cp = UTF16.charAt(source, i);
-            byte f = getResolvedLB(cp);
-            if (f != notLBType) return cp;
+            byte f = getResolvedType(cp, recommended);
+            if (f != notLBType) return i;
        }
-        return 0;
+        return -1;
    }

-    public static byte getResolvedLB (int cp) {
+    public byte getResolvedType (int cp, boolean recommended) {
        // LB 1  Assign a line break category to each character of the input.
        // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
-        byte result = Default.ucd.getLineBreak(cp);
+        byte result = getType(cp);
        switch (result) {
            case LB_AI: result = LB_AI; break;
            // case LB_CB: result = LB_ID; break;
@ -213,17 +341,31 @@ public class GenerateLineBreakTest implements UCD_Types {
            // case LB_SG: result = LB_XX; break; Surrogates; will never occur
            case LB_XX: result = LB_AL; break;
        }
+        if (recommended) {
+            if (getHangulType(cp) != hNot) {
+                    result = LB_ID;
+            }
+        }
+            
        return result;
    }
    
+    public boolean onCodepointBoundary(String s, int offset) {
+        if (offset < 0 || offset > s.length()) return false;
+        if (offset == 0 || offset == s.length()) return true;
+        if (UTF16.isLeadSurrogate(s.charAt(offset-1))
+        && UTF16.isTrailSurrogate(s.charAt(offset))) return false;
+        return true;
+    }
+    
    // find out whether there is a break at offset
    // WARNING: as a side effect, sets "rule"

-    public static boolean isBreak(String source, int offset, boolean recommended) {
+    public boolean isBreak(String source, int offset, boolean recommended) {

        // LB 1  Assign a line break category to each character of the input.
        // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
-        // this is taken care of in the getResolvedLB function
+        // this is taken care of in the getResolvedType function

        // LB 2a  Never break at the start of text

@ -237,8 +379,7 @@ public class GenerateLineBreakTest implements UCD_Types {


        // UTF-16: never break in the middle of a code point
-        if (UTF16.isLeadSurrogate(source.charAt(offset-1))
-            && UTF16.isTrailSurrogate(source.charAt(offset))) return false;
+        if (!onCodepointBoundary(source, offset)) return false;


        // now get the character before and after, and their types
@ -247,8 +388,8 @@ public class GenerateLineBreakTest implements UCD_Types {
        int cpBefore = UTF16.charAt(source, offset-1);
        int cpAfter = UTF16.charAt(source, offset);

-        byte before = getResolvedLB(cpBefore);
-        byte after = getResolvedLB(cpAfter);
+        byte before = getResolvedType(cpBefore, recommended);
+        byte after = getResolvedType(cpAfter, recommended);


        rule="3a";
@ -276,22 +417,21 @@ public class GenerateLineBreakTest implements UCD_Types {
        // LB 6  Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
        rule="6";
        if (after == LB_CM) return false;
-        if (Default.ucd.isLeadingJamo(cpBefore)) {
-            if (Default.ucd.isLeadingJamo(cpAfter) || Default.ucd.isVowelJamo(cpAfter)) return false;
-        } else if (Default.ucd.isVowelJamo(cpBefore)) {
-            if (Default.ucd.isVowelJamo(cpAfter) || Default.ucd.isTrailingJamo(cpAfter)) return false;
-        } else if (Default.ucd.isTrailingJamo(cpBefore)) {
-            if (Default.ucd.isTrailingJamo(cpAfter)) return false;
-        }
+        
+        if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false;
+            
+        if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false;
+            
+        if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false;
        
        boolean setBase = false;
        if (before == LB_CM) {
            setBase = true;
-            int cp = findLastNon(source, offset, LB_CM);
-            if (cp == 0) {
+            int backOffset = findLastNon(source, offset, LB_CM, recommended);
+            if (backOffset < 0) {
                before = LB_ID;
            } else {
-                before = getResolvedLB(cp);
+                before = getResolvedType(UTF16.charAt(source, backOffset), recommended);
            }
        }

@ -310,9 +450,9 @@ public class GenerateLineBreakTest implements UCD_Types {
        // find the last non-space character; we will need it
        byte lastNonSpace = before;
        if (lastNonSpace == LB_SP) {
-            int cp = findLastNon(source, offset, LB_CM);
-            if (cp != 0) {
-                lastNonSpace = getResolvedLB(cp);
+            int backOffset = findLastNon(source, offset, LB_CM, recommended);
+            if (backOffset >= 0) {
+                lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended);
            }
        }

@ -476,4 +616,162 @@ public class GenerateLineBreakTest implements UCD_Types {
        rule="20";
        return true;
    }
+    
+    static class GenerateWordBreakTest extends GenerateLineBreakTest {
+        
+        static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8,
+            oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES!
+            L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT,
+            LIMIT = LVT + 1;
+
+        static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" };
+        
+        static UnicodeProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend);
+        static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
+        static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
+
+        {
+            fileName = "Word";
+            TypeOrder = new byte[LIMIT];
+            for (byte i = 0; i < TypeOrder.length; ++i) {
+                TypeOrder[i] = i;
+            }
+        }
+
+        boolean skipType(byte type) {
+            return false;
+        }
+      
+        public int getLimit() {
+            return LIMIT;
+        }
+   
+        public int getTableLimit() {
+            return LIMIT;
+        }
+        
+        // stuff that subclasses need to override
+        public int genTestItems(String before, String after, String[] results) {
+            results[0] = before + after;
+            return 1;
+        }
+        
+        public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
+            boolean normalBreak = isBreak(before + after, before.length(), recommended);
+            String normalRule = rule;
+            ruleOut[0] = rule;
+            return normalBreak ? BREAK : NOBREAK;
+        }
+    
+        // stuff that subclasses need to override
+        public String getTypeID(int cp) {
+            byte type = getType(cp);
+            if (type >= oLIMIT) return hNames[type - oLIMIT];
+            return Names[type];
+        }
+        
+        // stuff that subclasses need to override
+        public byte getType(int cp) {
+            // single characters
+            if (cp == 0xA) return LF;
+            if (cp == 0xD) return CR;
+            if (cp == 0x034F) return CGJ;
+            if (cp == 0x2028 || cp == 0x2029) return Control;
+            
+            // Hangul
+            byte result = getHangulType(cp);
+            if (result != hNot) return (byte)(result + oLIMIT);
+            
+            // other properties
+            // category based
+            byte cat = Default.ucd.getCategory(cp);
+            if (cat == Cc) return Control;
+            if (cat == Cf) return Extend;
+            if (((1<<cat) & LETTER_MASK) != 0) return LetterBase;
+            
+            // other binary properties
+            if (linkProp.hasValue(cp)) return Link;
+            if (extendProp.hasValue(cp)) return Extend;
+            if (baseProp.hasValue(cp)) return Base;
+            
+            return Other;
+        }
+        
+        public byte getResolvedType(int cp, boolean recommended) {
+            return getType(cp);
+        }
+        
+        public boolean isBreak(String source, int offset, boolean recommended) {
+            rule="1";
+            if (offset < 0 || offset > source.length()) return false;
+            if (offset == 0) return true;
+            
+            rule = "2";
+            if (offset == source.length()) return true;
+
+            // UTF-16: never break in the middle of a code point
+            if (!onCodepointBoundary(source, offset)) return false;
+
+            // now get the character before and after, and their types
+
+
+            int cpBefore = UTF16.charAt(source, offset-1);
+            int cpAfter = UTF16.charAt(source, offset);
+
+            byte before = getResolvedType(cpBefore, recommended);
+            byte after = getResolvedType(cpAfter, recommended);
+            
+            rule = "3";
+            if (before == CR && after == LF) return false;
+
+            rule = "4";
+            if (before == CR || before == LF || before == Control 
+                || after == Control || after == LF || after == CR) return true;
+
+            rule = "6";
+            if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
+            
+            rule = "7";
+            if ((before == LV || before == V) && (after == V || after == T)) return false;
+            
+            rule = "8";
+            if ((before == LVT || before == T) && (after == T)) return false;
+            
+            rule = "9";
+            if (after == Extend) return false;
+            
+            if (recommended) {
+                if (after == Link || after == CGJ) return false;
+            } else {
+                
+                // Do not break around a CGJ.
+                rule = "10";
+                if (before == CGJ && (after == Base 
+                    || after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false;
+                rule = "11";
+                if (after == CGJ) return false;
+
+                // Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
+     
+                rule = "12";
+                //Link Extend* × LetterBase  (12) 
+                if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
+                    int backOffset = findLastNon(source, offset, Extend, recommended);
+                    if (backOffset >= 0) {
+                        byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended);
+                        if (last == Link) return false;
+                    }
+                }
+                
+                rule = "13";
+                if (after == Link) return false;
+            }
+
+            // Otherwise break after all characters.
+            rule = "14";
+            return true;
+
+        }
+            
+    }
 }
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.2 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.3 $
 *
 *******************************************************************************
 */
@ -23,20 +23,23 @@ public class GenerateThaiBreaks {
    
    BufferedReader br = new BufferedReader(
      new InputStreamReader(
-        new FileInputStream("\\icu4j\\src\\data\\thai6.ucs"), "UnicodeLittle"));
+        new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle"));
    PrintWriter out = null;
    
    try {
        Default.setUCD();
-        UnicodeSet ignorables = new UnicodeSet(0xE30, 0xE3A);
+        UnicodeSet ignorables = new UnicodeSet(); 
+        /* new UnicodeSet(0xE30, 0xE3A);
        ignorables.add(0x0E40, 0x0E44); // add logical order exception
        ignorables.add(0x0E47, 0x0E4E);
+        */
        ignorables.add(0, ' '); // add controls
        ignorables.add('.');
        
-        Set initials = new TreeSet();
-        Set finals = new TreeSet();
-        Set medials = new TreeSet();
+        
+        UnicodeSet initials = new UnicodeSet();
+        UnicodeSet finals = new UnicodeSet();
+        UnicodeSet medials = new UnicodeSet();
        
        char[] buffer = new char[100];
        
@ -60,34 +63,58 @@ public class GenerateThaiBreaks {
            }
            
            initials.add(temp.substring(0,1));
-            initials.add(temp.substring(0,2));
-            finals.add(temp.substring(temp.length()-2));
+            //initials.add(temp.substring(0,2));
            finals.add(temp.substring(temp.length()-1));
+            //finals.add(temp.substring(temp.length()-1));
            
-            for (int i = 1; i < temp.length() - 3; ++i) {
-                medials.add(temp.substring(i, i+2));
+            for (int i = 1; i < temp.length() - 1; ++i) {
+                //medials.add(temp.substring(i, i+2));
                medials.add(temp.substring(i, i+1));
            }
-            medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
+            //medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
        }
        
        System.out.println("initials size: " + initials.size());
        System.out.println("finals size: " + finals.size());
        System.out.println("medials size: " + medials.size());
        
+        //out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
+       // out.write('\uFEFF');
+       
+        UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]");
+        finals.addAll(marks);
+        
+        UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals);
+        
+        UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all);
+        
+        System.out.println("Never occur: " + missingThai.toPattern(true));
+        Utility.showSetNames("", missingThai, true, Default.ucd);
+        System.out.println();
+        
+        UnicodeSet neverInitial = new UnicodeSet(all).removeAll(initials);
+        UnicodeSet neverFinal = new UnicodeSet(all).removeAll(finals);
+        
+        System.out.println("Never initial: " + neverInitial.toPattern(true));
+        Utility.showSetNames("", neverInitial, true, Default.ucd);
+        System.out.println();
+        
+        System.out.println("Never final: " + neverFinal.toPattern(true));
+        Utility.showSetNames("", neverFinal, true, Default.ucd);
+        System.out.println();
+        
        initials.removeAll(medials);
        finals.removeAll(medials);

        System.out.println("initials size: " + initials.size());
        System.out.println("finals size: " + finals.size());
        
-        out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
-        out.write('\uFEFF');
-        out.println("Only Initials");
-        Utility.print(out, initials, ", ", new MyBreaker());
-        out.println();
-        out.println("Only Finals");
-        Utility.print(out, finals, ", ", new MyBreaker());
+        System.out.println("Only Initials" + initials.toPattern(true));
+        Utility.showSetNames("", initials, true, Default.ucd);
+        System.out.println();
+        
+        System.out.println("Only Finals" + finals.toPattern(true));
+        Utility.showSetNames("", finals, true, Default.ucd);
    } finally {
        br.close();
        if (out != null) out.close();
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.19 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.20 $
 *
 *******************************************************************************
 */
@ -78,7 +78,7 @@ public final class Main implements UCD_Types {
            else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
            
            
-            else if (arg.equalsIgnoreCase("linebreaktest")) GenerateLineBreakTest.main(null);
+            else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);

            else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
            else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2002/07/30 09:56:40 $
-* $Revision: 1.16 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.17 $
 *
 *******************************************************************************
 */
@ -737,6 +737,10 @@ public final class UCD implements UCD_Types {
        return UCD_Names.NT[prop];
    }

+    public static String getNumericTypeID_fromIndex(byte prop, byte style) {
+        return style == SHORT ? UCD_Names.SHORT_NT[prop] : UCD_Names.NT[prop];
+    }
+
    public String getEastAsianWidthID(int codePoint) {
        return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint));
    }
@ -745,6 +749,10 @@ public final class UCD implements UCD_Types {
        return UCD_Names.EA[prop];
    }

+    public static String getEastAsianWidthID_fromIndex(byte prop, byte style) {
+        return style != LONG ? UCD_Names.SHORT_EA[prop] : UCD_Names.EA[prop];
+    }
+
    public String getLineBreakID(int codePoint) {
        return getLineBreakID_fromIndex(getLineBreak(codePoint));
    }
@ -753,6 +761,10 @@ public final class UCD implements UCD_Types {
        return UCD_Names.LB[prop];
    }

+    public static String getLineBreakID_fromIndex(byte prop, byte style) {
+        return style != LONG ? UCD_Names.LB[prop] : UCD_Names.LONG_LB[prop];
+    }
+
    public String getJoiningTypeID(int codePoint) {
        return getJoiningTypeID_fromIndex(getJoiningType(codePoint));
    }
@ -761,6 +773,10 @@ public final class UCD implements UCD_Types {
        return UCD_Names.JOINING_TYPE[prop];
    }

+    public static String getJoiningTypeID_fromIndex(byte prop, byte style) {
+        return style != LONG ? UCD_Names.JOINING_TYPE[prop] : UCD_Names.LONG_JOINING_TYPE[prop];
+    }
+
    public String getJoiningGroupID(int codePoint) {
        return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint));
    }
@ -769,6 +785,11 @@ public final class UCD implements UCD_Types {
        return UCD_Names.JOINING_GROUP[prop];
    }

+    public static String getJoiningGroupID_fromIndex(byte prop, byte style) {
+        // no short version
+        return UCD_Names.JOINING_GROUP[prop];
+    }
+
    public String getScriptID(int codePoint) {
        return getScriptID_fromIndex(getScript(codePoint));
    }
@ -790,6 +811,11 @@ public final class UCD implements UCD_Types {
        return UCD_Names.AGE[prop];
    }

+    public static String getAgeID_fromIndex(byte prop, byte style) {
+        // no short for
+        return UCD_Names.AGE[prop];
+    }
+
    public String getBinaryPropertiesID(int codePoint, byte bit) {
        return (getBinaryProperties(codePoint) & (1<<bit)) != 0 ? "Y" : "N";
    }
--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
-* $Date: 2002/07/30 09:56:40 $
-* $Revision: 1.14 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.15 $
 *
 *******************************************************************************
 */
@ -22,6 +22,8 @@ public interface UCD_Types {
    public static final String BIN_DIR = BASE_DIR + "BIN\\";
    public static final String GEN_DIR = BASE_DIR + "GEN\\";
    
+    public static final char DOTTED_CIRCLE = '\u25CC';
+    
    public static final int 
    	CJK_BASE = 0x4E00,
    	CJK_LIMIT = 0x9FFF+1,
@ -166,7 +168,10 @@ public interface UCD_Types {
        CONTROL_MASK = (1<<Cc) | (1<<Cf) | (1<<Cs) | (1<<Co),
        PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
        SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
-        UNASSIGNED_MASK = (1<<Cn);
+        UNASSIGNED_MASK = (1<<Cn),
+        BASE_MASK = LETTER_MASK | NUMBER_MASK | PUNCTUATION_MASK | SYMBOL_MASK | (1<<Mc),
+        NONSPACING_MARK_MASK = (1<<Mn) | (1<<Me);
+        

 	// Binary Properties

--- a/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java
@ -148,7 +148,7 @@ public abstract class UnicodeProperty implements UCD_Types {
      /**
       * Does it have the propertyValue?
       */
-      abstract boolean hasValue(int cp);
+      abstract public boolean hasValue(int cp);
      
      /**
       * Get the set of characters it contains
--- a/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
-* $Date: 2002/07/03 02:15:47 $
-* $Revision: 1.8 $
+* $Date: 2002/08/04 21:38:44 $
+* $Revision: 1.9 $
 *
 *******************************************************************************
 */
@ -299,26 +299,14 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
            case COMBINING_CLASS>>8: return ucd.getCombiningClassID_fromIndex((byte)propValue, style);
            case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex((byte)propValue, style);
            case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex((byte)propValue, style);
-            case NUMERIC_TYPE>>8: if (propValue >= LIMIT_NUMERIC_TYPE) break;
-                if (style != SHORT) return ucd.getNumericTypeID_fromIndex((byte)propValue);
-                return UCD_Names.SHORT_NT[propValue];
-            case EAST_ASIAN_WIDTH>>8: if (propValue >= LIMIT_EAST_ASIAN_WIDTH) break;
-                if (style != LONG) return ucd.getEastAsianWidthID_fromIndex((byte)propValue);
-                return UCD_Names.SHORT_EA[propValue];
-            case LINE_BREAK>>8:  if (propValue >= LIMIT_LINE_BREAK) break;
-                if (style != LONG) return ucd.getLineBreakID_fromIndex((byte)propValue);
-                return UCD_Names.LONG_LB[propValue];
-            case JOINING_TYPE>>8: if (propValue >= LIMIT_JOINING_TYPE) break;
-                if (style != LONG) return ucd.getJoiningTypeID_fromIndex((byte)propValue);
-                return UCD_Names.LONG_JOINING_TYPE[propValue];
-            case JOINING_GROUP>>8: if (propValue >= LIMIT_JOINING_GROUP) break;
-                return ucd.getJoiningGroupID_fromIndex((byte)propValue);
+            case NUMERIC_TYPE>>8: ucd.getNumericTypeID_fromIndex((byte)propValue, style);
+            case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex((byte)propValue);
+            case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex((byte)propValue, style);
+            case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex((byte)propValue);
+            case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex((byte)propValue);
            case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
-            case SCRIPT>>8: if (propValue >= LIMIT_SCRIPT) break;
-                if (style != SHORT) return ucd.getScriptID_fromIndex((byte)propValue);
-                return UCD_Names.ABB_SCRIPT[propValue];
-            case AGE>>8: if (propValue >= LIMIT_AGE) break;
-                return ucd.getAgeID_fromIndex((byte)propValue);
+            case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue);
+            case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue);
                /*
            case DERIVED>>8:
                UnicodeProperty up = DerivedProperty.make(propValue, ucd);
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.23 $
+* $Date: 2002/08/04 21:38:44 $
+* $Revision: 1.24 $
 *
 *******************************************************************************
 */
@ -17,9 +17,10 @@ import java.util.*;
 import java.text.*;
 import java.io.*;
 import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UTF16;
 import com.ibm.text.UCD.*;

-public final class Utility {    // COMMON UTILITIES
+public final class Utility implements UCD_Types {    // COMMON UTILITIES

    static final boolean UTF8 = true; // TODO -- make argument
    
@ -470,6 +471,21 @@ public final class Utility {    // COMMON UTILITIES
    	return quoteXML(source, false);
    }
    
+    private static UnicodeProperty defaultIgnorable = null;
+    
+    public static String getDisplay(int cp) {
+        String result = UTF16.valueOf(cp);
+        byte cat = Default.ucd.getCategory(cp);
+        if (cat == Mn || cat == Me) {
+            result = String.valueOf(DOTTED_CIRCLE) + result;
+        } else if (cat == Cf || cat == Cc || cp == 0x034F || cp == 0x00AD || cp == 0x1806) {
+            result = "\u25A1";
+        } else {
+            if (defaultIgnorable == null) defaultIgnorable = DerivedProperty.make(DefaultIgnorable);
+            if (defaultIgnorable.hasValue(cp)) result = "\u25A1";
+        }
+        return result;
+    }
    
    public static int compare(char[] a, int aStart, int aEnd, char[] b, int bStart, int bEnd) {
        while (aStart < aEnd && bStart < bEnd) {