updated for 4.0

X-SVN-Rev: 11161
2003-02-25 23:38:23 +00:00 · 2003-02-25 23:38:23 +00:00 · 07a8be151c
commit 07a8be151c
parent c31688a777
24 changed files with 1610 additions and 367 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.9 $
+* $Date: 2003/02/25 23:38:23 $
+* $Revision: 1.10 $
 *
 *******************************************************************************
 */
@ -718,6 +718,8 @@ public final class ConvertUCD implements UCD_Types {

    static Set jtSet = new TreeSet();
    static Set jgSet = new TreeSet();
+    
+    static final boolean SHOW_SAMPLE = false;

    /** Adds the character data. Signals duplicates with an exception
     */
@ -725,6 +727,11 @@ public final class ConvertUCD implements UCD_Types {
        //if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
        UData charEntry = getEntry(cp);
        //if (cp < 10) System.out.println("   " + charEntry);
+        
+        if (SHOW_SAMPLE && cp == 0x221) {
+            System.out.println("Sample: " + cp + ", " + key + ", " + value);
+            System.out.println(charEntry);
+        }

        if (key.equals("bm")) {
            if (value.equals("Y")) charEntry.binaryProperties |= 1;
@ -780,6 +787,11 @@ public final class ConvertUCD implements UCD_Types {
        } else {
            setField(charEntry, key, value);
        }
+        if (SHOW_SAMPLE && cp == 0x221) {
+            System.out.println("Sample Result:");
+            System.out.println(charEntry);
+        }
+        
    }

    static public void setField(UData uData, String fieldName, String fieldValue) {
--- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
-* $Date: 2002/08/04 21:38:45 $
-* $Revision: 1.17 $
+* $Date: 2003/02/25 23:38:23 $
+* $Revision: 1.18 $
 *
 *******************************************************************************
 */
@ -14,11 +14,20 @@
 package com.ibm.text.UCD;
 import com.ibm.text.utility.*;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
 import java.util.*;
+import java.io.PrintWriter;

 public final class DerivedProperty implements UCD_Types {
  
    UCD ucdData;
+    Normalizer nfc;
+    Normalizer nfd;
+    Normalizer nfkc;
+    Normalizer nfkd;
+    Normalizer[] nf = new Normalizer[4];
+    UnicodeSet XID_Start_Set = new UnicodeSet();
+    UnicodeSet XID_Continue_Set = new UnicodeSet();
    
    // ADD CONSTANT to UCD_TYPES
    
@ -33,9 +42,6 @@ public final class DerivedProperty implements UCD_Types {
    }
    
    ///////////////////////////////////////////////////////////
-    private DerivedProperty(UCD ucd) {
-      ucdData = ucd;
-    }
    
    static Map cache = new HashMap();
    static UCD lastUCD = null;
@ -101,7 +107,7 @@ public final class DerivedProperty implements UCD_Types {
        Normalizer nfx;
        ExDProp(int i) {
            type = DERIVED_NORMALIZATION;
-            nfx = Default.nf[i];
+            nfx = nf[i];
            name = "Expands_On_" + nfx.getName();
            shortName = "XO_" + nfx.getName();
            header = "# Derived Property: " + name
@ -125,7 +131,7 @@ public final class DerivedProperty implements UCD_Types {
        NF_UnsafeStartProp(int i) {
            isStandard = false;
            type = DERIVED_NORMALIZATION;
-            nfx = Default.nf[i];
+            nfx = nf[i];
            name = nfx.getName() + "_UnsafeStart";
            shortName = nfx.getName() + "_SS";
            header = "# Derived Property: " + name
@ -144,6 +150,35 @@ public final class DerivedProperty implements UCD_Types {
        }
    };
    
+
+    /*
+    class HangulSyllableType extends UnicodeProperty {
+        Normalizer nfx;
+        //int prop;
+        
+        HangulSyllableType(int i) {
+            isStandard = false;
+            type = DERIVED_NORMALIZATION;
+            nfx = nf[i];
+            name = nfx.getName() + "_UnsafeStart";
+            shortName = nfx.getName() + "_SS";
+            header = "# Derived Property: " + name
+                + "\r\n#   Generated according to UAX #15."
+                + "\r\n#   Characters that are cc==0, BUT which may interact with previous characters."
+                ;
+        }
+        public boolean hasValue(int cp) {
+            if (ucdData.getCombiningClass(cp) != 0) return false;
+            String norm = nfx.normalize(cp);
+            int first = UTF16.charAt(norm, 0);
+            if (ucdData.getCombiningClass(first) != 0) return true;
+            if (nfx.isComposition()
+                && dprops[NFC_TrailingZero].hasValue(first)) return true; // 1,3 == composing
+            return false;
+        }
+    };
+    */
+    
    
    class NFC_Prop extends UnicodeProperty {
        BitSet bitset;
@ -161,7 +196,7 @@ public final class DerivedProperty implements UCD_Types {
                case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
            }
            filter = bitsets[1] != null;
-            Default.nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
+            nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
            
            name = Names[i-NFC_Leading];
            shortName = SNames[i-NFC_Leading];
@ -197,17 +232,17 @@ public final class DerivedProperty implements UCD_Types {
            isStandard = false;
            setValueType(NON_ENUMERATED);
            type = DERIVED_NORMALIZATION;
-            nfx = Default.nf[i];
+            nfx = nf[i];
            name = nfx.getName();
            String compName = "the character itself";
            
            if (i == NFKC || i == NFD) {
                name += "-NFC";
-                nfComp = Default.nfc;
+                nfComp = nfc;
                compName = "NFC for the character";
            } else if (i == NFKD) {
                name += "-NFD";
-                nfComp = Default.nfd;
+                nfComp = nfd;
                compName = "NFD for the character";
            }
            header = "# Derived Property: " + name              
@ -273,7 +308,7 @@ public final class DerivedProperty implements UCD_Types {
        QuickDProp (int i) {
            setValueType((i == NFC || i == NFKC) ? ENUMERATED : BINARY);
            type = DERIVED_NORMALIZATION;
-            nfx = Default.nf[i];
+            nfx = nf[i];
            NO = nfx.getName() + "_NO";
            MAYBE = nfx.getName() + "_MAYBE";
            name = nfx.getName() + "_QuickCheck";
@ -297,7 +332,14 @@ public final class DerivedProperty implements UCD_Types {
        public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
    };

-    {
+    private DerivedProperty(UCD ucd) {
+        ucdData = ucd;
+    
+        nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdData.getVersion());
+        nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdData.getVersion());
+        nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdData.getVersion());
+        nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdData.getVersion());
+
        for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
            dprops[i] = new ExDProp(i-ExpandsOnNFD);
        }
@ -321,10 +363,10 @@ public final class DerivedProperty implements UCD_Types {
                shortName = "IDS";
                header = "# Derived Property: " + name
                    + "\r\n#  Characters that can start an identifier."
-                    + "\r\n#  Generated from Lu+Ll+Lt+Lm+Lo+Nl";
+                    + "\r\n#  Generated from Lu+Ll+Lt+Lm+Lo+Nl+ID_Start_Exceptions";
            }
            public boolean hasValue(int cp) {
-                return ucdData.isIdentifierStart(cp, false);
+                return ucdData.isIdentifierStart(cp);
            }
        };
        
@ -339,10 +381,65 @@ public final class DerivedProperty implements UCD_Types {
                    + "\r\n#  NOTE: Cf characters should be filtered out.";
            }
            public boolean hasValue(int cp) {
-                return ucdData.isIdentifierContinue_NO_Cf(cp, false);
+                return ucdData.isIdentifierContinue_NO_Cf(cp);
            }
        };
        
+        StringBuffer tempBuf = new StringBuffer();
+        
+        //System.out.println("Deriving data for XID");
+        // special hack for middle dot
+        XID_Continue_Set.add(0x00B7);
+        //System.out.println("Adding (2)" + ucdData.getCodeAndName(0x00B7));
+        
+        
+        for (int cp = 0; cp < 0x10FFFF; ++cp) {
+            // skip cases that can't matter
+            if (!ucdData.isAssigned(cp)) continue;
+            
+            // find out normal status
+            int status = 0;
+            if (ucdData.isIdentifierStart(cp)) status = 1;
+            else if (ucdData.isIdentifierContinue_NO_Cf(cp)) status = 2;
+            
+            if (status != 0 && !nfkd.isNormalized(cp)) {
+                // now find out NFKD status
+                // if it is <start><extend>*, then it is start
+                // else if it is <extend>*, then it is extend
+                // else it is nothing
+                int status2 = 0;
+                tempBuf.setLength(0);
+                nfkd.normalize(UTF32.valueOf32(cp), tempBuf);
+                for (int i = 0; i < tempBuf.length(); i += UTF32.count16(cp)) {
+                    int cp2 = UTF32.char32At(tempBuf, i);
+                    if (i == 0) {
+                        if (ucdData.isIdentifierStart(cp2)) status2 = 1;
+                        else if (ucdData.isIdentifierContinue_NO_Cf(cp2)) status2 = 2;
+                        else {
+                            status2 = 0;
+                            break;
+                        }
+                    } else if (!ucdData.isIdentifierContinue_NO_Cf(cp2) && cp2 != 0xB7) {
+                        status2 = 0;
+                        break;
+                    }
+                }
+                
+                // Now see if the statuses are compatible.
+                if (status != status2) {
+                    //System.out.println("Need to do something with:");
+                    //System.out.println("  " + status + ": " + ucdData.getCodeAndName(cp));
+                    //System.out.println("  " + status2 + ": " + ucdData.getCodeAndName(tempBuf.toString()));
+                    if (status2 == 0) status = 0;
+                    else if (status2 > status) status = status2;
+                    //System.out.println("  " + status + ": " + ucdData.getCodeAndName(cp));
+                }
+            }
+            
+            if (status == 1) XID_Start_Set.add(cp);
+            if (status != 0) XID_Continue_Set.add(cp);
+        }
+        
        dprops[Mod_ID_Start] = new UnicodeProperty() {
            {
                type = DERIVED_CORE;
@ -355,7 +452,7 @@ public final class DerivedProperty implements UCD_Types {
                    + "\r\n#        Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
            }
            public boolean hasValue(int cp) {
-                return ucdData.isIdentifierStart(cp, true);
+                return XID_Start_Set.contains(cp);
            }
        };
        
@ -372,7 +469,7 @@ public final class DerivedProperty implements UCD_Types {
                    + "\r\n#        Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
            }
            public boolean hasValue(int cp) {
-                return ucdData.isIdentifierContinue_NO_Cf(cp, true);
+                return XID_Continue_Set.contains(cp);
            }
        };
        
@ -458,7 +555,6 @@ of characters, the first of which has a non-zero combining class.
                shortName = "Comp_Ex";
                defaultValueStyle = defaultPropertyStyle = SHORT;
                header = "# Derived Property: " + name
-                    + ": Full Composition Exclusion"
                    + "\r\n#  Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
            }
            public boolean hasValue(int cp) {
@ -469,6 +565,9 @@ of characters, the first of which has a non-zero combining class.
                if (isCompEx(cp)) return true;
                return false;
            }
+		    /*public String getListingValue(int cp) {
+    		    return "Comp_Ex";
+    	    }*/
            /*
 			public String getListingValue(int cp) {
        		if (getValueType() != BINARY) return getValue(cp, SHORT);
@ -511,8 +610,8 @@ of characters, the first of which has a non-zero combining class.
            }
            public String getValue(int cp, byte style) { 
                if (!ucdData.isRepresented(cp)) return "";
-                String b = Default.nfkc.normalize(fold(cp));
-                String c = Default.nfkc.normalize(fold(b));
+                String b = nfkc.normalize(fold(cp));
+                String c = nfkc.normalize(fold(b));
                if (c.equals(b)) return "";
                return "FNC; " + Utility.hex(c);
            } // default
@ -533,8 +632,8 @@ of characters, the first of which has a non-zero combining class.
            }
            public String getValue(int cp, byte style) { 
                if (!ucdData.isRepresented(cp)) return "";
-                String b = Default.nfc.normalize(fold(cp));
-                String c = Default.nfc.normalize(fold(b));
+                String b = nfc.normalize(fold(cp));
+                String c = nfc.normalize(fold(b));
                if (c.equals(b)) return "";
                return "FN; " + Utility.hex(c);
            } // default
@ -565,6 +664,94 @@ of characters, the first of which has a non-zero combining class.
            }
        };

+        dprops[Case_Sensitive] = new UnicodeProperty() {
+            {
+                type = DERIVED_CORE;
+                isStandard = false;
+                name = "Case_Sensitive";
+                hasUnassigned = false;
+                shortName = "CS";
+                header = header = "# Derived Property: " + name
+                    + "\r\n#  Generated from all characters that are either on the right or left side of a case mapping";
+            }
+            
+            UnicodeSet case_sensitive = null;
+            UnicodeSet tempSet = new UnicodeSet();
+            UnicodeSet cased = null;
+            PrintWriter log;
+            
+            private void addCase(String cps, byte c1, byte c2) {
+                String temp = ucdData.getCase(cps, c1, c2);
+                if (temp.equals(cps)) return;
+                
+                //temp = nfc.normalize(temp);
+                //if (temp.equals(cps)) return;
+                
+                tempSet.clear();
+                tempSet.addAll(cps);
+                tempSet.addAll(temp);
+                if (!case_sensitive.containsAll(tempSet)) {
+                    tempSet.removeAll(case_sensitive);
+                    if (!cased.containsAll(tempSet)) {
+                        log.println();
+                        log.println("Adding " + tempSet + " because of: ");
+                        log.println("\t" + ucdData.getCodeAndName(cps));
+                        log.println("=>\t" + ucdData.getCodeAndName(temp));
+                    }
+                    case_sensitive.addAll(tempSet);
+                }
+            }
+            
+            public boolean hasValue(int cp) {
+                if (case_sensitive == null) {
+                    try {
+                        log = Utility.openPrintWriter("Case_Sensitive_Log.txt", Utility.UTF8_UNIX);
+                        
+                        System.out.println("Building Case-Sensitive cache");
+                        case_sensitive = new UnicodeSet();
+                        cased = DerivedProperty.make(PropLowercase, ucdData).getSet()
+                            .addAll(DerivedProperty.make(PropUppercase, ucdData).getSet())
+                            .addAll(UnifiedBinaryProperty.make(CATEGORY | Lt).getSet());
+                        for (int c = 0; c < 0x10FFFF; ++c) {
+                            Utility.dot(c);
+                            // skip cases that can't matter
+                            if (!ucdData.isAssigned(c)) continue;
+                            
+                            String cps = UTF16.valueOf(c);
+                            addCase(cps, FULL, LOWER);
+                            addCase(cps, FULL, UPPER);
+                            addCase(cps, FULL, TITLE);
+                            addCase(cps, FULL, FOLD);
+                            addCase(cps, SIMPLE, LOWER);
+                            addCase(cps, SIMPLE, UPPER);
+                            addCase(cps, SIMPLE, TITLE);
+                            addCase(cps, SIMPLE, FOLD);
+                        }
+                        Utility.fixDot();
+                        UnicodeSet temp;
+                        log.println("Cased, but not Case_Sensitive");
+                        temp = new UnicodeSet().addAll(cased).removeAll(case_sensitive);
+                        Utility.showSetNames(log, "", temp, false, false, ucdData);
+                        
+                        log.println("Case_Sensitive, but not Cased");
+                        temp = new UnicodeSet().addAll(case_sensitive).removeAll(cased);
+                        Utility.showSetNames(log, "", temp, false, false, ucdData);
+                        
+                        log.println("Both Case_Sensitive, and Cased");
+                        temp = new UnicodeSet().addAll(case_sensitive).retainAll(cased);
+                        log.println(temp);
+                        System.out.println("Done Building Case-Sensitive cache");
+                        
+                        log.close();
+                        
+                    } catch (Exception e) {
+                        throw new ChainException("internal error", null, e);
+                    }
+                }
+                return case_sensitive.contains(cp);  
+            }
+        };
+
        dprops[Other_Case_Ignorable] = new UnicodeProperty() {
            {
                name = "Other_Case_Ignorable";
@ -602,8 +789,8 @@ of characters, the first of which has a non-zero combining class.
            }
            public boolean hasValue(int cp) {
                if (hasSoftDot(cp)) return true;
-                if (Default.nfkd.isNormalized(cp)) return false;
-                String decomp = Default.nfd.normalize(cp);
+                if (nfkd.isNormalized(cp)) return false;
+                String decomp = nfd.normalize(cp);
                boolean ok = false;
                for (int i = decomp.length()-1; i >= 0; --i) {
                    int ch = UTF16.charAt(decomp, i);
@ -650,16 +837,19 @@ of characters, the first of which has a non-zero combining class.
                name = "Grapheme_Extend";
                shortName = "GrExt";
                header = header = "# Derived Property: " + name
-                    + "\r\n#  Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link - CGJ"
-                    + "\r\n#  (CGJ = U+034F)";
+                    + "\r\n#  Generated from: Me + Mn + Other_Grapheme_Extend"
+                    + "\r\n#  Note: depending on an application's interpretation of Co (private use),"
+                    + "\r\n#  they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."                    
+                    ;
                     
            }
            public boolean hasValue(int cp) {
-            	if (cp == 0x034F) return false;
-                if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
+            	//if (cp == 0x034F) return false;
+                //if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
+                // || cat == Mc
                byte cat = ucdData.getCategory(cp);
-                if (cat == Me || cat == Mn || cat == Mc
-                || ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
+                if (cat == Me || cat == Mn
+                        || ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
                return false;
            }
        };
@ -671,14 +861,16 @@ of characters, the first of which has a non-zero combining class.
                shortName = "GrBase";
                
                header = header = "# Derived Property: " + name
-                    + "\r\n#  Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
-                    + "\r\n#    - Grapheme_Extend - Grapheme_Link - CGJ";
+                    + "\r\n#  Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend"
+                    + "\r\n#  Note: depending on an application's interpretation of Co (private use),"
+                    + "\r\n#  they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."                    
+                    ;
            }
            public boolean hasValue(int cp) {
-            	if (cp == 0x034F) return false;
+            	//if (cp == 0x034F) return false;
                byte cat = ucdData.getCategory(cp);
-                if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
-                || ucdData.getBinaryProperty(cp,GraphemeLink)) return false;
+                if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp) return false;
+                // || ucdData.getBinaryProperty(cp,GraphemeLink)
                if (dprops[GraphemeExtend].hasValue(cp)) return false;
                return true;
            }
@ -702,11 +894,11 @@ of characters, the first of which has a non-zero combining class.
            || ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
        if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
        
-       // if (true) throw new IllegalArgumentException("FIX Default.nf[2]");
+       // if (true) throw new IllegalArgumentException("FIX nf[2]");
        
-        if (Default.nf[NFKD].isNormalized(cp)) return Lo;
+        if (nf[NFKD].isNormalized(cp)) return Lo;

-        String norm = Default.nf[NFKD].normalize(cp);
+        String norm = nf[NFKD].normalize(cp);
        int cp2;
        boolean gotUpper = false;
        boolean gotLower = false;
--- a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
-* $Date: 2002/06/22 01:21:09 $
-* $Revision: 1.7 $
+* $Date: 2003/02/25 23:38:23 $
+* $Revision: 1.8 $
 *
 *******************************************************************************
 */
@ -31,6 +31,7 @@ class DiffPropertyLister extends PropertyLister {
        }
        breakByCategory = property != NOPROPERTY;
        useKenName = false;
+        usePropertyComment = false;
    }
    
    public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) {
@ -61,20 +62,27 @@ class DiffPropertyLister extends PropertyLister {
 	
    public String optionalComment(int cp) {
    	String normal = super.optionalComment(cp);
-        return oldUCD.getModCatID_fromIndex(
-        	oldUCD.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0))
-        	+ "/" + normal;
+    	if (oldUCD != null && breakByCategory) {
+    	    byte modCat = oldUCD.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
+            normal = oldUCD.getModCatID_fromIndex(modCat) + "/" + normal;
+        }
+        return normal;
    }

 	
+    byte getModCat(int cp) {
+    	byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : -1);
+    	//System.out.println(breakByCategory + ", " + ucdData.getModCatID_fromIndex(result));
+    	return result;
+    }
+	

    public byte status(int cp) {
    	if (newProp == null) {
        	if (ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp))) {
    	        set.add(cp);
        	    return INCLUDE;
-        	}
-        	else {
+        	} else {
        	    return EXCLUDE;
        	}
    	}
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
-* $Date: 2002/08/09 23:56:24 $
-* $Revision: 1.2 $
+* $Date: 2003/02/25 23:38:23 $
+* $Revision: 1.3 $
 *
 *******************************************************************************
 */
@ -83,14 +83,14 @@ abstract public class GenerateBreakTest implements UCD_Types {
        Default.setUCD();
    }
        
-    static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5\\u02ED\\u05F3]");
+    static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5-\\u02ED\\u05F3]");
    static UnicodeSet alphabeticSet = UnifiedBinaryProperty.make(DERIVED | PropAlphabetic).getSet()
        .addAll(extraAlpha);
        
    static UnicodeSet ideographicSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet();
    
    static {
-        System.out.println("alphabetic: " + alphabeticSet.toPattern(true));
+        if (false) System.out.println("alphabetic: " + alphabeticSet.toPattern(true));
    }
    

@ -116,16 +116,16 @@ abstract public class GenerateBreakTest implements UCD_Types {
            PrintWriter systemPrintWriter = new PrintWriter(System.out);
            gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false);
            systemPrintWriter.flush();
-        }
-        
-        if (false) {
-            GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
-            foo.isBreak("(\"Go.\") (He did)", 5, true);
-        
            showSet("sepSet", GenerateSentenceBreakTest.sepSet);
            showSet("atermSet", GenerateSentenceBreakTest.atermSet);
            showSet("termSet", GenerateSentenceBreakTest.termSet);
        }
+        
+        if (true) {
+            GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
+            //foo.isBreak("(\"Go.\") (He did)", 5, true);
+            foo.isBreak("3.4", 2, true);
+        }

        new GenerateSentenceBreakTest().run();
        
@ -276,7 +276,7 @@ abstract public class GenerateBreakTest implements UCD_Types {

        PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
        out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
-            + fileName + "</title></head>");
+            + fileName + " Break Chart</title></head>");
        out.println("<body bgcolor='#FFFFFF'><h3>Current:</h3>");


@ -304,7 +304,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
        PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest" 
            + (recommended & recommendedDiffers() ? "_NEW" : "")
            + (shortVersion ? "_SHORT" : "")
-            + ".txt", Utility.LATIN1_WINDOWS);
+            + ".txt", Utility.UTF8_WINDOWS);
        int counter = 0;

        out.println("# Default " + fileName + " Break Test");
@ -623,6 +623,60 @@ abstract public class GenerateBreakTest implements UCD_Types {
    }


+    static public class Context {
+        public int cpBefore2, cpBefore, cpAfter, cpAfter2;
+        public byte tBefore2, tBefore, tAfter, tAfter2;
+        public String toString() {
+            return "[" 
+            + Utility.hex(cpBefore2) + "(" + tBefore2 + "), "
+            + Utility.hex(cpBefore) + "(" + tBefore + "), "
+            + Utility.hex(cpAfter) + "(" + tAfter + "), "
+            + Utility.hex(cpAfter2) + "(" + tAfter2 + ")]";
+        }
+    }
+
+    public void getGraphemeBases(String source, int offset, boolean recommended, byte ignoreType, Context context) {
+        context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
+        context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1;
+        //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + ignoreType);
+            
+        MyBreakIterator graphemeIterator = new MyBreakIterator();
+
+        graphemeIterator.set(source, offset);
+        while (true) {
+            int cp = graphemeIterator.previousBase();
+            if (cp == -1) break;
+            byte t = getResolvedType(cp, recommended);
+            if (t == ignoreType) continue;
+                
+            if (context.cpBefore == -1) {
+                context.cpBefore = cp;
+                context.tBefore = t;
+            } else {
+                context.cpBefore2 = cp;
+                context.tBefore2 = t;
+                break;
+            }
+        }
+        graphemeIterator.set(source, offset);
+        while (true) {
+            int cp = graphemeIterator.nextBase();
+            if (cp == -1) break;
+            byte t = getResolvedType(cp, recommended);
+            if (t == ignoreType) continue;
+                
+            if (context.cpAfter == -1) {
+                context.cpAfter = cp;
+                context.tAfter = t;
+            } else {
+                context.cpAfter2 = cp;
+                context.tAfter2 = t;
+                break;
+            }
+        }
+    }
+
+
    // ========================================

    static class GenerateLineBreakTest extends GenerateBreakTest {
@ -1050,7 +1104,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
            if (cp == 0xA) return LF;
            if (cp == 0xD) return CR;
            if (recommended) {
-                if (cp == 0x034F) return CGJ;
+                if (cp == 0x034F) return Extend;
            }
            if (cp == 0x2028 || cp == 0x2029) return Control;

@ -1178,7 +1232,6 @@ abstract public class GenerateBreakTest implements UCD_Types {

        static UnicodeSet extraKatakana = new UnicodeSet("[" + LENGTH + HALFWIDTH_KATAKANA + KATAKANA_ITERATION + "]");

-        //static UnicodeProperty LineBreakIdeographic = UnifiedBinaryProperty.make(LINE_BREAK | LB_ID);
        static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
        static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);

@ -1325,52 +1378,6 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
            return 3;
        }

-        static public class Context {
-            public int cpBefore2, cpBefore, cpAfter, cpAfter2;
-            public byte tBefore2, tBefore, tAfter, tAfter2;
-        }
-
-        public void getGraphemeBases(String source, int offset, boolean recommended, Context context) {
-            context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
-            context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1;
-            
-            MyBreakIterator graphemeIterator = new MyBreakIterator();
-
-            graphemeIterator.set(source, offset);
-            while (true) {
-                int cp = graphemeIterator.previousBase();
-                if (cp == -1) break;
-                byte t = getResolvedType(cp, recommended);
-                if (t == Format) continue;
-                
-                if (context.cpBefore == -1) {
-                    context.cpBefore = cp;
-                    context.tBefore = t;
-                } else {
-                    context.cpBefore2 = cp;
-                    context.tBefore2 = t;
-                    break;
-                }
-            }
-            graphemeIterator.set(source, offset);
-            while (true) {
-                int cp = graphemeIterator.nextBase();
-                if (cp == -1) break;
-                byte t = getResolvedType(cp, recommended);
-                if (t == Format) continue;
-                
-                if (context.cpAfter == -1) {
-                    context.cpAfter = cp;
-                    context.tAfter = t;
-                } else {
-                    context.cpAfter2 = cp;
-                    context.tAfter2 = t;
-                    break;
-                }
-            }
-        }
-
-
        public boolean isBreak(String source, int offset, boolean recommended) {
            recommended = true; // don't care about old stuff

@ -1391,7 +1398,7 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT

            // now get the base character before and after, and their types

-            getGraphemeBases(source, offset, recommended, context);
+            getGraphemeBases(source, offset, recommended, Format, context);

            byte before = context.tBefore;
            byte after = context.tAfter;
@ -1457,42 +1464,55 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
    static class GenerateSentenceBreakTest extends GenerateBreakTest {
        
        static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5,
-            Close = 6, ATerm = 7, Term = 8, Other = 9,
+            Numeric = 6, Close = 7, ATerm = 8, Term = 9, Other = 10,
            LIMIT = Other + 1;

-        static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper",
+        static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper", "Numeric",
            "Close", "ATerm", "Term", "Other" };

        static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();

        static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]");
        static UnicodeSet atermSet = new UnicodeSet("[\\u002E]");
-        static UnicodeSet termSet = new UnicodeSet("[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934"
-            + "\\u1362\\u1367\\u1368\\u1803\\u1809\\u203c\\u203d\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]");
+        static UnicodeSet termSet = new UnicodeSet(
+            "[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934"
+            + "\\u1362\\u1367\\u1368\\u104A\\u104B\\u166E"
+            + "\\u1803\\u1809\\u203c\\u203d"
+            + "\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]");
        
        static UnicodeProperty lowercaseProp = UnifiedBinaryProperty.make(DERIVED | PropLowercase);
        static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase);
        
+        UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NU).getSet();
+        
        {

            fileName = "Sentence";
            extraSamples = new String[] {
-                
            };
-            String[] temp = new String[] {
+            
+            extraSingleSamples = new String[] {
                "(\"Go.\") (He did.)", 
-                "(\"Go?\") (He did.)", 
+                "(\u201CGo?\u201D) (He did.)", 
                "U.S.A\u0300. is", 
                "U.S.A\u0300? He", 
                "U.S.A\u0300.", 
-                "\u4e00.\u4300",
-                "\u4e00?\u4300",
+                "3.4", 
+                "c.d",
+                "etc.)\u2019 \u2018(the",
+                "etc.)\u2019 \u2018(The",
+                "the resp. leaders are",
+                "\u5B57.\u5B57",
+                "etc.\u5B83",
+                "etc.\u3002",
+                "\u5B57\u3002\u5B83",
            };
-            extraSingleSamples = new String [temp.length * 2];
-            System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
-            for (int i = 0; i < temp.length; ++i) {
-                extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
+            String[] temp = new String [extraSingleSamples.length * 2];
+            System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length);
+            for (int i = 0; i < extraSingleSamples.length; ++i) {
+                temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme);
            }
+            extraSingleSamples = temp;

        }
        
@ -1509,9 +1529,10 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
            if (cat == Cf) return Format;
            if (sepSet.contains(cp)) return Sep;
            if (Default.ucd.getBinaryProperty(cp, White_space)) return Sp;
-            if (alphabeticSet.contains(cp)) return OLetter;
+            if (linebreakNS.contains(cp)) return Numeric;
            if (lowercaseProp.hasValue(cp)) return Lower;
            if (uppercaseProp.hasValue(cp) || cat == Lt) return Upper;
+            if (alphabeticSet.contains(cp)) return OLetter;
            if (atermSet.contains(cp)) return ATerm;
            if (termSet.contains(cp)) return Term;
            if (cat == Po || cat == Pe
@ -1529,6 +1550,8 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
            return 1;
        }

+        static Context context = new Context();
+        
        public boolean isBreak(String source, int offset, boolean recommended) {

            rule = "1";
@ -1541,8 +1564,8 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT

            // Sep ÷  (3) 
            rule = "3";
-            byte before = getResolvedType(source.charAt(offset-1), recommended);
-            if (before == Sep) return true;
+            byte beforeChar = getResolvedType(source.charAt(offset-1), recommended);
+            if (beforeChar == Sep) return true;
            
            // Treat a grapheme cluster as if it were a single character:
            // the first base character, if there is one; otherwise the first character.
@ -1556,17 +1579,29 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
            rule="3";
            if (!grapheme.isBreak( source,  offset,  recommended)) return false;
            
-            // Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
-            // ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
-            // ATerm ×Upper (7)
-            
-            // Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
-            // ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
-            // ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
-            // ( Term | ATerm ) Close* Sp*÷(10)
+            getGraphemeBases(source, offset, recommended, Format, context);

+            byte before = context.tBefore;
+            byte after = context.tAfter;
+            byte before2 = context.tBefore2;
+            byte after2 = context.tAfter2;
+            
+            
+            // Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
+
+            // ATerm × (Lower | Numeric) (6)
+            // Upper ATerm × Upper (7)
+
+            if (before == ATerm) {
+                rule = "6";
+                if (after == Lower || after == Numeric) return false;
+                rule = "7";
+                if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper);
+                if (before2 == Upper && after == Upper) return false;
+            }
+            
+            // The following cases are all handled together.
            
-            // These cases are all handled together.
            // First we loop backwards, checking for the different types.
            
            MyBreakIterator graphemeIterator = new MyBreakIterator();
@ -1620,19 +1655,18 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
            if (lookAfter == -1) {
                // Otherwise, do not break
                // Any × Any (11)
-                rule = "11";
+                rule = "12";
                return false;
            }
                
-            // Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
-            // ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
-            // ATerm ×Upper (7)
+            // ATerm Close* Sp*×(¬( OLetter))* Lower(8)
            
            // Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
-            // ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
-            // ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
-            // ( Term | ATerm ) Close* Sp*÷(10)
-            
+            // ( Term | ATerm ) Close*×( Close | Sp | Sep )(9)
+            // ( Term | ATerm ) Close* Sp×( Sp | Sep )(10)
+            // ( Term | ATerm ) Close* Sp*÷(11)
+
+                        
            // We DID find one. Loop to see if the right side is ok.

            graphemeIterator.set(source, offset);
@ -1648,16 +1682,16 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
                if (isFirst) {
                    isFirst = false;
                    if (lookAfter == ATerm && t == Upper) {
-                        rule = "7";
+                        rule = "8";
                        return false;
                    }
                    if (gotSpace) {
                        if (t == Sp || t == Sep) {
-                            rule = "9";
+                            rule = "10";
                            return false;
                        }
                    } else if (t == Close || t == Sp || t == Sep) {
-                        rule = "8";
+                        rule = "9";
                        return false;
                    }
                    if (lookAfter == Term) break;
@ -1666,16 +1700,18 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
                // at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
                if (t != OLetter && t != Upper && t != Lower) continue;
                if (t == Lower) {
-                    rule = "6";
+                    rule = "8";
                    return false;
                }
                break;
            }
-            rule = "10";
+            rule = "11";
            return true;
        }
    }
    
+    static final boolean DEBUG_GRAPHEMES = false;
+    
    static class MyBreakIterator {
        int offset = 0;
        String string = "";
@ -1683,6 +1719,7 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
        boolean recommended = true;
        
        public MyBreakIterator set(String source, int offset) {
+            //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(string) + "; " + offset);
            string = source;
            this.offset = offset;
            return this;
@ -1694,6 +1731,7 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
            for (++offset; offset < string.length(); ++offset) {
                if (breaker.isBreak(string, offset, recommended)) break;
            }
+            //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
            return result;
        }
        
@ -1702,7 +1740,9 @@ U+02E5..U+02ED  # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
            for (--offset; offset >= 0; --offset) {
                if (breaker.isBreak(string, offset, recommended)) break;
            }
-            return UTF16.charAt(string, offset);
+            int result = UTF16.charAt(string, offset);
+            //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
+            return result;
        }
    }
 }
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.12 $
+* $Date: 2003/02/25 23:38:23 $
+* $Revision: 1.13 $
 *
 *******************************************************************************
 */
@ -45,10 +45,19 @@ public class GenerateCaseFolding implements UCD_Types {
        System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
        
        System.out.println("Making Full Data");
-        Map fullData = getCaseFolding(true, NF_CLOSURE);
+        Map fullData = getCaseFolding(true, NF_CLOSURE, "");
        Utility.fixDot();
+
        System.out.println("Making Simple Data");
-        Map simpleData = getCaseFolding(false, NF_CLOSURE);
+        Map simpleData = getCaseFolding(false, NF_CLOSURE, "");
+        // write the data
+
+        System.out.println("Making Turkish Full Data");
+        Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr");
+        Utility.fixDot();
+
+        System.out.println("Making Simple Data");
+        Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr");
        // write the data

        Utility.fixDot();
@ -58,7 +67,8 @@ public class GenerateCaseFolding implements UCD_Types {
        String directory = "DerivedData/";
        String newFile = directory + filename + GenerateData.getFileSuffix(true);
        PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true), batName);
        
        out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
        out.println(GenerateData.generateDateLine());
@ -81,7 +91,10 @@ public class GenerateCaseFolding implements UCD_Types {
            
            String rFull = (String)fullData.get(UTF32.valueOf32(ch));
            String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
-            if (rFull == null && rSimple == null) continue;
+            String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch));
+            String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch));
+            if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue;
+            
            if (rFull != null && rFull.equals(rSimple) 
              || (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
                String type = "C";
@ -105,10 +118,16 @@ public class GenerateCaseFolding implements UCD_Types {
                    drawLine(out, ch, "S", rSimple);
                }
            }
+            if (rFullTurkish != null && !rFullTurkish.equals(rFull)) {
+                drawLine(out, ch, "T", rFullTurkish);
+            }
+            if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) {
+                drawLine(out, ch, "t", rSimpleTurkish);
+            }
        }
        out.close();
        log.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
    }
    
 /* Goal is following (with no entries for 0131 or 0069)
@ -146,7 +165,7 @@ public class GenerateCaseFolding implements UCD_Types {
    static int probeCh = 0x01f0;
    static String shower = UTF16.valueOf(probeCh);

-    static Map getCaseFolding(boolean full, boolean nfClose) throws java.io.IOException {
+    static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException {
        Map data = new TreeMap();
        Map repChar = new TreeMap();
        //String option = "";
@ -157,7 +176,7 @@ public class GenerateCaseFolding implements UCD_Types {
            Utility.dot(ch);
            //if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
            if (!Default.ucd.isRepresented(ch)) continue;
-            getClosure(ch, data, full, nfClose);
+            getClosure(ch, data, full, nfClose, condition);
        }

        // get the representative characters
@ -180,7 +199,7 @@ public class GenerateCaseFolding implements UCD_Types {
            Iterator it2 = set.iterator();
            while (it2.hasNext()) {
                String s2 = (String)it2.next();
-                int s2Good = goodness(s2, full);
+                int s2Good = goodness(s2, full, condition);
                if (s2Good > repGood) {
                    rep = s2;
                    repGood = s2Good;
@ -206,12 +225,20 @@ public class GenerateCaseFolding implements UCD_Types {
                log.println(" Set:\t" + toString(set,true, true));
            }
            
+            log.println();
+            log.println();
+            log.println(rep + "\t#" + Default.ucd.getName(rep));
+ 
        // Add it for all the elements of the set
        
            it2 = set.iterator();
            while (it2.hasNext()) {
                String s2 = (String)it2.next();
-                if (UTF16.countCodePoint(s2) == 1 && !s2.equals(rep)) {
+                if (s2.equals(rep)) continue;
+                
+                log.println(s2 + "\t#" + Default.ucd.getName(s2));
+                
+                if (UTF16.countCodePoint(s2) == 1) {
                    repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
                    charsUsed.set(UTF16.charAt(s2, 0));
                }
@ -225,14 +252,14 @@ public class GenerateCaseFolding implements UCD_Types {
    static final int NFC_FORMAT = 64;
    static final int ISLOWER = 128;

-    static int goodness(String s, boolean full) {
+    static int goodness(String s, boolean full, String condition) {
        if (s == null) return 0;
        int result = 32-s.length();
        if (!PICK_SHORT) {
            result = s.length();
        }
        if (!full) result <<= 8;
-        String low = lower(upper(s, full), full);
+        String low = lower(upper(s, full, condition), full, condition);
        if (s.equals(low)) result |= ISLOWER;
        else if (PICK_SHORT && Default.nfd.normalize(s).equals(Default.nfd.normalize(low))) result |= ISLOWER;
        
@ -295,11 +322,11 @@ public class GenerateCaseFolding implements UCD_Types {
            }
            */

-    static void getClosure(int ch, Map data, boolean full, boolean nfClose) {
+    static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) {
        String charStr = UTF32.valueOf32(ch);
-        String lowerStr = lower(charStr, full);
-        String titleStr = title(charStr, full);
-        String upperStr = upper(charStr, full);
+        String lowerStr = lower(charStr, full, condition);
+        String titleStr = title(charStr, full, condition);
+        String upperStr = upper(charStr, full, condition);
        if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
        if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));

@ -327,47 +354,47 @@ public class GenerateCaseFolding implements UCD_Types {
                    if (add(set, Default.nfkd.normalize(s), data)) continue main;
                    if (add(set, Default.nfkc.normalize(s), data)) continue main;
                }
-                if (add(set, lower(s, full), data)) continue main;
-                if (add(set, title(s, full), data)) continue main;
-                if (add(set, upper(s, full), data)) continue main;
+                if (add(set, lower(s, full, condition), data)) continue main;
+                if (add(set, title(s, full, condition), data)) continue main;
+                if (add(set, upper(s, full, condition), data)) continue main;
            }
            break;
        }
    }

-    static String lower(String s, boolean full) {
-        String result = lower2(s,full);
+    static String lower(String s, boolean full, String condition) {
+        String result = lower2(s,full, condition);
        return result.replace('\u03C2', '\u03C3'); // HACK for lower
    }

    // These functions are no longer necessary, since Default.ucd is parameterized,
    // but it's not worth changing

-    static String lower2(String s, boolean full) {
+    static String lower2(String s, boolean full, String condition) {
        /*if (!full) {
            if (s.length() != 1) return s;
            return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
        }
        */
-        return Default.ucd.getCase(s, full ? FULL : SIMPLE, LOWER);
+        return Default.ucd.getCase(s, full ? FULL : SIMPLE, LOWER, condition);
    }

-    static String upper(String s, boolean full) {
+    static String upper(String s, boolean full, String condition) {
        /* if (!full) {
            if (s.length() != 1) return s;
            return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
        }
        */
-        return Default.ucd.getCase(s, full ? FULL : SIMPLE, UPPER);
+        return Default.ucd.getCase(s, full ? FULL : SIMPLE, UPPER, condition);
    }

-    static String title(String s, boolean full) {
+    static String title(String s, boolean full, String condition) {
        /*if (!full) {
            if (s.length() != 1) return s;
            return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
        }
        */
-        return Default.ucd.getCase(s, full ? FULL : SIMPLE, TITLE);
+        return Default.ucd.getCase(s, full ? FULL : SIMPLE, TITLE, condition);
    }

    static boolean add(Set set, String s, Map data) {
@ -557,7 +584,8 @@ public class GenerateCaseFolding implements UCD_Types {
        System.out.println("Writing");
        String newFile = "DerivedData/SpecialCasing" + suffix2 + GenerateData.getFileSuffix(true);
        PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true), batName);
        out.println("# SpecialCasing" + GenerateData.getFileSuffix(false));
        out.println(GenerateData.generateDateLine());
        out.println("#");
@ -594,6 +622,6 @@ public class GenerateCaseFolding implements UCD_Types {
        }
        Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
        out.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
    }
 }
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.23 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.24 $
 *
 *******************************************************************************
 */
@ -116,16 +116,13 @@ public class GenerateData implements UCD_Types {
        output.println(generateDateLine());
        output.println("#");
        if (headerChoice == HEADER_SCRIPTS) {
-            output.println("# For documentation, see UTR #24: Script Names");
-            output.println("#   http://www.unicode.org/unicode/reports/tr24/");
        } else if (headerChoice == HEADER_EXTEND) {
            output.println("# Unicode Character Database: Extended Properties");
-            output.println("# For documentation, see PropList.html");
        } else {
            output.println("# Unicode Character Database: Derived Property Data");
            output.println("# Generated algorithmically from the Unicode Character Database");
-            output.println("# For documentation, see DerivedProperties.html");
        }
+        output.println("# For documentation, see UCD.html");
        output.println("# Note: Unassigned and Noncharacter codepoints are omitted,");
        output.println("#       except when listing Noncharacter or Cn.");
        output.println(HORIZONTAL_LINE);
@ -144,12 +141,14 @@ public class GenerateData implements UCD_Types {
        String newFile = directory + fileName + getFileSuffix(true);
        System.out.println("New File: " + newFile);
        PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = generateBat(directory, fileName, getFileSuffix(true), batName);
        System.out.println("Most recent: " + mostRecent);
        
        doHeader(fileName + getFileSuffix(false), output, headerChoice);
        for (int i = 0; i < DERIVED_PROPERTY_LIMIT; ++i) {
            UnicodeProperty up = DerivedProperty.make(i, Default.ucd);
+            if (up == null) continue;
            boolean keepGoing = true;
            if (!up.isStandard()) keepGoing = false;
            if ((up.getType() & type) == 0) keepGoing = false;
@ -164,7 +163,7 @@ public class GenerateData implements UCD_Types {
            output.flush();
        }
        output.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
    }

    /*
@ -192,7 +191,8 @@ public class GenerateData implements UCD_Types {
        Default.setUCD();
        String newFile = "DerivedData/CompositionExclusions" + getFileSuffix(true);
        PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true), batName);
        
        output.println("# CompositionExclusions" + getFileSuffix(false));
        output.println(generateDateLine());
@ -248,7 +248,7 @@ public class GenerateData implements UCD_Types {
        new CompLister(output, 4).print();
        
        output.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
    }
    
    static String generateDateLine() {
@ -538,12 +538,14 @@ public class GenerateData implements UCD_Types {
        addLine(sorted, "qc", "M", "Maybe");
        checkDuplicate(duplicates, accumulation, "M", "qc=Maybe");
        
+        addLine(sorted, "blk", "n/a", Utility.getUnskeleton("no block", true));
        
        for (int i = 0; i < LIMIT_ENUM; ++i) {
            int type = i & 0xFF00;
            if (type == AGE) continue;
            if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue;
            if (i == (BINARY_PROPERTIES | Non_break)) continue;
+            if (i == (BINARY_PROPERTIES | Case_Sensitive)) continue;
            
            if (type == NUMERIC_TYPE) {
                //System.out.println("debug");
@ -658,7 +660,8 @@ public class GenerateData implements UCD_Types {
        String filename = "PropertyAliases";
        String newFile = "DerivedData/" + filename + getFileSuffix(true);
        PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true), batName);
        
        log.println("# " + filename + getFileSuffix(false));
        log.println(generateDateLine());
@ -669,12 +672,12 @@ public class GenerateData implements UCD_Types {
        Utility.print(log, sorted, "\r\n", new MyBreaker(true));
        log.println();
        log.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
        
        filename = "PropertyValueAliases";
        newFile = "DerivedData/" + filename + getFileSuffix(true);
        log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
+        mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true), batName);
        
        log.println("# " + filename + getFileSuffix(false));
        log.println(generateDateLine());
@ -685,12 +688,13 @@ public class GenerateData implements UCD_Types {
        Utility.print(log, sorted, "\r\n", new MyBreaker(false));
        log.println();
        log.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
        
        filename = "PropertyAliasSummary";
        newFile = "OtherData/" + filename + getFileSuffix(true);
        log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        mostRecent = generateBat("OtherData/", filename, getFileSuffix(true));
+        mostRecent = generateBat("OtherData/", filename, getFileSuffix(true), batName);
+        
        log.println();
        log.println(HORIZONTAL_LINE);
        log.println();
@ -702,7 +706,7 @@ public class GenerateData implements UCD_Types {
        Utility.print(log, accumulation, "\r\n", new MyBreaker(false));
        log.println();
        log.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
    }
    
    static void addLine(Set sorted, String f1, String f2, String f3) {
@ -821,10 +825,10 @@ public class GenerateData implements UCD_Types {
    */
    // static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
    
-    public static String generateBat(String directory, String fileRoot, String suffix) throws IOException {
+    public static String generateBat(String directory, String fileRoot, String suffix, String[] batName) throws IOException {
        String mostRecent = Utility.getMostRecentUnicodeDataFile(fixFile(fileRoot), Default.ucd.getVersion(), true, true);
        if (mostRecent != null) {
-            generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
+            batName[0] = generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
                mostRecent, directory + fileRoot + suffix);
        } else {
            System.out.println("No previous version of: " + fileRoot + ".txt");
@ -839,8 +843,10 @@ public class GenerateData implements UCD_Types {
        return mostRecent;
    }
    
-    public static void generateBatAux(String batName, String oldName, String newName) throws IOException {
+    public static String generateBatAux(String batName, String oldName, String newName) throws IOException {
+        String fullBatName = batName + ".bat";
        PrintWriter output = Utility.openPrintWriter(batName + ".bat", Utility.LATIN1_UNIX);
+        
        newName = Utility.getOutputName(newName);
        System.out.println("Writing BAT to compare " + oldName + " and " + newName);
        
@ -851,6 +857,7 @@ public class GenerateData implements UCD_Types {
            + " "
            + newFile.getCanonicalFile());
        output.close();
+        return new File(Utility.getOutputName(fullBatName)).getCanonicalFile().toString();
    }
        

@ -860,20 +867,25 @@ public class GenerateData implements UCD_Types {
        Default.setUCD();
        String newFile = directory + file + getFileSuffix(true);
        PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String mostRecent = generateBat(directory, file, getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = generateBat(directory, file, getFileSuffix(true), batName);
        
        doHeader(file + getFileSuffix(false), output, headerChoice);
        int last = -1;
        for (int i = startEnum; i < endEnum; ++i) {
            UnicodeProperty up = UnifiedBinaryProperty.make(i, Default.ucd);
            if (up == null) continue;
+            if (up.isDefaultValue()) continue;
            
+            /*
            if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
                || i == (BINARY_PROPERTIES | Non_break)
                || i == (BINARY_PROPERTIES | CaseFoldTurkishI)
+                || i == (HANGUL_SYLLABLE_TYPE | NA)
                || i == (JOINING_TYPE | JT_U)
                || i == (JOINING_GROUP | NO_SHAPING)
                ) continue; // skip zero case
+            */
            /*if (skipSpecial == SKIP_SPECIAL
                    && i >= (BINARY_PROPERTIES | CompositionExclusion)
                    && i < (AGE + NEXT_ENUM)) continue;
@ -920,8 +932,8 @@ public class GenerateData implements UCD_Types {
            output.flush();
        }
        output.close();
-        System.out.println("HERE");
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        //System.out.println("HERE");
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
        System.out.println();
    }
    
@ -929,7 +941,8 @@ public class GenerateData implements UCD_Types {
        Default.setUCD();
        String newFile = directory + fileName + getFileSuffix(true);
        PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
-        String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = generateBat(directory, fileName, getFileSuffix(true), batName);

        String[] example = new String[256];

@ -959,7 +972,7 @@ public class GenerateData implements UCD_Types {
        log.println("#    NFKD");
        log.println("#      c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
        log.println("#");
-        log.println("# 2. For every assigned Unicode 3.1.0 code point X that is not specifically");
+        log.println("# 2. For every code point X assigned in this version of Unicode that is not specifically");
        log.println("#    listed in Part 1, the following invariants must be true for all conformant");
        log.println("#    implementations:");
        log.println("#");
@ -1038,7 +1051,7 @@ public class GenerateData implements UCD_Types {
        log.println("#");
        log.println("# END OF FILE");
        log.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
    }
    
    static void handleIdentical() throws IOException {
@ -1130,7 +1143,8 @@ public class GenerateData implements UCD_Types {
        Default.setUCD();
        String newFile = directory + filename + getFileSuffix(true);
        PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String mostRecent = generateBat(directory, filename, getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = generateBat(directory, filename, getFileSuffix(true), batName);
        DiffPropertyLister dpl;
        UnicodeSet cummulative = new UnicodeSet();
        
@ -1203,7 +1217,7 @@ public class GenerateData implements UCD_Types {
        } finally {
            if (log != null) {
                log.close();
-                Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+                Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
            }
        }
    }
@ -1212,7 +1226,8 @@ public class GenerateData implements UCD_Types {
        Default.setUCD();
        String newFile = directory + filename + getFileSuffix(true);
        PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String mostRecent = generateBat(directory, filename, getFileSuffix(true));
+        String[] batName = {""};
+        String mostRecent = generateBat(directory, filename, getFileSuffix(true), batName);
        try {
            log.println("# " + filename + getFileSuffix(false));
            log.println(generateDateLine());
@ -1253,6 +1268,9 @@ public class GenerateData implements UCD_Types {
            log.println(HORIZONTAL_LINE);
            log.println();
            new DiffPropertyLister("3.1.0", "3.2.0", log).print();
+            log.println(HORIZONTAL_LINE);
+            log.println();
+            new DiffPropertyLister("3.2.0", "4.0.0", log).print();
            /*
            printDiff("110", "200");
 	        UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
@ -1298,7 +1316,7 @@ public class GenerateData implements UCD_Types {
        } finally {
            if (log != null) {
                log.close();
-                Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
+                Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
            }
        }

--- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.10 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.11 $
 *
 *******************************************************************************
 */
@ -73,7 +73,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
            String property = line.substring(tabPos+1, tabPos2).trim();
            
            String propertyValue = line.substring(tabPos2+1).trim();
-            if (propertyValue.indexOf("U+") >= 0) propertyValue = fixHex.transliterate(propertyValue);
+            if (propertyValue.indexOf("U+") >= 0) propertyValue = fromHexUnicode.transliterate(propertyValue);
            
            HanInfo values = (HanInfo) properties.get(property);
            if (values == null) {
@ -203,13 +203,15 @@ public final class GenerateHanTransliterator implements UCD_Types {
        return (radical << 8) + strokes;
    }
    
-    static Transliterator fixHex = Transliterator.getInstance("hex-any/unicode");
+    static Transliterator fromHexUnicode = Transliterator.getInstance("hex-any/unicode");
+    
+    static Transliterator toHexUnicode = Transliterator.getInstance("any-hex/unicode");
    
    /*
    static String convertUPlus(String other) {
        int pos1 = other.indexOf("U+");
        if (pos1 < 0) return other;
-        return fixHex(
+        return fromHexUnicode(
        pos1 += 2;
        
        StringBuffer result = new StringBuffer();
@ -297,6 +299,47 @@ public final class GenerateHanTransliterator implements UCD_Types {
            
            readFrequencyData(type);
            
+            Iterator it = fullPinyin.iterator();
+            while (it.hasNext()) {
+                String s = (String) it.next();
+                if (!isValidPinyin2(s)) {
+                    err.println("?Valid Pinyin: " + s);
+                }
+            }
+            
+            
+            it = unihanMap.keySet().iterator();
+            Map badPinyin = new TreeMap();
+            PrintWriter out2 = Utility.openPrintWriter("Raw_mapping.txt", Utility.UTF8_WINDOWS);
+            try {
+                while (it.hasNext()) {
+                    String keyChar = (String) it.next();
+                    String def = (String) unihanMap.get(keyChar);
+                    if (!isValidPinyin(def)) {
+                        String fixedDef = fixPinyin(def);
+                        err.println(Default.ucd.getCode(keyChar) + "\t" + keyChar + "\t" + fixedDef + "\t#" + def
+                            + (fixedDef.equals(def) ? " FAIL" : ""));
+                        Utility.addToSet(badPinyin, def, keyChar);
+                    }
+                    // check both ways
+                    String digitDef = accentPinyin_digitPinyin.transliterate(def);
+                    String accentDef = digitPinyin_accentPinyin.transliterate(digitDef);
+                    if (!accentDef.equals(def)) {
+                        err.println("Failed Digit Pinyin: " 
+                            + Default.ucd.getCode(keyChar) + "\t" + keyChar + "\t" 
+                            + def + " => " + digitDef + " => " + accentDef);
+                    }
+                    
+                    out2.println(toHexUnicode.transliterate(keyChar) 
+                        + "\tkMandarin\t" + digitDef.toUpperCase() + "\t# " + keyChar + ";\t" + def);
+                }
+                err.println();
+                err.println("Summary of Bad syllables");
+                Utility.printMapOfCollection(err, badPinyin, "\r\n", ":\t", ", ");
+            } finally {
+                out2.close();
+            }
+            
            out = Utility.openPrintWriter(filename, Utility.UTF8_WINDOWS);
            out.println("# Start RAW data for converting CJK characters");
            /*
@ -315,13 +358,12 @@ public final class GenerateHanTransliterator implements UCD_Types {
            */
            
            Set gotAlready = new HashSet();
-            Iterator it = rankList.iterator();
            Set lenSet = new TreeSet();
            Set backSet = new TreeSet();
            int rank = 0;
            Map definitionCount = new HashMap();
            
-            
+            it = rankList.iterator();
            while (it.hasNext()) {
                String keyChar = (String) it.next();
                String def = (String) unihanMap.get(keyChar);
@ -478,6 +520,578 @@ public final class GenerateHanTransliterator implements UCD_Types {
        }
    }
    
+    //http://fog.ccsf.cc.ca.us/~jliou/phonetic.htm
+    // longer ones must be AFTER!
+    // longer ones must be AFTER!
+    static final String[] initialPinyin = {
+        "",
+        "b", "p", "m", "f", 
+        "d", "t", "n", "l", 
+        "z", "c", "s", 
+        "zh", "ch", "sh", "r",
+        "j", "q", "x", 
+        "g", "k", "h", 
+        "y", "w"}; // added to make checking simpler
+        
+    static final String[] finalPinyin = {
+        "a", "ai", "ao", "an", "ang",
+        "o", "ou", "ong",
+        "e", "ei", "er", "en", "eng",
+        "i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong",
+        "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng",
+        "ü", "üe", "üan", "ün"
+    };
+    // Don't bother with the following rules; just add w,y to initials
+    // When “i” stands alone, a “y” will be added before it as “yi”. 
+    //      If “i” is the first letter of the syllable it will be changed to “y”. 
+    // When “u” stands alone, a “w” will be added before it as “wu”. 
+    //      If “u” is the first letter of the syllable it will be changed to “w”. e.g. “uang -> wang”. 
+    // When “ü” stands alone, a “y” will be added before it and “ü” will be changed to “u” as “yu”. 
+    //      If “ü” is the first letter of the syllable, then the spelling will be changed to “yu”. e.g. “üan -> yuan”. 
+    //Note: The nasal final “ueng” never occurs after an initial but always form a syllable by itself.
+    // The “o” in “iou” is hidden, so it will be wrote as “iu”. But, don’t forget to pronounce it. 
+    // The “e” in “uei” is hidden, so it will be wrote as “ui”. But, don’t forget to pronounce it. 
+    
+    
+    public static final String[] pinyin_bopomofo = {
+	"a", "\u311a",
+	"ai", "\u311e",
+	"an", "\u3122",
+	"ang", "\u3124",
+	"ao", "\u3120",
+	"ba", "\u3105\u311a",
+	"bai", "\u3105\u311e",
+	"ban", "\u3105\u3122",
+	"bang", "\u3105\u3124",
+	"bao", "\u3105\u3120",
+	"bei", "\u3105\u311f",
+	"ben", "\u3105\u3123",
+	"beng", "\u3105\u3125",
+	"bi", "\u3105\u3127",
+	"bian", "\u3105\u3127\u3122",
+	"biao", "\u3105\u3127\u3120",
+	"bie", "\u3105\u3127\u311d",
+	"bin", "\u3105\u3127\u3123",
+	"bing", "\u3105\u3127\u3125",
+	"bo", "\u3105\u311b",
+	"bu", "\u3105\u3128",
+	"ca", "\u3118\u311a",
+	"cai", "\u3118\u311e",
+	"can", "\u3118\u3122",
+	"cang", "\u3118\u3124",
+	"cao", "\u3118\u3120",
+	"ce", "\u3118",
+	"cen", "\u3118\u3123",
+	"ceng", "\u3118\u3125",
+	"cha", "\u3114\u311a",
+	"chai", "\u3114\u311e",
+	"chan", "\u3114\u3122",
+	"chang", "\u3114\u3124",
+	"chao", "\u3114\u3120",
+	"che", "\u3114\u311c",
+	"chen", "\u3114\u3123",
+	"cheng", "\u3114\u3125",
+	"chi", "\u3114",
+	"chong", "\u3114\u3121\u3125",
+	"chou", "\u3114\u3121",
+	"chu", "\u3114\u3128",
+	//"chua", "XXX",
+	"chuai", "\u3114\u3128\u311e",
+	"chuan", "\u3114\u3128\u3122",
+	"chuang", "\u3114\u3128\u3124",
+	"chui", "\u3114\u3128\u311f",
+	"chun", "\u3114\u3128\u3123",
+	"chuo", "\u3114\u3128\u311b",
+	"ci", "\u3118",
+	"cong", "\u3118\u3128\u3125",
+	"cou", "\u3118\u3121",
+	"cu", "\u3118\u3128",
+	"cuan", "\u3118\u3128\u3122",
+	"cui", "\u3118\u3128\u311f",
+	"cun", "\u3118\u3128\u3123",
+	"cuo", "\u3118\u3128\u311b",
+	"da", "\u3109\u311a",
+	"dai", "\u3109\u311e",
+	"dan", "\u3109\u3122",
+	"dang", "\u3109\u3124",
+	"dao", "\u3109\u3120",
+	"de", "\u3109\u311c",
+	"dei", "\u3109\u311f",
+        "den", "\u3109\u3123",
+	"deng", "\u3109\u3125",
+	"di", "\u3109\u3127",
+	"dia", "\u3109\u3127\u311a",
+	"dian", "\u3109\u3127\u3122",
+	"diao", "\u3109\u3127\u3120",
+	"die", "\u3109\u3127\u311d",
+	"ding", "\u3109\u3127\u3125",
+	"diu", "\u3109\u3127\u3121",
+	"dong", "\u3109\u3128\u3125",
+	"dou", "\u3109\u3121",
+	"du", "\u3109\u3128",
+	"duan", "\u3109\u3128\u3122",
+	"dui", "\u3109\u3128\u311f",
+	"dun", "\u3109\u3128\u3123",
+	"duo", "\u3109\u3128\u311b",
+	"e", "\u311c",
+	"ei", "\u311f",
+	"en", "\u3123",
+	"eng", "\u3125",
+	"er", "\u3126",
+	"fa", "\u3108\u311a",
+	"fan", "\u3108\u3122",
+	"fang", "\u3108\u3124",
+	"fei", "\u3108\u311f",
+	"fen", "\u3108\u3123",
+	"feng", "\u3108\u3125",
+	"fo", "\u3108\u311b",
+	"fou", "\u3108\u3121",
+	"fu", "\u3108\u3128",
+	"ga", "\u310d\u311a",
+	"gai", "\u310d\u311e",
+	"gan", "\u310d\u3122",
+	"gang", "\u310d\u3124",
+	"gao", "\u310d\u3120",
+	"ge", "\u310d\u311c",
+	"gei", "\u310d\u311f",
+	"gen", "\u310d\u3123",
+	"geng", "\u310d\u3125",
+	"gong", "\u310d\u3128\u3125",
+	"gou", "\u310d\u3121",
+	"gu", "\u310d\u3128",
+	"gua", "\u310d\u3128\u311a",
+	"guai", "\u310d\u3128\u311e",
+	"guan", "\u310d\u3128\u3122",
+	"guang", "\u310d\u3128\u3124",
+	"gui", "\u310d\u3128\u311f",
+	"gun", "\u310d\u3128\u3123",
+	"guo", "\u310d\u3128\u311b",
+	"ha", "\u310f\u311a",
+	"hai", "\u310f\u311e",
+	"han", "\u310f\u3122",
+	"hang", "\u310f\u3124",
+	"hao", "\u310f\u3120",
+	"he", "\u310f\u311c",
+	"hei", "\u310f\u311f",
+	"hen", "\u310f\u3123",
+	"heng", "\u310f\u3125",
+                "hm", "\u310f\u3107",
+	"hng", "\u310f\u312b", // 'dialect of n'
+	"hong", "\u310f\u3128\u3125",
+	"hou", "\u310f\u3121",
+	"hu", "\u310f\u3128",
+	"hua", "\u310f\u3128\u311a",
+	"huai", "\u310f\u3128\u311e",
+	"huan", "\u310f\u3128\u3122",
+	"huang", "\u310f\u3128\u3124",
+	"hui", "\u310f\u3128\u311f",
+	"hun", "\u310f\u3128\u3123",
+	"huo", "\u310f\u3128\u311b",
+	"ji", "\u3110\u3127",
+	"jia", "\u3110\u3127\u311a",
+	"jian", "\u3110\u3127\u3122",
+	"jiang", "\u3110\u3127\u3124",
+	"jiao", "\u3110\u3127\u3120",
+	"jie", "\u3110\u3127\u311d",
+	"jin", "\u3110\u3127\u3123",
+	"jing", "\u3110\u3127\u3125",
+	"jiong", "\u3110\u3129\u3125",
+	"jiu", "\u3110\u3127\u3121",
+	"ju", "\u3110\u3129",
+	"juan", "\u3110\u3129\u3122",
+	"jue", "\u3110\u3129\u311d",
+	"jun", "\u3110\u3129\u3123",
+	"ka", "\u310e\u311a",
+	"kai", "\u310e\u311e",
+	"kan", "\u310e\u3122",
+	"kang", "\u310e\u3124",
+	"kao", "\u310e\u3120",
+	"ke", "\u310e\u311c",
+                "kei", "\u310e\u311f",
+	"ken", "\u310e\u3123",
+	"keng", "\u310e\u3125",
+	"kong", "\u310e\u3128\u3125",
+	"kou", "\u310e\u3121",
+	"ku", "\u310e\u3128",
+	"kua", "\u310e\u3128\u311a",
+	"kuai", "\u310e\u3128\u311e",
+	"kuan", "\u310e\u3128\u3122",
+	"kuang", "\u310e\u3128\u3124",
+	"kui", "\u310e\u3128\u311f",
+	"kun", "\u310e\u3128\u3123",
+	"kuo", "\u310e\u3128\u311b",
+	"la", "\u310c\u311a",
+	"lai", "\u310c\u311e",
+	"lan", "\u310c\u3122",
+	"lang", "\u310c\u3124",
+	"lao", "\u310c\u3120",
+	"le", "\u310c\u311c",
+	"lei", "\u310c\u311f",
+	"leng", "\u310c\u3125",
+	"li", "\u310c\u3127",
+	"lia", "\u310c\u3127\u311a",
+	"lian", "\u310c\u3127\u3122",
+	"liang", "\u310c\u3127\u3124",
+	"liao", "\u310c\u3127\u3120",
+	"lie", "\u310c\u3127\u311d",
+	"lin", "\u310c\u3127\u3123",
+	"ling", "\u310c\u3127\u3125",
+	"liu", "\u310c\u3127\u3121",
+	"lo", "\u310c\u311b",
+	"long", "\u310c\u3128\u3125",
+	"lou", "\u310c\u3121",
+	"lu", "\u310c\u3128",
+	"lü", "\u310c\u3129",
+	"luan", "\u310c\u3128\u3122",
+	"lüe", "\u310c\u3129\u311d",
+	"lun", "\u310c\u3128\u3123",
+	"luo", "\u310c\u3128\u311b",
+	"m", "\u3107",
+	"ma", "\u3107\u311a",
+	"mai", "\u3107\u311e",
+	"man", "\u3107\u3122",
+	"mang", "\u3107\u3124",
+	"mao", "\u3107\u3120",
+	"me", "\u3107\u311c",
+	"mei", "\u3107\u311f",
+	"men", "\u3107\u3123",
+	"meng", "\u3107\u3125",
+	"mi", "\u3107\u3127",
+	"mian", "\u3107\u3127\u3122",
+	"miao", "\u3107\u3127\u3120",
+	"mie", "\u3107\u3127\u311d",
+	"min", "\u3107\u3127\u3123",
+	"ming", "\u3107\u3127\u3125",
+	"miu", "\u3107\u3127\u3121",
+	"mo", "\u3107\u311b",
+	"mou", "\u3107\u3121",
+	"mu", "\u3107\u3128",
+	"n", "\u310b",
+	"na", "\u310b\u311a",
+	"nai", "\u310b\u311e",
+	"nan", "\u310b\u3122",
+	"nang", "\u310b\u3124",
+	"nao", "\u310b\u3120",
+	"ne", "\u310b\u311c",
+	"nei", "\u310b\u311f",
+	"nen", "\u310b\u3123",
+	"neng", "\u310b\u3125",
+	"ng", "\u312b",
+	"ni", "\u310b\u3127",
+	"nian", "\u310b\u3127\u3122",
+	"niang", "\u310b\u3127\u3124",
+	"niao", "\u310b\u3127\u3120",
+	"nie", "\u310b\u3127\u311d",
+	"nin", "\u310b\u3127\u3123",
+	"ning", "\u310b\u3127\u3125",
+	"niu", "\u310b\u3127\u3121",
+	"nong", "\u310b\u3128\u3125",
+	"nou", "\u310b\u3121",
+	"nu", "\u310b\u3128",
+	"nü", "\u310b\u3129",
+	"nuan", "\u310b\u3128\u3122",
+	"nüe", "\u310b\u3129\u311d",
+	"nuo", "\u310b\u3128\u311b",
+	"o", "\u311b",
+	"ou", "\u3121",
+	"pa", "\u3106\u311a",
+	"pai", "\u3106\u311e",
+	"pan", "\u3106\u3122",
+	"pang", "\u3106\u3124",
+	"pao", "\u3106\u3120",
+	"pei", "\u3106\u311f",
+	"pen", "\u3106\u3123",
+	"peng", "\u3106\u3125",
+	"pi", "\u3106\u3127",
+	"pian", "\u3106\u3127\u3122",
+	"piao", "\u3106\u3127\u3120",
+	"pie", "\u3106\u3127\u311d",
+	"pin", "\u3106\u3127\u3123",
+	"ping", "\u3106\u3127\u3125",
+	"po", "\u3106\u311b",
+	"pou", "\u3106\u3121",
+	"pu", "\u3106\u3128",
+	"qi", "\u3111",
+	"qia", "\u3111\u3127\u311a",
+	"qian", "\u3111\u3127\u3122",
+	"qiang", "\u3111\u3127\u3124",
+	"qiao", "\u3111\u3127\u3120",
+	"qie", "\u3111\u3127\u311d",
+	"qin", "\u3111\u3127\u3123",
+	"qing", "\u3111\u3127\u3125",
+	"qiong", "\u3111\u3129\u3125",
+	"qiu", "\u3111\u3129\u3121",
+	"qu", "\u3111\u3129",
+	"quan", "\u3111\u3129\u3122",
+	"que", "\u3111\u3129\u311d",
+	"qun", "\u3111\u3129\u3123",
+	"ran", "\u3116\u3122",
+	"rang", "\u3116\u3124",
+	"rao", "\u3116\u3120",
+	"re", "\u3116\u311c",
+	"ren", "\u3116\u3123",
+	"reng", "\u3116\u3125",
+	"ri", "\u3116",
+	"rong", "\u3116\u3128\u3125",
+	"rou", "\u3116\u3121",
+	"ru", "\u3116\u3128",
+	"ruan", "\u3116\u3128\u3122",
+	"rui", "\u3116\u3128\u311f",
+	"run", "\u3116\u3128\u3123",
+	"ruo", "\u3116\u3128\u311b",
+	"sa", "\u3119\u311a",
+	"sai", "\u3119\u311e",
+	"san", "\u3119\u3122",
+	"sang", "\u3119\u3124",
+	"sao", "\u3119\u3120",
+	"se", "\u3119\u311c",
+	"sen", "\u3119\u3123",
+	"seng", "\u3119\u3125",
+	"sha", "\u3115\u311a",
+	"shai", "\u3115\u311e",
+	"shan", "\u3115\u3122",
+	"shang", "\u3115\u3124",
+	"shao", "\u3115\u3120",
+	"she", "\u3115\u311c",
+	"shei", "\u3115\u311f",
+	"shen", "\u3115\u3123",
+	"sheng", "\u3115\u3125",
+	"shi", "\u3115",
+	"shou", "\u3115\u3121",
+	"shu", "\u3115\u3128",
+	"shua", "\u3115\u3128\u311a",
+	"shuai", "\u3115\u3128\u311e",
+	"shuan", "\u3115\u3128\u3122",
+	"shuang", "\u3115\u3128\u3124",
+	"shui", "\u3115\u3128\u311f",
+	"shun", "\u3115\u3128\u3123",
+	"shuo", "\u3115\u3128\u311b",
+	"si", "\u3119",
+	"song", "\u3119\u3128\u3125",
+	"sou", "\u3119\u3121",
+	"su", "\u3119\u3128",
+	"suan", "\u3119\u3128\u3122",
+	"sui", "\u3119\u3128\u311f",
+	"sun", "\u3119\u3128\u3123",
+	"suo", "\u3119\u3128\u311b",
+	"ta", "\u310a\u311a",
+	"tai", "\u310a\u311e",
+	"tan", "\u310a\u3122",
+	"tang", "\u310a\u3124",
+	"tao", "\u310a\u3120",
+	"te", "\u310a\u311c",
+	"teng", "\u310a\u3125",
+	"ti", "\u310a\u3127",
+	"tian", "\u310a\u3127\u3122",
+	"tiao", "\u310a\u3127\u3120",
+	"tie", "\u310a\u3127\u311d",
+	"ting", "\u310a\u3127\u3125",
+	"tong", "\u310a\u3128\u3125",
+	"tou", "\u310a\u3121",
+	"tu", "\u310a\u3128",
+	"tuan", "\u310a\u3128\u3122",
+	"tui", "\u310a\u3128\u311f",
+	"tun", "\u310a\u3128\u3123",
+	"tuo", "\u310a\u3128\u311b",
+	"wa", "\u3128\u311a",
+	"wai", "\u3128\u311e",
+	"wan", "\u3128\u3122",
+	"wang", "\u3128\u3124",
+	"wei", "\u3128\u311f",
+	"wen", "\u3128\u3123",
+	"weng", "\u3128\u3125",
+	"wo", "\u3128\u311b",
+	"wu", "\u3128",
+	"xi", "\u3112\u3127",
+	"xia", "\u3112\u3127\u311a",
+	"xian", "\u3112\u3127\u3122",
+	"xiang", "\u3112\u3127\u3124",
+	"xiao", "\u3112\u3127\u3120",
+	"xie", "\u3112\u3127\u311d",
+	"xin", "\u3112\u3127\u3123",
+	"xing", "\u3112\u3127\u3125",
+	"xiong", "\u3112\u3129\u3125",
+	"xiu", "\u3112\u3127\u3121",
+	"xu", "\u3112\u3129",
+	"xuan", "\u3112\u3129\u3122",
+	"xue", "\u3112\u3129\u311d",
+	"xun", "\u3112\u3129\u3123",
+	"ya", "\u3127\u311a",
+	"yai", "\u3127\u311e", // not in xinhua zidian index, but listed as alternate pronunciation
+	"yan", "\u3127\u3122",
+	"yang", "\u3127\u3124",
+	"yao", "\u3127\u3120",
+	"ye", "\u3127\u311d",
+	"yi", "\u3127",
+	"yin", "\u3127\u3123",
+	"ying", "\u3127\u3125",
+	"yo", "\u3127\u311b",
+	"yong", "\u3129\u3125",
+	"you", "\u3127\u3121",
+	"yu", "\u3129",
+	"yuan", "\u3129\u3122",
+	"yue", "\u3129\u311d",
+	"yun", "\u3129\u3123",
+	"za", "\u3117\u311a",
+	"zai", "\u3117\u311e",
+	"zan", "\u3117\u3122",
+	"zang", "\u3117\u3124",
+	"zao", "\u3117\u3120",
+	"ze", "\u3117",
+	"zei", "\u3117\u311f",
+	"zen", "\u3117\u3123",
+	"zeng", "\u3117\u3125",
+	"zha", "\u3113\u311a",
+	"zhai", "\u3113\u311e",
+	"zhan", "\u3113\u3122",
+	"zhang", "\u3113\u3124",
+	"zhao", "\u3113\u3120",
+	"zhe", "\u3113\u311d",
+	"zhei", "\u3113\u311f",
+	"zhen", "\u3113\u3123",
+	"zheng", "\u3113\u3125",
+	"zhi", "\u3113",
+	"zhong", "\u3113\u3128\u3125",
+	"zhou", "\u3113\u3121",
+	"zhu", "\u3113\u3128",
+	"zhua", "\u3113\u3128\u311a",
+	"zhuai", "\u3113\u3128\u311e",
+	"zhuan", "\u3113\u3128\u3122",
+	"zhuang", "\u3113\u3128\u3124",
+	"zhui", "\u3113\u3128\u311f",
+	"zhun", "\u3113\u3128\u3123",
+	"zhuo", "\u3113\u3128\u311b",
+	"zi", "\u3117",
+	"zong", "\u3117\u3128\u3125",
+	"zou", "\u3117\u3121",
+	"zu", "\u3117\u3128",
+	"zuan", "\u3117\u3128\u3122",
+	"zui", "\u3117\u3128\u311f",
+	"zun", "\u3117\u3128\u3123",
+	"zuo", "\u3117\u3128\u311b",
+    };
+    
+    static final Set fullPinyin = new TreeSet();
+    static {
+        for (int i = 0; i < pinyin_bopomofo.length; i+= 2) {
+            fullPinyin.add(pinyin_bopomofo[i]);
+        }
+    }
+    
+    static boolean isValidPinyin(String s) {
+        s = dropTones.transliterate(s);
+        if (fullPinyin.contains(s)) return true;
+        return false;
+    }
+    
+    static boolean isValidPinyin2(String s) {
+        s = dropTones.transliterate(s);
+        for (int i = initialPinyin.length-1; i >= 0; --i) {
+            if (s.startsWith(initialPinyin[i])) {
+                String end = s.substring(initialPinyin[i].length());
+                for (int j = finalPinyin.length-1; j >= 0; --j) {
+                    if (end.equals(finalPinyin[j])) return true;
+                }
+                return false;
+            }
+        }
+        return false;
+    }
+    
+    /*
+    U+347C	·	liù	#lyuè  
+U+3500	·	lüè	#lvè
+U+3527	·	liù	#lyù
+U+3729	·	ào	#àu
+U+380E	·	jí	#jjí
+U+3825	·	l·	#lv·
+U+3A3C	·	lüè	#luè
+U+3B5A	·	li·	#ly· *** lü?
+U+3CB6	·	l·	#lv·
+U+3D56	·	niù	#nyù *** nü?
+U+3D88	·	li·ng	#li·ng
+U+3EF2	·	li·	#ly·*** lü?
+U+3F94	·	li·	#ly·*** lü?
+U+4071	·	ào	#àu
+U+40AE	·	liù	#lyuè *** lüe?
+U+430E	·	liù	#lyuè *** lüe?
+U+451E	·	liù	#lyù *** lü?
+U+4588	·	nüè	#nuè
+U+458B	·	nüè	#nuè
+U+45A1	·	niù	#nyù *** nü?
+U+4610	·	niù	#nyù *** nü?
+U+46BC	·	niù	#nyù *** nü?
+U+46DA	·	liù	#lyuè *** lüe?
+U+4896	·	liù	#lyù *** lü?
+U+4923	·	liù	#lyuè *** lüe?
+U+4968	·	liù	#lyù *** lü?
+U+4A0B	·	niù	#nyuè *** nüe?
+U+4AC4	·	chuò	#chuà
+U+4D08	·	·o	#·u
+U+4D8A	·	niù	#nyù *** nü?
+U+51CA	·	qíng	#qýng
+U+51D6	·	zhu·n	#zhu·n *** this is probably zh·n 
+U+5481	·	gàn	#gèm
+U+5838	·	féng	#fúng
+U+639F	·	lü·	#lu· *** this pronunciation surprises me, but I don't know...
+U+66D5	·	yàn	#yiàn
+U+6B3B	·	chu·	#chu· *** chua _is_ ok after all, my table missed an entry
+U+6B56	·	chu·	#chu· *** chua 
+U+6C7C	·	ni·	#ni·u
+U+6E6D	·	qiú	#qióu
+U+6F71	·	y·	#yi·
+U+7493	·	xiù	#xiòu
+U+7607	·	zh·ng	#zh·ng *** I suspect zh·ng
+U+7674	·	luán	#lüán
+U+7867	·	y·ng	#i·ng
+U+7878	·	nüè	#nuè
+*/
+    
+    static Transliterator fixTypos = Transliterator.createFromRules("fix_typos", 
+        "$cons=[bcdfghjklmnpqrstvwxyz];"
+        +"$nlet=[^[:Letter:][:Mark:]];"
+        +"$cons{iou}$nlet   > iu;"
+        +"$cons{em}$nlet    > an;"
+        +"$cons{uen}$nlet   > ueng;"
+        +"$cons{ve}$nlet    > üe;"
+        +"$cons{v}$nlet     > ü;"
+        +"$cons{yue}$nlet   > iu;"
+        +"$cons{yng}$nlet   > ing;"
+        +"$cons{yu}$nlet    > iu;"
+        //+"$cons{ue}       > üe;"
+        +"jj                > j;"
+        //+"$nlet{ng}$nlet  > eng;"
+        //+"$nlet{n}$nlet   > en;"
+        //+"$nlet{m}$nlet   > en;"
+        +"$nlet{au}$nlet    > ao;"
+        
+        // new fixes        
+        +"zhueng}$nlet       > zhong;"
+        +"zhuen}$nlet       > zhuan;"
+        +"lue > lüe;"
+        +"liong > liang;"
+        +"nue > nüe;"
+        +"chua > chuo;"
+        +"yian > yan;"
+        +"yie > ye;"
+        +"lüan > luan;"
+        +"iong > yong;"
+        , Transliterator.FORWARD);
+    
+    
+    static String fixPinyin(String s) {
+        String original = s;
+        //err.println("Source: " + s);
+        s = accentPinyin_digitPinyin.transliterate(s);
+        //err.println("Digit: " + s);
+        s = fixTypos.transliterate(s);
+        //err.println("fixed: " + s);
+        s = digitPinyin_accentPinyin.transliterate(s);
+        //err.println("Result: " + s);
+        if (isValidPinyin(s)) return s;
+        return original;
+    }
+    
    static PrintWriter log;
    static PrintWriter out;
    static PrintWriter err;
@ -734,7 +1348,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
                    if (type == JAPANESE) {
                        processEdict(word, definition, line);
                    } else {
-                        definition = convertPinyin.transliterate(definition);
+                        definition = digitToPinyin(definition, line);
                        //definition = Utility.replace(definition, " ", "\\ ");
                        addCheck(word, definition, line);
                    }
@ -755,20 +1369,37 @@ public final class GenerateHanTransliterator implements UCD_Types {
        int counter = 0;
        String[] pieces = new String[50];
        String line = "";
+        boolean noOverrideFailure = true;
        try {
            while (true) {
                line = Utility.readDataLine(br);
                if (line == null) break;
                if (line.length() == 0) continue;
                Utility.dot(counter++);
+                //System.out.println(line);
                
                // skip code
+                line=line.toLowerCase();
+                
                int wordStart = line.indexOf('\t') + 1;
                int wordEnd = line.indexOf('\t', wordStart);
                String word = line.substring(wordStart, wordEnd);
-                String definition = line.substring(wordEnd+1);
-                addCheck(word, definition, line);
-                overrideSet.add(word);
+                String definition = fixPinyin(line.substring(wordEnd+1));
+                String old = (String) unihanMap.get(word);
+                if (old != null) {
+                    if (!old.equals(definition)) {
+                        if (noOverrideFailure) {
+                            System.out.println("Overriding Failure");
+                            noOverrideFailure = false;
+                        }
+                        err.println("Overriding Failure: " + word 
+                            + "\t" + old + " " + toHexUnicode.transliterate(old)
+                            + "\t" + definition + " " + toHexUnicode.transliterate(definition));
+                    }
+                } else {
+                    addCheck(word, definition, line);
+                    overrideSet.add(word);
+                }
            }
            br.close();
        } catch (Exception e) {
@ -776,6 +1407,81 @@ public final class GenerateHanTransliterator implements UCD_Types {
        }
    }    
    
+    
+/*
+    @Unihan Data
+
+Bad pinyin data: \u4E7F	?	LE
+\u7684	?	de, de, dí, dì
+*/
+
+    static void fixChineseOverrides() throws IOException {
+        
+        log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
+        out = Utility.openPrintWriter("new_Chinese_override.txt", Utility.UTF8_WINDOWS);
+        try {
+            
+            String fname = "fixed_Chinese_transliterate_log.txt";
+            
+            int counter = 0;
+            String line = "";
+            String pinyinPrefix = "Bad pinyin data: ";
+            
+            System.out.println("Reading " + fname);
+            BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
+            try {
+                while (true) {
+                    line = Utility.readDataLine(br);
+                    if (line == null) break;
+                    if (line.length() == 0) continue;
+                    if (line.charAt(0) == 0xFEFF) {
+                        line = line.substring(1); // remove BOM
+                        if (line.length() == 0) continue;
+                    }
+                    Utility.dot(counter++);
+                    
+                    
+                    if (line.charAt(0) == '@') continue;
+                    if (line.startsWith(pinyinPrefix)) {
+                        line = line.substring(pinyinPrefix.length());
+                    }
+                    line = line.toLowerCase();
+                    
+                    //System.out.println(Default.ucd.getCode(line));
+                    // skip code
+                    int wordStart = line.indexOf('\t') + 1;
+                    int wordEnd = line.indexOf('\t', wordStart);
+                    String word = line.substring(wordStart, wordEnd).trim();
+                    
+                    int defStart = wordEnd+1;
+                    int defEnd = line.indexOf(',', defStart);
+                    if (defEnd < 0) defEnd = line.length();
+                    
+                    String definition = fixCircumflex.transliterate(line.substring(defStart, defEnd).trim());
+                    
+                    String notones = dropTones.transliterate(definition);
+                    if (definition.equals(notones)) {
+                        definition = digitPinyin_accentPinyin.transliterate(definition + "1");
+                        if (definition == null) {
+                            System.out.println("Huh? " + notones);
+                        }
+                        log.println("Fixing: " + notones + " => " + definition + "; " + line);
+                    }
+                    
+                    out.println(hex.transliterate(word) + "\t" + word + "\t" + definition);
+                }
+            } catch (Exception e) {
+                throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
+            } finally {
+                br.close();
+            }
+        } finally {
+            out.close();
+        }
+    }    
+
+
+    
    static Set overrideSet = new HashSet();
    
    static void processEdict(String word, String definition, String line) {
@ -997,7 +1703,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
    
    static void readCDICT() throws IOException {
        System.out.println("Reading cdict.txt");
-        BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", Utility.UTF8);
+        String fname = "cdict.txt";
+        
+        BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
        int counter = 0;
        String[] pieces = new String[50];
        String line = "";
@ -1026,7 +1734,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
                }
                for (int i = 0; i < len; ++i) {
                    String chr = word.substring(i, i+1);
-                    String piece = convertPinyin.transliterate(pieces[i]);
+                    
+                    String piece = digitToPinyin(pieces[i], line);
+                    
                    Map oldMap = (Map) cdict.get(chr);
                    if (oldMap == null) {
                        oldMap = new TreeMap();
@ -1069,6 +1779,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
        }
    }
    
+    static String digitToPinyin(String source, String line) {
+        if (source.indexOf('5') >= 0) log.println("Pinyin Tone5 at: " + line);
+        return digitPinyin_accentPinyin.transliterate(source);
+    }
+    
    static Map cdict = new TreeMap();
    static Map simplifiedToTraditional = new HashMap();
    static Map traditionalToSimplified = new HashMap();
@ -1098,7 +1813,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
            String property = line.substring(tabPos+1, tabPos2).trim();
            
            String propertyValue = line.substring(tabPos2+1).trim();
-            if (propertyValue.indexOf("U+") >= 0) propertyValue = fixHex.transliterate(propertyValue);
+            if (propertyValue.indexOf("U+") >= 0) propertyValue = fromHexUnicode.transliterate(propertyValue);
            
            // gather traditional mapping
            if (property.equals("kTraditionalVariant")) {
@ -1160,7 +1875,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
            }
            definition = definition.substring(0, end3);
            
-            definition = convertPinyin.transliterate(definition);
+            definition = digitToPinyin(definition, line);
        }
        if (type == DEFINITION) {
            definition = removeMatched(definition,'(', ')', line);
@ -1220,7 +1935,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
        return source;
    }
        
-    static Map unihanMap = new HashMap();
+    static Map unihanMap = new TreeMap(); // could be hashmap
    static Map duplicates = new TreeMap();
    
    static boolean unihanNonSingular = false;
@ -1274,14 +1989,26 @@ public final class GenerateHanTransliterator implements UCD_Types {
        }
    }
    
-    static Transliterator convertPinyin;
+    static Transliterator digitPinyin_accentPinyin;
+    
+    static Transliterator accentPinyin_digitPinyin = Transliterator.createFromRules("accentPinyin_digitPinyin", 
+        "::NFD; "
+        + " ([\u0304\u0301\u030C\u0300\u0306]) ([[:Mark:][:Letter:]]+) > $2 | $1;"
+        + "\u0304 > '1'; \u0301 > '2'; \u030C > '3'; \u0300 > '4'; \u0306 > '3';" 
+        + " ::NFC;", Transliterator.FORWARD);
+    
+    static Transliterator fixCircumflex = Transliterator.createFromRules("fix_circumflex", 
+        "::NFD; \u0306 > \u030C; ::NFC;", Transliterator.FORWARD);
+        
+    static Transliterator dropTones = Transliterator.createFromRules("drop_tones", 
+        "::NFD; \u0304 > ; \u0301 > ; \u030C > ; \u0300 > ; \u0306 > ; ::NFC;", Transliterator.FORWARD);
    
    static {
-        String dt = "1 > ;\n"
+        String dt = "1 > \u0304;\n"
                    + "2 <> \u0301;\n"
-                    + "3 <> \u0306;\n"
+                    + "3 <> \u030C;\n"
                    + "4 <> \u0300;\n"
-                    + "5 <> \u0304;";
+                    + "5 <> ;";
        
        String dp = "# syllable is ...vowel+ consonant* number\n"
                    + "# 'a', 'e' are the preferred bases\n"
@ -1301,8 +2028,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
    	System.out.println(at.transliterate("a1a2a3a4a5"));
    	DummyFactory.add(at.getID(), at);
    	
-    	convertPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
-    	System.out.println(convertPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
+    	digitPinyin_accentPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
+    	System.out.println(digitPinyin_accentPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
    
    }
    /*
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.25 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.26 $
 *
 *******************************************************************************
 */
@ -37,7 +37,10 @@ public final class Main implements UCD_Types {
        "PropList",
        "Scripts",
        "SpecialCasing",
+        "HangulSyllableType",
        "DerivedAge",
+        "StandardizedVariants",
+        //"HangulSyllable",
        //"OtherDerivedProperties",
    };

@ -71,6 +74,10 @@ public final class Main implements UCD_Types {
            else if (arg.equalsIgnoreCase("pinYinTransliterator")) GenerateHanTransliterator.main(2);
            else if (arg.equalsIgnoreCase("hanproperties")) GenerateHanTransliterator.readUnihan();
            
+            else if (arg.equalsIgnoreCase("fixChineseOverrides")) GenerateHanTransliterator.fixChineseOverrides();
+            
+            
+            
            else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
            
            else if (arg.equalsIgnoreCase("testenum")) SampleEnum.test();
@ -115,6 +122,7 @@ public final class Main implements UCD_Types {
            else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
            else if (arg.equalsIgnoreCase("TestDirectoryIterator")) DirectoryIterator.test();
            else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical();
+            else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.test();
            
            //else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts();
            
@ -191,10 +199,17 @@ public final class Main implements UCD_Types {
                GenerateData.generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
                    "DerivedData/extracted/", "DerivedNumericType" );

+            } else if (arg.equalsIgnoreCase("HangulSyllableType")) {
+                GenerateData.generateVerticalSlice(HANGUL_SYLLABLE_TYPE,HANGUL_SYLLABLE_TYPE+NEXT_ENUM, GenerateData.HEADER_EXTEND,
+                    "DerivedData/", "HangulSyllableType" );
+            
            } else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
                GenerateData.generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, GenerateData.HEADER_DERIVED,
                    "DerivedData/extracted/", "DerivedNumericValues" );
            
+            } else if (arg.equalsIgnoreCase("StandardizedVariants")) {
+                GenerateStandardizedVariants.generate();
+            
    // OTHER STANDARD PROPERTIES
    
            } else if (arg.equalsIgnoreCase("CaseFolding")) {
@ -239,7 +254,7 @@ public final class Main implements UCD_Types {
            
            } else if (arg.equalsIgnoreCase("OtherDerivedProperties")) {
                //mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
-                GenerateData.generateDerived(ALL, false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
+                GenerateData.generateDerived((byte)(ALL & ~DERIVED_CORE & ~DERIVED_NORMALIZATION), false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");

            } else if (arg.equalsIgnoreCase("AllBinary")) {
                GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,
--- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.13 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */
@ -416,7 +416,11 @@ public final class Normalizer implements UCD_Types {
                        String s = ucd.getDecompositionMapping(i);
                        int len = UTF16.countCodePoint(s);
                        if (len != 2) {
-                            if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
+                            if (len > 2) {
+                                if (ucd.getVersion().compareTo("3.0.0") >= 0) {
+                                    throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
+                                }
+                            }
                            continue;
                        }
                        int a = UTF16.charAt(s, 0);
--- a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
@ -1,9 +1,7 @@
 # This file contains aliases for properties used in the UCD.
 # These names can be used for XML formats of UCD data, for regular-expression
 # property tests, and other programmatic textual descriptions of Unicode data.
-# The names are not normative, except where they correspond to normative
-# properties in the UCD. For information on which properties are normative,
-# see UnicodeCharacterDatabase.html.
+# For information on which properties are normative, see UCD.html.
 #
 # The names may be translated in appropriate environments, and additional
 # aliases may be useful.
@ -20,16 +18,14 @@
 # and '_' are ignored.
 #
 # NOTE: Currently there is at most one abbreviated name and one long name for
-# each property. However, in the future additional aliases
-# may be added. In such a case, the first line for the property
-# would have the preferred alias for output.
+# each property. However, in the future additional aliases may be added.
 #
 # NOTE: The property value names are NOT unique across properties, especially
-# with loose matches. For example,
+# with loose matches. For example:
 #
-# AL means Arabic Letter for the Bidi_Class property, and
-# AL means Alpha_Left for the Combining_Class property, and
-# AL means Alphabetic for the Line_Break property.
+#   AL means Arabic Letter for the Bidi_Class property, and
+#   AL means Alpha_Left for the Combining_Class property, and
+#   AL means Alphabetic for the Line_Break property.
 #
 # In addition, some property names may be the same as some property value names.
 #
--- a/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
-* $Date: 2002/05/29 02:01:00 $
-* $Revision: 1.9 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.10 $
 *
 *******************************************************************************
 */
@ -57,7 +57,7 @@ abstract public class PropertyLister implements UCD_Types {
    }

    public String optionalComment(int cp) {
-        if (!usePropertyComment || !breakByCategory) return "";
+        if (!usePropertyComment) return "";
        return ucdData.getModCatID_fromIndex(getModCat(cp));
    }

@ -143,7 +143,8 @@ abstract public class PropertyLister implements UCD_Types {
    }
    
    byte getModCat(int cp) {
-    	return ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
+    	byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
+    	return result;
    }


--- a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt
@ -1,9 +1,7 @@
 # This file contains aliases for property values used in the UCD.
 # These names can be used for XML formats of UCD data, for regular-expression
 # property tests, and other programmatic textual descriptions of Unicode data.
-# The names are not normative, except where they correspond to normative property
-# values in the UCD. For information on which properties are normative, see
-# UnicodeCharacterDatabase.html.
+# For information on which properties are normative, see UCD.html.
 #
 # The names may be translated in appropriate environments, and additional
 # aliases may be useful.
@ -22,29 +20,29 @@
 #
 # Third Field: The third field is a long name.
 #
-# In the case of ccc, their are 4 fields. The second field is numeric, third
+# In the case of ccc, there are 4 fields. The second field is numeric, third
 # is abbreviated, and fourth is long.
 #
 # With loose matching of property names, the case distinctions, whitespace,
 # and '_' are ignored.
 #
-# NOTE: The Block property values are in Blocks.txt, and not repeated here.
-# For more information on the use of blocks, see UTR #18: Regular Expression Guidelines
-#
 # NOTE: Currently there is at most one abbreviated name and one long name for
-# property value. However, in the future additional aliases
-# may be added. In such a case, the first line for the property value
-# would have the preferred alias for output.
+# property value. However, in the future additional aliases may be added.
+# In such a case, the first line for the property value would have
+# the preferred alias for output.
 #
 # NOTE: The property value names are NOT unique across properties, especially
-# with loose matches. For example,
+# with loose matches. For example:
+#
 # AL means Arabic Letter for the Bidi_Class property, and
 # AL means Alpha_Left for the Combining_Class property, and
 # AL means Alphabetic for the Line_Break property.
 #
-# In addition, some property names may be the same as some property value names:
-# cc means Combining_Class property, and
-# cc means the General_Category property value Control (cc)
+# In addition, some property names may be the same as some property value names.
+# For example:
+#
+#   cc means Combining_Class property, and
+#   cc means the General_Category property value Control (cc)
 #
 # The combination of property value and property name is, however, unique.
-# For more information, see UTR #24: Regular Expression Guidelines
+# For more information, see UTR #18: Regular Expression Guidelines
--- a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.1 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.2 $
 *
 *******************************************************************************
 */
@ -23,25 +23,38 @@ import com.ibm.text.utility.*;
 public class QuickTest implements UCD_Types {
    static final void test() {
        Default.setUCD();
+        UnicodeSet format = new UnicodeSet("[:Cf:]");
 /*
-         [4]    NameStartChar := ":" | [A-Z] | "_" | [a-z] |
-		 		 [#xC0 - #x2FF] | [#x370 - #x37D] | [#x37F - #x1FFF] |
-		 		 [#x200C - #x200D] | [#x2070 - #x218F] | [#x2C00 - #x2FEF] | 
-		 		 [#x3001 - #xD7FF] | [#xF900 - #xF9FF] | [#x10000 - #xDFFFF]
-
- [4a]    NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F]
+ [4]     NameStartChar := ":" | [A-Z] | "_" | [a-z] |
+            [#xC0-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
+            [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
+            [#x3001-#xD7FF] | [#xF900-#xEFFFF]
+ [4a]    NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 |
+            [#x0300-#x036F] | [#x203F-#x2040]
 */
        UnicodeSet nameStartChar = new UnicodeSet("[\\: A-Z \\_ a-z"
            + "\\u00c0-\\u02FF \\u0370-\\u037D \\u037F-\\u1FFF"
            + "\\u200C-\\u200D \\u2070-\\u218F \\u2C00-\\u2FEF"
-		 	+ "\\u3001-\\uD7FF \\uF900-\\uF9FF \\U00010000-\\U000DFFFF]");
+		 	+ "\\u3001-\\uD7FF \\uF900-\\U000EFFFF]");
 		 	
-        UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 \\u0300-\\u036F]")
+        UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 "
+            + "\\u0300-\\u036F \\u203F-\\u2040]")
            .addAll(nameStartChar);
            
+        UnicodeSet nameAll = new UnicodeSet(nameChar).addAll(nameStartChar);
+            
 		showSet("NameStartChar", nameStartChar);
 		showDiffs("NameChar", nameChar, "NameStartChar", nameStartChar);
 		
+		
+        UnicodeSet ID_Start = new UnicodeSet("[:ID_Start:]");
+        UnicodeSet ID_Continue = new UnicodeSet("[:ID_Continue:]").removeAll(format);	
+        
+        UnicodeSet ID_All = new UnicodeSet(ID_Start).addAll(ID_Continue);
+        
+		showDiffs("ID_All", ID_All, "nameAll", nameAll);
+		showDiffs("ID_Start", ID_Start, "nameStartChar", nameStartChar);
+		

        UnicodeSet defaultIgnorable = UnifiedBinaryProperty.make(DERIVED | DefaultIgnorable).getSet();
        UnicodeSet whitespace = UnifiedBinaryProperty.make(BINARY_PROPERTIES | White_space).getSet();
@ -49,7 +62,6 @@ public class QuickTest implements UCD_Types {
        UnicodeSet notNFKC = new UnicodeSet();
        UnicodeSet privateUse = new UnicodeSet();
        UnicodeSet noncharacter = new UnicodeSet();
-        UnicodeSet format = new UnicodeSet("[:Cf:]");
        
        for (int i = 0; i <= 0x10FFFF; ++i) {
            if (!Default.ucd.isAllocated(i)) continue;
--- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt
@ -48,14 +48,14 @@
 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
 # The following rules handle those cases.

-0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
-0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
+0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE

 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 # This matches the behavior of the canonically equivalent I-dot_above

-0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
-0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
+0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
+0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE

 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.

--- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
@ -4,8 +4,7 @@
 # It contains additional information about the casing of Unicode characters.
 # (For compatibility, the UnicodeData.txt file only contains case mappings for
 # characters where they are 1-1, and does not have locale-specific mappings.)
-# For more information, see
-# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
+# For more information, see the discussion of Case Mappings in the Unicode Standard.
 #
 # ================================================================================
 # Format
@ -31,10 +30,10 @@
 # <ISO_3166_code> := 2-letter ISO country code,
 # <ISO_639_code> :=  2-letter ISO language code
 #
-# A context is one of the following, as defined in UAX #21: Case Mappings:
-#   Final_Sigma, After_Soft_Dotted, More_Above, Before_Dot
+# A context is one of the following, as defined in the Unicode Standard:
+#   Final_Sigma, After_Soft_Dotted, More_Above, Before_Dot, Not_Before_Dot, After_I
 #
-# Parsers of this file must be prepared to deal future additions to this format:
+# Parsers of this file must be prepared to deal with future additions to this format:
 #  * Additional contexts
 #  * Additional fields
 # ================================================================================
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.19 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.20 $
 *
 *******************************************************************************
 */
@ -35,7 +35,7 @@ public final class UCD implements UCD_Types {
    /**
     * Used for the default version.
     */
-    public static final String latestVersion = "3.2.1";
+    public static final String latestVersion = "4.0.0";

    /**
     * Create singleton instance for default (latest) version
@ -79,17 +79,19 @@ public final class UCD implements UCD_Types {
     */
    public boolean isAllocated(int codePoint) {
        if (getCategory(codePoint) != Cn) return true;
-        if (major >= 2 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true;
+        if (compositeVersion >= 0x20000 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true;
        if (isNoncharacter(codePoint)) return true;
        return false;
    }
    
    public boolean isNoncharacter(int codePoint) {
        if ((codePoint & 0xFFFE) == 0xFFFE) {
-            if (major < 2 && codePoint > 0xFFFF) return false;
+            if (compositeVersion < 0x20000 && codePoint > 0xFFFF) return false;
+            // major < 2
            return true;
        }
-        if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
+        if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && compositeVersion >= 0x30100) return true;
+        // major >= 3 && minor >= 1
        return false;
    }

@ -239,8 +241,9 @@ public final class UCD implements UCD_Types {
    
    public byte getModCat(int cp, int collapseBits) {
        byte cat = getCategory(cp);
-        if (cat == UNASSIGNED && isNoncharacter(cp)) cat = FAKENC;
-        if (((1<<cat) & collapseBits) != 0) {
+        if (cat == UNASSIGNED && isNoncharacter(cp)) {
+            cat = FAKENC;
+        } else if (((1<<cat) & collapseBits) != 0) {
        	switch (cat) {
 				case UNASSIGNED: cat = FAKE_OTHER; break;
 				case FAKENC: cat = FAKE_OTHER; break;
@ -281,7 +284,17 @@ public final class UCD implements UCD_Types {
 				case CURRENCY_SYMBOL: cat = FAKE_SYMBOL; break;
 				case MODIFIER_SYMBOL: cat = FAKE_SYMBOL; break;
 				case OTHER_SYMBOL: cat = FAKE_SYMBOL; break;
-
+			}
+            if (collapseBits == -1) {
+                switch (cat) {
+                    case FAKE_MARK: 
+                    case FAKE_NUMBER:
+                    case FAKE_SEPERATOR:
+                    case FAKE_PUNCTUATION:
+                    case FAKE_SYMBOL:
+                        cat = FAKE_LETTER;
+                        break;
+                }
 			}
        }
        return cat;
@ -832,7 +845,7 @@ public final class UCD implements UCD_Types {
        return style == SHORT ? UCD_Names.SHORT_BP[bit] : UCD_Names.BP[bit];
    }

-    public static int mapToRepresentative(int ch, boolean old) {
+    public static int mapToRepresentative(int ch, boolean lessThan20105) {
        if (ch <= 0xFFFD) {
            //if (ch <= 0x2800) return ch;
            //if (ch <= 0x28FF) return 0x2800;    // braille
@ -850,7 +863,7 @@ public final class UCD implements UCD_Types {
            if (ch <= 0xDFFF) return 0xDC00;
            if (ch <= 0xE000) return ch;         // Private Use
            if (ch <= 0xF8FF) return 0xE000;
-            if (old) {
+            if (lessThan20105) {
                if (ch <= 0xF900) return ch;         // CJK Compatibility Ideograp
                if (ch <= 0xFA2D) return 0xF900;
            }
@ -870,37 +883,43 @@ public final class UCD implements UCD_Types {
        return ch;
    }

-    public boolean isIdentifierStart(int cp, boolean extended) {
+    public boolean isIdentifierStart(int cp) {
+        /*
        if (extended) {
            if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return false;
            if (cp == 0x037A || cp >= 0xFC5E && cp <= 0xFC63 || cp == 0xFDFA || cp == 0xFDFB) return false;
            if (cp >= 0xFE70 && cp <= 0xFE7E && (cp & 1) == 0) return false;
        }
+        */
        byte cat = getCategory(cp);
        if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl) return true;
+        if (getBinaryProperty(cp, ID_Start_Exceptions)) return true;
        return false;
    }

-    public boolean isIdentifierContinue_NO_Cf(int cp, boolean extended) {
-        if (isIdentifierStart(cp, extended)) return true;
+    public boolean isIdentifierContinue_NO_Cf(int cp) {
+        if (isIdentifierStart(cp)) return true;
+        /*
        if (extended) {
            if (cp == 0x00B7) return true;
            if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return true;
        }
+        */
        byte cat = getCategory(cp);
        if (cat == Mn || cat == Mc || cat == Nd || cat == Pc) return true;
+        if (getBinaryProperty(cp, ID_Start_Exceptions)) return true;
        return false;
    }

-    public boolean isIdentifier(String s, boolean extended) {
+    public boolean isIdentifier(String s) {
        if (s.length() == 0) return false; // at least one!
        int cp;
        for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
            cp = UTF32.char32At(s, i);
            if (i == 0) {
-                if (!isIdentifierStart(cp, extended)) return false;
+                if (!isIdentifierStart(cp)) return false;
            } else {
-                if (!isIdentifierContinue_NO_Cf(cp, extended)) return false;
+                if (!isIdentifierContinue_NO_Cf(cp)) return false;
            }
        }
        return true;
@ -940,9 +959,10 @@ to guarantee identifier closure.
    private String file;
    private long date = -1;
    private byte format = -1;
-    private byte major = -1;
-    private byte minor = -1;
-    private byte update = -1;
+    //private byte major = -1;
+    //private byte minor = -1;
+    //private byte update = -1;
+    private int compositeVersion = -1;
    private int size = -1;

    // cache last UData
@ -971,7 +991,7 @@ to guarantee identifier closure.
        if (codePoint >= 0x2800 && codePoint <= 0x28FF) return true; 
        if (codePoint >= 0x2F800 && codePoint <= 0x2FA1D) return true;
        
-        int rangeStart = mapToRepresentative(codePoint, major < 2);
+        int rangeStart = mapToRepresentative(codePoint, compositeVersion < 0x020105);
        switch (rangeStart) {
          default:
            return getRaw(codePoint) == null;
@ -999,6 +1019,11 @@ to guarantee identifier closure.

    // access data for codepoint
    UData get(int codePoint, boolean fixStrings) {
+        /*if (codePoint == 0xF901) {
+            System.out.println(version + ", " + Integer.toString(compositeVersion, 16));
+            System.out.println("debug: ");
+        }
+        */
        if (codePoint < 0 || codePoint > 0x10FFFF) {
            throw new IllegalArgumentException("Illegal Code Point: " + Utility.hex(codePoint));
        }
@ -1024,11 +1049,11 @@ to guarantee identifier closure.

        // do range stuff
        String constructedName = null;
-        int rangeStart = mapToRepresentative(codePoint, major < 2);
+        int rangeStart = mapToRepresentative(codePoint, compositeVersion < 0x020105);
        boolean isHangul = false;
        switch (rangeStart) {
          case 0xF900:
-            if (major < 2) {
+            if (compositeVersion < 0x020105) {
                if (fixStrings) constructedName = "CJK COMPATIBILITY IDEOGRAPH-" + Utility.hex(codePoint, 4);
                break;
            }
@ -1198,9 +1223,11 @@ to guarantee identifier closure.
    }

    static boolean isLeadingJamoComposition(int char1) {
-        return (LBase <= char1 && char1 < LLimit
-            ||  SBase <= char1 && char1 < SLimit
-                && ((char1 - SBase) % TCount) == 0);
+        return isLeadingJamo(char1) || isLV(char1);
+    }
+
+    static boolean isLV(int char1) {
+        return (SBase <= char1 && char1 < SLimit && ((char1 - SBase) % TCount) == 0);
    }

    static boolean isVowelJamo(int cp) {
@ -1218,6 +1245,24 @@ to guarantee identifier closure.
    static boolean isNonLeadJamo(int cp) {
        return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
    }
+    
+    static byte getHangulSyllableType(int cp) {
+        if (isLeadingJamo(cp)) return L;
+        else if (isVowelJamo(cp)) return V;
+        else if (isTrailingJamo(cp)) return T;
+        else if (isLV(cp)) return LV;
+        else if (isHangulSyllable(cp)) return LVT;
+        else return NA;
+    }
+
+    static String getHangulSyllableTypeID_fromIndex(byte index, byte style) {
+        if (style == LONG) return UCD_Names.LONG_HANGUL_SYLLABLE_TYPE[index];
+        return UCD_Names.HANGUL_SYLLABLE_TYPE[index];
+    }
+
+    static String getHangulSyllableTypeID(int char1, byte style) {
+        return getHangulSyllableTypeID_fromIndex(getHangulSyllableType(char1),style);
+    }

    private void fillFromFile(String version) {
    	try {
@ -1243,9 +1288,11 @@ to guarantee identifier closure.
                    128*1024));
            // header
            format = dataIn.readByte();
-            major = dataIn.readByte();
-            minor = dataIn.readByte();
-            update = dataIn.readByte();
+            byte major = dataIn.readByte();
+            byte minor = dataIn.readByte();
+            byte update = dataIn.readByte();
+            compositeVersion = (major << 16) | (minor << 8) | update;
+            
            String foundVersion = major + "." + minor + "." + update;
            if (format != BINARY_FORMAT || !version.equals(foundVersion)) {
                throw new ChainException("Illegal data file format for {0}: {1}, {2}",
@ -1262,7 +1309,7 @@ to guarantee identifier closure.
                UData uData = new UData();
                uData.readBytes(dataIn);

-                if (DEBUG && uData.codePoint == 0x2801) {
+                if (uData.codePoint == 0x0221) {
                    System.out.println("SPOT-CHECK: " + uData);
                }

--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.15 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.16 $
 *
 *******************************************************************************
 */
@ -53,6 +53,7 @@ final class UCD_Names implements UCD_Types {
        "BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)",
        "Script",
        "Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)",
+        "Hangul Syllable Type\r\n# All codepoints not explicitly listed here have the value NA",
        "Derived"
    };

@ -69,6 +70,7 @@ final class UCD_Names implements UCD_Types {
        "",
        "Script",
        "Age",
+        "Hangul_Syllable_Type",
        ""
    };

@ -85,6 +87,7 @@ final class UCD_Names implements UCD_Types {
        "",
        "sc",
        "ag",
+        "hst",
        "",
    };

@ -121,6 +124,7 @@ final class UCD_Names implements UCD_Types {
        "Deprecated",
        "Soft_Dotted",
        "Logical_Order_Exception",
+        "ID_Start_Exceptions",
    };

    static final String[] SHORT_BP = {
@ -155,6 +159,7 @@ final class UCD_Names implements UCD_Types {
        "Dep",
        "SD",
        "LOE",
+        "IDSX",
    };

    /*
@ -273,6 +278,14 @@ final class UCD_Names implements UCD_Types {
    "HANUNOO",
    "BUHID",
    "TAGBANWA",
+    "LIMBU",
+    "TAI_LE",
+    "LINEAR_B",
+    "UGARITIC",
+    "SHAVIAN",
+    "OSMANYA",
+    "CYPRIOT", 
+    
  };

 	public static final String[] ABB_SCRIPT = {
@ -322,6 +335,13 @@ final class UCD_Names implements UCD_Types {
    "Hano",
    "Buhd",
    "Tagb",
+    "LIMBU",
+    "TAI_LE",
+    "LINEAR_B",
+    "UGARITIC",
+    "SHAVIAN",
+    "OSMANYA",
+    "CYPRIOT", 
  };


@ -330,7 +350,8 @@ final class UCD_Names implements UCD_Types {
    "UNSPECIFIED",
    "1.1",
    "2.0", "2.1",
-    "3.0", "3.1"
+    "3.0", "3.1", "3.2",
+    "4.0"
  };


@ -573,6 +594,24 @@ final class UCD_Names implements UCD_Types {

    public static byte ON = Utility.lookup("ON", BC, true);

+    public static String[] HANGUL_SYLLABLE_TYPE = {
+        "NA",
+        "L",
+        "V",
+        "T",
+        "LV",
+        "LVT",
+    };
+
+    public static String[] LONG_HANGUL_SYLLABLE_TYPE = {
+        "Not_Applicable",
+        "Leading_Jamo",
+        "Vowel_Jamo",
+        "Trailing_Jamo",
+        "LV_Syllable",
+        "LVT_Syllable",
+    };
+
    public static String[] JOINING_TYPE = {
        "C",
        "D",
@ -643,6 +682,9 @@ final class UCD_Names implements UCD_Types {
        "YUDH",
        "YUDH_HE",
        "ZAIN",
+        "ZHAIN",
+        "KHAPH",
+        "FE",
    };

    public static String[] OLD_JOINING_GROUP = {
@ -697,6 +739,9 @@ final class UCD_Names implements UCD_Types {
        "YUDH",
        "YUDH_HE",
        "ZAIN",
+        "ZHAIN",
+        "KHAPH",
+        "FE",
    };


--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
-* $Date: 2002/10/05 01:28:58 $
-* $Revision: 1.16 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.17 $
 *
 *******************************************************************************
 */
@ -15,7 +15,7 @@ package com.ibm.text.UCD;

 public interface UCD_Types {
    
-    public static final int dVersion = 2; // change to fix the generated file D version. If less than zero, no "d"
+    public static final int dVersion = 10; // change to fix the generated file D version. If less than zero, no "d"
    
    public static final String BASE_DIR = "C:\\DATA\\";
    public static final String UCD_DIR = BASE_DIR + "UCD\\";
@ -41,7 +41,7 @@ public interface UCD_Types {
        NOT_DERIVED = 1, 
        DERIVED_CORE = 2, 
        DERIVED_NORMALIZATION = 4, 
-        DERIVED_ALL = 6, 
+        DERIVED_ALL = 0x6, 
        ALL = (byte)-1;
        
     static final byte
@ -86,9 +86,10 @@ public interface UCD_Types {
        BINARY_PROPERTIES = 0x900,
        SCRIPT = 0xA00,
        AGE = 0xB00,
-        DERIVED = 0xC00,
-        NEXT_ENUM = 0x100,
-        LIMIT_ENUM = DERIVED + 0x100;
+        HANGUL_SYLLABLE_TYPE = 0xC00,
+        DERIVED = 0xD00,
+        LIMIT_ENUM = DERIVED + 0x100,
+        NEXT_ENUM = 0x100;

    public static final int LIMIT_COMBINING_CLASS = 256;

@ -207,7 +208,8 @@ public interface UCD_Types {
        Deprecated = 28,
        Soft_Dotted = 29,
        Logical_Order_Exception = 30,
-	    LIMIT_BINARY_PROPERTIES = 31;
+        ID_Start_Exceptions = 31,
+	    LIMIT_BINARY_PROPERTIES = 32;

 	/*
    static final int
@ -309,6 +311,9 @@ public interface UCD_Types {
    // numericType
    static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
        LIMIT_NUMERIC_TYPE = 4;
+        
+    static final byte NA = 0, L = 1, V = 2, T = 3, LV = 4, LVT = 5,
+        HANGUL_SYLLABLE_TYPE_LIMIT = 6;

    public static final byte // SCRIPT CODE
        COMMON_SCRIPT = 0,
@ -357,7 +362,14 @@ public interface UCD_Types {
        HANUNOO_SCRIPT = 43,
        BUHID_SCRIPT = 44,
        TAGBANWA_SCRIPT = 45,
-        LIMIT_SCRIPT = 46;
+        LIMBU = 46,
+        TAI_LE = 47,
+        LINEAR_B = 48,
+        UGARITIC = 49,
+        SHAVIAN = 50,
+        OSMANYA = 51,
+        CYPRIOT = 52,
+        LIMIT_SCRIPT = 53;

  static final int
    UNKNOWN = 0,
@ -366,7 +378,9 @@ public interface UCD_Types {
    AGE21 = 3,
    AGE30 = 4,
    AGE31 = 5,
-    LIMIT_AGE = 6;
+    AGE32 = 6,
+    AGE40 = 7,
+    LIMIT_AGE = 8;



@ -431,7 +445,11 @@ public static byte
    YUDH = 48,
    YUDH_HE = 49,
    ZAIN = 50,
-    LIMIT_JOINING_GROUP = 51;
+    ZHAIN = 51,
+    KHAPH = 52,
+    FE = 53,
+    
+    LIMIT_JOINING_GROUP = 54;
    
    static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3;    
    public static final int
@ -500,7 +518,9 @@ public static byte
        NFC_Skippable = 42,
        NFKD_Skippable = 43,
        NFKC_Skippable = 44,
+        
+        Case_Sensitive = 45,

-        DERIVED_PROPERTY_LIMIT = 41;
+        DERIVED_PROPERTY_LIMIT = 46;
    
 }
--- a/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java
@ -35,6 +35,8 @@ public abstract class UnicodeProperty implements UCD_Types {
      public boolean isStandard() { return isStandard; }
      public void setStandard(boolean in) { isStandard = in; }
      
+      public boolean isDefaultValue() {return false;}
+      
      /**
       * What type is it? DERIVED..
       */
--- a/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
-* $Date: 2002/10/05 01:28:57 $
-* $Revision: 1.10 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.11 $
 *
 *******************************************************************************
 */
@ -122,7 +122,11 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
        propValue = propMask & 0xFF;
        
        //System.out.println("A: " + getValueType());
-        if (majorProp <= (JOINING_GROUP>>8) || majorProp == SCRIPT>>8) setValueType(FLATTENED_BINARY);
+        if (majorProp <= (JOINING_GROUP>>8) 
+                || majorProp == (SCRIPT>>8) 
+                || majorProp==(HANGUL_SYLLABLE_TYPE>>8)) {
+            setValueType(FLATTENED_BINARY);
+        }
        //System.out.println("B: " + getValueType());
        
        header = UCD_Names.UNIFIED_PROPERTY_HEADERS[majorProp];
@ -217,6 +221,8 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
            return true;
          case AGE>>8: if (propValue >= LIMIT_AGE) break;
            return true;
+          case HANGUL_SYLLABLE_TYPE>>8: if (propValue >= HANGUL_SYLLABLE_TYPE_LIMIT) break;
+            return true;
            /*
          case DERIVED>>8:
            UnicodeProperty up = DerivedProperty.make(propValue, ucd);
@ -227,6 +233,28 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
        return false;
    }
    
+    public boolean isDefaultValue() {
+        switch ((majorProp<<8) | propValue) {
+            //case CATEGORY | Cn:
+            //case COMBINING_CLASS | 0:
+            //case BIDI_CLASS | BIDI_L:
+            case DECOMPOSITION_TYPE | NONE:
+            case NUMERIC_TYPE | NUMERIC_NONE:
+            // case EAST_ASIAN_WIDTH | EAN:
+            // case LINE_BREAK | LB_XX:
+            case JOINING_TYPE | JT_U:
+            case JOINING_GROUP | NO_SHAPING:
+            case BINARY_PROPERTIES | Non_break:
+            case BINARY_PROPERTIES | CaseFoldTurkishI:
+            case SCRIPT | COMMON_SCRIPT:
+            case HANGUL_SYLLABLE_TYPE | NA:
+                return true;
+        }
+        return false;
+    }
+      
+    
+    
    public boolean hasValue(int cp) {
        try {
            switch (majorProp) {
@ -242,6 +270,8 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
            case BINARY_PROPERTIES>>8: return ucd.getBinaryProperty(cp, propValue);
            case SCRIPT>>8: return ucd.getScript(cp) == propValue;
            case AGE>>8: return ucd.getAge(cp) == propValue;
+            case HANGUL_SYLLABLE_TYPE>>8: return ucd.getHangulSyllableType(cp) == propValue;
+            // return true;
                /*
            case DERIVED>>8:
                UnicodeProperty up = DerivedProperty.make(propValue, ucd);
@ -307,6 +337,7 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
            case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
            case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue, style);
            case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue);
+            case HANGUL_SYLLABLE_TYPE>>8: return ucd.getHangulSyllableTypeID_fromIndex((byte)propValue, style);
                /*
            case DERIVED>>8:
                UnicodeProperty up = DerivedProperty.make(propValue, ucd);
@ -337,6 +368,7 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
            case BINARY_PROPERTIES>>8: return LONG;
            case SCRIPT>>8: return LONG;
            case AGE>>8: return LONG;
+            case HANGUL_SYLLABLE_TYPE>>8: return SHORT;
            }
        } catch (RuntimeException e) {
            throw new ChainException("Illegal property Number {0}, {1}", new Object[]{
--- a/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedProperty.java,v $
-* $Date: 2002/10/05 01:28:57 $
-* $Revision: 1.2 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.3 $
 *
 *******************************************************************************
 */
@ -142,6 +142,7 @@ public final class UnifiedProperty extends UnicodeProperty {
          case JOINING_GROUP>>8:
          case SCRIPT>>8:
          case AGE>>8:
+          case HANGUL_SYLLABLE_TYPE>>8:
            return true;
            /*
          case DERIVED>>8:
@ -181,7 +182,9 @@ public final class UnifiedProperty extends UnicodeProperty {
        case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(cp), style);
        case SCRIPT>>8: return ucd.getScriptID_fromIndex(ucd.getScript(cp), style);
        case AGE>>8: return ucd.getAgeID_fromIndex(ucd.getAge(cp), style);
+        case HANGUL_SYLLABLE_TYPE>>8: 
+            return ucd.getHangulSyllableTypeID(cp,style);
        default: throw new IllegalArgumentException("Internal Error");
        }
    }
-}
+}
--- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
-* $Date: 2002/08/09 23:56:24 $
-* $Revision: 1.19 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.20 $
 *
 *******************************************************************************
 */
@ -1806,8 +1806,11 @@ E0020-E007F; [TAGGING CHARACTERS]

                String x_cp = 'x' + UTF32.valueOf32(cp);
                String nfx_x_cp = normalize(x_cp, j);
-                plain = Default.ucd.isIdentifier(x_cp, true);
-                norm = Default.ucd.isIdentifier(nfx_x_cp, true);
+                if (true) {
+                    throw new RuntimeException("Fix plain & norm, 4 instances!!");
+                }
+                // plain = Default.ucd.isIdentifier(x_cp, true);
+                //norm = Default.ucd.isIdentifier(nfx_x_cp, true);
                if (plain & !norm) {
                    Utility.fixDot();
                    System.out.println("*Not Identifier: " + Default.ucd.getCodeAndName(cp));
@ -1822,8 +1825,8 @@ E0020-E007F; [TAGGING CHARACTERS]
                }

                String nfx_cp = normalize(UTF32.valueOf32(cp), j);
-                plain = Default.ucd.isIdentifierStart(cp, true);
-                norm = Default.ucd.isIdentifier(nfx_cp, true);
+                // plain = Default.ucd.isIdentifierStart(cp, true);
+                // norm = Default.ucd.isIdentifier(nfx_cp, true);
                if (plain & !norm) {
                    Utility.fixDot();
                    System.out.println(" Changes Category: " + Default.ucd.getCodeAndName(cp));
--- a/tools/unicodetools/com/ibm/text/utility/FileLineIterator.java
+++ b/tools/unicodetools/com/ibm/text/utility/FileLineIterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/FileLineIterator.java,v $
-* $Date: 2002/10/01 01:12:10 $
-* $Revision: 1.1 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.2 $
 *
 *******************************************************************************
 */
@ -43,18 +43,18 @@ public class FileLineIterator {
    public int counter = 0;
    
    private BufferedReader br = null;
-    private boolean isUTF8 = false;
+    private Utility.Encoding encoding = Utility.UTF8;
    
    /**
     * Open the file for reading. If useGenDir is set, use the normal generation directory
     */
-    public void open(String filename, boolean isUTF8) throws IOException {
+    public void open(String filename, Utility.Encoding encoding) throws IOException {
        if (showFilename) {
            Utility.fixDot();
            System.out.println("Reading File: " + new File(filename).getCanonicalPath());
        }
-        br = Utility.openReadFile(filename, isUTF8);
-        this.isUTF8 = isUTF8;
+        br = Utility.openReadFile(filename, encoding);
+        this.encoding = encoding;
    }
    
    /**
@ -68,7 +68,7 @@ public class FileLineIterator {
            if (cleanedLine == null) return null;
            
            // drop BOM
-            if (isUTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) {
+            if (encoding == Utility.UTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) {
                cleanedLine = cleanedLine.substring(1);
            }
            
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2002/10/05 01:28:56 $
-* $Revision: 1.26 $
+* $Date: 2003/02/25 23:38:22 $
+* $Revision: 1.27 $
 *
 *******************************************************************************
 */
@ -144,7 +144,10 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
        boolean haveFirstCased = true;
        for (int i = 0; i < source.length(); ++i) {
            char c = source.charAt(i);
-            if (c == ' ' || c == '-') c = '_';
+            if (c == ' ' || c == '-' || c == '_') {
+                c = '_';
+                haveFirstCased = true;
+            }
            int cat = Character.getType(c);
            if (lastCat == Character.LOWERCASE_LETTER && cat == Character.UPPERCASE_LETTER) {
                result.append('_');
@ -616,6 +619,7 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES

    private static final String[] searchPath = {
        "EXTRAS",
+        "4.0.0",
        "3.2.0",
        "3.1.1",
        "3.1.0",
@ -654,8 +658,13 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
        UTF8_UNIX = Encoding.add("UTF8_UNIX"),
        UTF8_WINDOWS = Encoding.add("UTF8_WINDOWS"),
        
-        UTF8 = Encoding.add("UTF8"), // for read-only
-        LATIN1 = Encoding.add("LATIN1"), // for read-only
+        //UTF8 = Encoding.add("UTF8"), // for read-only
+        //LATIN1 = Encoding.add("LATIN1"), // for read-only
+        
+        // read-only (platform doesn't matter, since it is only line-end)
+        
+        UTF8 = UTF8_WINDOWS,
+        LATIN1 = LATIN1_WINDOWS,
        
        FIRST = LATIN1_UNIX;
        
@ -700,6 +709,24 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
        public boolean filter(Object current); // true is keep
    }
    
+    public static void printMapOfCollection(PrintWriter pw, Map c, String mainSeparator, String itemSeparator, String subseparator) {
+        Iterator it = c.keySet().iterator();
+        boolean first = true;
+        Object last = null;
+        while (it.hasNext()) {
+            Object key = it.next();
+            Collection value = (Collection) c.get(key);
+            if (first) {
+                first = false;
+            } else {
+                pw.print(mainSeparator);
+            }
+            pw.print(key);
+            pw.print(itemSeparator);
+            print(pw, value, subseparator);
+        }
+    }
+    
    public static void print(PrintWriter pw, Collection c, String separator, Breaker b) {
        Iterator it = c.iterator();
        boolean first = true;
@ -745,7 +772,12 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
    
    public static BufferedReader openReadFile(String filename, Encoding encoding) throws FileNotFoundException, UnsupportedEncodingException {
        FileInputStream fis = new FileInputStream(filename);
-        InputStreamReader isr = (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis);
+        InputStreamReader isr;
+        if (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) {
+            isr = new InputStreamReader(fis, "UTF8");
+        } else {
+            isr = new InputStreamReader(fis);
+        }
        BufferedReader br = new BufferedReader(isr, 32*1024);
        return br;
    }
@ -817,10 +849,10 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
        }
    }
    
-    public static void renameIdentical(String file1, String file2) throws IOException {
+    public static boolean renameIdentical(String file1, String file2, String batFile) throws IOException {
        if (file1 == null) {
            System.out.println("Null file");
-            return;
+            return false;
        }
        
        boolean identical = false;
@ -845,25 +877,34 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
            br2.close();
        }
        if (identical) {
-            File foo = new File(file2);
-            File newName = new File(foo.getParent(), "UNCHANGED-" + foo.getName());
-            if (newName.exists()) {
-                for (int i = 1; newName.exists(); ++i) {
-                    newName = new File(foo.getParent(), "UNCHANGED" + i + "-" + foo.getName());
-                }
-            }
-            System.out.println("IDENTICAL TO PREVIOUS, RENAMING : " + foo);
-            System.out.println("TO : " + newName);
-            boolean renameResult = foo.renameTo(newName);
-            if (!renameResult) System.out.println("Couldn't rename!");
+            renameIdentical(file2);
+            if (batFile != null) renameIdentical(batFile);
+            return true;
        } else {
+            if (line1 == null) line1 = "<end of file>";
+            if (line2 == null) line2 = "<end of file>";
            System.out.println("Found difference in : " + file1 + ", " + file2);
            int diff = compare(line1, line2);
            System.out.println(" Line1: '" + line1.substring(0,diff) + "', '" + line1.substring(diff));
            System.out.println(" Line2: '" + line2.substring(0,diff) + "', '" + line2.substring(diff));
+            return false;
        }
    }
    
+    static void renameIdentical(String file2) {
+        File foo = new File(file2);
+        File newName = new File(foo.getParent(), "UNCHANGED-" + foo.getName());
+        if (newName.exists()) {
+            for (int i = 1; newName.exists(); ++i) {
+                newName = new File(foo.getParent(), "UNCHANGED" + i + "-" + foo.getName());
+            }
+        }
+        System.out.println("IDENTICAL TO PREVIOUS, RENAMING : " + foo);
+        System.out.println("TO : " + newName);
+        boolean renameResult = foo.renameTo(newName);
+        if (!renameResult) System.out.println("Couldn't rename!");
+    }
+    
    static String getLineWithoutFluff(BufferedReader br1, boolean first) throws IOException {
        while (true) {
            String line1 = br1.readLine();