shifted the implicit range.

X-SVN-Rev: 8767
2002-06-02 05:07:08 +00:00 · 2002-06-02 05:07:08 +00:00 · 012100e9dd
commit 012100e9dd
parent 0078f2f53e
5 changed files with 526 additions and 216 deletions
--- a/tools/unicodetools/com/ibm/text/UCA/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCA/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ 
-* $Date: 2002/05/31 01:41:03 $ 
-* $Revision: 1.3 $
+* $Date: 2002/06/02 05:07:08 $ 
+* $Revision: 1.4 $
 *
 *******************************************************************************
 */
@ -18,7 +18,8 @@ import com.ibm.text.utility.*;

 public class Main {
 	static final String UCDVersion = "";
-	static final String[] ICU_FILES = {"FractionalUCA", "writeconformance", "writeconformanceshifted", "WriteRules"};
+	static final String[] ICU_FILES = {"FractionalUCA", "writeconformance", "writeconformanceshifted", 
+		"WriteRules", "WriteRulesWithNames", "WriteRulesXML"};
 	
 	public static void main(String args[]) throws Exception {
 		
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ 
-* $Date: 2002/05/31 01:41:03 $ 
-* $Revision: 1.11 $
+* $Date: 2002/06/02 05:07:08 $ 
+* $Revision: 1.12 $
 *
 *******************************************************************************
 */
@ -639,9 +639,72 @@ final public class UCA implements Comparator {
    }
    
    
-    static boolean isImplicitCE(int ce) {
-    	int primary = getPrimary(ce);
-    	return primary >= UNSUPPORTED_BASE && primary <= UNSUPPORTED_TOP;
+    static boolean isImplicitLeadCE(int ce) {
+    	return isImplicitLeadPrimary(getPrimary(ce));
+    }
+    
+    static boolean isImplicitLeadPrimary(int primary) {
+    	return primary >= UNSUPPORTED_BASE && primary < UNSUPPORTED_LIMIT;
+    }
+    
+            
+
+/*
+The formula from the UCA:
+
+BASE:
+
+FB40 CJK Ideograph 
+FB80 CJK Ideograph Extension A/B 
+FBC0 Any other code point 
+
+AAAA = BASE + (CP >> 15);
+BBBB = (CP & 0x7FFF) | 0x8000;The mapping given to CP is then given by:
+
+CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
+*/		
+    
+    /**
+     * Returns implicit value as pair, first part in high word; second part in low word
+     * So to get first part use (x >>> 16) -- remember the >>>!
+     * and to get the second part use (x & 0xFFFF)
+     */
+    
+    static void CodepointToImplicit(int cp, int[] output) {
+		int base = UNSUPPORTED_OTHER_BASE;
+        if (isCJK(cp)) base = UNSUPPORTED_CJK_BASE;
+        else if (isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
+        output[0] = base + (cp >>> 15);
+        output[1] = (cp & 0x7FFF) | 0x8000;
+    }
+    
+    /**
+     * Takes implicit value as pair, first part in high word; second part in low word
+     * So to get first part use (x >>> 16) -- remember the >>>!
+     * and to get the second part use (x & 0xFFFF)
+     */
+    
+    static int ImplicitToCodePoint(int leadImplicit, int trailImplicit) {
+    	// could probably optimize all this, but it is not worth it.
+    	if (leadImplicit < UNSUPPORTED_BASE || leadImplicit >= UNSUPPORTED_LIMIT) {
+    		throw new IllegalArgumentException("Lead implicit out of bounds: " + Utility.hex(leadImplicit));
+    	}
+    	if ((trailImplicit & 0x8000) == 0) {
+    		throw new IllegalArgumentException("Trail implicit out of bounds: " + Utility.hex(trailImplicit));
+    	}
+    	int base;
+    	if (leadImplicit >= UNSUPPORTED_OTHER_BASE) base = UNSUPPORTED_OTHER_BASE;
+    	else if (leadImplicit >= UNSUPPORTED_CJK_AB_BASE) base = UNSUPPORTED_CJK_AB_BASE;
+    	else base = UNSUPPORTED_CJK_BASE;
+    	
+    	int result = ((leadImplicit - base) << 15) | (trailImplicit & 0x7FFF);
+    	
+    	if (result > 0x10FFFF) {
+    		throw new IllegalArgumentException("Resulting character out of  bounds: "
+    			+ Utility.hex(leadImplicit) + ", " + Utility.hex(trailImplicit) 
+    			+ " => " + result);
+    	}
+    	return result;
    }
    
    /**
@ -784,10 +847,17 @@ final public class UCA implements Comparator {
 // Collation Element Memory Data Table Formats
 // =============================================================

+    /**
+     * Used to composed Hangul and Han characters
+     */
+     
+    static final int NEUTRAL_SECONDARY = 0x20;
+    static final int NEUTRAL_TERTIARY = 0x02;
+    
    /**
     * Temporary buffer used in getSortKey for the decomposed string
     */
-    StringBuffer decompositionBuffer = new StringBuffer();
+    private StringBuffer decompositionBuffer = new StringBuffer();
    
    /**
     * The collation element data is stored a couple of different structures.
@ -798,20 +868,14 @@ final public class UCA implements Comparator {
     * table of simple collation elements, indexed by char.<br>
     * Exceptional cases: expanding, contracting, unsupported are handled as described below.
     */
-    int[] collationElements = new int[65536];
+    private int[] collationElements = new int[65536];
    
    /**
     * A special bit combination in a CE is used to reserve exception cases. This has the effect
     * of removing a small number of the primary key values out of the 65536 possible.
     */
-    static final int EXCEPTION_CE_MASK = 0xF8000000;
+    private static final int EXCEPTION_CE_MASK = 0xF8000000;
    
-    /**
-     * Used to composed Hangul and Han characters
-     */
-     
-    static final int NEUTRAL_SECONDARY = 0x20;
-    static final int NEUTRAL_TERTIARY = 0x02;
       
    /**
     * Any unsupported characters (those not in the UCA data tables) 
@ -820,14 +884,14 @@ final public class UCA implements Comparator {
     * There are at least 34 values, so that we can use a range for surrogates
     * However, we do add to the first weight if we have surrogate pairs!
     */
-    public static final int UNSUPPORTED_CJK_BASE = 0xFF40;
-    public static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
-    public static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
+    private static final int UNSUPPORTED_CJK_BASE = 0xFF40;
+    private static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
+    private static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
    
-    public static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
-    public static final int UNSUPPORTED_TOP = UNSUPPORTED_OTHER_BASE + 0x40;
+    private static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
+    private static final int UNSUPPORTED_LIMIT = UNSUPPORTED_OTHER_BASE + 0x40;
    
-    static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
+    private static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
    
    // was 0xFFC20101;
    
@ -838,7 +902,7 @@ final public class UCA implements Comparator {
     * to be looked up (with following characters) in the contractingTable.<br>
     * This isn't a MASK since there is exactly one value.
     */
-    static final int CONTRACTING = 0xFA310000;
+    private static final int CONTRACTING = 0xFA310000;

    /**
     * Expanding characters are marked with a exception bit combination
@ -846,7 +910,7 @@ final public class UCA implements Comparator {
     * This means that they map to more than one CE, which is looked up in
     * the expansionTable by index. See EXCEPTION_INDEX_MASK
     */
-    static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
+    private static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
    
    /**
     * This mask is used to get the index from an EXPANDING exception.
@ -860,12 +924,12 @@ final public class UCA implements Comparator {
     * as the table is built from the UCA data, they are narrowed in.
     * The first three values are used in building; the last two in testing.
    */
-    int variableLow = '\uFFFF';
-    int nonVariableLow = '\uFFFF'; // HACK '\u089A';
-    int variableHigh = '\u0000';
+    private int variableLow = '\uFFFF';
+    private int nonVariableLow = '\uFFFF'; // HACK '\u089A';
+    private int variableHigh = '\u0000';
    
-    int variableLowCE;  // used for testing against
-    int variableHighCE; // used for testing against
+    private int variableLowCE;  // used for testing against
+    private int variableHighCE; // used for testing against
    
    /**
     * Although a single character can expand into multiple CEs, we don't want to burden
@ -875,19 +939,19 @@ final public class UCA implements Comparator {
     * will be used for the expansion. The implementation is as a stack; this just makes it
     * easy to generate.
     */
-    IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
+    private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
        
    /**
     * For now, this is just a simple mapping of strings to collation elements.
     * The implementation depends on the contracting characters being "completed",
     * so that it can be efficiently determined when to stop looking.
     */
-    Hashtable contractingTable = new Hashtable();
+    private Hashtable contractingTable = new Hashtable();
    
    /**
     *  Special char value that means failed or terminated
     */
-    static final char NOT_A_CHAR = '\uFFFF';
+    private static final char NOT_A_CHAR = '\uFFFF';
    
    /**
     * Marks whether we are using the full data set, or an abbreviated version for
@ -1006,37 +1070,19 @@ final public class UCA implements Comparator {
                index++; // skip next char
                bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
            }
-            

-/*
-The formula from the UCA:
-
-BASE:
-
-FB40 CJK Ideograph 
-FB80 CJK Ideograph Extension A/B 
-FBC0 Any other code point 
-
-AAAA = BASE + (CP >> 15);
-BBBB = (CP & 0x7FFF) | 0x8000;The mapping given to CP is then given by:
-
-CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
-*/		
-			// divide the three cases
+			// find the implicit values; returned in 0 and 1
+			int[] implicit = new int[2];
+			CodepointToImplicit(bigChar, implicit);
 			
-			int base = UNSUPPORTED_OTHER_BASE;
-            if (isCJK(bigChar)) base = UNSUPPORTED_CJK_BASE;
-            else if (isCJK_AB(bigChar)) base = UNSUPPORTED_CJK_AB_BASE;
-            
            // Now compose the two keys
-            // first push BBBB
+            // first push BBBB, which is #1
                        
-            // HACK: expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0));
-            expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
+            expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
            
-            // now return AAAA
+            // now return AAAA, which is #0
            
-            return makeKey(base + (bigChar >>> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
+            return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);

        }
        if (ce == CONTRACTING) {
@ -1113,17 +1159,64 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        return expandingStack.pop(); // pop last (guaranteed to exist!)
    }
    
-    public final boolean isCJK(int bigChar) {
-        return (0x4E00 <= bigChar && bigChar <= 0x9FFF);
+    // Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6]
+    
+    public static boolean isCJK(int cp) {
+        return (CJK_BASE <= cp && cp < CJK_LIMIT 
+        || cp == 0xFA0E	// compat characters that don't decompose.
+        || cp == 0xFA0F
+        || cp == 0xFA11
+        || cp == 0xFA13
+        || cp == 0xFA14
+        || cp == 0xFA1F
+        || cp == 0xFA21
+        || cp == 0xFA23
+        || cp == 0xFA24
+        || cp == 0xFA27
+        || cp == 0xFA28
+        || cp == 0xFA29
+        || cp == 0xFA2E
+        || cp == 0xFA2F
+        );
    }
-    public final boolean isCJK_AB(int bigChar) {
-        return (0x3400 <= bigChar && bigChar <= 0x4DBF
-             || 0x20000 <= bigChar && bigChar <= 0x2A6DF);
+    
+    public static final int 
+    	CJK_BASE = 0x4E00,
+    	CJK_LIMIT = 0x9FFF+1,
+    	CJK_BASE_COMPAT_USED = 0xFA0E,
+    	CJK_LIMIT_COMPAT_USED = 0xFA2F+1,
+    	CJK_A_BASE = 0x3400,
+    	CJK_A_LIMIT = 0x4DBF+1,
+    	CJK_B_BASE = 0x20000,
+    	CJK_B_LIMIT = 0x2A6DF+1;
+    
+    public static final boolean isCJK_AB(int bigChar) {
+        return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT
+             || CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT);
    }
 /*
+2E80..2EFF; CJK Radicals Supplement
+2F00..2FDF; Kangxi Radicals
+
 3400..4DBF; CJK Unified Ideographs Extension A
 4E00..9FFF; CJK Unified Ideographs
+F900..FAFF; CJK Compatibility Ideographs
+
 20000..2A6DF; CJK Unified Ideographs Extension B
+2F800..2FA1F; CJK Compatibility Ideographs Supplement
+
+Compat:
+# F900..FA0D     [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
+# FA10                 CJK COMPATIBILITY IDEOGRAPH-FA10
+# FA12                 CJK COMPATIBILITY IDEOGRAPH-FA12
+# FA15..FA1E      [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
+# FA20                 CJK COMPATIBILITY IDEOGRAPH-FA20
+# FA22                 CJK COMPATIBILITY IDEOGRAPH-FA22
+# FA25..FA26       [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
+# FA2A..FA2D       [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D
+# FA30..FA6A      [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A
+# 2F800..2FA1D   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
+
 */
    
    private final boolean isHangul(int bigChar) {
@ -1348,6 +1441,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        IntStack tempStack = new IntStack(100); // used for reversal
        StringBuffer multiChars = new StringBuffer(); // used for contracting chars
        String inputLine = "";
+        boolean[] wasImplicitLeadPrimary = new boolean[1];
+            
        while (true) try {
            inputLine = in.readLine();
            if (inputLine == null) break;       // means file is done
@ -1413,8 +1508,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
            	System.out.println("debug");
            }
            
-            int ce = getCEFromLine(value, line, position, record);
-            int ce2 = getCEFromLine(value, line, position, record);
+            wasImplicitLeadPrimary[0] = false;
+            
+            int ce = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
+            int ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
            if (CHECK_UNIQUE && (ce2 == TERMINATOR || CHECK_UNIQUE_EXPANSIONS)) {
                if (!CHECK_UNIQUE_VARIABLES) {
                    checkUnique(value, ce, 0, inputLine); // only need to check first value
@ -1433,7 +1530,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
                // set collationElement to exception value, plus index
                ce = EXPANDING_MASK | expandingTable.getTop();
                while (true) {
-                    ce2 = getCEFromLine(value, line, position, record);
+                    ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
                    if (ce2 == TERMINATOR) break;
                    tempStack.push(ce2);
                } 
@ -1700,7 +1797,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
    boolean haveVariableWarning = false;
    boolean haveZeroVariableWarning = false;
    
-    private int getCEFromLine(char value, String line, int[] position, boolean record) {
+    private int getCEFromLine(char value, String line, int[] position, boolean record, boolean[] lastWasImplicitLead) {
        int start = line.indexOf('[', position[0]);
        if (start == -1) return TERMINATOR;
        boolean variable = line.charAt(start+1) == '*';
@ -1711,7 +1808,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        int key2 = Integer.parseInt(line.substring(start+7,start+11),16);
        int key3 = Integer.parseInt(line.substring(start+12,start+16),16);
        if (record) {
-            primarySet.set(key1);
+        	if (lastWasImplicitLead[0]) {
+        		lastWasImplicitLead[0] = false;
+            } else if (isImplicitLeadPrimary(key1)) {
+            	lastWasImplicitLead[0] = true;
+            } else {
+            	primarySet.set(key1);
+            }
            secondarySet.set(key2);
            secondaryCount[key2]++;
            tertiarySet.set(key3);
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ 
-* $Date: 2002/05/31 01:41:03 $ 
-* $Revision: 1.9 $
+* $Date: 2002/06/02 05:07:08 $ 
+* $Revision: 1.10 $
 *
 *******************************************************************************
 */
@ -113,7 +113,7 @@ public class WriteCharts implements UCD_Types {
            else if (primary == 0) script = IGNORABLE_ORDER;
            else if (primary < variable) script = VARIABLE_ORDER;
            else if (primary < high) script = COMMON_SCRIPT;
-            else if (primary >= UCA.UNSUPPORTED_BASE && primary <= UCA.UNSUPPORTED_TOP) script = UNSUPPORTED;
+            else if (UCA.isImplicitLeadPrimary(primary)) script = UNSUPPORTED;
            
            if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT;
            else if ((script == INHERITED_SCRIPT || script == COMMON_SCRIPT) && oldScript >= 0) script = oldScript;
@ -147,7 +147,7 @@ public class WriteCharts implements UCD_Types {
            for (int i = 0; i < sortKey.length(); ++i) {
                char w = sortKey.charAt(i);
                if (w == 0) break;
-				if (w >= UCA.UNSUPPORTED_BASE && w <= UCA.UNSUPPORTED_TOP) {
+				if (UCA.isImplicitLeadPrimary(w)) {
 					++i; // skip next
 				}
                ++ primaryCount;
@ -571,7 +571,7 @@ public class WriteCharts implements UCD_Types {

    static int getFirstPrimary(String sortKey) {
        int result = sortKey.charAt(0);
-		if (result >= UCA.UNSUPPORTED_BASE && result <= UCA.UNSUPPORTED_TOP) {
+		if (UCA.isImplicitLeadPrimary(result)) {
 			return (result << 16) | sortKey.charAt(1);
 		}
 		return (result << 16);
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ 
-* $Date: 2002/05/31 01:41:03 $ 
-* $Revision: 1.13 $
+* $Date: 2002/06/02 05:07:08 $ 
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */
@ -15,6 +15,7 @@ package com.ibm.text.UCA;

 import java.util.*;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 import java.io.*;
 //import java.text.*;
@ -30,6 +31,9 @@ import com.ibm.text.utility.*;
 import com.ibm.text.UCD.Normalizer;

 public class WriteCollationData implements UCD_Types {
+	
+	static final boolean DEBUG = false;
+	
    public static final String copyright = 
      "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
      
@ -283,7 +287,7 @@ public class WriteCollationData implements UCD_Types {
    static void writeConformance(String filename, byte option, boolean shortPrint)  throws IOException {
        UCD ucd30 = UCD.make("3.0.0");
        
-        PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt");
+        PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false);
        if (!shortPrint) log.write('\uFEFF');
        
        System.out.println("Sorting");
@ -1149,6 +1153,7 @@ public class WriteCollationData implements UCD_Types {
    }
    
    static Normalizer nfdNew = new Normalizer(Normalizer.NFD, "");
+    static Normalizer NFC = new Normalizer(Normalizer.NFC, "");
    static Normalizer nfkdNew = new Normalizer(Normalizer.NFKD, "");
    
    static void writeRules (byte option) throws IOException {
@ -1207,6 +1212,56 @@ public class WriteCollationData implements UCD_Types {
            }
        }
        
+        System.out.println("Checking CJK");
+        
+        // Check for characters that are ARE explicitly mapped in the CJK ranges
+        UnicodeSet CJK = new UnicodeSet(0x2E80, 0x2EFF);
+        CJK.add(0x2F00, 0x2EFF);
+        CJK.add(0x2F00, 0x2FDF);
+        CJK.add(0x3400, 0x9FFF);
+        CJK.add(0xF900, 0xFAFF);
+        CJK.add(0x20000, 0x2A6DF);
+        CJK.add(0x2F800, 0x2FA1F);
+        CJK.removeAll(new UnicodeSet("[:Cn:]")); // remove unassigned
+        
+        // make set with canonical decomposibles
+        UnicodeSet composites = new UnicodeSet();
+        for (int i = 0; i < 0x10FFFF; ++i) {
+        	if (!ucd.isAllocated(i)) continue;
+        	if (nfd.isNormalized(i)) continue;
+        	composites.add(i);
+        }
+        UnicodeSet CJKcomposites = new UnicodeSet(CJK).retainAll(composites);
+        System.out.println("CJK composites " + CJKcomposites.toPattern(true));
+        System.out.println("CJK NONcomposites " + new UnicodeSet(CJK).removeAll(composites).toPattern(true));
+        
+        UnicodeSet mapped = new UnicodeSet();
+        Iterator it = alreadyDone.iterator();
+        while (it.hasNext()) {
+        	String member = (String) it.next();
+        	mapped.add(member);
+        }
+        UnicodeSet CJKmapped = new UnicodeSet(CJK).retainAll(mapped);
+        System.out.println("Mapped CJK: " + CJKmapped.toPattern(true));
+        System.out.println("UNMapped CJK: " + new UnicodeSet(CJK).removeAll(mapped).toPattern(true));
+        System.out.println("Neither Mapped nor Composite CJK: "
+        	+ new UnicodeSet(CJK).removeAll(CJKcomposites).removeAll(CJKmapped).toPattern(true));
+        
+        
+        
+/*
+2E80..2EFF; CJK Radicals Supplement
+2F00..2FDF; Kangxi Radicals
+
+3400..4DBF; CJK Unified Ideographs Extension A
+4E00..9FFF; CJK Unified Ideographs
+F900..FAFF; CJK Compatibility Ideographs
+
+20000..2A6DF; CJK Unified Ideographs Extension B
+2F800..2FA1F; CJK Compatibility Ideographs Supplement
+*/
+        
+        
        System.out.println("Adding Kanji");
        for (int i = 0; i < 0x10FFFF; ++i) {
        	if (!ucd.isAllocated(i)) continue;
@ -1236,10 +1291,29 @@ public class WriteCollationData implements UCD_Types {
        else if (option == IN_XML) filename = "UCA_Rules.xml";
        log = Utility.openPrintWriter(filename, false, false);
        
-        if (option == IN_XML) log.println("<uca>");
-        else log.write('\uFEFF'); // BOM
+        String[] commentText = {
+        	"NOTE: Since UCA handles canonical equivalents, no composites are necessary",
+        	"(except in extensions).",
+        	"For syntax description, see: http://oss.software.ibm.com/icu/userguide/Collate_Intro.html"
+        };
        
-        Iterator it = ordered.keySet().iterator();
+        if (option == IN_XML) {
+        	log.println("<uca>");
+        	log.println("<!--");
+        	for (int i = 0; i < commentText.length; ++i) {
+        		log.println(commentText[i]);
+        	}
+        	log.println("-->");
+        	log.println("<version UCA='" + collator.getDataVersion() + "' UCD='" + collator.getUCDVersion() + "'/>");
+        } else {
+        	log.write('\uFEFF'); // BOM
+        	for (int i = 0; i < commentText.length; ++i) {
+        		log.println("#\t" + commentText[i]);
+        	}
+        	log.println("# VERSION: UCA=" + collator.getDataVersion() + ", UCD=" + collator.getUCDVersion());
+        }
+        
+        it = ordered.keySet().iterator();
        int oldFirstPrimary = UCA.getPrimary(UCA.TERMINATOR);
        boolean wasVariable = false;
        
@ -1347,46 +1421,6 @@ public class WriteCollationData implements UCD_Types {
            
            if (len == -1) continue;
            
-            // RESETs: do special case for relations to fixed items
-            
-            String reset = "";
-            int xmlReset = 0;
-
-            if (firstTime
-              || collator.getPrimary(lastCE) == 0 && collator.getPrimary(ce) != 0
-              || collator.getSecondary(lastCE) == 0 && collator.getSecondary(ce) != 0
-              || collator.getTertiary(lastCE) == 0 && collator.getTertiary(ce) != 0) {
-                firstTime = false;
-                if (collator.getPrimary(ce) != 0) {
-                    reset = "& [top]";
-                } else {
-                    reset = "& " + quoteOperand(chr);
-                }
-            } else if (variableTop != 0 && (ce & 0xFFFF0000L) > variableTop) {
-                reset = "= [variable\\u0020top]";
-                xmlReset = 1;
-                variableTop = 0;
-            } else {
-                char primary = collator.getPrimary(ce);
-                if (isFixedIdeograph(remapUCA_CompatibilityIdeographToCp(primary))) {
-                    if (primary != lastCJKPrimary) {
-                        reset = "& " + quoteOperand(String.valueOf(primary));
-                        lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
-                        xmlReset = 2;
-                    }
-                }
-                lastCJKPrimary = primary;
-            }
-            
-            /*
-            if (primary >= 0x3400) {
-                if (primary == 0x9FA6) {
-                    primary = '\u9FA5';
-                }
-                if (primary < 0x9FA6) {
-                }
-            }
-            */
            
            // get relation
            
@ -1398,6 +1432,50 @@ public class WriteCollationData implements UCD_Types {
            
            int relation = getStrengthDifference(ces, len, lastCes, lastLen);

+            // RESETs: do special case for relations to fixed items
+            
+            String reset = "";
+            String resetComment = "";
+            int xmlReset = 0;
+
+            if (firstTime
+              || collator.getPrimary(lastCE) == 0 && collator.getPrimary(ce) != 0
+              || collator.getSecondary(lastCE) == 0 && collator.getSecondary(ce) != 0
+              || collator.getTertiary(lastCE) == 0 && collator.getTertiary(ce) != 0) {
+                firstTime = false;
+                if (collator.getPrimary(ce) != 0) {
+                    reset = "[top]";
+                } else {
+                    reset = quoteOperand(chr);
+                }
+            } else if (variableTop != 0 && (ce & 0xFFFF0000L) > variableTop) {
+                reset = "[variable\\u0020top]";
+                xmlReset = 1;
+                variableTop = 0;
+            } else {
+                int primary = collator.getPrimary(ce);
+                if (UCA.isImplicitLeadPrimary(primary)) {
+                    if (relation == PRIMARY_DIFF) {
+                    	int resetCp = UCA.ImplicitToCodePoint(primary, UCA.getPrimary(ces[1]));
+                        reset = quoteOperand(UTF16.valueOf(resetCp));
+                        resetComment = ucd.getCodeAndName(resetCp);
+                        // lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
+                        xmlReset = 2;
+                    }
+                	// lastCJKPrimary = primary;
+                }
+            }
+            
+            /*
+            if (primary >= 0x3400) {
+                if (primary == 0x9FA6) {
+                    primary = '\u9FA5';
+                }
+                if (primary < 0x9FA6) {
+                }
+            }
+            */
+
            if (chr.equals("\u2F00")) {
                System.out.println(UCA.ceToString(ces, len));
            }
@ -1405,7 +1483,7 @@ public class WriteCollationData implements UCD_Types {
            // There are double-CEs, so we have to know what the length of the first bit is.
            
    		int expansionStart = 1;
-    		if (UCA.isImplicitCE(ces[0])) {
+    		if (UCA.isImplicitLeadCE(ces[0])) {
    			expansionStart = 2; // move up if first is double-ce
    		}
            
@ -1432,16 +1510,17 @@ public class WriteCollationData implements UCD_Types {
                */
                
                if (xmlReset == 2) {
-                    log.print("<reset anchor=\"" + Utility.quoteXML(String.valueOf(collator.getPrimary(ce))) + "\"/>");
+                    log.print("<reset>" + Utility.quoteXML(reset) + "</reset>");
                }
-                log.print("  <" + XML_RELATION_NAMES[relation]);
-                log.print(" s=\"" + Utility.quoteXML(chr) + "\"");
-                if (len > 1) {
-                    log.print(" expansion=\"" + Utility.quoteXML(expansion) + "\"");
+                log.print("  <" + XML_RELATION_NAMES[relation] + ">");
+                if (expansion.length() > 0) {
+                    log.print("<x>" + Utility.quoteXML(expansion) + "</x>");
                }
-                log.println("/>");
+                log.print(Utility.quoteXML(chr));
+                log.print("</" + XML_RELATION_NAMES[relation] + ">");
            } else {
-                if (reset.length() != 0) log.println(reset);
+                if (reset.length() != 0) log.println("& " + reset 
+                	+ (resetComment.length() != 0 ? "\t\t# " + resetComment : ""));
                log.print(RELATION_NAMES[relation] + " " + quoteOperand(chr));
                if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion));
                if (option == WITH_NAMES) {
@ -1461,7 +1540,7 @@ public class WriteCollationData implements UCD_Types {
    }
    
    static long getPrimary(int[] ces) {
-    	if (UCA.isImplicitCE(ces[0])) {
+    	if (UCA.isImplicitLeadCE(ces[0])) {
    		return (UCA.getPrimary(ces[0]) << 16) + UCA.getPrimary(ces[1]);
    	} else {
    		return UCA.getPrimary(ces[0]);
@ -1469,7 +1548,7 @@ public class WriteCollationData implements UCD_Types {
    }
    
    static long getSecondary(int[] ces) {
-    	if (UCA.isImplicitCE(ces[0])) {
+    	if (UCA.isImplicitLeadCE(ces[0])) {
    		return (UCA.getSecondary(ces[0]) << 16) + UCA.getSecondary(ces[1]);
    	} else {
    		return UCA.getSecondary(ces[0]);
@ -1477,36 +1556,42 @@ public class WriteCollationData implements UCD_Types {
    }
    
    static long getTertiary(int[] ces) {
-    	if (UCA.isImplicitCE(ces[0])) {
+    	if (UCA.isImplicitLeadCE(ces[0])) {
    		return (UCA.getTertiary(ces[0]) << 16) + UCA.getTertiary(ces[1]);
    	} else {
    		return UCA.getTertiary(ces[0]);
    	}
    }
    
+    static final int 
+    	PRIMARY_DIFF = 0,
+    	SECONDARY_DIFF = 1,
+    	TERTIARY_DIFF = 2,
+    	QUARTERNARY_DIFF = 3;
+    	
 	static int getStrengthDifference(int[] ces, int len, int[] lastCes, int lastLen) {
 		
-        int relation = 3;
+        int relation = QUARTERNARY_DIFF;
        if (getPrimary(ces) != getPrimary(lastCes)) {
-            relation = 0;
+            relation = PRIMARY_DIFF;
        } else if (getSecondary(ces) != getSecondary(lastCes)) {
-            relation = 1;
+            relation = SECONDARY_DIFF;
        } else if (getTertiary(ces) != getTertiary(lastCes)) {
-            relation = 2;
+            relation = TERTIARY_DIFF;
        } else if (len > lastLen) {
-            relation = 2; // HACK
+            relation = TERTIARY_DIFF; // HACK
        } else {
            int minLen = len < lastLen ? len : lastLen;
-			int start = UCA.isImplicitCE(ces[0]) ? 2 : 1;
+			int start = UCA.isImplicitLeadCE(ces[0]) ? 2 : 1;
            for (int kk = start; kk < minLen; ++kk) {
                int lc = lastCes[kk];
                int c = ces[kk];
                if (collator.getPrimary(c) != collator.getPrimary(lc)
                    || collator.getSecondary(c) != collator.getSecondary(lc)) {
-                    relation = 3;   // reset relation on FIRST char, since differ anyway
+                    relation = QUARTERNARY_DIFF;   // reset relation on FIRST char, since differ anyway
                    break;
                    } else if (collator.getTertiary(c) > collator.getTertiary(lc)) {
-                    relation = 2;   // reset to tertiary (but later ce's might override!)
+                    relation = TERTIARY_DIFF;   // reset to tertiary (but later ce's might override!)
                }
            }
        }
@ -1760,6 +1845,7 @@ public class WriteCollationData implements UCD_Types {
    static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster
    
    static final String quoteOperand(String s) {
+    	s = NFC.normalize(s);
        quoteOperandBuffer.setLength(0);
        boolean noQuotes = true;
        boolean inQuote = false;
@ -1910,14 +1996,14 @@ public class WriteCollationData implements UCD_Types {
                // special handling for Jamo 3-byte forms
                
                if (isOldJamo(primary)) {
-                    System.out.print("JAMO: " + Utility.hex(lastValue));
+                    if (DEBUG) System.out.print("JAMO: " + Utility.hex(lastValue));
                    if ((lastValue & 0xFF0000) == 0) { // lastValue was 2-byte form
                        subtotal += primaryDelta[primary];  // we convert from relative to absolute
                        lastValue = primaryDelta[primary] = (subtotal << 8) + 0x10; // make 3 byte, leave gap
                    } else { // lastValue was 3-byte form
                        lastValue = primaryDelta[primary] = lastValue + 3;
                    }
-                    System.out.println(" => " + Utility.hex(lastValue));
+                    if (DEBUG) System.out.println(" => " + Utility.hex(lastValue));
                    continue;
                }
                
@ -1945,12 +2031,17 @@ public class WriteCollationData implements UCD_Types {
                lastValue = primaryDelta[primary] = subtotal;
            }
            // fixup for Kanji
+            /*
+            
+            // WE DROP THIS: we are skipping all CJK values above, and will fix them separately
+            
            int fixedCompat = remapUCA_CompatibilityIdeographToCp(primary);
            if (isFixedIdeograph(fixedCompat)) {
                int CE = getImplicitPrimary(fixedCompat);
                
                lastValue = primaryDelta[primary] = CE >>> 8; 
            }
+            */
            //if ((primary & 0xFF) == 0) System.out.println(Utility.hex(primary) + " => " + hexBytes(primaryDelta[primary]));
        }
        
@ -1959,7 +2050,7 @@ public class WriteCollationData implements UCD_Types {
        
        System.out.println("Sorting");
        Map ordered = new TreeMap();
-        UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, null);
+        UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, NFD);
        int ccounter = 0;
        while (true) {
            Utility.dot(ccounter++);
@ -2043,6 +2134,11 @@ public class WriteCollationData implements UCD_Types {
        log.println("#  - Differs from previous version in that MAX value was introduced at 1F.");
        log.println("#    All tertiary values are shifted down by 1, filling the gap at 7!");
        
+        int firstImplicit = getImplicitPrimary(UCA.CJK_BASE) >>> 24;
+        int lastImplicit = getImplicitPrimary(0x10FFFF) >>> 24;
+        log.println("[FIRST_IMPLICIT= " + Utility.hex(firstImplicit) + "]");
+        log.println("[LAST_IMPLICIT= " + Utility.hex(lastImplicit) + "]");
+        
        String lastChr = "";
        int lastNp = 0;
        boolean doVariable = false;
@ -2091,27 +2187,37 @@ public class WriteCollationData implements UCD_Types {
                
                oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
                
-                // special hack for unsupported!
+                // special treatment for unsupported!
                
-                if (pri >= UCA.UNSUPPORTED_BASE) {
+                if (UCA.isImplicitLeadPrimary(pri)) {
                    ++q;
                    oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
                
                    int pri2 = UCA.getPrimary(ces[q]);
                    // get old code point
-                    // pri = UNSUPPORTED_BASE + (bigChar >>> 15)
-                    // pri2 = (bigChar & 0x7FFF) | 0x8000
-                    pri -= UCA.UNSUPPORTED_BASE;
-                    pri <<= 15;
-                    pri2 &= 0x7FFF;
-                    pri += pri2;
-                    System.out.println("Unsupported: "
-                        + Utility.hex(UCA.getPrimary(ces[q-1]))
-                        + ", " + Utility.hex(UCA.getPrimary(ces[q]))
-                        + ", " + Utility.hex(pri)
-                        + ", " + Utility.hex(fixPrimary(pri) & 0xFFFFFFFFL)
+                    
+                    int cp = UCA.ImplicitToCodePoint(pri, pri2);
+                    
+                    // double check results!
+                    
+                    int[] testImplicit = new int[2];
+                    UCA.CodepointToImplicit(cp, testImplicit);
+                    boolean gotError = pri != testImplicit[0] || pri2 != testImplicit[1];
+                    if (gotError) {
+                    	System.out.println("ERROR");
+                    }
+                    if (DEBUG || gotError) {
+                    	System.out.println("Computing Unsupported CP as: "
+                        	+ Utility.hex(pri)
+                        	+ ", " + Utility.hex(pri2)
+                        	+ " => " + Utility.hex(cp)
+                        	+ " => " + Utility.hex(testImplicit[0])
+                        	+ ", " + Utility.hex(testImplicit[1])
+                        	// + ", " + Utility.hex(fixPrimary(pri) & 0xFFFFFFFFL)
                        );
-                        
+                    }
+                    
+                    pri = cp | MARK_CODE_POINT;
                }
                
                if (sec != 0x20) {
@ -2173,10 +2279,14 @@ public class WriteCollationData implements UCD_Types {
        summary.println();
        summary.println("# First Implicit: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0)));
        summary.println("# Last Implicit: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x10FFFF)));
+        summary.println("# First CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4E00)));
+        summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0xFA2F)));
+        summary.println("# First CJK_A: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x3400)));
+        summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4DBF)));
        
        boolean lastOne = false;
        for (int i = 0; i < 0x10FFFF; ++i) {
-            boolean thisOne = isFixedIdeograph(i);
+            boolean thisOne = UCA.isCJK(i) || UCA.isCJK_AB(i);
            if (thisOne != lastOne) {
                summary.println("# Implicit Cusp: CJK=" + lastOne + ": " + Utility.hex(i-1) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i-1)));
                summary.println("# Implicit Cusp: CJK=" + thisOne + ": " + Utility.hex(i) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i)));
@ -2223,21 +2333,7 @@ public class WriteCollationData implements UCD_Types {
        summary.close();
    }
    
-    // CONSTANTS
-    
-    static final int 
-        HAN_START = 0x3400,
-        HAN_LIMIT = 0xA000,
-        SUPPLEMENTARY_COUNT = 0x100000,
-        BYTES_TO_AVOID = 3,
-        OTHER_COUNT = 256 - BYTES_TO_AVOID,
-        LAST_COUNT = OTHER_COUNT / 2,
-        LAST_COUNT2 = (SUPPLEMENTARY_COUNT - 1) / (OTHER_COUNT * OTHER_COUNT) + 1, // last byte
-        HAN_SHIFT = LAST_COUNT * OTHER_COUNT - HAN_START,
-        IMPLICIT_BOUNDARY = 2 * OTHER_COUNT * LAST_COUNT + HAN_START,
-        LAST2_MULTIPLIER = OTHER_COUNT / LAST_COUNT2;
-    
-    
+    /*
    static boolean isFixedIdeograph(int cp) {
        return (0x3400 <= cp && cp <= 0x4DB5 
            || 0x4E00 <= cp && cp <= 0x9FA5 
@ -2246,6 +2342,7 @@ public class WriteCollationData implements UCD_Types {
            || 0x2F800 <= cp && cp <= 0x2FA1D // compat: most of these decompose anyway
            );
    }
+    */
 /*
 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
@ -2258,6 +2355,7 @@ public class WriteCollationData implements UCD_Types {
 2FA1D;CJK COMPATIBILITY IDEOGRAPH-2FA1D;Lo;0;L;2A600;;;;N;;;;;
 */
    
+    /*
    static int remapUCA_CompatibilityIdeographToCp(int cp) {
        switch (cp) {    
            case 0x9FA6: return 0xFA0E; // FA0E ; [.9FA6.0020.0002.FA0E] # CJK COMPATIBILITY IDEOGRAPH-FA0E
@ -2275,6 +2373,45 @@ public class WriteCollationData implements UCD_Types {
        }
        return cp;
    }
+    */
+    
+    /**
+     * Function used to collapse the two different Han blocks from UCA into one.
+     * It does this by reversing the order of the two groups A and B below.
+     * A:
+	 *	4E00..9FFF; CJK Unified Ideographs
+	 *	F900..FAFF; CJK Compatibility Ideographs
+	 * B:
+	 *	3400..4DBF; CJK Unified Ideographs Extension A
+	 * As long as
+	 *	no new B characters are allocated between 4E00 and FAFF, and
+	 *	no new A characters are outside of this range,
+	 * (very high probability) this simple code will work.
+	 */
+    static int swapCJK(int i) {
+    	if (i >= UCA.CJK_LIMIT_COMPAT_USED) return i;
+    	if (i >= UCA.CJK_BASE) return i - UCA.CJK_BASE;
+    	return i + (UCA.CJK_LIMIT_COMPAT_USED - UCA.CJK_BASE);
+    }
+    
+    // CONSTANTS
+    
+    static final int 
+        BYTES_TO_AVOID = 3,
+        OTHER_COUNT = 256 - BYTES_TO_AVOID,
+        LAST_COUNT = OTHER_COUNT / 2,
+        LAST_COUNT2 = OTHER_COUNT / 16, // room for intervening, without expanding to 5 bytes
+        IMPLICIT_3BYTE_COUNT = 1,
+        IMPLICIT_BASE_BYTE = 0xE0,
+        
+        IMPLICIT_LIMIT_BYTE = IMPLICIT_BASE_BYTE + 4, // leave room for 1 3-byte and 2 4-byte forms
+        
+        IMPLICIT_4BYTE_BOUNDARY = IMPLICIT_3BYTE_COUNT * OTHER_COUNT * LAST_COUNT,
+        LAST_MULTIPLIER = OTHER_COUNT / LAST_COUNT,
+        LAST2_MULTIPLIER = OTHER_COUNT / LAST_COUNT2,
+        IMPLICIT_BASE_3BYTE = (IMPLICIT_BASE_BYTE << 24) + 0x030300,
+        IMPLICIT_BASE_4BYTE = ((IMPLICIT_BASE_BYTE + IMPLICIT_3BYTE_COUNT) << 24) + 0x030303
+        ;
    
    // GET IMPLICIT PRIMARY WEIGHTS
    // Return value is left justified primary key
@ -2287,22 +2424,62 @@ public class WriteCollationData implements UCD_Types {
        // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
        // Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
        
-        int last0 = cp - IMPLICIT_BOUNDARY;
-        int hanFixup = 0;
-        if (isFixedIdeograph(cp)) hanFixup = 0x04000000;
+        if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
+        
+        if (!UCA.isCJK(cp) && !UCA.isCJK_AB(cp)) cp += 0x10FFFF; // space everything else after CJK
+        
+        if (DEBUG) System.out.println("Remapped: " + Utility.hex(cp));
+        
+        cp = swapCJK(cp);
+        
+        if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
+        // we now have a range of numbers from 0 to 21FFFF.
+        
+        int last0 = cp - IMPLICIT_4BYTE_BOUNDARY;
        if (last0 < 0) {
-            cp += HAN_SHIFT; // shift so HAN shares single block
            int last1 = cp / LAST_COUNT;
            last0 = cp % LAST_COUNT;
+            
            int last2 = last1 / OTHER_COUNT;
            last1 %= OTHER_COUNT;
-            return 0xEC030300 - hanFixup + (last2 << 24) + (last1 << 16) + (last0 << 9);
+            
+            if (DEBUG || last2 > 0xFF-BYTES_TO_AVOID) System.out.println("3B: " + Utility.hex(cp) + " => "
+            	+ Utility.hex(last2) + ", "
+            	+ Utility.hex(last1) + ", "
+            	+ Utility.hex(last0) + ", "
+        	);
+            
+            return IMPLICIT_BASE_3BYTE + (last2 << 24) + (last1 << 16) + ((last0*LAST_MULTIPLIER) << 8);
        } else {
            int last1 = last0 / LAST_COUNT2;
            last0 %= LAST_COUNT2;
+            
            int last2 = last1 / OTHER_COUNT;
            last1 %= OTHER_COUNT;
-            return 0xEF030303 - hanFixup + (last2 << 16) + (last1 << 8) + (last0 * LAST2_MULTIPLIER);
+            
+            int last3 = last2 / OTHER_COUNT;
+            last2 %= OTHER_COUNT;
+            
+            if (DEBUG || last3 > 0xFF-BYTES_TO_AVOID) System.out.println("4B: " + Utility.hex(cp) + " => "
+            	+ Utility.hex(last3) + ", "
+            	+ Utility.hex(last2) + ", "
+            	+ Utility.hex(last1) + ", "
+            	+ Utility.hex(last0 * LAST2_MULTIPLIER) + ", "
+        	);
+
+           return IMPLICIT_BASE_4BYTE + (last3 << 24) + (last2 << 16) + (last1 << 8) + (last0 * LAST2_MULTIPLIER);
+        }
+    }
+    
+    
+    static void showImplicit(String title, int cp) {
+    	if (DEBUG) {
+        	System.out.println(title + "-1: " + Utility.hex(cp-1) + " => "
+        		+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp-1)));
+        	System.out.println(title + ": " + Utility.hex(cp) + " => "
+        		+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp)));
+        	System.out.println(title + "+1: " + Utility.hex(cp+1) + " => "
+        		+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp+1)));
        }
    }
    
@ -2311,35 +2488,65 @@ public class WriteCollationData implements UCD_Types {
    static void checkImplicit() {
        long oldPrimary = 0;
        System.out.println("Starting Implicit Check");
-        int mask = ~0x04000000;
-        for (int i = 0; i <= 0x10FFFF; ++i) {
-            long newPrimary = 0xFFFFFFFFL & getImplicitPrimary(i);
-            
-            // test correct values
-            
-            if ((newPrimary & mask) < (oldPrimary & mask)) {
-                throw new IllegalArgumentException(Utility.hex(i) + ": overlap: " + Utility.hex(oldPrimary) + " > " + Utility.hex(newPrimary));
-            }
-            
-            long b0 = (newPrimary >> 24) & 0xFF;
-            long b1 = (newPrimary >> 16) & 0xFF;
-            long b2 = (newPrimary >> 8) & 0xFF;
-            long b3 = newPrimary & 0xFF;
-            
-            if (b0 < 0xE8 || b0 > 0xEF || b1 < 3 || b2 < 3 || b3 == 1 || b3 == 2) {
-                throw new IllegalArgumentException(Utility.hex(i) + ": illegal byte value: " + Utility.hex(newPrimary)
-                    + ", " + Utility.hex(b1) + ", " + Utility.hex(b2) + ", " + Utility.hex(b3));
-            }
-            
-            // print range to look at
-            
-            if (false) {
-                int b = i & 0xFF;
-                if (b == 255 || b == 0 || b == 1) {
-                    System.out.println(Utility.hex(i) + " => " + Utility.hex(newPrimary));
-                }
-            }
-            oldPrimary = newPrimary;
+        
+        showImplicit("# First CJK", UCA.CJK_BASE);
+        showImplicit("# Last CJK", UCA.CJK_LIMIT-1);
+        showImplicit("# First CJK-compat", UCA.CJK_BASE_COMPAT_USED);
+        showImplicit("# Last CJK-compat", UCA.CJK_LIMIT_COMPAT_USED-1);
+        showImplicit("# First CJK_A", UCA.CJK_A_BASE);
+        showImplicit("# Last CJK_A", UCA.CJK_A_LIMIT-1);
+        showImplicit("# First CJK_B", UCA.CJK_B_BASE);
+        showImplicit("# Last CJK_B", UCA.CJK_B_LIMIT-1);
+        showImplicit("# First Other Implicit", 0);
+        showImplicit("# Last Other Implicit", 0x10FFFF);
+        showImplicit("# Boundary", IMPLICIT_4BYTE_BOUNDARY);
+        
+        
+        int oldChar = -1;
+        for (int batch = 0; batch < 3; ++batch) {
+        	for (int i = 0; i <= 0x10FFFF; ++i) {
+        		
+        		// separate the three groups
+        		
+        		if (UCA.isCJK(i)) {
+        			if (batch != 0) continue;
+        		} else if (UCA.isCJK_AB(i)) {
+        			if (batch != 1) continue;
+        		} else if (batch != 2) continue;
+        		
+            	long newPrimary = 0xFFFFFFFFL & getImplicitPrimary(i);
+	            
+            	// test correct values
+	            
+	            
+            	if ((newPrimary) < (oldPrimary)) {
+                	throw new IllegalArgumentException(Utility.hex(i) + ": overlap: "
+                		+ Utility.hex(oldChar) + ", " + Utility.hex(oldPrimary)
+                		+ Utility.hex(i) + ", " + " > " + Utility.hex(newPrimary));
+            	}
+	            
+	            
+            	long b0 = (newPrimary >> 24) & 0xFF;
+            	long b1 = (newPrimary >> 16) & 0xFF;
+            	long b2 = (newPrimary >> 8) & 0xFF;
+            	long b3 = newPrimary & 0xFF;
+	            
+            	if (b0 < IMPLICIT_BASE_BYTE || b0 >= IMPLICIT_LIMIT_BYTE  || b1 < 3 || b2 < 3 || b3 == 1 || b3 == 2) {
+                	throw new IllegalArgumentException(Utility.hex(i) + ": illegal byte value: " + Utility.hex(newPrimary)
+                    	+ ", " + Utility.hex(b1) + ", " + Utility.hex(b2) + ", " + Utility.hex(b3));
+            	}
+	            
+            	// print range to look at
+	            
+            	if (false) {
+                	int b = i & 0xFF;
+                	if (b == 255 || b == 0 || b == 1) {
+                    	System.out.println(Utility.hex(i) + " => " + Utility.hex(newPrimary));
+                	}
+            	}
+            	oldPrimary = newPrimary;
+            	oldChar = i;
+        	}
        }
        System.out.println("Successful Implicit Check!!");
    }
@ -2448,16 +2655,12 @@ public class WriteCollationData implements UCD_Types {
        
    
    static final int secondaryDoubleStart = 0xD0;
+    static final int MARK_CODE_POINT = 0x40000000;
    
    static int fixPrimary(int x) {
        int result = 0;
-        if (x <= 0xFFFF) result = primaryDelta[x];
-        else result = getImplicitPrimary(x);
-        
-        /*if (x > 0x3400) {
-            System.out.println(Utility.hex(x) + " => " + Utility.hex(result));
-        }
-        */
+        if ((x & MARK_CODE_POINT) != 0) result = getImplicitPrimary(x & ~MARK_CODE_POINT);
+        else result = primaryDelta[x];
        return result;
    }
    
@ -2898,7 +3101,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
            0xE0000, 0xEFFFD, 0xEFFFE, 0xEFFFF,
            0xF0000, 0xFFFFD, 0xFFFFE, 0xFFFFF,
            0x100000, 0x10FFFD, 0x10FFFE, 0x10FFFF,
-            IMPLICIT_BOUNDARY, IMPLICIT_BOUNDARY-1, IMPLICIT_BOUNDARY+1,
+            IMPLICIT_4BYTE_BOUNDARY, IMPLICIT_4BYTE_BOUNDARY-1, IMPLICIT_4BYTE_BOUNDARY+1,
        };
        
    static final int MARK = 1;
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2002/05/31 01:41:04 $
-* $Revision: 1.16 $
+* $Date: 2002/06/02 05:07:08 $
+* $Revision: 1.17 $
 *
 *******************************************************************************
 */
@ -535,6 +535,9 @@ public final class Utility {    // COMMON UTILITIES
        return openPrintWriter(filename, true, true);
    }
    
+    // Normally use false, false.
+    // But for UCD files use true, true
+    // Or if they are UTF8, use true, false
    public static PrintWriter openPrintWriter(String filename, boolean removeCR, boolean latin1) throws IOException {
        File file = new File(getOutputName(filename));
        System.out.println("Creating File: " + file);