scuffed-code/tools/unicodetools/com/ibm/text/UCA/UCA.java

/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ 
* $Date: 2001/12/03 19:29:35 $ 
* $Revision: 1.8 $
*
*******************************************************************************
*/

package com.ibm.text.UCA;

import java.util.*;
import java.io.BufferedReader;
import java.io.Reader;
import java.io.PrintWriter;
import java.io.FileReader;
import java.text.MessageFormat;
import java.io.IOException;
import com.ibm.text.UCD.Normalizer;
import com.ibm.text.UCD.UCD;
import com.ibm.text.utility.*;
import com.ibm.text.UTF16;

//import com.ibm.text.CollationData.*;

/**
* Collator is a working version of UTR#10 Unicode Collation Algorithm,
* as described on http://www.unicode.org/unicode/reports/tr10/
* @author Mark Davis

It is not optimized, although it does use some techniques that are required for
a real optimization, such as squeezing all the weights into 32 bits.<p>

Invariants relied upon by the algorithm:

UCA Data:
1. While it contains secondaries greater than 0xFF, 
these can be folded down by subtracting 0xC0--without collision--to be less than 0xFF
2. Tertiary values are less than 0x80
3. Contracting characters must be "completed": if "abcd" is a contracting character, 
then "abc" is also.
4. Variables (marked with *), have a distinct, closed range of primaries. 
That is, there are no variable CEs X, Z and non-ignorable CE Y such that X[1] <= Y[1] <= Z[1]
5. It needs to be fixed when reading: only non-zero weights (levels 1-3) are really variable!

#4 saves a bit in each CE.

Limits
1. There is a limit on the number of expanding characters. If N is the number of expanding
characters, then their total lengths must be less than 65536-N. This should never pose a
problem in practice.
2. If any of the weight limits are reached (FFFF for primary, FF for secondary, tertiary),
expanding characters can be used to achieve the right results, as discussed in UTR#10.

Remarks:
Neither the old 14651 nor the old UCA algorithms for backwards really worked.
This is because of shared
characters between scripts with different directions, like French with Arabic or Greek.
*/

final public class UCA implements Comparator {
    public static final String copyright = 
      "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
      
    public int compare(Object a, Object b) {
        return getSortKey((String) a).compareTo(getSortKey((String) b));
    }

    /**
     * Version of the UCA tables to use
     */
    //private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7"; 
    public static final String VERSION = "-3.1.1d1"; // ""; // "-2.1.9d7"; 
    public static final String ALLFILES = "allkeys"; // null if not there
    
    /**
     * Records the codeversion
     */
    private static final String codeVersion = "7";

    // base directory will change depending on the installation
    public static final String BASE_DIR = "c:\\DATA\\";
    
    /** Enum for alternate handling */
    public static final byte SHIFTED = 0, ZEROED = 1, NON_IGNORABLE = 2, SHIFTED_TRIMMED = 3, LAST = 3;
    
    /**
     * Used to terminate a list of CEs
     */
    public static final int TERMINATOR = 0xFFFFFFFF;   // CE that marks end of string
         
    
// =============================================================
// Test Settings
// =============================================================
    static final boolean DEBUG = false;
    static final boolean SHOW_STATS = true;
    
    static final boolean SHOW_CE = false;
    static final boolean CHECK_UNIQUE = false;
    static final boolean CHECK_UNIQUE_EXPANSIONS = false; // only effective if CHECK_UNIQUE
    static final boolean CHECK_UNIQUE_VARIABLES = false; // only effective if CHECK_UNIQUE
    static final boolean TEST_BACKWARDS = false;
    static final boolean RECORDING_DATA = false;
    static final boolean RECORDING_CHARS = true;
    
// =============================================================
// Main Methods
// =============================================================

    /**
     * Initializes the collation from a stream of rules in the normal formal.
     * If the source is null, uses the normal Unicode data files, which
     * need to be in BASE_DIR.
     */
    public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException {
        fullData = source == null;

        // clear some tables
        for (int i = 0; i < collationElements.length; ++i) {
            collationElements[i] = UNSUPPORTED;
        }
        // load the normalizer
        if (toD == null) {
            toD = new Normalizer(Normalizer.NFD, unicodeVersion);
        }
        
        ucdVersion = UCD.make(unicodeVersion).getVersion();
        
        // either get the full sources, or just a demo set
        if (fullData) {
            for (int i = 0; i < KEYS.length; ++i) {
                BufferedReader in = new BufferedReader(
                    new FileReader(KEYS[i]), BUFFER_SIZE);
                addCollationElements(in);
                in.close();
            }
        } else {
            addCollationElements(source);
        }
        cleanup();
    }
    
    /**
     * Constructs a sort key for a string of input Unicode characters. Uses
     * default values for alternate and decomposition.
     * @param sourceString string to make a sort key for.
     * @return Result is a String not of really of Unicodes, but of weights.
     * String is just a handy way of returning them in Java, since there are no
     * unsigned shorts.
     */
    public String getSortKey(String sourceString) {
        return getSortKey(sourceString, defaultAlternate, defaultDecomposition);
    }
    /**
     * Constructs a sort key for a string of input Unicode characters. Uses
     * default value decomposition.
     * @param sourceString string to make a sort key for.
     * @param alternate choice of different 4th level weight construction
     * @return Result is a String not of really of Unicodes, but of weights.
     * String is just a handy way of returning them in Java, since there are no
     * unsigned shorts.
     */

    public String getSortKey(String sourceString, byte alternate) {
        return getSortKey(sourceString, alternate, defaultDecomposition);
    }
    
    /**
     * Constructs a sort key for a string of input Unicode characters.
     * @param sourceString string to make a sort key for.
     * @param alternate choice of different 4th level weight construction
     * @param decomposition true for UCA, false where the text is guaranteed to be
     * normalization form C with no combining marks of class 0.
     * @return Result is a String not of really of Unicodes, but of weights.
     * String is just a handy way of returning them in Java, since there are no
     * unsigned shorts.
     */
    public String getSortKey(String sourceString, byte alternate, boolean decomposition) {
        decompositionBuffer.setLength(0);
        if (decomposition) {
            toD.normalize(sourceString, decompositionBuffer);
        } else {
            decompositionBuffer.append(sourceString);
        }
        storedDecomposition = decomposition;    // record the setting for other methods
        index = 0;                              // position in source string

        // Weight strings - not chars, weights.
        primaries.setLength(0);             // clear out
        secondaries.setLength(0);           // clear out
        tertiaries.setLength(0);            // clear out
        quaternaries.setLength(0);          // clear out
        if (SHOW_CE) debugList.setLength(0); // clear out
        
        rearrangeBuffer = EMPTY;            // clear the rearrange buffer (thai)
        hangulBufferPosition = 0;           // clear hangul buffer
        hangulBuffer.setLength(0);           // clear hangul buffer
        
        char weight4 = '\u0000'; // DEFAULT FOR NON_IGNORABLE

        // process CEs, building weight strings
        while (true) {
            //fixQuaternatiesPosition = quaternaries.length();
            int ce = getCE();
            if (ce == TERMINATOR) break;
            if (ce == 0) continue;
            
            switch (alternate) {
              case ZEROED:
                if (isVariable(ce)) {
                    ce = 0;
                }
                break;
              case SHIFTED_TRIMMED:
              case SHIFTED:
                if (ce == 0) {
                    weight4 = 0;
                } else if (isVariable(ce)) { // variables
                    weight4 = getPrimary(ce);
                    ce = 0;
                } else { // above variables
                    weight4 = '\uFFFF';
                }
                break;
              // case NON_IGNORABLE: // doesn't ever change!
            }
            if (SHOW_CE) {
                if (debugList.length() != 0) debugList.append("/");
                debugList.append(ceToString(ce));
            }
            
            // add weights
            char w = getPrimary(ce);
            if (DEBUG) System.out.println("\tCE: " + Utility.hex(ce));
            if (w != 0) primaries.append(w);
            
            w = getSecondary(ce);
            if (w != 0) {
                if (!useBackwards) {
                    secondaries.append(w);
                } else {
                    secondaries.insert(0, w);
                }
            }
            
            w = getTertiary(ce);
            if (w != 0) tertiaries.append(w);
   
            if (weight4 != 0) quaternaries.append(weight4);
        }
        
        // Produce weight strings
        // For simplicity, we use the strength setting here.
        // To optimize, we wouldn't actually generate the weights in the first place.
        
        StringBuffer result = primaries;
        if (strength >= 2) {
            result.append('\u0000');    // separator
            result.append(secondaries);
            if (strength >= 3) {
                result.append('\u0000');    // separator
                result.append(tertiaries);
                if (strength >= 4) {
                    result.append('\u0000');    // separator
                    if (alternate == SHIFTED_TRIMMED) {
                        int q;
                        for (q = quaternaries.length()-1; q >= 0; --q) {
                            if (quaternaries.charAt(q) != '\uFFFF') {
                                break;
                            }
                        }
                        quaternaries.setLength(q+1);
                    }
                    result.append(quaternaries);
                    //appendInCodePointOrder(decompositionBuffer, result);
                }
            }
        }
        return result.toString();
    }
    
    // 0 ==
    // 2, -2 quarternary
    // 3, -3 tertiary
    // 4, -4 secondary
    // 5, -5 primary
    
    public static int strengthDifference(String sortKey1, String sortKey2) {
        int len1 = sortKey1.length();
        int len2 = sortKey2.length();
        int minLen = len1 < len2 ? len1 : len2;
        int strength = 5;
        for (int i = 0; i < minLen; ++i) {
            char c1 = sortKey1.charAt(i);
            char c2 = sortKey2.charAt(i);
            if (c1 < c2) return -strength;
            if (c1 > c2) return strength;
            if (c1 == '\u0000') --strength; // Separator!
        }
        if (len1 < len2) return -strength;
        if (len1 > len2) return strength;
        return 0;
    }
    
    /**
     * Turns backwards (e.g. for French) on globally for all secondaries
     */
    public void setBackwards(boolean backwards) {
        useBackwards = backwards;
    }

    /**
     * Retrieves value applied by set.
     */
    public boolean isBackwards() {
        return useBackwards;
    }

    /**
     * Causes variables (those with *) to be set to all zero weights (level 1-3).
     */
    public void setDecompositionState(boolean state) {
        defaultDecomposition = state;
    }

    /**
     * Retrieves value applied by set.
     */
    public boolean isDecomposed() {
        return defaultDecomposition;
    }

    /**
     * Causes variables (those with *) to be set to all zero weights (level 1-3).
     */
    public void setAlternate(byte status) {
        defaultAlternate = status;
    }

    /**
     * Retrieves value applied by set.
     */
    public byte getAlternate() {
        return defaultAlternate;
    }

    /**
     * Sets the maximum strength level to be included in the string. 
     * E.g. with 3, only weights of 1, 2, and 3 are included: level 4 weights are discarded.
     */
    public void setStrength(int inStrength) {
        strength = inStrength;
    }

    /**
     * Retrieves value applied by set.
     */
    public int getStrength() {
        return strength;
    }
    
    /**
     * Retrieves version
     */
    public String getCodeVersion() {
        return codeVersion;
    }

    /**
     * Retrieves versions
     */
    public String getDataVersion() {
        return dataVersion;
    }
    
    /**
     * Retrieves versions
     */
    public String getUCDVersion() {
        return ucdVersion;
    }
    
    public static String codePointOrder(String s) {
        return appendInCodePointOrder(s, new StringBuffer()).toString();
    }

    /**
     * Appends UTF-16 string
     * with the values swapped around so that they compare in
     * code-point order. Replace 0000 and 0001 by 0001 0001/2
     * @param source Normal UTF-16 (Java) string
     * @return sort key (as string)
     * @author Markus Scherer (cast into Java by MD)
     */
    public static StringBuffer appendInCodePointOrder(String source, StringBuffer target) {
        for (int i = 0; i < source.length(); ++i) {
            int ch = source.charAt(i);
            if (ch <= 1) { // hack to avoid nulls
                target.append('\u0001');
                target.append((char)(ch+1));
            }
            target.append((char)(ch + utf16CodePointOrder[ch>>11]));
        }
        return target;
    }
    
    /**
     * Returns a list of CEs for a unicode character at a position.
     * @param sourceString string to make a sort key for.
     * @param offset position in string
     * @param decomposition true for UCA, false where the text is guaranteed to be
     * normalization form C with no combining marks of class 0.
     * @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
     * @return count of CEs
     */
    public int getCEs(String sourceString, boolean decomposition, int[] output) {
        decompositionBuffer.setLength(0);
        if (decomposition) {
            toD.normalize(sourceString, decompositionBuffer);
        } else {
            decompositionBuffer.append(sourceString);
        }
        rearrangeBuffer = EMPTY;            // clear the rearrange buffer (thai)
        index = 0;
        int outpos = 0;
        output[0] = 0; // just in case!!

        // process CEs, building weight strings
        while (true) {
            //fixQuaternatiesPosition = quaternaries.length();
            int ce = getCE();
            if (ce == 0) continue;
            if (ce == TERMINATOR) break;
            output[outpos++] = ce;
        }
        return outpos;
    }
    
    /**
     * Returns a CEList for a unicode character at a position.
     * @param sourceString string to make a sort key for.
     * @param offset position in string
     * @param decomposition true for UCA, false where the text is guaranteed to be
     * normalization form C with no combining marks of class 0.
     * @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
     * @return count of CEs
     */
    
    public CEList getCEList(String sourceString, boolean decomposition) {
        int len;
        while (true) {
            try {
                len = getCEs(sourceString, decomposition, ceListBuffer);
                break;
            } catch (ArrayIndexOutOfBoundsException e) {
                ceListBuffer = new int[ceListBuffer.length * 2];
            }
        }
        return new CEList(ceListBuffer, 0, len);
    }
    
    int[] ceListBuffer = new int[30]; // temporary storage, to avoid multiple creation
    
    
    /**
     * Get Usage
     */
    public BitSet getWeightUsage(int strength) {
        return strength == 1 ? primarySet : strength == 2 ? secondarySet : tertiarySet;
    }
     
    /**
     * CE Type
     */
    static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2, 
        FIXED_CE = 3, HANGUL_CE = 5, SURROGATE_CE = 6, UNSUPPORTED_CE = 7;
   
    /**
     * Returns the char associated with a FIXED value
     */
    public char charFromFixed(int ce) {
        return getPrimary(ce);
    }
    
    /**
     * Return the type of the CE
     */
    public byte getCEType(int ch) {
        
        if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands
        
        int ce = collationElements[ch];
        if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return NORMAL_CE;
        if (ce == UNSUPPORTED) {
            
            // Special check for Han, Hangul
            if (isHangul(ch)) return HANGUL_CE;
            
            if (isFixed(ch)) return FIXED_CE;
                        
            // special check for unsupported surrogate pair, 20 1/8 bits
            if (0xD800 <= ch && ch <= 0xDFFF) {
                return SURROGATE_CE;
            }
            return UNSUPPORTED_CE;
        }
            
        if (ce == CONTRACTING) return CONTRACTING_CE;
        return EXPANDING_CE;
    }

    /**
     * Utility, used to get the primary weight from a 32-bit CE
     * The primary is 16 bits, stored in b31..b16
     */
    public static char getPrimary(int ce) {
        return (char)(ce >>> 16);
    }

    /**
     * Utility, used to get the secondary weight from a 32-bit CE
     * The secondary is 8 bits, stored in b15..b8
     */
    public static char getSecondary(int ce) {
        return (char)((ce >>> 7) & 0x1FF);
    }

    /**
     * Utility, used to get the tertiary weight from a 32-bit CE
     * The tertiary is 6 bits, stored in b6..b0
     */
    public static char getTertiary(int ce) {
        return (char)(ce & 0x7F);
    }

    /**
     * Utility, used to determine whether a CE is variable or not.
     */
     
    public boolean isVariable(int ce) {
        return (variableLowCE <= ce && ce <= variableHighCE);
    }
    
    /**
     * Utility, used to determine whether a CE is variable or not.
     */
     
    public int getVariableLow() {
        return variableLowCE;
    }
    
    /**
     * Utility, used to determine whether a CE is variable or not.
     */
     
    public int getVariableHigh() {
        return variableHighCE;
    }
    
    /**
     * Utility, used to make a CE from the pieces. They must already
     * be in the right range of values.
     */
    public static int makeKey(int primary, int secondary, int tertiary) {
        return (primary << 16) | (secondary << 7) | tertiary;
    }
    
// =============================================================
// Utility methods
// =============================================================

    /**
     * Produces a human-readable string for a sort key.
     * The 0000 separator is replaced by a '|'
     */
    static public String toString(String sortKey) {
        StringBuffer result = new StringBuffer();
        boolean needSep = false;
        result.append("[");
        for (int i = 0; i < sortKey.length(); ++i) {
            char ch = sortKey.charAt(i);
            if (needSep) result.append(" ");
            if (ch == 0) {
                result.append("|");
                needSep = true;
            } else {
                result.append(Utility.hex(ch));
                needSep = true;
            }
        }
        result.append("]");
        return result.toString();
    }
    
    /**
     * Produces a human-readable string for a collation element
     */
    static public String ceToString(int ce) {
        return "[" + Utility.hex(getPrimary(ce)) + "." 
          + Utility.hex(getSecondary(ce)) + "."
          + Utility.hex(getTertiary(ce)) + "]";
    }
    
    /**
     * Produces a human-readable string for a collation element.
     * value is terminated by -1!
     */
    static public String ceToString(int[] ces, int len) {
        StringBuffer result = new StringBuffer();
        for (int i = 0; i < len; ++i) {
            result.append(ceToString(ces[i]));
        }
        return result.toString();
    }
    
    /**
     * Produces a human-readable string for a collation element.
     * value is terminated by -1!
     */
    static public String ceToString(int[] ces) {
        StringBuffer result = new StringBuffer();
        for (int i = 0; ; ++i) {
            if (ces[i] == TERMINATOR) break;
            result.append(ceToString(ces[i]));
        }
        return result.toString();
    }
    
    /**
     * Supplies a zero-padded hex representation of an integer (without 0x)
     */
    /*
    static public String hex(int i) {
        String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
        return "00000000".substring(result.length(),8) + result;
    }
    */
    /**
     * Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
     */
    /*
    static public String hex(char i) {
        String result = Integer.toString(i, 16).toUpperCase();
        return "0000".substring(result.length(),4) + result;
    }
    */
    /**
     * Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
     */
     /*
    static public String hex(byte b) {
        int i = b & 0xFF;
        String result = Integer.toString(i, 16).toUpperCase();
        return "00".substring(result.length(),2) + result;
    }
    */
    /**
     * Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u)
     *@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062"
     */
     /*
    static public String hex(String s, String sep) {
        StringBuffer result = new StringBuffer();
        for (int i = 0; i < s.length(); ++i) {
            if (i != 0) result.append(sep);
            result.append(hex(s.charAt(i)));
        }
        return result.toString();
    }
    */
    /**
     * Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u)
     *@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062"
     */
     /*
    static public String hex(StringBuffer s, String sep) {
        StringBuffer result = new StringBuffer();
        for (int i = 0; i < s.length(); ++i) {
            if (i != 0) result.append(sep);
            result.append(hex(s.charAt(i)));
        }
        return result.toString();
    }
    */
    
// =============================================================
// Privates
// =============================================================
    
    /**
     * Array used to reorder surrogates to top of 16-bit range, and others down.
     * Adds 2000 to D800..DFFF, making them F800..FFFF
     * Subtracts 800 from E000..FFFF, making them D800..F7FF
     */
    private static final int[] utf16CodePointOrder = {
        0, 0, 0, 0,                        // 00, 08, 10, 18
        0, 0, 0, 0,                        // 20, 28, 30, 38
        0, 0, 0, 0,                        // 40, 48, 50, 58
        0, 0, 0, 0,                        // 60, 68, 70, 78
        0, 0, 0, 0,                        // 80, 88, 90, 98
        0, 0, 0, 0,                        // A0, A8, B0, B8
        0, 0, 0, 0x2000,                   // C0, C8, D0, D8
        -0x800, -0x800, -0x800, -0x800     // E0, E8, F0, F8
    };

    /**
     * NFD required
     */
    private static Normalizer toD;

    /**
     * Records the dataversion
     */
    private String dataVersion = "?";

    /**
     * Records the dataversion
     */
    private String ucdVersion = "?";

    /**
     * Turns backwards (e.g. for French) on globally for all secondaries
     */
    private boolean useBackwards = false;
    
    /**
     * Choice of how to handle variables (those with *)
     */
    private byte defaultAlternate = SHIFTED;
    
    /**
     * For testing
     */
    private boolean defaultDecomposition = true;
    
    /**
     * Sets the maximum strength level to be included in the string. 
     * E.g. with 3, only weights of 1, 2, and 3 are included: level 4 weights are discarded.
     */
    private int strength = 4;
    
    /**
     * Position in decompositionBuffer used when constructing sort key
     */
    private int index;

    /**
     * List of files to use for constructing the CE data, used by build()
     */
    private static final String[] KEYS = {
        //"D:\\UnicodeData\\testkeys.txt",
        BASE_DIR + "Collation\\allkeys" + VERSION + ".txt",
        /*
        BASE_DIR + "UnicodeData\\Collation\\basekeys" + VERSION + ".txt",
        BASE_DIR + "UnicodeData\\Collation\\compkeys" + VERSION + ".txt",
        BASE_DIR + "UnicodeData\\Collation\\ctrckeys" + VERSION + ".txt",
        */
    };
 
    /**
     * File buffer size, used to make reads faster.
     */
    private static final int BUFFER_SIZE = 64*1024;
    
// =============================================================
// Collation Element Memory Data Table Formats
// =============================================================

    /**
     * Temporary buffer used in getSortKey for the decomposed string
     */
    StringBuffer decompositionBuffer = new StringBuffer();
    
    /**
     * The collation element data is stored a couple of different structures.
     * First is collationElements, which generally contains the 32-bit CE corresponding
     * to the data. It is directly indexed by character code.<br>
     * For brevity in the implementation, we just use a flat array.
     * A real implementation would use a multi-stage table, as described in TUS Section 5.
     * table of simple collation elements, indexed by char.<br>
     * Exceptional cases: expanding, contracting, unsupported are handled as described below.
     */
    int[] collationElements = new int[65536];
    
    /**
     * A special bit combination in a CE is used to reserve exception cases. This has the effect
     * of removing 32 primary key values out of the 65536 possible.
     */
    static final int EXCEPTION_CE_MASK = 0xFFC00000;
    
    /**
     * Used to composed Hangul and Han characters
     */
     
    static final int NEUTRAL_SECONDARY = 0x20;
    static final int NEUTRAL_TERTIARY = 0x02;
       
    /**
     * Any unsupported characters (those not in the UCA data tables) 
     * are marked with a exception bit combination
     * so that they can be treated specially.<br>
     * There are at least 34 values, so that we can use a range for surrogates
     * However, we do add to the first weight if we have surrogate pairs!
     */
    public static final int UNSUPPORTED_BASE = 0xFFC2;
    static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
    
    // was 0xFFC20101;
    
    /**
     * Contracting characters are marked with a exception bit combination 
     * in the collationElement table.
     * This means that they are the first character of a contraction, and need
     * to be looked up (with following characters) in the contractingTable.<br>
     * This isn't a MASK since there is exactly one value.
     */
    static final int CONTRACTING = 0xFFC10000;

    /**
     * Expanding characters are marked with a exception bit combination
     * in the collationElement table.
     * This means that they map to more than one CE, which is looked up in
     * the expansionTable by index. See EXCEPTION_INDEX_MASK
     */
    static final int EXPANDING_MASK = 0xFFC00000; // marks expanding range start
    
    /**
     * This mask is used to get the index from an EXPANDING exception.
     * The contracting characters can also make use of this in a future optimization.
     */
    static final int EXCEPTION_INDEX_MASK = 0x0000FFFF;
 
    /**
     * We take advantage of the variables being in a closed range to save a bit per CE.
     * The low and high values are initially set to be at the opposite ends of the range,
     * as the table is built from the UCA data, they are narrowed in.
     * The first three values are used in building; the last two in testing.
    */
    int variableLow = '\uFFFF';
    int nonVariableLow = '\uFFFF'; // HACK '\u089A';
    int variableHigh = '\u0000';
    
    int variableLowCE;  // used for testing against
    int variableHighCE; // used for testing against
    
    /**
     * Although a single character can expand into multiple CEs, we don't want to burden
     * the normal case with the storage. So, they get a special value in the collationElements
     * array. This value has a distinct primary weight, followed by an index into a separate
     * table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
     * will be used for the expansion. The implementation is as a stack; this just makes it
     * easy to generate.
     */
    IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
        
    /**
     * For now, this is just a simple mapping of strings to collation elements.
     * The implementation depends on the contracting characters being "completed",
     * so that it can be efficiently determined when to stop looking.
     */
    Hashtable contractingTable = new Hashtable();
    
    /**
     *  Special char value that means failed or terminated
     */
    static final char NOT_A_CHAR = '\uFFFF';
    
    /**
     * Marks whether we are using the full data set, or an abbreviated version for
     * an applet.
     */
     
    private boolean fullData;
    
// =============================================================
// Temporaries used in getCE. 
// Made part of the object to avoid reallocating each time.
// =============================================================

    /**
     * Stack for expanding characters
     */
    private IntStack expandingStack = new IntStack(100);
    
    /**
     * Temporary buffers used in getSortKey to store weights
     * these are NOT strings of Unicode characters--they are
     * lists of weights. But this is a convenient way to store them,
     * since Java doesn't have unsigned shorts.
     */
    private StringBuffer primaries = new StringBuffer(100);
    private StringBuffer secondaries = new StringBuffer(100);
    private StringBuffer tertiaries = new StringBuffer(100);
    private StringBuffer quaternaries = new StringBuffer(100);
    
    /**
     * Temporary buffer used to collect progress data for debugging
     */
    StringBuffer debugList = new StringBuffer(100);
    
    /**
     * Temporary with requested decomposition
     */
    boolean storedDecomposition;
    int hangulHackBottom;
    int hangulHackTop;
    
    /**
     * Used for supporting Thai rearrangement
     */
    static final char EMPTY = '\uFFFF';
    char rearrangeBuffer = EMPTY;
    String rearrangeList = "";
    int hangulBufferPosition = 0;
    StringBuffer hangulBuffer = new StringBuffer();

// =============================================================
// getCE: Get the next Collation Element
// Main Routine
// =============================================================

    /**
     * Gets the next Collation Element from the decomposition buffer.
     * May take one or more characters.
     * Resets index to point at the next position to get characters from.
     *@param quaternary the collection of 4th level weights, synthesized from the
     * (normalized) character code.
     */
    private int getCE() {
        if (!expandingStack.isEmpty()) return expandingStack.pop();
        char ch;
        
        // Fetch next character. Handle rearrangement for Thai, etc.
        if (rearrangeBuffer != EMPTY) {
            ch = rearrangeBuffer;
            rearrangeBuffer = EMPTY;
        } else if (hangulBufferPosition < hangulBuffer.length()) {
            ch = hangulBuffer.charAt(hangulBufferPosition++);
            if (hangulBufferPosition == hangulBuffer.length()) {
                hangulBuffer.setLength(0);
                hangulBufferPosition = 0;
            }
        } else {
            if (index >= decompositionBuffer.length()) return TERMINATOR;
            ch = decompositionBuffer.charAt(index++); // get next
            if (rearrangeList.indexOf(ch) != -1 && index < decompositionBuffer.length()) {// if in list
                rearrangeBuffer = ch;   // store for later
                ch = decompositionBuffer.charAt(index++);   // never rearrange twice!!
            }
        }
        
        int ce = collationElements[ch];
        
        // Hangul tailoring hack
        //if (!storedDecomposition && hangulHackBottom <= ce && ce < hangulHackTop) return fixJamo(ch, ce);   // hard coded fix!!

        // if the CE is not exceptional (unsupported, contracting, expanding) we are done.
        if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce;
        
        if (ce == UNSUPPORTED) {
            int bigChar = ch;
            
            // Special check for Hangul
            if (isHangul(bigChar)) {
                // MUST DECOMPOSE!!
                hangulBuffer = new StringBuffer();
                decomposeHangul(bigChar, hangulBuffer);
                return getCE();
                // RECURSIVE!!!
            }
                        
            // Special check for Han, YI
            if (isFixed(bigChar)) {
                return makeKey(bigChar, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
            }
                        
            // special check for unsupported surrogate pair, 20 1/8 bits
            if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
                // ignore unmatched surrogates (e.g. return zero)
                if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
                int ch2 = decompositionBuffer.charAt(index);
                if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0;  // unmatched
                index++; // skip next char
                bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
            }

            if ((bigChar & 0xFFFE) == 0xFFFE) { // illegal code value, ignore!!
                return 0;
            }
            
            // The result is 2 CEs. One is UNSUPPORTED + top bits, and the other
            // is a primary that is the next fifteen bits
            // This has the effect of putting all unsupported characters at the end,
            // in code order.
                    // add bottom 5 bits to UNSUPPORTED, and push rest
                    //return UNSUPPORTED + (bigChar & 0xFFFF0000);    // top bits added
            expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0)); // primary = bottom 15 bits plus turn bottom bit on.
            // secondary and tertiary are both zero
            return makeKey(UNSUPPORTED_BASE + (bigChar >>> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); // top 34 values plus UNSUPPORTED
            /*
            expandingStack.push(((bigChar & 0x7FFF) << 16) | 0x10000000); // primary = bottom 15 bits plus turn bottom bit on.
            // secondary and tertiary are both zero
            return UNSUPPORTED + ((bigChar << 1) & 0xFFFF0000); // top 34 values plus UNSUPPORTED
            */
        }
        if (ce == CONTRACTING) {
            // Contracting is probably the most interesting (read "tricky") part
            // of the algorithm.
            // First get longest substring that is in the contracting table.
            // For simplicity, we use a hash table for contracting.
            // There are much better optimizations, 
            // but they take a more complicated build algorithm than we want to show here.
            // NOTE: We are guaranteed that the character itself is in the contracting table because
            // of the build process.
            String probe = String.valueOf(ch);
            Object value = contractingTable.get(probe);
            if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));
            
            // We loop, trying to add successive characters to the longest substring.
            while (index < decompositionBuffer.length()) {
                char ch2 = decompositionBuffer.charAt(index);
                
                // see whether the current string plus the next char are in
                // the contracting table.
                String newProbe = probe + ch2;
                Object newValue = contractingTable.get(newProbe);
                if (newValue == null) break;    // stop if not in table.
                
                // We succeeded--so update our new values, and set index
                // and quaternary to indicate that we swallowed another character.
                probe = newProbe;
                value = newValue;
                index++;
            }
            
            // Now, see if we can add any combining marks
            short lastCan = 0;
            for (int i = index; i < decompositionBuffer.length(); ++i) {
                // We only take certain characters. They have to be accents,
                // and they have to not be blocked.
                // Unlike above, if we don't find a match (and it was an accent!)
                // then we don't stop, we continue looping.
                char ch2 = decompositionBuffer.charAt(i);
                short can = toD.getCanonicalClass(ch2);
                if (can == 0) break;            // stop with any zero (non-accent)
                if (can == lastCan) continue;   // blocked if same class as last
                lastCan = can;                  // remember for next time
                
                // Now see if we can successfully add it onto our string
                // and find it in the contracting table.
                String newProbe = probe + ch2;
                Object newValue = contractingTable.get(newProbe);
                if (newValue == null) continue;

                // We succeeded--so update our new values, remove the char, and update
                // quaternary to indicate that we swallowed another character.
                probe = newProbe;
                value = newValue;
                decompositionBuffer.setCharAt(i,'\u0000');  // zero char
            }
            
            // we are all done, and can extract the CE from the last value set.
            ce = ((Integer)value).intValue();
            // if the CE is not exceptional (unsupported expanding) we are done.
            // BTW we will never have a contracting CE at this point.
            if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce;
            // otherwise fall through to expansion
        }
        // expanding, so copy list of items onto stack
        int index = ce & EXCEPTION_INDEX_MASK; // get index
        // copy onto stack from index until reach TERMINATOR
        while (true) {
            ce = expandingTable.get(index++);
            if (ce == TERMINATOR) break;
            expandingStack.push(ce);
        }
        return expandingStack.pop(); // pop last (guaranteed to exist!)
    }
    
    public final boolean isFixed(int bigChar) {
        return (0x3400 <= bigChar && bigChar <= 0x4DB5
             || 0x4E00 <= bigChar && bigChar <= 0x9FA5
             // || 0xA000 <= bigChar && bigChar <= 0xA48F
             );
    }
    
    private final boolean isHangul(int bigChar) {
        return (0xAC00 <= bigChar && bigChar <= 0xD7A3);
    }
    
    /**
     * Constants for Hangul
     */
    static final int // constants
        SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
        LCount = 19, VCount = 21, TCount = 28,
        NCount = VCount * TCount,   // 588
        SCount = LCount * NCount,   // 11172
        LastInitial = LBase + LCount-1, // last initial jamo
        LastPrimary = SBase + (LCount-1) * VCount * TCount; // last corresponding primary
   
    public static StringBuffer decomposeHangul(int s, StringBuffer result) {
        int SIndex = s - SBase;
        if (0 > SIndex || SIndex >= SCount) {
            throw new IllegalArgumentException("Non-Hangul Syllable");
        }
        int L = LBase + SIndex / NCount;
        int V = VBase + (SIndex % NCount) / TCount;
        int T = TBase + SIndex % TCount;
        result.append((char)L);
        result.append((char)V);
        if (T != TBase) result.append((char)T);
        return result;
    }
   
    /**
     * Fix for Hangul, since the tables are not set up right.
     * The fix for Hangul is to give different values to the combining initial 
     * Jamo to put them up into the AC00 range, as follows. Each one is put
     * after the first syllable it begins.
     *
    private int fixJamo(char ch, int jamoCe) {
        
        int result = jamoCe - hangulHackBottom + 0xAC000000; // put into right range
        if (DEBUG) System.out.println("\tChanging " + hex(ch) + " " + hex(jamoCe) + " => " + hex(result));
        return result;
        /*
        int newPrimary;
        int LIndex = jamo - LBase;
        if (LIndex < LCount) {
            newPrimary = SBase + (LIndex + 1) * VCount * TCount; // multiply to match syllables
        } else {
            newPrimary = LastPrimary + (jamo - LastInitial); // just shift up
        }
        return makeKey(newPrimary, 0x21, 0x2);  // make secondary difference!
        * /
    }
    */
    
// =============================================================
// Building Collation Element Tables
// =============================================================

    /**
     * Value for returning int as well as function return,
     * since Java doesn't have output parameters
     */
    private int[] position = new int[1]; 
    
    /**
     * For recording statistics
     */
    private int count1 = 0, count2 = 0, count3 = 0, max2 = 0, max3 = 0;
    private int oldKey1 = -1, oldKey2 = -1, oldKey3 = -1;
    Map multiTable = new TreeMap();
    BitSet found = new BitSet();
    
    public Hashtable getContracting() {
        return new Hashtable(multiTable);
    }
    
    public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) {
        return new UCAContents(ceLimit, skipDecomps);
    }
    
    public class UCAContents {
        int current = -1;
        Normalizer skipDecomps = new Normalizer(Normalizer.NFD);
        Normalizer nfd = skipDecomps;
        Iterator enum = null;
        byte ceLimit;
        int currentRange = Integer.MAX_VALUE; // set to ZERO to enable
        int startOfRange = SAMPLE_RANGES[0][0];
        int endOfRange = startOfRange;
        int itemInRange = startOfRange;
        int skip = 1;
        
        /**
         * use FIXED_CE as the limit
         */
        UCAContents(byte ceLimit, Normalizer skipDecomps) {
            this.ceLimit = ceLimit;
            this.skipDecomps = skipDecomps;
        }
        
        /**
         * use FIXED_CE as the limit
         */
        public void enableSamples() {
            currentRange = 0;
        }
        
        /**
         * returns a string
         */
        public String next() {
            String result = null; // null if done
            
            // normal case
            while (current++ < 0x10FFFF) {
                //char ch = (char)current;
                byte type = getCEType(current);
                
                if (!nfd.normalizationDiffers(current) || type == HANGUL_CE) {
                    if (type >= ceLimit) continue;
                    if (skipDecomps != null && skipDecomps.hasDecomposition(current)) continue;
                }
                result = UTF16.valueOf(current);
                return result;
            }
            
            // contractions
            if (enum == null) enum = multiTable.keySet().iterator();
            if (enum.hasNext()) {
                result = (String)enum.next();
                return result;
            }
            
            // extra samples
            if (currentRange < SAMPLE_RANGES.length) {
                try {
                    result = UTF16.valueOf(itemInRange);
                } catch (RuntimeException e) {
                    System.out.println(Utility.hex(itemInRange));
                    throw e;
                }
                ++itemInRange;
                if (itemInRange > endOfRange) {
                    ++currentRange;
                    if (currentRange < SAMPLE_RANGES.length) {
                        startOfRange = itemInRange = SAMPLE_RANGES[currentRange][0];
                        endOfRange = SAMPLE_RANGES[currentRange].length > 1
                            ? SAMPLE_RANGES[currentRange][1]
                            : startOfRange;
                        skip = ((endOfRange - startOfRange) / 513);
                    }
                } else if (itemInRange > startOfRange + 9 && itemInRange < endOfRange - 9 - skip) {
                    itemInRange += skip;
                }
            }
            
            return result;
        }
        
        /**
         * returns a string and its ces
         */
        public String next(int[] ces, int[] len) {

            String result = next(); // null if done
            if (result != null) {
                len[0] = getCEs(result, true, ces);
            }
            return result;
        }
        
        int[] lengthBuffer = new int[1];
        
        /**
         * returns a string and its ces
         */
        public boolean next(Pair result) {
            String s = next(ceListBuffer, lengthBuffer);
            if (s == null) return false;
            result.first = new CEList(ceListBuffer, 0, lengthBuffer[0]);
            result.second = s;
            return true;
        }
    }
    
    static final int[][] SAMPLE_RANGES = {
                {0x10000},
                {0x10FFFF},
                {0x0220},
                {0xFFF0}, 
                {0xD800},
                {0xDFFF},
                {0xFFFE},
                {0xFFFF},
                {0x10FFFE},
                {0x10FFFF},
                {0x3400, 0x4DB5},
                {0x4E00, 0x9FA5},
                {0xAC00, 0xD7A3},
                {0xA000, 0xA48C},
                {0xE000, 0xF8FF},
                {0x20000, 0x2A6D6},
                {0xE0000, 0xE00FF},
                {0xF0000, 0xF00FD},
                {0xFFF00, 0xFFFFD},
                {0x100000, 0x1000FD},
                {0x10FF00, 0x10FFFD},
    };
                
    /**
     * Adds the collation elements from a file (or other stream) in the UCA format.
     * Values will override any previous mappings.
     */
    private void addCollationElements(BufferedReader in) throws java.io.IOException {
        IntStack tempStack = new IntStack(100); // used for reversal
        StringBuffer multiChars = new StringBuffer(); // used for contracting chars
        String inputLine = "";
        while (true) try {
            inputLine = in.readLine();
            if (inputLine == null) break;       // means file is done
            String line = cleanLine(inputLine); // remove comments, extra whitespace
            if (line.length() == 0) continue;   // skip empty lines

            position[0] = 0;                    // start at front of line
            if (line.startsWith("@version")) {
                dataVersion = line.substring("@version".length()+1).trim();
                continue;
            }
            
            if (line.startsWith("@rearrange")) {
                line = line.substring("@rearrange".length()+1).trim();
                while (position[0] < line.length()) {
                    rearrangeList += getChar(line, position);
                }
                continue;
            }
            
            // collect characters
            char value = getChar(line, position);
            fixSurrogateContraction(value);
            char value2 = getChar(line, position);
            multiChars.setLength(0);            // clear buffer
            if (value2 != NOT_A_CHAR) {
                fixSurrogateContraction(value2);
                multiChars.append(value);       // append until we get terminator
                multiChars.append(value2);
                while (true) {
                    value2 = getChar(line, position);
                    if (value2 == NOT_A_CHAR) break;
                    fixSurrogateContraction(value2);
                    multiChars.append(value2);
                }
            }
            if (RECORDING_CHARS) {
                if (multiChars.length() > 1) {
                    multiTable.put(multiChars.toString(), "");
                }
                found.set(value);
                for (int i = 1; i < multiChars.length(); ++i) {
                    found.set(multiChars.charAt(i));
                }
            }
            if (!fullData && RECORDING_DATA) {
                if (value == 0 || value == '\t' || value == '\n' || value == '\r'
                  || (0x20 <= value && value <= 0x7F)
                  || (0x80 <= value && value <= 0xFF)
                  || (0x300 <= value && value <= 0x3FF)
                  ) {
                    System.out.println("    + \"" + inputLine + "\\n\"");
                }
            }
            // for recording information
            boolean record = true;
            /* if (multiChars.length() > 0) record = false;
            else */
            if (toD.hasDecomposition(value)) record = false;
            
            // collect CEs
            int ce = getCEFromLine(value, line, position, record);
            int ce2 = getCEFromLine(value, line, position, record);
            if (CHECK_UNIQUE && (ce2 == TERMINATOR || CHECK_UNIQUE_EXPANSIONS)) {
                if (!CHECK_UNIQUE_VARIABLES) {
                    checkUnique(value, ce, 0, inputLine); // only need to check first value
                } else {
                    int key1 = ce >>> 16;
                    if (isVariable(ce)) {
                        checkUnique(value, 0, key1, inputLine); // only need to check first value
                    }
                }
            }
            if (ce2 != TERMINATOR) { // have expanding character!
                // put list into the expanding table
                // use a temporary stack to get them in reverse order
                tempStack.push(ce);
                tempStack.push(ce2);
                // set collationElement to exception value, plus index
                ce = EXPANDING_MASK | expandingTable.getTop();
                while (true) {
                    ce2 = getCEFromLine(value, line, position, record);
                    if (ce2 == TERMINATOR) break;
                    tempStack.push(ce2);
                } 
                // push onto expanding table, now in reverse order
                while (!tempStack.isEmpty()) expandingTable.push(tempStack.pop());
                expandingTable.push(TERMINATOR);
            }
            
            // assign CE(s) to char(s)
            if (multiChars.length() > 0) {
                contractingTable.put(multiChars.toString(), new Integer(ce));
                if (collationElements[value] == UNSUPPORTED) {
                    collationElements[value] = CONTRACTING; // mark special
                } else if (collationElements[value] != CONTRACTING) {
                    // move old value to contracting table!
                    contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
                    collationElements[value] = CONTRACTING; // signal we must look up in table
                }
            } else if (collationElements[value] == CONTRACTING) {
                // must add old value to contracting table!
                contractingTable.put(String.valueOf(value), new Integer(ce));
            } else {
                collationElements[value] = ce; // normal
            }
        //} catch (Exception e) {
          //  throw new IllegalArgumentException("Malformed line: " + inputLine + "\n " 
            //  + e.getClass().getName() + ": " + e.getMessage());
        } catch (RuntimeException e) {
            System.out.println("Error on line: " + inputLine);
            throw e;
        }
    }
    
    private void fixSurrogateContraction(char ch) {
        //if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));            
        if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
        String chs = String.valueOf(ch);
        Object probe = contractingTable.get(chs);
        if (probe != null) return;
        contractingTable.put(chs, new Integer(0));
    }
    
    private void concat(int[] ces1, int[] ces2) {
        
    }
    
    private void add(String source, int[] ces, int ceLen) {
        
        int ce;
        if (ceLen < 1) {
            throw new IllegalArgumentException("CE too short: " + ceLen);
        } else if (ceLen == 1) {
            ce = ces[0];
        } else {
            ce = EXPANDING_MASK | expandingTable.getTop();
            for (int i = 0; i < ceLen; ++i) {
                expandingTable.push(ces[i]);
            }
        }
        
        // assign CE(s) to char(s)
        
        int value = source.charAt(0);
        if (source.length() > 0) {
            contractingTable.put(source.toString(), new Integer(ce));
            if (collationElements[value] == UNSUPPORTED) {
                collationElements[value] = CONTRACTING; // mark special
            } else if (collationElements[value] != CONTRACTING) {
                // move old value to contracting table!
                contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
                collationElements[value] = CONTRACTING; // signal we must look up in table
            }
        } else if (collationElements[value] == CONTRACTING) {
            // must add old value to contracting table!
            contractingTable.put(source, new Integer(ce));
        } else {
            collationElements[source.charAt(0)] = ce; // normal
        }
    }
    
    /**
     * Checks the internal tables corresponding to the UCA data.
     */
    private void cleanup() {
        
        // at this point, we have to guarantee that the contractingTable is CLOSED
        // e.g. if a substring of length n is in the table, then the first n-1 characters
        // are also!!
        
        
/*
0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
        int[] temp1 = int[20];
        int[] temp2 = int[20];
        int[] temp3 = int[20];
        getCEs("\u0fb2", true, temp1);
        getCEs("\u0fb3", true, temp2);
        getCEs("\u0f71", true, temp3);
        add("\u0FB2\u0F71", concat(temp1, temp3));
*/
        
        Hashtable missingStrings = new Hashtable();
        
        int[] temp1 = new int[20];
        Enumeration enum = contractingTable.keys();
        while (enum.hasMoreElements()) {
            String sequence = (String)enum.nextElement();
            //System.out.println("Contraction: " + Utility.hex(sequence));
            for (int i = sequence.length()-1; i > 0; --i) {
                String shorter = sequence.substring(0,i);
                Object probe = contractingTable.get(shorter);
                if (probe == null) {
                    int len = getCEs(shorter, true, temp1);
                    System.out.println("WARNING: CLOSING: " + UCD.make().getCodeAndName(shorter) + " => " + ceToString(temp1, len));
                    add(shorter, temp1, len);
                    // missingStrings.put(shorter,"");
                    // collationElements[sequence.charAt(0)] = UNSUPPORTED; // nuke all bad values
                }
            }
        }
        
        enum = missingStrings.keys();
        if (missingStrings.size() != 0) {
            /**
            while (enum.hasMoreElements()) {
                String sequence = (String)enum.nextElement();
                getCE(sequence);
                FIX LATER;
            }
            */
            String errorMessage = "";
            while (enum.hasMoreElements()) {
                String missing = (String)enum.nextElement();
                if (errorMessage.length() != 0) errorMessage += ", ";
                errorMessage += "\"" + missing + "\"";
            }
            throw new IllegalArgumentException("Contracting table not closed! Missing " + errorMessage);
        }
        
        //fixlater;
        variableLowCE = variableLow << 16;
        variableHighCE = (variableHigh << 16) | 0xFFFF; // turn on bottom bits
        
        hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries
        hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries
        if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop));
        
        // show some statistics
        if (SHOW_STATS) System.out.println("\tcount1: " + count1);
        if (SHOW_STATS) System.out.println("\tcount2: " + max2);
        if (SHOW_STATS) System.out.println("\tcount3: " + max3);
        
        if (SHOW_STATS) System.out.println("\tMIN1/MAX1: " + Utility.hex(MIN1) + "/" + Utility.hex(MAX1));
        if (SHOW_STATS) System.out.println("\tMIN2/MAX2: " + Utility.hex(MIN2) + "/" + Utility.hex(MAX2));
        if (SHOW_STATS) System.out.println("\tMIN3/MAX3: " + Utility.hex(MIN3) + "/" + Utility.hex(MAX3));
        
        if (SHOW_STATS) System.out.println("\tVar Min/Max: " + Utility.hex(variableLow) + "/" + Utility.hex(variableHigh));
        if (SHOW_STATS) System.out.println("\tNon-Var Min: " + Utility.hex(nonVariableLow));
        
        if (SHOW_STATS) System.out.println("\trenumberedVariable: " + renumberedVariable);
    }
    
    /**
     * Remove comments, extra whitespace
     */
    private String cleanLine(String line) {
        int commentPosition = line.indexOf('#');
        if (commentPosition >= 0) line = line.substring(0,commentPosition);
        commentPosition = line.indexOf('%');
        if (commentPosition >= 0) line = line.substring(0,commentPosition);
        return line.trim();
    }
    
    /**
     * Get a char from a line, of form: (<space> | <comma>)* <hex>*
     *@param position on input, the place to start at. 
     * On output, updated to point to the next place to search.
     *@return the character, or NOT_A_CHAR when done
     */
    
    // NOTE in case of surrogates, we buffer up the second character!!
    char charBuffer = 0;
    
    private char getChar(String line, int[] position) {
        char ch;
        if (charBuffer != 0) {
            ch = charBuffer;
            charBuffer = 0;
            return ch;
        }
        int start = position[0];
        while (true) { // trim whitespace
            if (start >= line.length()) return NOT_A_CHAR;
            ch = line.charAt(start);
            if (ch != ' ' && ch != ',') break;
            start++;
        }
        // from above, we have at least one char
        int hexLimit = start;
        while ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F')) {
            hexLimit++;
            ch = line.charAt(hexLimit);
        }
        if (hexLimit >= start + 4) {
            position[0] = hexLimit;
            int cp = Integer.parseInt(line.substring(start,hexLimit),16);
            if (cp <= 0xFFFF) return (char)cp;
            //DEBUGCHAR = true;
            charBuffer = UTF16.getTrailSurrogate(cp);
            return UTF16.getLeadSurrogate(cp);
        }
        
        return NOT_A_CHAR; 
    }
    
    boolean DEBUGCHAR = false;    
    
    BitSet primarySet = new BitSet();
    BitSet secondarySet = new BitSet();
    BitSet tertiarySet = new BitSet();
    
    public int writeUsedWeights(PrintWriter p, int strength, MessageFormat mf) {
        BitSet weights = strength == 1 ? primarySet : strength == 2 ? secondarySet : tertiarySet;
        int first = -1;
        int count = 0;
        for (int i = 0; i <= weights.length(); ++i) {
            if (strength > 1) {
                if (weights.get(i)) {
                    count++;
                    p.println(mf.format(new Object[] {Utility.hex((char)i), new Integer(stCounts[strength][i])}));
                }
                continue;
            }
            if (weights.get(i)) {
                if (first == -1) first = i;
            } else if (first != -1) {
                int last = i-1;
                int diff = last - first + 1;
                count += diff;
                String lastStr = last == first ? "" : Utility.hex((char)last);
                p.println(mf.format(new Object[] {Utility.hex((char)first),lastStr,new Integer(diff), new Integer(count)}));
                first = -1;
            }
        }
        return count;  
    }
    
    int[] secondaryCount = new int[0x200];
    int[] tertiaryCount = new int[0x80];
    int[][] stCounts = {null, null, secondaryCount, tertiaryCount};
    
    /**
     * Gets a CE from a UCA format line
     *@param value the first character for the line. Just used for statistics.
     *@param line a string of form "[.0000.0000.0000.0000]..."
     *@param position on input, the place to start at. 
     * On output, updated to point to the next place to search.
     */
    
    boolean haveVariableWarning = false;
    boolean haveZeroVariableWarning = false;
    
    private int getCEFromLine(char value, String line, int[] position, boolean record) {
        int start = line.indexOf('[', position[0]);
        if (start == -1) return TERMINATOR;
        boolean variable = line.charAt(start+1) == '*';
        int key1 = Integer.parseInt(line.substring(start+2,start+6),16);
        if (key1 == 0x1299) {
            System.out.println("\t1299");
        }
        int key2 = Integer.parseInt(line.substring(start+7,start+11),16);
        int key3 = Integer.parseInt(line.substring(start+12,start+16),16);
        if (record) {
            primarySet.set(key1);
            secondarySet.set(key2);
            secondaryCount[key2]++;
            tertiarySet.set(key3);
            tertiaryCount[key3]++;
        }
        if (key1 == 0 && variable) {
            if (!haveZeroVariableWarning) {
                System.out.println("\tBAD DATA: Zero L1s cannot be variable!!: " + line);
                haveZeroVariableWarning = true;
            }
            variable = false; // FIX DATA FILE
        }
        if (key2 > 0x1FF) {
            throw new IllegalArgumentException("Weight2 doesn't fit: " + Utility.hex(key2) + "," + line);
        }
        if (key3 > 0x7F) {
            throw new IllegalArgumentException("Weight3 doesn't fit: " + Utility.hex(key3) + "," + line);
        }
        // adjust variable bounds, if needed
        if (variable) {
            if (key1 > nonVariableLow) {
                if (!haveVariableWarning) {
                    System.out.println("\tBAD DATA: Variable overlap, nonvariable low: "
                    + Utility.hex(nonVariableLow) + ", line: \"" + line + "\"");
                    haveVariableWarning = true;
                }
            } else {
                if (key1 < variableLow) variableLow = key1;
                if (key1 > variableHigh) variableHigh = key1;
            }
        } else if (key1 != 0) { // not variable, not zero
            if (key1 < variableHigh) {
                if (!haveVariableWarning) {
                    System.out.println("\tBAD DATA: Variable overlap, variable high: "
                    + Utility.hex(variableHigh) + ", line: \"" + line + "\"");
                    haveVariableWarning = true;
                }
            } else {
                if (key1 < nonVariableLow) nonVariableLow = key1;
            }
        }
            
        // statistics
        count1++;
        if (key1 != oldKey1) {
            oldKey1 = key1;
            if (count2 > max2) max2 = count2;
            if (count3 > max3) max3 = count3;
            count2 = count3 = 1;
        } else {
            count2++;
            if (key2 != oldKey2) {
                oldKey2 = key2;
                if (count3 > max3) max3 = count3;
                count3 = 1;
            } else {
                count3++;
            }
        }
        position[0] = start + 17;
        /*
        if (VARIABLE && variable) {
            key1 = key2 = key3 = 0;
            if (CHECK_UNIQUE) {
                if (key1 != lastUniqueVariable) renumberedVariable++;
                result = renumberedVariable;     // push primary down
                lastUniqueVariable = key1;
                key3 = key1;
                key1 = key2 = 0;
            }
        }
        */
        // gather some statistics
        if (key1 != 0 && key1 < MIN1) MIN1 = (char)key1;
        if (key2 != 0 && key2 < MIN2) MIN2 = (char)key2;
        if (key3 != 0 && key3 < MIN3) MIN3 = (char)key3;
        if (key1 > MAX1) MAX1 = (char)key1;
        if (key2 > MAX2) MAX2 = (char)key2;
        if (key3 > MAX3) MAX3 = (char)key3;
        return makeKey(key1, key2, key3);
    }
    
    /**
     * Just for statistics
     */
    int lastUniqueVariable = 0;
    int renumberedVariable = 50;
    char MIN1 = '\uFFFF'; // start large; will be reset as table is built
    char MIN2 = '\uFFFF'; // start large; will be reset as table is built
    char MIN3 = '\uFFFF'; // start large; will be reset as table is built
    char MAX1 = '\u0000'; // start small; will be reset as table is built
    char MAX2 = '\u0000'; // start small; will be reset as table is built
    char MAX3 = '\u0000'; // start small; will be reset as table is built
    
    /**
     * Used for checking data file integrity
     */
    private Hashtable uniqueTable = new Hashtable();
    
    /**
     * Used for checking data file integrity
     */
    private void checkUnique(char value, int result, int fourth, String line) {
        if (toD.hasDecomposition(value)) return; // don't check decomposables.
        Object ceObj = new Long(((long)result << 16) | fourth);
        Object probe = uniqueTable.get(ceObj);
        if (probe != null) {
            System.out.println("\tCE(" + Utility.hex(value) 
              + ")=CE(" + Utility.hex(((Character)probe).charValue()) + "); " + line);
              
        } else {
            uniqueTable.put(ceObj, new Character(value));
        }
    }
}