scuffed-code/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java

/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $
* $Date: 2004/03/11 19:03:19 $
* $Revision: 1.2 $
*
*******************************************************************************
*/

package com.ibm.text.UCA;

import java.util.*;
import java.io.BufferedReader;
import java.io.Reader;
import java.io.PrintWriter;
import java.io.FileReader;
import java.text.MessageFormat;
import java.io.IOException;
import com.ibm.text.UCD.Normalizer;
import com.ibm.text.UCD.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

public class UCA_Data implements UCA_Types {
    static final boolean DEBUG = false;
    static final boolean DEBUG_SHOW_ADD = false;

    private Normalizer toD;
    private UCD ucd;

    public UCA_Data(Normalizer toD, UCD ucd) {
        this.toD = toD;
        this.ucd = ucd;
    }

    /**
     * The collation element data is stored a couple of different structures.
     * First is collationElements, which generally contains the 32-bit CE corresponding
     * to the data. It is directly indexed by character code.<br>
     * For brevity in the implementation, we just use a flat array.
     * A real implementation would use a multi-stage table, as described in TUS Section 5.
     * table of simple collation elements, indexed by char.<br>
     * Exceptional cases: expanding, contracting, unsupported are handled as described below.
     */
    private int[] collationElements = new int[65536];

    /**
     * Although a single character can expand into multiple CEs, we don't want to burden
     * the normal case with the storage. So, they get a special value in the collationElements
     * array. This value has a distinct primary weight, followed by an index into a separate
     * table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
     * will be used for the expansion. The implementation is as a stack; this just makes it
     * easy to generate.
     */
    private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys

    /**
     * For now, this is just a simple mapping of strings to collation elements.
     * The implementation depends on the contracting characters being "completed",
     * so that it can be efficiently determined when to stop looking.
     */
    private Map contractingTable = new TreeMap();

    {
        // clear some tables
        for (int i = 0; i < collationElements.length; ++i) {
            collationElements[i] = UNSUPPORTED_FLAG;
        }
        // preload with parts
        for (char i = 0xD800; i < 0xDC00; ++i) {
            collationElements[i] = CONTRACTING;
            addToContractingTable(String.valueOf(i), UNSUPPORTED_FLAG);
        }
        checkConsistency();
    }

    /**
     * Return the type of the CE
     */
    public byte getCEType(int ch) {
        if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands

        int ce = collationElements[ch];
        if (ce == UNSUPPORTED_FLAG) {

            // Special check for Han, Hangul
            if (ucd.isHangulSyllable(ch)) return HANGUL_CE;

            if (ucd.isCJK_BASE(ch)) return CJK_CE;
            if (ucd.isCJK_AB(ch)) return CJK_AB_CE;

            // special check for unsupported surrogate pair, 20 1/8 bits
            //if (0xD800 <= ch && ch <= 0xDFFF) {
            //    return SURROGATE_CE;
            //}
            return UNSUPPORTED_CE;
        }
        if (ce == CONTRACTING) return CONTRACTING_CE;
        if ((ce & EXPANDING_MASK) == EXPANDING_MASK) return EXPANDING_CE;
        return NORMAL_CE;
    }

    public void add(String source, IntStack ces) {
        add(new StringBuffer(source), ces);
    }

    public void add(StringBuffer source, IntStack ces) {

        if (DEBUG_SHOW_ADD) {
            System.out.println("Adding: " + ucd.getCodeAndName(source.toString()) + CEList.toString(ces));
        }
        if (source.length() < 1 || ces.length() < 1) {
            throw new IllegalArgumentException("String or CEs too short");
        }

        int ce;
        if (ces.length() == 1) {
            ce = ces.get(0);
        } else {
            ce = EXPANDING_MASK | expandingTable.getTop();
            expandingTable.append(ces);
            expandingTable.append(TERMINATOR);
        }

        // assign CE(s) to char(s)
        char value = source.charAt(0);
        //if (value == 0x10000) System.out.print("DEBUG2: " + source);

        if (source.length() > 1) {
            addToContractingTable(source, ce);
            if (collationElements[value] == UNSUPPORTED_FLAG) {
                collationElements[value] = CONTRACTING; // mark special
            } else if (collationElements[value] != CONTRACTING) {
                // move old value to contracting table!
                //contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
                addToContractingTable(String.valueOf(value), collationElements[value]);
                collationElements[value] = CONTRACTING; // signal we must look up in table
            }
        } else if (collationElements[value] == CONTRACTING) {
            // must add old value to contracting table!
            addToContractingTable(source, ce);
            //contractingTable.put(source, new Integer(ce));
        } else {
            collationElements[source.charAt(0)] = ce; // normal
        }
        //if (DEBUG) checkConsistency();
    }

    boolean isCompletelyIgnoreable(int cp) {
        int ce = collationElements[cp < UTF16.SUPPLEMENTARY_MIN_VALUE ? cp : UTF16.getLeadSurrogate(cp)];
        if (ce == 0) return true;
        if (ce != CONTRACTING) return false;
        Object newValue = contractingTable.get(UTF16.valueOf(cp));
        if (newValue == null) return false;
        return ((Integer)newValue).intValue() == 0;
    }

    // returns new pos, fills in result.
    public int get(char ch, StringBuffer decompositionBuffer, int index, IntStack result) {
        int ce = collationElements[ch];

        if (ce == CONTRACTING) {
            // Contracting is probably the most interesting (read "tricky") part
            // of the algorithm.
            // First get longest substring that is in the contracting table.
            // For simplicity, we use a hash table for contracting.
            // There are much better optimizations,
            // but they take a more complicated build algorithm than we want to show here.
            // NOTE: We are guaranteed that the first code unit is in the contracting table because
            // of the build process.
            String probe = String.valueOf(ch);
            Object value = contractingTable.get(probe);
            if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));

            // complete the first character, if part of supplementary
            if (UTF16.isLeadSurrogate(ch) && index < decompositionBuffer.length()) {
                char ch2 = decompositionBuffer.charAt(index);
                String newProbe = probe + ch2;
                Object newValue = contractingTable.get(newProbe);
                if (newValue != null) {
                    probe = newProbe;
                    value = newValue;
                    index++;
                }
            }

            // We loop, trying to add successive CODE UNITS to the longest substring.
            int cp2;
            while (index < decompositionBuffer.length()) {
                //char ch2 = decompositionBuffer.charAt(index);
                cp2 = UTF16.charAt(decompositionBuffer, index);
                int increment = UTF16.getCharCount(cp2);

                // CHECK if last char was completely ignorable
                if (isCompletelyIgnoreable(cp2)) {
                    index += increment; // just skip char don't set probe, value
                    continue;
                }

                // see whether the current string plus the next char are in
                // the contracting table.
                String newProbe = probe + UTF16.valueOf(cp2);
                Object newValue = contractingTable.get(newProbe);
                if (newValue == null) break;    // stop if not in table.

                // We succeeded--so update our new values, and set index
                // and quaternary to indicate that we swallowed another character.
                probe = newProbe;
                value = newValue;
                index += increment;
            }

            // Now, see if we can add any combining marks
            short lastCan = 0;
            int increment;
            for (int i = index; i < decompositionBuffer.length(); i += increment) {
                // We only take certain characters. They have to be accents,
                // and they have to not be blocked.
                // Unlike above, if we don't find a match (and it was an accent!)
                // then we don't stop, we continue looping.
                cp2 = UTF16.charAt(decompositionBuffer, i);
                increment = UTF16.getCharCount(cp2);
                short can = toD.getCanonicalClass(cp2);
                if (can == 0) break;            // stop with any zero (non-accent)
                if (can == lastCan) continue;   // blocked if same class as last
                lastCan = can;                  // remember for next time

                // CHECK if last char was completely ignorable. If so, skip it.
                if (isCompletelyIgnoreable(cp2)) {
                    continue;
                }

                // Now see if we can successfully add it onto our string
                // and find it in the contracting table.
                String newProbe = probe + UTF16.valueOf(cp2);
                Object newValue = contractingTable.get(newProbe);
                if (newValue == null) continue;

                // We succeeded--so update our new values, remove the char, and update
                // quaternary to indicate that we swallowed another character.
                probe = newProbe;
                value = newValue;
                decompositionBuffer.setCharAt(i,'\u0000');  // zero char
                if (increment == 2) {
                    // WARNING: we had a supplementary character. zero BOTH parts
                    decompositionBuffer.setCharAt(i+1,'\u0000');  // zero char
                }
            }

            // we are all done, and can extract the CE from the last value set.
            ce = ((Integer)value).intValue();

        }

        // if the CE is not expanding) we are done.
        if ((ce & EXPANDING_MASK) != EXPANDING_MASK) {
            result.push(ce);
        } else {
            // expanding, so copy list of items onto stack
            int ii = ce & EXCEPTION_INDEX_MASK; // get index
            // copy onto stack from index until reach TERMINATOR
            while (true) {
                ce = expandingTable.get(ii++);
                if (ce == TERMINATOR) break;
                result.push(ce);
            }
        }
        return index;
    }

    private void addToContractingTable(Object s, int ce) {
        if (s == null) {
            throw new IllegalArgumentException("String can't be null");
        }
        contractingTable.put(s.toString(), new Integer(ce));
    }

    void checkConsistency() {
                // at this point, we have to guarantee that the contractingTable is CLOSED
        // e.g. if a substring of length n is in the table, then the first n-1 characters
        // are also!!

        // First check consistency. the CE for a value is CONTRACTING if and only if there is a contraction starting
        // with that value.

        UnicodeSet ceSet = new UnicodeSet();
        for (int i = 0; i < collationElements.length; ++i) {
            if (collationElements[i] == CONTRACTING) ceSet.add(i);
        }
        UnicodeSet ceSet2 = new UnicodeSet();
        Iterator enum = contractingTable.keySet().iterator();
        while (enum.hasNext()) {
            String sequence = (String)enum.next();
            ceSet2.add(sequence.charAt(0));
        }

        if (!ceSet.equals(ceSet2)) {
            System.out.println("In both: " + new UnicodeSet(ceSet).retainAll(ceSet2).toPattern(true));
            System.out.println("CONTRACTING but not in table: " + new UnicodeSet(ceSet).removeAll(ceSet2).toPattern(true));
            System.out.println("In table but not CONTRACTING: " + new UnicodeSet(ceSet2).removeAll(ceSet).toPattern(true));
            throw new IllegalArgumentException("Inconsistent data");
        }

/*
0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
        int[] temp1 = int[20];
        int[] temp2 = int[20];
        int[] temp3 = int[20];
        getCEs("\u0fb2", true, temp1);
        getCEs("\u0fb3", true, temp2);
        getCEs("\u0f71", true, temp3);
        add("\u0FB2\u0F71", concat(temp1, temp3));
*/

    }

    Iterator getContractions() {
        return contractingTable.keySet().iterator();
    }

    int getContractionCount() {
        return contractingTable.size();
    }

    boolean contractionTableContains(String s) {
        return contractingTable.get(s) != null;
    }

}