scuffed-code/tools/unicodetools/com/ibm/text/UCD/Normalizer.java

/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.16 $
*
*******************************************************************************
*/

package com.ibm.text.UCD;

import java.util.*;
import com.ibm.icu.text.UTF16;

import com.ibm.text.utility.*;


/**
 * Implements Unicode Normalization Forms C, D, KC, KD.<br>
 * See UTR#15 for details.<br>
 * Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
 * The Unicode Consortium makes no expressed or implied warranty of any
 * kind, and assumes no liability for errors or omissions.
 * No liability is assumed for incidental and consequential damages
 * in connection with or arising out of the use of the information here.
 * @author Mark Davis
 */

public final class Normalizer implements UCD_Types {
    public static final String copyright =
      "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";

    public static boolean SHOW_PROGRESS = false;

    /**
     * Create a normalizer for a given form.
     */
    public Normalizer(byte form, String unicodeVersion) {
        this.form = form;
        this.composition = (form & NF_COMPOSITION_MASK) != 0;
        this.compatibility = (form & NF_COMPATIBILITY_MASK) != 0;
        this.data = getData(unicodeVersion);
    }

    /**
     * Create a normalizer for a given form.
     */
    // public Normalizer(byte form) {
    //    this(form,"");
    //}

    /**
     * Return string name
     */
    public static String getName(byte form) {
        return UCD_Names.NF_NAME[form];
    }

    /**
     * Return string name
     */
    public String getName() {
        return getName(form);
    }

    /**
     * Return string name
     */
    public String getUCDVersion() {
        return data.getUCDVersion();
    }

    /**
     * Does compose?
     */
    public boolean isComposition() {
        return composition;
    }

    /**
     * Does compose?
     */
    public boolean isCompatibility() {
        return compatibility;
    }

    /**
    * Normalizes text according to the chosen form,
    * replacing contents of the target buffer.
    * @param   source      the original text, unnormalized
    * @param   target      the resulting normalized text
    */
    public StringBuffer normalize(String source, StringBuffer target) {

        // First decompose the source into target,
        // then compose if the form requires.

        if (source.length() != 0) {
            internalDecompose(source, target, true, compatibility);
            if (composition) {
                internalCompose(target);
            }
        }
        return target;
    }

    /**
    * Normalizes text according to the chosen form,
    * replacing contents of the target buffer.
    * @param   source      the original text, unnormalized
    * @param   target      the resulting normalized text
    */
    public boolean isFCD(String source) {
        if (source.length() == 0) return true;
        StringBuffer noReorder = new StringBuffer();
        StringBuffer reorder = new StringBuffer();

        internalDecompose(source, noReorder, false, false);
        internalDecompose(source, reorder, true, false);

        return reorder.toString().equals(noReorder.toString());
    }

    /**
    * Normalizes text according to the chosen form
    * @param   source      the original text, unnormalized
    * @return  target      the resulting normalized text
    */
    public String normalize(String source) {
        return normalize(source, new StringBuffer()).toString();
    }

    /**
    * Normalizes text according to the chosen form
    * @param   source      the original text, unnormalized
    * @return  target      the resulting normalized text
    */
    public String normalize(int cp) {
        return normalize(UTF16.valueOf(cp));
    }

    /**
    private StringBuffer hasDecompositionBuffer = new StringBuffer();

    public boolean hasDecomposition(int cp) {
        hasDecompositionBuffer.setLength(0);
        normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
        if (hasDecompositionBuffer.length() != 1) return true;
        return cp != hasDecompositionBuffer.charAt(0);
    }
    */

    /**
     * Does a quick check to see if the string is in the current form. Checks canonical order and
     * isAllowed().
     * @param   source  source text
     * @return YES, NO, MAYBE
     */
     /*
    public static final int NO = 0, YES = 1, MAYBE = -1;

    public int quickCheck(String source) {
        short lastCanonicalClass = 0;
        int result = YES;
        for (int i = 0; i < source.length(); ++i) {
            char ch = source.charAt(i);
            short canonicalClass = data.getCanonicalClass(ch);
            if (lastCanonicalClass > canonicalClass && canonicalClass != 0) {
                return NO;
            }
            int check = isAllowed(ch);
            if (check == NO) return NO;
            if (check == MAYBE) result = MAYBE;
        }
        return result;
    }

    /**
     * Find whether the given character is allowed in the current form.
     * @return YES, NO, MAYBE
     */
     /*
    public int isAllowed(char ch) {
        if (composition) {
            if (compatibility) {
                if (data.isCompatibilityExcluded(ch)) {
                    return NO;
                }
            } else {
                if (data.isExcluded(ch)) {
                    return NO;
                }
            }
            if (data.isTrailing(ch)) {
                return MAYBE;
            }
        } else { // decomposition: both NFD and NFKD
            if (data.normalizationDiffers(compatibility,ch)) return NO;
        }
        return YES;
    }

    /**
    * Utility: Gets the combining class of a character from the
    * Unicode Character Database. Only a byte is needed, but since they are signed in Java
    * return an int to forstall problems.
    * @param   ch      the source character
    * @return          value from 0 to 255
    */

    public short getCanonicalClass(int ch) {
        return data.getCanonicalClass(ch);
    }

    /**
    * Utility: Checks whether there is a recursive decomposition of a character from the
    * Unicode Character Database. It is compatibility or canonical according to the particular
    * normalizer.
    * @param   ch      the source character
    */
    public boolean isNormalized(int ch) {
        return !data.normalizationDiffers(ch, composition, compatibility);
    }

    /**
    * Utility: Checks whether there is a recursive decomposition of a character from the
    * Unicode Character Database. It is compatibility or canonical according to the particular
    * normalizer.
    * @param   ch      the source character
    */
    public boolean isNormalized(String s) {
        if (UTF16.countCodePoint(s) > 1) {
            return !data.normalizationDiffers(UTF16.charAt(s,0), composition, compatibility);
        }
        return s.equals(normalize(s)); // TODO: OPTIMIZE LATER
    }

    /**
    * Utility: Gets recursive decomposition of a character from the
    * Unicode Character Database.
    * @param   compatibility    If false selects the recursive
    *                  canonical decomposition, otherwise selects
    *                  the recursive compatibility AND canonical decomposition.
    * @param   ch      the source character
    * @param   buffer  buffer to be filled with the decomposition
    */
    public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
        data.getRecursiveDecomposition(ch, buffer, compatibility);
    }

    /**
    * Utility: Gets composition mapping.
    * @return IntEnumeration with the pair -> value mapping, where the
    * pair is firstChar << 16 | secondChar.
    * Will need to be fixed for surrogates.
    */

    public void getCompositionStatus(BitSet leading, BitSet trailing, BitSet resulting) {
        Iterator it = data.compTable.keySet().iterator();
        while (it.hasNext()) {
            Long key = (Long)it.next();
            Integer result = (Integer)data.compTable.get(key);
            long keyLong = key.longValue();
            if (leading != null) leading.set((int)(keyLong >>> 32));
            if (trailing != null) trailing.set((int)keyLong);
            if (resulting != null) resulting.set(result.intValue());
        }
        for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
            if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
            if (trailing != null && UCD.isNonLeadJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
        }
        if (leading != null) {
            for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
                if (UCD.isDoubleHangul(i)) leading.set(i); // set all two-Jamo syllables
            }
        }
    }

    public boolean isTrailing(int cp) {
        return this.composition ? data.isTrailing(cp) : false;
    }

    public boolean isLeading(int cp) {
        return this.composition ? data.isLeading(cp) : false;
    }

    public int getComposition(int first, int second) {
        return data.getPairwiseComposition(first, second);
    }

    // ======================================
    //                  PRIVATES
    // ======================================

    /**
     * The current form.
     */
    private byte form;
    private boolean composition;
    private boolean compatibility;

    /**
    * Decomposes text, either canonical or compatibility,
    * replacing contents of the target buffer.
    * @param   form        the normalization form. If NF_COMPATIBILITY_MASK
    *                      bit is on in this byte, then selects the recursive
    *                      compatibility decomposition, otherwise selects
    *                      the recursive canonical decomposition.
    * @param   source      the original text, unnormalized
    * @param   target      the resulting normalized text
    */
    private void internalDecompose(String source, StringBuffer target, boolean reorder, boolean compat) {
        StringBuffer buffer = new StringBuffer();
        int ch32;
        for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
            buffer.setLength(0);
            ch32 = UTF16.charAt(source, i);
            data.getRecursiveDecomposition(ch32, buffer, compat);

            // add all of the characters in the decomposition.
            // (may be just the original character, if there was
            // no decomposition mapping)

            int ch;
            for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
                ch = UTF16.charAt(buffer, j);
                int chClass = data.getCanonicalClass(ch);
                int k = target.length(); // insertion point
                if (chClass != 0 && reorder) {

                    // bubble-sort combining marks as necessary

                    int ch2;
                    for (; k > 0; k -= UTF16.getCharCount(ch2)) {
                        ch2 = UTF16.charAt(target, k-1);
                        if (data.getCanonicalClass(ch2) <= chClass) break;
                    }
                }
                target.insert(k, UTF16.valueOf(ch));
            }
        }
    }

    /**
    * Composes text in place. Target must already
    * have been decomposed.
    * Uses UTF16, which is a utility class for supplementary character support in Java.
    * @param   target      input: decomposed text.
    *                      output: the resulting normalized text.
    */
    private void internalCompose(StringBuffer target) {
        int starterPos = 0;
        int starterCh = UTF16.charAt(target,0);
        int compPos = UTF16.getCharCount(starterCh); // length of last composition
        int lastClass = data.getCanonicalClass(starterCh);
        if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
        int oldLen = target.length();

        // Loop on the decomposed characters, combining where possible

        int ch;
        for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
            ch = UTF16.charAt(target, decompPos);
            if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
                + ", decompPos: " + decompPos
                + ", compPos: " + compPos
                + ", ch: " + Utility.hex(ch)
                );
            int chClass = data.getCanonicalClass(ch);
            int composite = data.getPairwiseComposition(starterCh, ch);
            if (composite != data.NOT_COMPOSITE
            && (lastClass < chClass || lastClass == 0)) {
                UTF16.setCharAt(target, starterPos, composite);
                // we know that we will only be replacing non-supplementaries by non-supplementaries
                // so we don't have to adjust the decompPos
                starterCh = composite;
            } else {
                if (chClass == 0) {
                    starterPos = compPos;
                    starterCh  = ch;
                }
                lastClass = chClass;
                UTF16.setCharAt(target, compPos, ch);
                if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
                    System.out.println("ADJUSTING: " + Utility.hex(target));
                    decompPos += target.length() - oldLen;
                    oldLen = target.length();
                }
                compPos += UTF16.getCharCount(ch);
            }
        }
        target.setLength(compPos);
    }

    static class Stub {
        private UCD ucd;
        private HashMap compTable = new HashMap();
        private BitSet isSecond = new BitSet();
        private BitSet isFirst = new BitSet();
        private BitSet canonicalRecompose = new BitSet();
        private BitSet compatibilityRecompose = new BitSet();
        static final int NOT_COMPOSITE = 0xFFFF;

        Stub(String version) {
            ucd = UCD.make(version);
            for (int i = 0; i < 0x10FFFF; ++i) {
                if (!ucd.isAssigned(i)) continue;
                if (ucd.isPUA(i)) continue;
                if (ucd.isNonLeadJamo(i)) isSecond.set(i);
                if (ucd.isLeadingJamoComposition(i)) isFirst.set(i);
                byte dt = ucd.getDecompositionType(i);
                if (dt != CANONICAL) continue;
                if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
                    try {
                        String s = ucd.getDecompositionMapping(i);
                        int len = UTF16.countCodePoint(s);
                        if (len != 2) {
                            if (len > 2) {
                                if (ucd.getVersion().compareTo("3.0.0") >= 0) {
                                    throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
                                }
                            }
                            continue;
                        }
                        int a = UTF16.charAt(s, 0);
                        if (ucd.getCombiningClass(a) != 0) continue;
                        isFirst.set(a);

                        int b = UTF16.charAt(s, UTF16.getCharCount(a));
                        isSecond.set(b);

                        // have a recomposition, so set the bit
                        canonicalRecompose.set(i);

                        // set the compatibility recomposition bit
                        // ONLY if the component characters
                        // don't compatibility decompose
                        if (ucd.getDecompositionType(a) <= CANONICAL
                         && ucd.getDecompositionType(b) <= CANONICAL) {
                            compatibilityRecompose.set(i);
                         }

                        long key = (((long)a)<<32) | b;

                        /*if (i == '\u1E0A' || key == 0x004400000307) {
                            System.out.println(Utility.hex(s));
                            System.out.println(Utility.hex(i));
                            System.out.println(Utility.hex(key));
                        }*/
                        compTable.put(new Long(key), new Integer(i));
                    } catch (Exception e) {
                        throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
                    }
                }
            }
            // process compatibilityRecompose
            // have to do this afterwards, since we don't know whether the pieces
            // are allowable until we have processed all the characters
            /*
            Iterator it = compTable.keySet().iterator();
            while (it.hasNext()) {
                Long key = (Long)it.next();
                int cp = compTable.get(key);
                long keyLong = key.longValue();
                int first = (int)(keyLong >>> 32);
                int second = (int)keyLong;
                if (ucd.
            */
        }

        String getUCDVersion() {
        	return ucd.getVersion();
        }

        /*
Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS
Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
Problem: differs: true, call: false U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
Problem: differs: true, call: false U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE
Problem: differs: true, call: false U+1FC1 GREEK DIALYTIKA AND PERISPOMENI
Problem: differs: true, call: false U+1FCD GREEK PSILI AND VARIA
Problem: differs: true, call: false U+1FCE GREEK PSILI AND OXIA
Problem: differs: true, call: false U+1FCF GREEK PSILI AND PERISPOMENI
Problem: differs: true, call: false U+1FDD GREEK DASIA AND VARIA
Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA
Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI
Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
*/

        short getCanonicalClass(int cp) {
            return ucd.getCombiningClass(cp);
        }

        boolean isTrailing(int cp) {
            return isSecond.get(cp);
        }

        boolean isLeading(int cp) {
            return isFirst.get(cp);
        }

        boolean normalizationDiffers(int cp, boolean composition, boolean compat) {
            byte dt = ucd.getDecompositionType(cp);
            if (!composition) {
                if (compat) return dt >= CANONICAL;
                else return dt == CANONICAL;
            } else {
                // almost the same, except that we add back in the characters
                // that RECOMPOSE
                if (compat) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
                else return dt == CANONICAL && !canonicalRecompose.get(cp);
            }
        }

        public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compat) {
            byte dt = ucd.getDecompositionType(cp);
            // we know we decompose all CANONICAL, plus > CANONICAL if compat is TRUE.
            if (dt == CANONICAL || dt > CANONICAL && compat) {
                String s = ucd.getDecompositionMapping(cp);
                if (s.equals(UTF16.valueOf(cp))) {
                    System.out.println("fix");
                }
                for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
                    cp = UTF16.charAt(s, i);
                    getRecursiveDecomposition(cp, buffer, compat);
                }
            } else {
                UTF16.append(buffer, cp);
            }
        }

        int getPairwiseComposition(int starterCh, int ch) {
            int hangulPoss = UCD.composeHangul(starterCh, ch);
            if (hangulPoss != 0xFFFF) return hangulPoss;
            Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
            if (obj == null) return 0xFFFF;
            return ((Integer)obj).intValue();
        }

    }

    /**
    * Contains normalization data from the Unicode Character Database.
    * use false for the minimal set, true for the real set.
    */
    private Stub data;

    private static HashMap versionCache = new HashMap();

    private static Stub getData (String version) {
        if (version.length() == 0) version = UCD.latestVersion;
        Stub result = (Stub)versionCache.get(version);
        if (result == null) {
            result = new Stub(version);
            versionCache.put(version, result);
        }
        return result;
    }

    /**
    * Just accessible for testing.
    */
    /*
    boolean isExcluded (char ch) {
        return data.isExcluded(ch);
    }

    /**
    * Just accessible for testing.
    */
    /*
    String getRawDecompositionMapping (char ch) {
        return data.getRawDecompositionMapping(ch);
    }
    //*/
}