scuffed-code/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java

/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/

package com.ibm.text.UCD;

import java.util.*;
import java.io.*;

import com.ibm.text.utility.*;

public class GenerateCaseFolding implements UCD_Types {
    public static boolean DEBUG = false;
    public static UCD ucd = UCD.make("310");

    public static void main(String[] args) throws java.io.IOException {
        makeCaseFold();
        //getAge();
    }

    public static void makeCaseFold() throws java.io.IOException {
        System.out.println("Making Full Data");
        Map fullData = getCaseFolding(true);
        System.out.println("Making Simple Data");
        Map simpleData = getCaseFolding(false);
        // write the data

        System.out.println("Writing");
        PrintWriter out = new PrintWriter(
            new BufferedWriter(
            new OutputStreamWriter(
                new FileOutputStream("CaseFoldingSample.txt"),
                "UTF8"),
            4*1024));

        for (int ch = 0; ch < 0x10FFFF; ++ch) {
            String rFull = (String)fullData.get(UTF32.valueOf32(ch));
            String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
            if (rFull == null && rSimple == null) continue;
            if (rFull != null && rFull.equals(rSimple)) {
                String type = "C";
                if (ch == 0x130 || ch == 0x131) type = "I";
                drawLine(out, ch, type, rFull);
            } else {
                if (rFull != null) {
                    drawLine(out, ch, "F", rFull);
                }
                if (rSimple != null) {
                    drawLine(out, ch, "S", rSimple);
                }
            }
        }
        out.close();
    }

    static void drawLine(PrintWriter out, int ch, String type, String result) {
        out.println(Utility.hex(ch)
            + "; " + type +
            "; " + Utility.hex(result, " ") +
            "; # " + ucd.getName(ch));
    }


    static Map getCaseFolding(boolean full) throws java.io.IOException {
        Map data = new TreeMap();
        Map repChar = new TreeMap();
        //String option = "";

        // get the equivalence classes

        for (int ch = 0; ch < 0x10FFFF; ++ch) {
            if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
            if (!ucd.isRepresented(ch)) continue;
            getClosure(ch, data, full);
        }

        // get the representative characters

        Iterator it = data.keySet().iterator();
        while (it.hasNext()) {
            String s = (String) it.next();
            Set set = (Set) data.get(s);
            String rep = null;
            int repGood = 0;
            String dup = null;
            Iterator it2 = set.iterator();
            while (it2.hasNext()) {
                String s2 = (String)it2.next();
                int s2Good = goodness(s2, full);
                if (s2Good > repGood) {
                    rep = s2;
                    repGood = s2Good;
                    dup = null;
                } else if (s2Good == repGood) {
                    dup = s2;
                }
            }
            if (rep == null) System.err.println("No representative for: " + toString(set));
            else if (repGood < 128) {
                System.err.println("Non-optimal!!: "
                    + ucd.getName(rep) + ", " + toString(set,true));
            }
            it2 = set.iterator();
            while (it2.hasNext()) {
                String s2 = (String)it2.next();
                if (s2.length() == 1 && !s2.equals(rep)) repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
            }
        }
        return repChar;
    }

    static int goodness(String s, boolean full) {
        if (s == null) return 0;
        int result = s.length();
        if (s.equals(lower(upper(s, full), full))) result |= 128;
        if (s.equals(NFC.normalize(s))) result |= 64;
        return result;
    }


    static Normalizer NFC = new Normalizer(Normalizer.NFC);
    /*
    static HashSet temp = new HashSet();
    static void normalize(HashSet set) {
        temp.clear();
        temp.addAll(set);
        set.clear();
        Iterator it = temp.iterator();
        while (it.hasNext()) {
            String s = (String) it.next();
            String s2 = KC.normalize(s);
            set.add(s);
            data2.put(s,set);
            if (!s.equals(s2)) {
                set.add(s2);
                data2.put(s2,set);
                System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2));
            }
        }
    }
    */

            /*
            String
            String lower1 = ucd.getLowercase(ch);
            String lower2 = ucd.toLowercase(ch,option);

            char ch2 = ucd.getLowercase(ucd.getUppercase(ch).charAt(0)).charAt(0);
            //String lower1 = String.valueOf(ucd.getLowercase(ch));
            //String lower = ucd.toLowercase(ch2,option);
            String upper = ucd.toUppercase(ch2,option);
            String lowerUpper = ucd.toLowercase(upper,option);
            //String title = ucd.toTitlecase(ch2,option);
            //String lowerTitle = ucd.toLowercase(upper,option);

            if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
                output.println(Utility.hex(ch)
                    + "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
                    + "; " + Utility.hex(lowerUpper," ")
                    + ";\t#" + ucd.getName(ch)
                    );
                //if (!lowerUpper.equals(lower)) {
                //    output.println("Warning1: " + Utility.hex(lower) + " " + ucd.getName(lower));
                //}
                //if (!lowerUpper.equals(lowerTitle)) {
                //    output.println("Warning2: " + Utility.hex(lowerTitle) + " " + ucd.getName(lowerTitle));
                //}
            }
            */

    static void getClosure(int ch, Map data, boolean full) {
        String charStr = UTF32.valueOf32(ch);
        String lowerStr = lower(charStr, full);
        String titleStr = title(charStr, full);
        String upperStr = upper(charStr, full);
        if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
        if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));

        // make new set
        Set set = new TreeSet();
        set.add(charStr);
        data.put(charStr, set);

        // add cases to get started
        add(set, lowerStr, data);
        add(set, upperStr, data);
        add(set, titleStr, data);

        // close it
        main:
        while (true) {
            Iterator it = set.iterator();
            while (it.hasNext()) {
                String s = (String) it.next();
                // do funny stuff since we can't modify set while iterating
                //if (add(set, NFC.normalize(s), data)) continue main;
                if (add(set, lower(s, full), data)) continue main;
                if (add(set, title(s, full), data)) continue main;
                if (add(set, upper(s, full), data)) continue main;
            }
            break;
        }
    }

    static String lower(String s, boolean full) {
        String result = lower2(s,full);
        return result.replace('\u03C2', '\u03C3'); // HACK for lower
    }

    // These functions are no longer necessary, since UCD is parameterized,
    // but it's not worth changing

    static String lower2(String s, boolean full) {
        if (!full) {
            if (s.length() != 1) return s;
            return ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
        }
        return ucd.getCase(s, FULL, LOWER);
    }

    static String upper(String s, boolean full) {
        if (!full) {
            if (s.length() != 1) return s;
            return ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
        }
        return ucd.getCase(s, SIMPLE, UPPER);
    }

    static String title(String s, boolean full) {
        if (!full) {
            if (s.length() != 1) return s;
            return ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
        }
        return ucd.getCase(s, SIMPLE, TITLE);
    }

    static boolean add(Set set, String s, Map data) {
        if (set.contains(s)) return false;
        set.add(s);
        if (DEBUG) System.err.println("adding: " + toString(set));
        Set other = (Set) data.get(s);
        if (other != null && other != set) { // merge
            // make all the items in set point to merged set
            Iterator it = other.iterator();
            while (it.hasNext()) {
                data.put(it.next(), set);
            }
            set.addAll(other);
        }
        if (DEBUG) System.err.println("done adding: " + toString(set));
        return true;
    }

    static String toString(Set set) {
        String result = "{";
        Iterator it2 = set.iterator();
        boolean first = true;
        while (it2.hasNext()) {
            String s2 = (String) it2.next();
            if (!first) result += ", ";
            first = false;
            result += Utility.hex(s2, " ");
        }
        return result + "}";
    }

    static String toString(Set set, boolean t) {
        String result = "{";
        Iterator it2 = set.iterator();
        boolean first = true;
        while (it2.hasNext()) {
            String s2 = (String) it2.next();
            if (!first) result += ", ";
            first = false;
            result += ucd.getName(s2);
        }
        return result + "}";
    }

    static final void getAge() throws IOException {
        PrintStream log = new PrintStream(
            new BufferedOutputStream (
            new FileOutputStream("UnicodeAge.txt"),
            4*1024));
        try {
            log.println("# Derived file showing when various code points were allocated in Unicode");
            log.println("# author: M. Davis");
            log.println("# generated: " + new Date());
            log.println("# Notes:");
            log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 110 listing.");
            log.println("# - The supplementary private use code points, although allocated earlier,");
            log.println("#   were NOT specifically listed in the UCD until 3.0.1, and are not included until then.");
            new DiffPropertyLister(null, "110", log).print();
            new DiffPropertyLister("110", "200", log).print();
            new DiffPropertyLister("200", "210", log).print();
            new DiffPropertyLister("210", "300", log).print();
            new DiffPropertyLister("300", "310", log).print();
            /*
            printDiff("110", "200");
	        UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
	        UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false);
	        UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
	        UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
	        UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);

            log.println();
            log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
                + n.format(u11.count()));
            log.println();
            u11.print(log, false, false, "1.1");

            UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
            log.println();
            log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
                + n.format(u20m.count()));
            log.println();
            u20m.print(log, false, false, "2.0");

            UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
            log.println();
            log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
                + n.format(u21m.count()));
            log.println();
            u21m.print(log, false, false, "2.1");

            UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
            log.println();
            log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
                + n.format(u30m.count()));
            log.println();
            u30m.print(log, false, false, "3.0");

            UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
            log.println();
            log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
                + n.format(u31m.count()));
            log.println();
            u31m.print(log, false, false, "3.1");
            */
        } finally {
            if (log != null) log.close();
        }

    }

}