ICU-1683 added draft versions of two iterators

X-SVN-Rev: 7549
2002-02-01 02:05:35 +00:00 · 2002-02-01 02:05:35 +00:00 · a1c6d85cf7
commit a1c6d85cf7
parent f56fb8ddba
6 changed files with 1256 additions and 0 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java
+++ b/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java
@ -0,0 +1,70 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2000, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java,v $ 
+ * $Date: 2002/02/01 02:05:35 $ 
+ * $Revision: 1.1 $
+ *
+ *****************************************************************************************
+ */
+package com.ibm.test.normalizer;
+
+import com.ibm.test.*;
+import com.ibm.text.*;
+import com.ibm.util.Utility;
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+import java.util.*;
+
+// TODO: fit into test framework
+
+public class TestCanonicalIterator {
+    static final String testArray[] = {
+        "Åd\u0307\u0327",
+        "\u010d\u017E",
+        "x\u0307\u0327",
+    };
+
+    public static void main(String[] args) {
+        // set up for readable display
+        Transliterator name = Transliterator.getInstance("name");
+        Transliterator hex = Transliterator.getInstance("hex");
+        
+        // check build
+        UnicodeSet ss = CanonicalIterator.getSafeStart();
+        System.out.println("Safe Start: " + ss.toPattern(true));
+        System.out.println();
+        ss = CanonicalIterator.getStarts('a');
+        System.out.println("Characters with 'a' at the start of their decomposition: " + ss.toPattern(true));
+        
+        // check permute
+        System.out.println(collectionToString(CanonicalIterator.permute("ABC")));
+        
+        // try samples
+        for (int i = 0; i < testArray.length; ++i) {
+            System.out.println();
+            System.out.println("Results for: " + name.transliterate(testArray[i]));
+            CanonicalIterator it = new CanonicalIterator(testArray[i]);
+            int counter = 0;
+            while (true) {
+                String result = it.next();
+                if (result == null) break;
+                System.out.println(++counter + ": " + hex.transliterate(result));
+                System.out.println(" = " + name.transliterate(result));
+            }
+        }
+    }
+    
+    static String collectionToString(Collection col) {
+        StringBuffer result = new StringBuffer();
+        Iterator it = col.iterator();
+        while (it.hasNext()) {
+            if (result.length() != 0) result.append(", ");
+            result.append(it.next().toString());
+        }
+        return result.toString();
+    }
+}
--- a/icu4j/src/com/ibm/icu/text/CanonicalIterator.java
+++ b/icu4j/src/com/ibm/icu/text/CanonicalIterator.java
@ -0,0 +1,439 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2000, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CanonicalIterator.java,v $ 
+ * $Date: 2002/02/01 02:05:35 $ 
+ * $Revision: 1.1 $
+ *
+ *****************************************************************************************
+ */
+package com.ibm.text;
+import com.ibm.util.Utility;
+import java.util.Enumeration;
+import java.util.Vector;
+
+import java.util.*;
+
+/**
+ * This class allows one to iterate through all the strings that are canonically equivalent to a given
+ * string. For example, here are some sample results:
+Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
+1: \u0041\u030A\u0064\u0307\u0327
+ = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
+2: \u0041\u030A\u0064\u0327\u0307
+ = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
+3: \u0041\u030A\u1E0B\u0327
+ = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
+4: \u0041\u030A\u1E11\u0307
+ = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
+5: \u00C5\u0064\u0307\u0327
+ = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
+6: \u00C5\u0064\u0327\u0307
+ = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
+7: \u00C5\u1E0B\u0327
+ = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
+8: \u00C5\u1E11\u0307
+ = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
+9: \u212B\u0064\u0307\u0327
+ = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
+10: \u212B\u0064\u0327\u0307
+ = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
+11: \u212B\u1E0B\u0327
+ = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
+12: \u212B\u1E11\u0307
+ = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
+ *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
+ * since it has not been optimized for that situation.
+ *@author M. Davis
+ *@draft
+ */
+
+public class CanonicalIterator {
+    /**
+     *@param source string to get results for
+     */
+    public CanonicalIterator(String source) {
+        setSource(source);
+    }
+    
+    /**
+     *@return gets the source: NOTE: it is the NFD form of source
+     */
+    public String getSource() {
+      return source;
+    }
+    
+    /**
+     * Resets the iterator so that one can start again from the beginning.
+     */
+    public void reset() {
+        done = false;
+        for (int i = 0; i < current.length; ++i) {
+            current[i] = 0;
+        }
+    }
+    
+    /**
+     *@return the next string that is canonically equivalent. The value null is returned when
+     * the iteration is done.
+     */
+    public String next() {
+        if (done) return null;
+        
+        // construct return value
+        
+        buffer.setLength(0); // delete old contents
+        for (int i = 0; i < pieces.length; ++i) {
+            buffer.append(pieces[i][current[i]]);
+        }
+        String result = buffer.toString();
+        
+        // find next value for next time
+        
+        for (int i = current.length - 1; ; --i) {
+            if (i < 0) {
+                done = true;
+                break;
+            }
+            current[i]++;
+            if (current[i] < pieces[i].length) break; // got sequence
+            current[i] = 0;
+        }
+        return result;
+    }
+    
+    /**
+     *@param set the source string to iterate against. This allows the same iterator to be used
+     * while changing the source string, saving object creation.
+     */
+    public void setSource(String newSource) {
+        source = Normalizer.normalize(newSource, Normalizer.DECOMP, 0);
+        done = false;
+        
+        // find the segments
+        List list = new ArrayList();
+        int cp;
+        int start = 0;
+        int i = 1;
+        for (; i < source.length(); i += UTF16.getCharCount(i)) {
+            cp = UTF16.charAt(source, i);
+            if (SAFE_START.contains(cp)) {
+                list.add(source.substring(start, i)); // add up to i
+                start = i;
+            }
+        }
+        list.add(source.substring(start, i)); // add last one
+        
+        // allocate the arrays, and find the strings that are CE to each segment
+        pieces = new String[list.size()][];
+        current = new int[list.size()];
+        for (i = 0; i < pieces.length; ++i) {
+            if (PROGRESS) System.out.println("SEGMENT");
+            pieces[i] = getEquivalents((String) list.get(i));
+        }
+    }
+    
+    /**
+     * Dumb recursive implementation of permutation. 
+     * TODO: optimize
+     * @param source the string to find permutations for
+     * @return the results in a set.
+     */
+    public static Set permute(String source) {
+        //if (PROGRESS) System.out.println("Permute: " + source);
+        Set result = new TreeSet();
+        
+        // optimization:
+        // if zero or one character, just return a set with it
+        // we check for length < 2 to keep from counting code points all the time
+        if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
+            result.add(source);
+            return result;
+        }
+        
+        // otherwise iterate through the string, and recursively permute all the other characters
+        int cp;
+        for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(source, i);
+            String chStr = UTF16.valueOf(source, i);
+            
+            // see what the permutations of the characters before and after this one are
+            Set subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
+            
+            // prefix this character to all of them
+            Iterator it = subpermute.iterator();
+            while (it.hasNext()) {
+                String piece = chStr + (String) it.next();
+                //if (PROGRESS) System.out.println("  Piece: " + piece);
+                result.add(piece);
+            }
+        }
+        return result;
+    }
+    
+    // FOR TESTING
+    
+    /**
+     *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
+     */
+    public static UnicodeSet getSafeStart() {
+        return (UnicodeSet) SAFE_START.clone();
+    }
+    
+    /**
+     *@return the set of characters whose decompositions start with the given character
+     */
+    public static UnicodeSet getStarts(int cp) {
+        UnicodeSet result = AT_START.get(cp);
+        if (result == null) result = EMPTY;
+        return (UnicodeSet) result.clone();
+    }
+    
+    
+    // ===================== PRIVATES ==============================
+    
+    // debug
+    private static boolean PROGRESS = false; // debug progress
+    private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
+ 
+    // fields
+    private String source;
+    private boolean done;
+    private String[][] pieces;
+    private int[] current;
+    // Note: C will need two more fields, since arrays there don't have lengths
+    // int pieces_length;
+    // int[] pieces_lengths;
+    
+    // transient fields
+    private transient StringBuffer buffer = new StringBuffer();
+    
+    
+    // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
+    private String[] getEquivalents(String segment) {
+        Set result = new TreeSet();
+        Set basic = getEquivalents2(segment);
+        
+        // now get all the permutations
+        // add only the ones that are canonically equivalent
+        // TODO: optimize by not permuting any class zero.
+        Iterator it = basic.iterator();
+        while (it.hasNext()) {
+            String item = (String) it.next();
+            Set permutations = permute(item);
+            Iterator it2 = permutations.iterator();
+            while (it2.hasNext()) {
+                String possible = (String) it2.next();
+                String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
+                if (attempt.equals(segment)) {
+                    if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
+                    result.add(possible);
+                } else {
+                    if (PROGRESS) System.out.println("-Skipping Permutation: " + NAME.transliterate(possible));
+                }
+            }
+        }
+        
+        // convert into a String[] to clean up storage
+        String[] finalResult = new String[result.size()];
+        result.toArray(finalResult);
+        return finalResult;
+    }
+    
+    private Set getEquivalents2(String segment) {
+        Set result = new TreeSet();
+        if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
+        result.add(segment);
+        StringBuffer workingBuffer = new StringBuffer();
+        
+        // cycle through all the characters
+        int cp;
+        for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
+            // see if any character is at the start of some decomposition
+            cp = UTF16.charAt(segment, i);
+            UnicodeSet starts = AT_START.get(cp);
+            if (starts == null) continue;
+            UnicodeSetIterator usi = new UnicodeSetIterator(starts);
+            // if so, see which decompositions match 
+            while (true) {
+                int cp2 = usi.next();
+                if (cp2 < 0) break; // done
+                Set remainder = extract(cp2, segment, i, workingBuffer);
+                if (remainder == null) continue;
+                
+                // there were some matches, so add all the possibilities to the set.
+                String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
+                Iterator it = remainder.iterator();
+                while (it.hasNext()) {
+                    String item = (String) it.next();
+                    if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
+                    result.add(prefix + item);
+                }
+            }
+        }
+        return result;
+    }
+    
+    /**
+     * See if the decomposition of cp2 is at segment starting at segmentPos 
+     * (with canonical rearrangment!)
+     * If so, take the remainder, and return the equivalents 
+     */
+    private Set extract(int comp, String segment, int segmentPos, StringBuffer buffer) {
+        if (PROGRESS) System.out.println(" extract: " + NAME.transliterate(UTF16.valueOf(comp))
+            + ", " + NAME.transliterate(segment.substring(segmentPos)));
+        String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
+        
+        // See if it matches the start of segment (at segmentPos)
+        boolean ok = false;
+        int cp;
+        int decompPos = 0;
+        int decompCp = UTF16.charAt(decomp,0);
+        decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
+        //int decompClass = getClass(decompCp);
+        buffer.setLength(0); // initialize working buffer, shared among callees
+        
+        for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(segment, i);
+            if (cp == decompCp) { // if equal, eat another cp from decomp
+                if (PROGRESS) System.out.println("  matches: " + NAME.transliterate(UTF16.valueOf(cp)));
+                if (decompPos == decomp.length()) { // done, have all decomp characters!
+                    buffer.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
+                    ok = true;
+                    break;
+                }
+                decompCp = UTF16.charAt(decomp, decompPos);
+                decompPos += UTF16.getCharCount(decompCp);
+                //decompClass = getClass(decompCp);
+            } else {
+                if (PROGRESS) System.out.println("  buffer: " + NAME.transliterate(UTF16.valueOf(cp)));
+                // brute force approach
+                UTF16.append(buffer, cp);
+                /* TODO: optimize
+                // since we know that the classes are monotonically increasing, after zero
+                // e.g. 0 5 7 9 0 3
+                // we can do an optimization
+                // there are only a few cases that work: zero, less, same, greater
+                // if both classes are the same, we fail
+                // if the decomp class < the segment class, we fail
+        
+                segClass = getClass(cp);
+                if (decompClass <= segClass) return null;
+                */
+            }
+        }
+        if (!ok) return null; // we failed, characters left over
+        if (PROGRESS) System.out.println("Matches");
+        if (buffer.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
+        String remainder = buffer.toString();
+        
+        // brute force approach
+        // check to make sure result is canonically equivalent
+        String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
+        if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
+        
+        // get the remaining combinations
+        return getEquivalents2(remainder);
+    }
+    
+    // TODO: fix once we have a codepoint interface to get the canonical combining class
+    // TODO: Need public access to canonical combining class in UCharacter!
+    private static int getClass(int cp) {
+        return Normalizer.getClass((char)cp);
+    }
+    
+   // ================= BUILDER =========================
+    // TODO: Flatten this data so it doesn't have to be reconstructed each time!
+    
+    private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
+    private static final Set SET_WITH_NULL_STRING = new TreeSet(); // constant, don't change
+    static {
+        SET_WITH_NULL_STRING.add("");
+    }
+    
+    private static UnicodeSet SAFE_START = new UnicodeSet();
+    private static CharMap AT_START = new CharMap();
+    
+        // WARNING, NORMALIZER doesn't have supplementaries yet;
+        // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
+    private static int LAST_UNICODE = 0xFFFF;
+    static {
+        buildData();
+    }
+    
+    // TODO: public just for testing
+    private static void buildData() {
+
+        if (PROGRESS) System.out.println("Getting Safe Start");
+        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
+            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
+            int cc = getClass(cp);
+            if (cc == 0) SAFE_START.add(cp);
+            // will fix to be really safe below
+        }
+        if (PROGRESS) System.out.println();
+        
+        if (PROGRESS) System.out.println("Getting Containment");
+        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
+            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
+            // TODO: For efficiency, need extra function plus overloads
+            // Normalizer.normalizationDiffers(String source,...)
+            // Normalizer.normalizationDiffers(int char32,...)
+            // Normalizer.normalize(char32,...);
+            String istr = UTF16.valueOf(cp);
+            String decomp = Normalizer.normalize(istr, Normalizer.DECOMP, 0);
+            if (decomp.equals(istr)) continue;
+            
+            // add each character in the decomposition to canBeIn 
+            
+            int component;
+            for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
+                component = UTF16.charAt(decomp, i);
+                if (i == 0) {
+                    AT_START.add(component, cp);
+                } else if (getClass(component) == 0) {
+                    SAFE_START.remove(component);
+                }
+            }
+        }
+        if (PROGRESS) System.out.println();
+    }
+    
+    // the following is just for a map from characters to a set of characters
+    
+    private static class CharMap {
+        Map storage = new HashMap();
+        MutableInt probe = new MutableInt();
+        boolean converted = false;
+        
+        public void add(int cp, int whatItIsIn) {
+            UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
+            if (result == null) {
+                result = new UnicodeSet();
+                storage.put(probe, result);
+            }
+            result.add(whatItIsIn);
+        }
+        
+        public UnicodeSet get(int cp) {
+            return (UnicodeSet) storage.get(probe.set(cp));
+        }
+    }
+            
+    private static class MutableInt {
+        public int contents;
+        public int hashCode() { return contents; }
+        public boolean equals(Object other) {
+            return ((MutableInt)other).contents == contents;
+        }
+        // allows chaining
+        public MutableInt set(int contents) {
+            this.contents = contents;
+            return this;
+        }
+    }
+}
+    
--- a/icu4j/src/com/ibm/icu/text/UnicodeSetIterator.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeSetIterator.java
@ -0,0 +1,119 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2000, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSetIterator.java,v $ 
+ * $Date: 2002/02/01 02:05:35 $ 
+ * $Revision: 1.1 $
+ *
+ *****************************************************************************************
+ */
+package com.ibm.text;
+
+import com.ibm.util.Utility;
+//import java.text.*;
+import java.util.*;
+import java.io.*;
+
+/**
+ * Class that allows simple iteration over a UnicodeSet.
+ * @author M. Davis
+ * @draft
+ */
+public final class UnicodeSetIterator {
+
+    /**
+     *@set set to iterate over
+     */
+    public UnicodeSetIterator(UnicodeSet set) {
+        reset(set);
+    }
+        
+    /**
+     *@return next character in the set. Returns -1 when done!
+     */
+    public int next() {
+        if (abbreviated) {
+            if (element >= startElement + 50 && element <= endElement - 50) {
+                element = endElement - 50;
+            }
+        }
+        if (element < endElement) {
+            return ++element;
+        }
+        if (range >= endRange) return -1;
+        ++range;
+        endElement = set.getRangeEnd(range);
+        startElement = set.getRangeStart(range);
+        element = set.getRangeStart(range);
+        return element;
+    }
+        
+    /**
+     *@param set the set to iterate over. This allows reuse of the iterator.
+     */
+    public void reset(UnicodeSet set) {
+        this.set = set;
+        endRange = set.getRangeCount() - 1;
+        resetInternal();
+    }
+        
+    /**
+     * Resets to the start, to allow the iteration to start over again.
+     */
+    public void reset() {
+        endRange = set.getRangeCount() - 1;
+        resetInternal();
+    }
+    
+    /**
+     * TODO: Move to UnicodeSet!
+     *@param s the string to test
+     *@return true if and only if no character from s are in the set.
+     */
+    public static boolean containsNone(UnicodeSet set, String s) {
+        int cp;
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
+            cp = UTF16.charAt(s, i);
+            if (set.contains(cp)) return false;
+        }
+        return true;
+    }
+        
+    /**
+     * TODO: Move to UnicodeSet!
+     *@param s the string to test
+     *@return true if and only if all characters from s are in the set.
+     */
+    public static boolean containsAll(UnicodeSet set, String s) {
+        int cp;
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
+            cp = UTF16.charAt(s, i);
+            if (!set.contains(cp)) return false;
+        }
+        return true;
+    }
+    
+    // ======================= PRIVATES ===========================
+    
+    private UnicodeSet set;
+    private int endRange = 0;
+    private int range = 0;
+    private int startElement = 0;
+    private int endElement;
+    private int element;
+    private boolean abbreviated = false;
+        
+    private void resetInternal() {
+        range = 0;
+        endElement = 0;
+        element = 0;            
+        if (endRange >= 0) {
+            element = set.getRangeStart(range);
+            endElement = set.getRangeEnd(range);
+            startElement = set.getRangeStart(range);
+        }
+    }
+}
--- a/icu4j/src/com/ibm/test/normalizer/TestCanonicalIterator.java
+++ b/icu4j/src/com/ibm/test/normalizer/TestCanonicalIterator.java
@ -0,0 +1,70 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2000, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/normalizer/Attic/TestCanonicalIterator.java,v $ 
+ * $Date: 2002/02/01 02:05:35 $ 
+ * $Revision: 1.1 $
+ *
+ *****************************************************************************************
+ */
+package com.ibm.test.normalizer;
+
+import com.ibm.test.*;
+import com.ibm.text.*;
+import com.ibm.util.Utility;
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+import java.util.*;
+
+// TODO: fit into test framework
+
+public class TestCanonicalIterator {
+    static final String testArray[] = {
+        "Åd\u0307\u0327",
+        "\u010d\u017E",
+        "x\u0307\u0327",
+    };
+
+    public static void main(String[] args) {
+        // set up for readable display
+        Transliterator name = Transliterator.getInstance("name");
+        Transliterator hex = Transliterator.getInstance("hex");
+        
+        // check build
+        UnicodeSet ss = CanonicalIterator.getSafeStart();
+        System.out.println("Safe Start: " + ss.toPattern(true));
+        System.out.println();
+        ss = CanonicalIterator.getStarts('a');
+        System.out.println("Characters with 'a' at the start of their decomposition: " + ss.toPattern(true));
+        
+        // check permute
+        System.out.println(collectionToString(CanonicalIterator.permute("ABC")));
+        
+        // try samples
+        for (int i = 0; i < testArray.length; ++i) {
+            System.out.println();
+            System.out.println("Results for: " + name.transliterate(testArray[i]));
+            CanonicalIterator it = new CanonicalIterator(testArray[i]);
+            int counter = 0;
+            while (true) {
+                String result = it.next();
+                if (result == null) break;
+                System.out.println(++counter + ": " + hex.transliterate(result));
+                System.out.println(" = " + name.transliterate(result));
+            }
+        }
+    }
+    
+    static String collectionToString(Collection col) {
+        StringBuffer result = new StringBuffer();
+        Iterator it = col.iterator();
+        while (it.hasNext()) {
+            if (result.length() != 0) result.append(", ");
+            result.append(it.next().toString());
+        }
+        return result.toString();
+    }
+}
--- a/icu4j/src/com/ibm/text/CanonicalIterator.java
+++ b/icu4j/src/com/ibm/text/CanonicalIterator.java
@ -0,0 +1,439 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2000, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/CanonicalIterator.java,v $ 
+ * $Date: 2002/02/01 02:05:35 $ 
+ * $Revision: 1.1 $
+ *
+ *****************************************************************************************
+ */
+package com.ibm.text;
+import com.ibm.util.Utility;
+import java.util.Enumeration;
+import java.util.Vector;
+
+import java.util.*;
+
+/**
+ * This class allows one to iterate through all the strings that are canonically equivalent to a given
+ * string. For example, here are some sample results:
+Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
+1: \u0041\u030A\u0064\u0307\u0327
+ = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
+2: \u0041\u030A\u0064\u0327\u0307
+ = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
+3: \u0041\u030A\u1E0B\u0327
+ = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
+4: \u0041\u030A\u1E11\u0307
+ = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
+5: \u00C5\u0064\u0307\u0327
+ = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
+6: \u00C5\u0064\u0327\u0307
+ = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
+7: \u00C5\u1E0B\u0327
+ = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
+8: \u00C5\u1E11\u0307
+ = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
+9: \u212B\u0064\u0307\u0327
+ = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
+10: \u212B\u0064\u0327\u0307
+ = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
+11: \u212B\u1E0B\u0327
+ = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
+12: \u212B\u1E11\u0307
+ = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
+ *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
+ * since it has not been optimized for that situation.
+ *@author M. Davis
+ *@draft
+ */
+
+public class CanonicalIterator {
+    /**
+     *@param source string to get results for
+     */
+    public CanonicalIterator(String source) {
+        setSource(source);
+    }
+    
+    /**
+     *@return gets the source: NOTE: it is the NFD form of source
+     */
+    public String getSource() {
+      return source;
+    }
+    
+    /**
+     * Resets the iterator so that one can start again from the beginning.
+     */
+    public void reset() {
+        done = false;
+        for (int i = 0; i < current.length; ++i) {
+            current[i] = 0;
+        }
+    }
+    
+    /**
+     *@return the next string that is canonically equivalent. The value null is returned when
+     * the iteration is done.
+     */
+    public String next() {
+        if (done) return null;
+        
+        // construct return value
+        
+        buffer.setLength(0); // delete old contents
+        for (int i = 0; i < pieces.length; ++i) {
+            buffer.append(pieces[i][current[i]]);
+        }
+        String result = buffer.toString();
+        
+        // find next value for next time
+        
+        for (int i = current.length - 1; ; --i) {
+            if (i < 0) {
+                done = true;
+                break;
+            }
+            current[i]++;
+            if (current[i] < pieces[i].length) break; // got sequence
+            current[i] = 0;
+        }
+        return result;
+    }
+    
+    /**
+     *@param set the source string to iterate against. This allows the same iterator to be used
+     * while changing the source string, saving object creation.
+     */
+    public void setSource(String newSource) {
+        source = Normalizer.normalize(newSource, Normalizer.DECOMP, 0);
+        done = false;
+        
+        // find the segments
+        List list = new ArrayList();
+        int cp;
+        int start = 0;
+        int i = 1;
+        for (; i < source.length(); i += UTF16.getCharCount(i)) {
+            cp = UTF16.charAt(source, i);
+            if (SAFE_START.contains(cp)) {
+                list.add(source.substring(start, i)); // add up to i
+                start = i;
+            }
+        }
+        list.add(source.substring(start, i)); // add last one
+        
+        // allocate the arrays, and find the strings that are CE to each segment
+        pieces = new String[list.size()][];
+        current = new int[list.size()];
+        for (i = 0; i < pieces.length; ++i) {
+            if (PROGRESS) System.out.println("SEGMENT");
+            pieces[i] = getEquivalents((String) list.get(i));
+        }
+    }
+    
+    /**
+     * Dumb recursive implementation of permutation. 
+     * TODO: optimize
+     * @param source the string to find permutations for
+     * @return the results in a set.
+     */
+    public static Set permute(String source) {
+        //if (PROGRESS) System.out.println("Permute: " + source);
+        Set result = new TreeSet();
+        
+        // optimization:
+        // if zero or one character, just return a set with it
+        // we check for length < 2 to keep from counting code points all the time
+        if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
+            result.add(source);
+            return result;
+        }
+        
+        // otherwise iterate through the string, and recursively permute all the other characters
+        int cp;
+        for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(source, i);
+            String chStr = UTF16.valueOf(source, i);
+            
+            // see what the permutations of the characters before and after this one are
+            Set subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
+            
+            // prefix this character to all of them
+            Iterator it = subpermute.iterator();
+            while (it.hasNext()) {
+                String piece = chStr + (String) it.next();
+                //if (PROGRESS) System.out.println("  Piece: " + piece);
+                result.add(piece);
+            }
+        }
+        return result;
+    }
+    
+    // FOR TESTING
+    
+    /**
+     *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
+     */
+    public static UnicodeSet getSafeStart() {
+        return (UnicodeSet) SAFE_START.clone();
+    }
+    
+    /**
+     *@return the set of characters whose decompositions start with the given character
+     */
+    public static UnicodeSet getStarts(int cp) {
+        UnicodeSet result = AT_START.get(cp);
+        if (result == null) result = EMPTY;
+        return (UnicodeSet) result.clone();
+    }
+    
+    
+    // ===================== PRIVATES ==============================
+    
+    // debug
+    private static boolean PROGRESS = false; // debug progress
+    private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
+ 
+    // fields
+    private String source;
+    private boolean done;
+    private String[][] pieces;
+    private int[] current;
+    // Note: C will need two more fields, since arrays there don't have lengths
+    // int pieces_length;
+    // int[] pieces_lengths;
+    
+    // transient fields
+    private transient StringBuffer buffer = new StringBuffer();
+    
+    
+    // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
+    private String[] getEquivalents(String segment) {
+        Set result = new TreeSet();
+        Set basic = getEquivalents2(segment);
+        
+        // now get all the permutations
+        // add only the ones that are canonically equivalent
+        // TODO: optimize by not permuting any class zero.
+        Iterator it = basic.iterator();
+        while (it.hasNext()) {
+            String item = (String) it.next();
+            Set permutations = permute(item);
+            Iterator it2 = permutations.iterator();
+            while (it2.hasNext()) {
+                String possible = (String) it2.next();
+                String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
+                if (attempt.equals(segment)) {
+                    if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
+                    result.add(possible);
+                } else {
+                    if (PROGRESS) System.out.println("-Skipping Permutation: " + NAME.transliterate(possible));
+                }
+            }
+        }
+        
+        // convert into a String[] to clean up storage
+        String[] finalResult = new String[result.size()];
+        result.toArray(finalResult);
+        return finalResult;
+    }
+    
+    private Set getEquivalents2(String segment) {
+        Set result = new TreeSet();
+        if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
+        result.add(segment);
+        StringBuffer workingBuffer = new StringBuffer();
+        
+        // cycle through all the characters
+        int cp;
+        for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
+            // see if any character is at the start of some decomposition
+            cp = UTF16.charAt(segment, i);
+            UnicodeSet starts = AT_START.get(cp);
+            if (starts == null) continue;
+            UnicodeSetIterator usi = new UnicodeSetIterator(starts);
+            // if so, see which decompositions match 
+            while (true) {
+                int cp2 = usi.next();
+                if (cp2 < 0) break; // done
+                Set remainder = extract(cp2, segment, i, workingBuffer);
+                if (remainder == null) continue;
+                
+                // there were some matches, so add all the possibilities to the set.
+                String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
+                Iterator it = remainder.iterator();
+                while (it.hasNext()) {
+                    String item = (String) it.next();
+                    if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
+                    result.add(prefix + item);
+                }
+            }
+        }
+        return result;
+    }
+    
+    /**
+     * See if the decomposition of cp2 is at segment starting at segmentPos 
+     * (with canonical rearrangment!)
+     * If so, take the remainder, and return the equivalents 
+     */
+    private Set extract(int comp, String segment, int segmentPos, StringBuffer buffer) {
+        if (PROGRESS) System.out.println(" extract: " + NAME.transliterate(UTF16.valueOf(comp))
+            + ", " + NAME.transliterate(segment.substring(segmentPos)));
+        String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
+        
+        // See if it matches the start of segment (at segmentPos)
+        boolean ok = false;
+        int cp;
+        int decompPos = 0;
+        int decompCp = UTF16.charAt(decomp,0);
+        decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
+        //int decompClass = getClass(decompCp);
+        buffer.setLength(0); // initialize working buffer, shared among callees
+        
+        for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(segment, i);
+            if (cp == decompCp) { // if equal, eat another cp from decomp
+                if (PROGRESS) System.out.println("  matches: " + NAME.transliterate(UTF16.valueOf(cp)));
+                if (decompPos == decomp.length()) { // done, have all decomp characters!
+                    buffer.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
+                    ok = true;
+                    break;
+                }
+                decompCp = UTF16.charAt(decomp, decompPos);
+                decompPos += UTF16.getCharCount(decompCp);
+                //decompClass = getClass(decompCp);
+            } else {
+                if (PROGRESS) System.out.println("  buffer: " + NAME.transliterate(UTF16.valueOf(cp)));
+                // brute force approach
+                UTF16.append(buffer, cp);
+                /* TODO: optimize
+                // since we know that the classes are monotonically increasing, after zero
+                // e.g. 0 5 7 9 0 3
+                // we can do an optimization
+                // there are only a few cases that work: zero, less, same, greater
+                // if both classes are the same, we fail
+                // if the decomp class < the segment class, we fail
+        
+                segClass = getClass(cp);
+                if (decompClass <= segClass) return null;
+                */
+            }
+        }
+        if (!ok) return null; // we failed, characters left over
+        if (PROGRESS) System.out.println("Matches");
+        if (buffer.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
+        String remainder = buffer.toString();
+        
+        // brute force approach
+        // check to make sure result is canonically equivalent
+        String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
+        if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
+        
+        // get the remaining combinations
+        return getEquivalents2(remainder);
+    }
+    
+    // TODO: fix once we have a codepoint interface to get the canonical combining class
+    // TODO: Need public access to canonical combining class in UCharacter!
+    private static int getClass(int cp) {
+        return Normalizer.getClass((char)cp);
+    }
+    
+   // ================= BUILDER =========================
+    // TODO: Flatten this data so it doesn't have to be reconstructed each time!
+    
+    private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
+    private static final Set SET_WITH_NULL_STRING = new TreeSet(); // constant, don't change
+    static {
+        SET_WITH_NULL_STRING.add("");
+    }
+    
+    private static UnicodeSet SAFE_START = new UnicodeSet();
+    private static CharMap AT_START = new CharMap();
+    
+        // WARNING, NORMALIZER doesn't have supplementaries yet;
+        // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
+    private static int LAST_UNICODE = 0xFFFF;
+    static {
+        buildData();
+    }
+    
+    // TODO: public just for testing
+    private static void buildData() {
+
+        if (PROGRESS) System.out.println("Getting Safe Start");
+        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
+            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
+            int cc = getClass(cp);
+            if (cc == 0) SAFE_START.add(cp);
+            // will fix to be really safe below
+        }
+        if (PROGRESS) System.out.println();
+        
+        if (PROGRESS) System.out.println("Getting Containment");
+        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
+            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
+            // TODO: For efficiency, need extra function plus overloads
+            // Normalizer.normalizationDiffers(String source,...)
+            // Normalizer.normalizationDiffers(int char32,...)
+            // Normalizer.normalize(char32,...);
+            String istr = UTF16.valueOf(cp);
+            String decomp = Normalizer.normalize(istr, Normalizer.DECOMP, 0);
+            if (decomp.equals(istr)) continue;
+            
+            // add each character in the decomposition to canBeIn 
+            
+            int component;
+            for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
+                component = UTF16.charAt(decomp, i);
+                if (i == 0) {
+                    AT_START.add(component, cp);
+                } else if (getClass(component) == 0) {
+                    SAFE_START.remove(component);
+                }
+            }
+        }
+        if (PROGRESS) System.out.println();
+    }
+    
+    // the following is just for a map from characters to a set of characters
+    
+    private static class CharMap {
+        Map storage = new HashMap();
+        MutableInt probe = new MutableInt();
+        boolean converted = false;
+        
+        public void add(int cp, int whatItIsIn) {
+            UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
+            if (result == null) {
+                result = new UnicodeSet();
+                storage.put(probe, result);
+            }
+            result.add(whatItIsIn);
+        }
+        
+        public UnicodeSet get(int cp) {
+            return (UnicodeSet) storage.get(probe.set(cp));
+        }
+    }
+            
+    private static class MutableInt {
+        public int contents;
+        public int hashCode() { return contents; }
+        public boolean equals(Object other) {
+            return ((MutableInt)other).contents == contents;
+        }
+        // allows chaining
+        public MutableInt set(int contents) {
+            this.contents = contents;
+            return this;
+        }
+    }
+}
+    
--- a/icu4j/src/com/ibm/text/UnicodeSetIterator.java
+++ b/icu4j/src/com/ibm/text/UnicodeSetIterator.java
@ -0,0 +1,119 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2000, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSetIterator.java,v $ 
+ * $Date: 2002/02/01 02:05:35 $ 
+ * $Revision: 1.1 $
+ *
+ *****************************************************************************************
+ */
+package com.ibm.text;
+
+import com.ibm.util.Utility;
+//import java.text.*;
+import java.util.*;
+import java.io.*;
+
+/**
+ * Class that allows simple iteration over a UnicodeSet.
+ * @author M. Davis
+ * @draft
+ */
+public final class UnicodeSetIterator {
+
+    /**
+     *@set set to iterate over
+     */
+    public UnicodeSetIterator(UnicodeSet set) {
+        reset(set);
+    }
+        
+    /**
+     *@return next character in the set. Returns -1 when done!
+     */
+    public int next() {
+        if (abbreviated) {
+            if (element >= startElement + 50 && element <= endElement - 50) {
+                element = endElement - 50;
+            }
+        }
+        if (element < endElement) {
+            return ++element;
+        }
+        if (range >= endRange) return -1;
+        ++range;
+        endElement = set.getRangeEnd(range);
+        startElement = set.getRangeStart(range);
+        element = set.getRangeStart(range);
+        return element;
+    }
+        
+    /**
+     *@param set the set to iterate over. This allows reuse of the iterator.
+     */
+    public void reset(UnicodeSet set) {
+        this.set = set;
+        endRange = set.getRangeCount() - 1;
+        resetInternal();
+    }
+        
+    /**
+     * Resets to the start, to allow the iteration to start over again.
+     */
+    public void reset() {
+        endRange = set.getRangeCount() - 1;
+        resetInternal();
+    }
+    
+    /**
+     * TODO: Move to UnicodeSet!
+     *@param s the string to test
+     *@return true if and only if no character from s are in the set.
+     */
+    public static boolean containsNone(UnicodeSet set, String s) {
+        int cp;
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
+            cp = UTF16.charAt(s, i);
+            if (set.contains(cp)) return false;
+        }
+        return true;
+    }
+        
+    /**
+     * TODO: Move to UnicodeSet!
+     *@param s the string to test
+     *@return true if and only if all characters from s are in the set.
+     */
+    public static boolean containsAll(UnicodeSet set, String s) {
+        int cp;
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
+            cp = UTF16.charAt(s, i);
+            if (!set.contains(cp)) return false;
+        }
+        return true;
+    }
+    
+    // ======================= PRIVATES ===========================
+    
+    private UnicodeSet set;
+    private int endRange = 0;
+    private int range = 0;
+    private int startElement = 0;
+    private int endElement;
+    private int element;
+    private boolean abbreviated = false;
+        
+    private void resetInternal() {
+        range = 0;
+        endElement = 0;
+        element = 0;            
+        if (endRange >= 0) {
+            element = set.getRangeStart(range);
+            endElement = set.getRangeEnd(range);
+            startElement = set.getRangeStart(range);
+        }
+    }
+}