ICU-60 implement toPattern in UnicodeSet; update UnicodeFilter.contains to take an int; update UnicodeSet to support code points to U+10FFFF

X-SVN-Rev: 5904
2001-09-24 19:57:18 +00:00 · 2001-09-24 19:57:18 +00:00 · a01b74ee85
commit a01b74ee85
parent 301464ad59
22 changed files with 980 additions and 424 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/translit/CompoundTransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/CompoundTransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/CompoundTransliteratorTest.java,v $ 
- * $Date: 2001/09/08 01:17:50 $ 
- * $Revision: 1.2 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -111,7 +111,7 @@ public class CompoundTransliteratorTest extends TestFmwk {
 
    public void TestGetTransliterator(){
        logln("Testing the getTransliterator() API of CompoundTransliterator");
-        String ID="Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Unicode-Hex;Hex-Unicode";
+        String ID="Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Any-Hex;Hex-Any";
        CompoundTransliterator ct1=null;
        try{
            ct1=new CompoundTransliterator(ID);
@ -141,9 +141,9 @@ public class CompoundTransliteratorTest extends TestFmwk {
        logln("Testing the handleTransliterate() API of CompoundTransliterator");
        CompoundTransliterator ct1=null;
        try{
-            ct1=new CompoundTransliterator("Unicode-Hex;Hex-Unicode");
+            ct1=new CompoundTransliterator("Any-Hex;Hex-Any");
        }catch(IllegalArgumentException iae){
-            errln("FAIL: construction using CompoundTransliterator(String ID) failed for " + "Unicode-Hex;Hex-Unicode");
+            errln("FAIL: construction using CompoundTransliterator(String ID) failed for " + "Any-Hex;Hex-Any");
            throw iae;
        }
    
@ -167,8 +167,8 @@ public class CompoundTransliteratorTest extends TestFmwk {
       
        String Data[]={
             //ID, input string, transliterated string
-             "Unicode-Hex;Hex-Unicode;Unicode-Hex",     "hello",  "\\u0068\\u0065\\u006C\\u006C\\u006F", 
-             "Unicode-Hex;Hex-Unicode",                 "hello! How are you?",  "hello! How are you?",
+             "Any-Hex;Hex-Any;Any-Hex",     "hello",  "\\u0068\\u0065\\u006C\\u006C\\u006F", 
+             "Any-Hex;Hex-Any",                 "hello! How are you?",  "hello! How are you?",
             "Devanagari-Latin;Latin-Devanagari",       "\u092D\u0948'\u0930'\u0935",  "\u092D\u0948\u0930\u0935", // quotes lost
             "Latin-Cyrillic;Cyrillic-Latin",           "a'b'k'd'e'f'g'h'i'j'Shch'shch'zh'h", "abkdefghijShchshchzhh",
             "Latin-Greek;Greek-Latin",                 "ABGabgAKLMN", "ABGabgAKLMN",
--- a/icu4j/src/com/ibm/icu/dev/test/translit/HexToUnicodeTransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/HexToUnicodeTransliteratorTest.java
@ -6,8 +6,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/Attic/HexToUnicodeTransliteratorTest.java,v $ 
- * $Date: 2000/10/09 16:32:07 $ 
- * $Revision: 1.2 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -33,7 +33,7 @@ public class HexToUnicodeTransliteratorTest extends TestFmwk {
    * Used by TestConstruction() and TestTransliterate.
    */
    UnicodeFilter HexFilter=new UnicodeFilter() {
-        public boolean contains(char c) {
+        public boolean contains(int c) {
            if(c == 0x0061 || c == 0x0063 )
                return false;
            else
--- a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
@ -4,9 +4,9 @@
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
- * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $ 
- * $Date: 2001/09/21 21:23:34 $ 
- * $Revision: 1.45 $
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
+ * $Date: 2001/09/24 19:56:41 $
+ * $Revision: 1.46 $
 *
 *****************************************************************************************
 */
@ -48,14 +48,12 @@ public class TransliteratorTest extends TestFmwk {
                throw ex;
            }

-            // TODO remove check for class when we implement full
-            // toRules().
-            if (t != null && t instanceof RuleBasedTransliterator) {
+            if (t != null) {
                // Now test toRules
                String rules = null;
                try {
-                    rules = ((RuleBasedTransliterator)t).toRules(true);
-                
+                    rules = t.toRules(true);
+
                    Transliterator u = Transliterator.createFromRules("x",
                                           rules, Transliterator.FORWARD);
                } catch (IllegalArgumentException ex2) {
@ -74,7 +72,7 @@ public class TransliteratorTest extends TestFmwk {
        } catch (IllegalArgumentException ex) {
            logln("OK: Bogus ID handled properly");
        }
-        
+
        ms = System.currentTimeMillis() - ms;
        logln("Elapsed time: " + ms + " ms");
    }
@ -223,7 +221,7 @@ public class TransliteratorTest extends TestFmwk {
     * Basic test of keyboard.
     */
    public void TestKeyboard() {
-        Transliterator t = new RuleBasedTransliterator("<ID>", 
+        Transliterator t = new RuleBasedTransliterator("<ID>",
                                                       "psch>Y;"
                                                       +"ps>y;"
                                                       +"ch>x;"
@ -246,7 +244,7 @@ public class TransliteratorTest extends TestFmwk {
     * Basic test of keyboard with cursor.
     */
    public void TestKeyboard2() {
-        Transliterator t = new RuleBasedTransliterator("<ID>", 
+        Transliterator t = new RuleBasedTransliterator("<ID>",
                                                       "ych>Y;"
                                                       +"ps>|y;"
                                                       +"ch>x;"
@ -381,7 +379,7 @@ public class TransliteratorTest extends TestFmwk {
    public void TestFiltering() {
        Transliterator hex = Transliterator.getInstance("Any-Hex");
        hex.setFilter(new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                return c != 'c';
            }
        });
@ -398,7 +396,7 @@ public class TransliteratorTest extends TestFmwk {
    /**
     * Test anchors
     */
-    public void TestAnchors() { 
+    public void TestAnchors() {
        expect("^ab  > 01 ;" +
               " ab  > |8 ;" +
               "  b  > k ;" +
@ -406,7 +404,7 @@ public class TransliteratorTest extends TestFmwk {
               " 8x  > 77 ;",

               "ababbabxabx",
-               "018k7745");  
+               "018k7745");
        expect("$s = [z$] ;" +
               "$s{ab    > 01 ;" +
               "   ab    > |8 ;" +
@ -440,7 +438,7 @@ public class TransliteratorTest extends TestFmwk {
     */
    public void TestJ277() {
        Transliterator gl = Transliterator.getInstance("Greek-Latin");
-        
+
        char sigma = (char)0x3C3;
        char upsilon = (char)0x3C5;
        char nu = (char)0x3BD;
@ -517,7 +515,7 @@ public class TransliteratorTest extends TestFmwk {

        // Try a custom Hex-Any
        // \\uXXXX and &#xXXXX;
-        HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;"); 
+        HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
        expect(hex2, "\\u61\\u062\\u0063\\u00645\\u66x&#x30;&#x031;&#x0032;&#x00033;",
               "abcd5fx012&#x00033;");

@ -740,7 +738,7 @@ public class TransliteratorTest extends TestFmwk {
        if (!got.equals(exp)) {
            errln("FAIL: Inverse of " + ID + " is " + got +
                  ", expected " + exp);
-        }        
+        }
    }

    /**
@ -752,18 +750,18 @@ public class TransliteratorTest extends TestFmwk {
            "Hex[aeiou]-Any",
            "quizzical",
            "q\\u0075\\u0069zz\\u0069c\\u0061l",
-            
+
            "Any[aeiou]-Hex;Hex[^5]-Any",
            "Any[^5]-Hex;Hex[aeiou]-Any",
            "quizzical",
            "q\\u0075izzical",
-            
+
            "Null[abc]",
            "Null[abc]",
            "xyz",
            "xyz",
        };
-        
+
        for (int i=0; i<DATA.length; i+=4) {
            String ID = DATA[i];
            Transliterator t = Transliterator.getInstance(ID);
@ -796,6 +794,109 @@ public class TransliteratorTest extends TestFmwk {
               "Th qck brwn fx.");
    }

+    public void TestToRules() {
+        String RBT = "rbt";
+        String SET = "set";
+        String[] DATA = {
+            RBT,
+            "$a=\\u4E61; [$a] > A;",
+            "[\\u4E61] > A;",
+
+            RBT,
+            "$white=[[:Zs:][:Zl:]]; $white{a} > A;",
+            "[[:Zs:][:Zl:]]{a} > A;",
+
+            SET,
+            "[[:Zs:][:Zl:]]",
+            "[[:Zs:][:Zl:]]",
+
+            SET,
+            "[:Ps:]",
+            "[:Ps:]",
+
+            SET,
+            "[:L:]",
+            "[:L:]",
+
+            SET,
+            "[[:L:]-[A]]",
+            "[[:L:]-[A]]",
+
+            SET,
+            "[~[:Lu:][:Ll:]]",
+            "[~[:Lu:][:Ll:]]",
+
+            SET,
+            "[~[a-z]]",
+            "[~[a-z]]",
+
+            RBT,
+            "$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
+            "[^[:Zs:]]{a} > A;",
+
+            RBT,
+            "$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
+            "[[a-z]-[:Zs:]]{a} > A;",
+
+            RBT,
+            "$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
+            "[[:Zs:]&[a-z]]{a} > A;",
+
+            RBT,
+            "$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
+            "[x[:Zs:]]{a} > A;",
+        };
+
+        for (int d=0; d < DATA.length; d+=3) {
+            if (DATA[d] == RBT) {
+                // Transliterator test
+                Transliterator t = Transliterator.createFromRules("ID",
+                                       DATA[d+1], Transliterator.FORWARD);
+                if (t == null) {
+                    errln("FAIL: createFromRules failed");
+                    return;
+                }
+                String rules, escapedRules;
+                rules = t.toRules(false);
+                escapedRules = t.toRules(true);
+                String expRules = Utility.unescape(DATA[d+2]);
+                String expEscapedRules = DATA[d+2];
+                if (rules.equals(expRules)) {
+                    logln("Ok: " + DATA[d+1] +
+                          " => " + Utility.escape(rules));
+                } else {
+                    errln("FAIL: " + DATA[d+1] +
+                          " => " + Utility.escape(rules + ", exp " + expRules));
+                }
+                if (escapedRules.equals(expEscapedRules)) {
+                    logln("Ok: " + DATA[d+1] +
+                          " => " + escapedRules);
+                } else {
+                    errln("FAIL: " + DATA[d+1] +
+                          " => " + escapedRules + ", exp " + expEscapedRules);
+                }
+
+            } else {
+                // UnicodeSet test
+                String pat = DATA[d+1];
+                String expToPat = DATA[d+2];
+                UnicodeSet set = new UnicodeSet(pat);
+
+                // Adjust spacing etc. as necessary.
+                String toPat;
+                toPat = set.toPattern(true);
+                if (expToPat.equals(toPat)) {
+                    logln("Ok: " + pat +
+                          " => " + toPat);
+                } else {
+                    errln("FAIL: " + pat +
+                          " => " + Utility.escape(toPat) +
+                          ", exp " + Utility.escape(pat));
+                }
+            }
+        }
+    }
+
    /**
     * Test the case mapping transliterators.
     */
@ -806,7 +907,7 @@ public class TransliteratorTest extends TestFmwk {
            Transliterator.getInstance("Any-Lower[^xyzXYZ]");
        Transliterator toTitle =
            Transliterator.getInstance("Any-Title[^xyzXYZ]");
-        
+
        expect(toUpper, "The quick brown fox jumped over the lazy dogs.",
               "THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
        expect(toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
@ -857,7 +958,7 @@ public class TransliteratorTest extends TestFmwk {
                    errln("FAIL: " + DATA[i+2] +
                          " create ID \"" + DATA[i] + "\" => \"" +
                          t.getID() + "\", exp \"" + DATA[i+1] + "\"");
-                }   
+                }
            } catch (IllegalArgumentException e) {
                errln("FAIL: " + DATA[i+2] +
                      " create ID \"" + DATA[i] + "\"");
@ -875,52 +976,52 @@ public class TransliteratorTest extends TestFmwk {
            // Input               Decomposed            Composed
            {"cat",                "cat",                "cat"               },
            {"\u00e0ardvark",      "a\u0300ardvark",     "\u00e0ardvark"     },
-                                                         
+
            {"\u1e0a",             "D\u0307",            "\u1e0a"            }, // D-dot_above
            {"D\u0307",            "D\u0307",            "\u1e0a"            }, // D dot_above
-                                                         
+
            {"\u1e0c\u0307",       "D\u0323\u0307",      "\u1e0c\u0307"      }, // D-dot_below dot_above
            {"\u1e0a\u0323",       "D\u0323\u0307",      "\u1e0c\u0307"      }, // D-dot_above dot_below
            {"D\u0307\u0323",      "D\u0323\u0307",      "\u1e0c\u0307"      }, // D dot_below dot_above
-                                                         
+
            {"\u1e10\u0307\u0323", "D\u0327\u0323\u0307","\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above
            {"D\u0307\u0328\u0323","D\u0328\u0323\u0307","\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below
-                                                         
+
            {"\u1E14",             "E\u0304\u0300",      "\u1E14"            }, // E-macron-grave
            {"\u0112\u0300",       "E\u0304\u0300",      "\u1E14"            }, // E-macron + grave
            {"\u00c8\u0304",       "E\u0300\u0304",      "\u00c8\u0304"      }, // E-grave + macron
-                                                         
+
            {"\u212b",             "A\u030a",            "\u00c5"            }, // angstrom_sign
            {"\u00c5",             "A\u030a",            "\u00c5"            }, // A-ring
-                                                         
+
            {"\u00fdffin",         "y\u0301ffin",        "\u00fdffin"        },	//updated with 3.0
            {"\u00fd\uFB03n",      "y\u0301\uFB03n",     "\u00fd\uFB03n"     },	//updated with 3.0
-                                                         
+
            {"Henry IV",           "Henry IV",           "Henry IV"          },
            {"Henry \u2163",       "Henry \u2163",       "Henry \u2163"      },
-                                                         
+
            {"\u30AC",             "\u30AB\u3099",       "\u30AC"            }, // ga (Katakana)
            {"\u30AB\u3099",       "\u30AB\u3099",       "\u30AC"            }, // ka + ten
            {"\uFF76\uFF9E",       "\uFF76\uFF9E",       "\uFF76\uFF9E"      }, // hw_ka + hw_ten
            {"\u30AB\uFF9E",       "\u30AB\uFF9E",       "\u30AB\uFF9E"      }, // ka + hw_ten
            {"\uFF76\u3099",       "\uFF76\u3099",       "\uFF76\u3099"      }, // hw_ka + ten
-                                                         
+
            {"A\u0300\u0316",      "A\u0316\u0300",      "\u00C0\u0316"      },
-        };                                                
-                                                          
-        String[][] COMPAT = {                        
+        };
+
+        String[][] COMPAT = {
            // Input               Decomposed            Composed
            {"\uFB4f",             "\u05D0\u05DC",       "\u05D0\u05DC"      }, // Alef-Lamed vs. Alef, Lamed
-                                                         
+
            {"\u00fdffin",         "y\u0301ffin",        "\u00fdffin"        },	//updated for 3.0
            {"\u00fd\uFB03n",      "y\u0301ffin",        "\u00fdffin"        }, // ffi ligature -> f + f + i
-                                                         
+
            {"Henry IV",           "Henry IV",           "Henry IV"          },
            {"Henry \u2163",       "Henry IV",           "Henry IV"          },
-                                                         
+
            {"\u30AC",             "\u30AB\u3099",       "\u30AC"            }, // ga (Katakana)
            {"\u30AB\u3099",       "\u30AB\u3099",       "\u30AC"            }, // ka + ten
-                                                         
+
            {"\uFF76\u3099",       "\u30AB\u3099",       "\u30AC"            }, // hw_ka + ten
        };

@ -960,7 +1061,7 @@ public class TransliteratorTest extends TestFmwk {
        expect(t, "\u010dx", "c\u030C");
        }
    }
-    
+
    /**
     * Test compound RBT rules.
     */
@ -1116,7 +1217,7 @@ public class TransliteratorTest extends TestFmwk {
                append('|').
                append(s.substring(index.start));
        }
-        
+
        // As a final step in keyboard transliteration, we must call
        // transliterate to finish off any pending partial matches that
        // were waiting for more input.
@ -1135,7 +1236,7 @@ public class TransliteratorTest extends TestFmwk {
                  result.equals(expectedResult),
                  expectedResult);
    }
-    
+
    void expectAux(String tag, String summary, boolean pass,
                   String expectedResult) {
        if (pass) {
--- a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeFilterLogicTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeFilterLogicTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/Attic/UnicodeFilterLogicTest.java,v $ 
- * $Date: 2000/10/04 23:12:33 $ 
- * $Revision: 1.2 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -29,7 +29,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {


   UnicodeFilter Filter1=new UnicodeFilter() {
-        public boolean contains(char c) {
+        public boolean contains(int c) {
            if(c == 0x0061 || c == 0x0041 || c == 0x0063 || c == 0x0043)
                return false;
            else
@ -37,7 +37,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {
       }
   };
   UnicodeFilter Filter2=new UnicodeFilter() {
-        public boolean contains(char c) {
+        public boolean contains(int c) {
            if(c == 0x0079 || c == 0x0059 || c == 0x007a || c == 0x005a  || c == 0x0061 || c == 0x0063)
                return false;
            else
@ -47,7 +47,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {

   public void TestAllFilters() {

-        Transliterator t1 = Transliterator.getInstance("Unicode-Hex");
+        Transliterator t1 = Transliterator.getInstance("Any-Hex");
        String source="abcdABCDyzYZ";

        //sanity testing wihtout any filter
--- a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java,v $ 
- * $Date: 2001/09/08 01:17:50 $ 
- * $Revision: 1.11 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.12 $
 *
 *****************************************************************************************
 */
@ -181,8 +181,8 @@ public class UnicodeSetTest extends TestFmwk {
        set.clear();
        set.applyPattern("[A-Y 1-8 b-d l-y]");
        for (int i = 0; i<set.getRangeCount(); ++i) {
-            char a = set.getRangeStart(i);
-            char b = set.getRangeEnd(i);
+            int a = set.getRangeStart(i);
+            int b = set.getRangeEnd(i);
            if (!set.contains(a, b)) {
                errln("FAIL, should contain " + (char)a + '-' + (char)b +
                      " but doesn't: " + set);
@ -219,7 +219,7 @@ public class UnicodeSetTest extends TestFmwk {
        if (c.equals(exp)) {
            logln("c.complement(): " + c);
        } else {
-            errln("FAIL: c.complement() = " + c + ", expect " + exp);
+            errln(Utility.escape("FAIL: c.complement() = " + c + ", expect " + exp));
        }
        c.complement();
        exp.set((char)3, (char)15);
@ -252,13 +252,13 @@ public class UnicodeSetTest extends TestFmwk {
    public void TestIndexOf() {
        UnicodeSet set = new UnicodeSet("[a-cx-y3578]");
        for (int i=0; i<set.size(); ++i) {
-            char c = set.charAt(i);
+            int c = set.charAt(i);
            if (set.indexOf(c) != i) {
                errln("FAIL: charAt(" + i + ") = " + c +
                      " => indexOf() => " + set.indexOf(c));
            }
        }
-        char c = set.charAt(set.size());
+        int c = set.charAt(set.size());
        if (c != '\uFFFE') {
            errln("FAIL: charAt(<out of range>) = " +
                  Utility.escape(String.valueOf(c)));
--- a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeToHexTransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeToHexTransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/Attic/UnicodeToHexTransliteratorTest.java,v $ 
- * $Date: 2000/10/16 16:58:29 $ 
- * $Revision: 1.3 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.4 $
 *
 *****************************************************************************************
 */
@ -32,7 +32,7 @@ public class UnicodeToHexTransliteratorTest extends TestFmwk {
    * Used by TestConstruction() and TestTransliterate.
    */
    UnicodeFilter UniFilter=new UnicodeFilter() {
-        public boolean contains(char c) {
+        public boolean contains(int c) {
            if(c==0x0063 || c==0x0061 || c==0x0043 || c==0x0041)
                return false;
            else
--- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $ 
- * $Date: 2001/09/21 21:24:04 $ 
- * $Revision: 1.44 $
+ * $Date: 2001/09/24 19:57:17 $ 
+ * $Revision: 1.45 $
 *
 *****************************************************************************************
 */
@ -279,7 +279,7 @@ import com.ibm.text.resources.ResourceReader;
 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
 * 
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.44 $ $Date: 2001/09/21 21:24:04 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.45 $ $Date: 2001/09/24 19:57:17 $
 */
 public class RuleBasedTransliterator extends Transliterator {

@ -542,7 +542,7 @@ public class RuleBasedTransliterator extends Transliterator {
            /**
             * Implement SymbolTable API.
             */
-            public UnicodeSet lookupSet(char ch) {
+            public UnicodeSet lookupSet(int ch) {
                // Note that we cannot use data.lookupSet() because the
                // set array has not been constructed yet.
                int i = ch - data.setVariablesBase;
@ -1579,6 +1579,9 @@ public class RuleBasedTransliterator extends Transliterator {

 /**
 * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.45  2001/09/24 19:57:17  alan
+ * jitterbug 60: implement toPattern in UnicodeSet; update UnicodeFilter.contains to take an int; update UnicodeSet to support code points to U+10FFFF
+ *
 * Revision 1.44  2001/09/21 21:24:04  alan
 * jitterbug 64: allow ::ID blocks in rules
 *
--- a/icu4j/src/com/ibm/icu/text/SymbolTable.java
+++ b/icu4j/src/com/ibm/icu/text/SymbolTable.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SymbolTable.java,v $ 
- * $Date: 2000/08/30 20:40:30 $ 
- * $Revision: 1.6 $
+ * $Date: 2001/09/24 19:57:18 $ 
+ * $Revision: 1.7 $
 *
 *****************************************************************************************
 */
@ -41,8 +41,9 @@ public interface SymbolTable {
    /**
     * Lookup the UnicodeSet associated with the given character, and
     * return it.  Return <tt>null</tt> if not found.
+     * @param ch a 32-bit code point from 0 to 0x10FFFF.
     */
-    UnicodeSet lookupSet(char ch);
+    UnicodeSet lookupSet(int ch);

    /**
     * Parse a symbol reference name from the given string, starting
--- a/icu4j/src/com/ibm/icu/text/UnicodeFilter.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeFilter.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeFilter.java,v $ 
- * $Date: 2000/03/10 04:07:25 $ 
- * $Revision: 1.4 $
+ * $Date: 2001/09/24 19:57:18 $ 
+ * $Revision: 1.5 $
 *
 *****************************************************************************************
 */
@ -30,5 +30,5 @@ public interface UnicodeFilter {
     * filtered</b>, then <tt>contains()</tt> returns
     * <b><tt>false</tt></b>.
     */
-    boolean contains(char c);
+    boolean contains(int c);
 }
--- a/icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UnicodeFilterLogic.java,v $ 
- * $Date: 2000/03/10 04:07:25 $ 
- * $Revision: 1.3 $
+ * $Date: 2001/09/24 19:57:18 $ 
+ * $Revision: 1.4 $
 *
 *****************************************************************************************
 */
@ -28,7 +28,7 @@ public final class UnicodeFilterLogic {
     */
    public static UnicodeFilter not(final UnicodeFilter f) {
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                return !f.contains(c);
            }
        };
@ -51,7 +51,7 @@ public final class UnicodeFilterLogic {
            return f;
        }
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                return f.contains(c) && g.contains(c);
            }
        };
@ -66,7 +66,7 @@ public final class UnicodeFilterLogic {
     */
    public static UnicodeFilter and(final UnicodeFilter[] f) {
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                for (int i=0; i<f.length; ++i) {
                    if (!f[i].contains(c)) {
                        return false;
@ -94,7 +94,7 @@ public final class UnicodeFilterLogic {
            return f;
        }
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                return f.contains(c) || g.contains(c);
            }
        };
@ -109,7 +109,7 @@ public final class UnicodeFilterLogic {
     */
    public static UnicodeFilter or(final UnicodeFilter[] f) {
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                for (int i=0; i<f.length; ++i) {
                    if (f[i].contains(c)) {
                        return true;
--- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java
@ -5,21 +5,22 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
- * $Date: 2001/09/20 21:20:00 $
- * $Revision: 1.33 $
+ * $Date: 2001/09/24 19:57:18 $
+ * $Revision: 1.34 $
 *
 *****************************************************************************************
 */
 package com.ibm.text;

 import java.text.*;
+import com.ibm.util.Utility;

 /**
 * A mutable set of Unicode characters.  Objects of this class
 * represent <em>character classes</em> used in regular expressions.
 * Such classes specify a subset of the set of all Unicode characters,
 * which in this implementation is the characters from U+0000 to
- * U+FFFF, ignoring surrogates.
+ * U+10FFFF.
 *
 * <p><code>UnicodeSet</code> supports two APIs. The first is the
 * <em>operand</em> API that allows the caller to modify the value of
@ -184,7 +185,7 @@ import java.text.*;
 * through 'z' and all letters in between, in Unicode order
 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
 * all characters but 'a' through 'z',
- * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF
+ * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
@ -254,10 +255,11 @@ import java.text.*;
 * *Unsupported by Java (and hence unsupported by UnicodeSet).
 *
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.33 $ $Date: 2001/09/20 21:20:00 $ */
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.34 $ $Date: 2001/09/24 19:57:18 $ */
 public class UnicodeSet implements UnicodeFilter {

    /* Implementation Notes.
+     * NOTE: This conversion has been completed as of 2.0.
     *
     * UnicodeSet currently represents only the characters U+0000 to
     * U+FFFF.  This allows the API to be written in terms of the Java
@ -285,24 +287,35 @@ public class UnicodeSet implements UnicodeFilter {
     */

    private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
-    private static final int HIGH = 0x10000; // HIGH > all valid values. 10000 for code units.
+    private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
                                             // 110000 for codepoints

    /**
     * Minimum value that can be stored in a UnicodeSet.
     */
-    public static final char MIN_VALUE = (char) LOW;
+    public static final int MIN_VALUE = LOW;

    /**
     * Maximum value that can be stored in a UnicodeSet.
     */
-    public static final char MAX_VALUE = (char) (HIGH - 1);
+    public static final int MAX_VALUE = HIGH - 1;

    private int len;      // length used; list may be longer to minimize reallocs
    private int[] list;   // MUST be terminated with HIGH
    private int[] rangeList; // internal buffer
    private int[] buffer; // internal buffer

+    /**
+     * The pattern representation of this set.  This may not be the
+     * most economical pattern.  It is the pattern supplied to
+     * applyPattern(), with variables substituted and whitespace
+     * removed.  For sets constructed without applyPattern(), or
+     * modified using the non-pattern API, this string will be null,
+     * indicating that toPattern() must generate a pattern
+     * representation from the inversion list.
+     */ 
+    private String pat = null;
+
    private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
    private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0

@ -349,7 +362,7 @@ public class UnicodeSet implements UnicodeFilter {
     * @param start first character, inclusive, of range
     * @param end last character, inclusive, of range
     */
-    public UnicodeSet(char start, char end) {
+    public UnicodeSet(int start, int end) {
        this();
        complement(start, end);
    }
@ -418,7 +431,7 @@ public class UnicodeSet implements UnicodeFilter {
     * @param start first character in the set, inclusive
     * @rparam end last character in the set, inclusive
     */
-    public void set(char start, char end) {
+    public void set(int start, int end) {
        clear();
        complement(start, end);
    }
@ -431,6 +444,7 @@ public class UnicodeSet implements UnicodeFilter {
    public void set(UnicodeSet other) {
        list = (int[]) other.list.clone();
        len = other.len;
+        pat = other.pat;
    }

    /**
@ -475,24 +489,6 @@ public class UnicodeSet implements UnicodeFilter {
        }
    }

-    /**
-     * Append the <code>toPattern()</code> representation of a
-     * character to the given <code>StringBuffer</code>.
-     */
-    private static final void _toPat(StringBuffer buf, char c) {
-        // Okay to let ':' pass through
-        switch (c) {
-        case '[':
-        case ']':
-        case '-':
-        case '^':
-        case '&':
-        case '\\':
-            buf.append('\\');
-        }
-        buf.append(c);
-    }
-
    /**
     * Append the <code>toPattern()</code> representation of a
     * character to the given <code>StringBuffer</code>.
@ -575,6 +571,53 @@ public class UnicodeSet implements UnicodeFilter {
     */
    public String toPattern(boolean escapeUnprintable) {
        StringBuffer result = new StringBuffer();
+        return _toPattern(result, escapeUnprintable).toString();
+    }
+
+    /**
+     * Append a string representation of this set to result.  This will be
+     * a cleaned version of the string passed to applyPattern(), if there
+     * is one.  Otherwise it will be generated.
+     */
+    private StringBuffer _toPattern(StringBuffer result,
+                                    boolean escapeUnprintable) {
+        if (pat != null) {
+            int i;
+            int backslashCount = 0;
+            for (i=0; i<pat.length(); ++i) {
+                char c = pat.charAt(i);
+                if (escapeUnprintable && _isUnprintable(c)) {
+                    // If the unprintable character is preceded by an odd
+                    // number of backslashes, then it has been escaped.
+                    // Before unescaping it, we delete the final
+                    // backslash.
+                    if ((backslashCount % 2) == 1) {
+                        result.setLength(result.length() - 1);
+                    }
+                    _escapeUnprintable(result, c);
+                    backslashCount = 0;
+                } else {
+                    result.append(c);
+                    if (c == '\\') {
+                        ++backslashCount;
+                    } else {
+                        backslashCount = 0;
+                    }
+                }
+            }
+            return result;
+        }
+        
+        return _generatePattern(result, escapeUnprintable);
+    }
+
+    /**
+     * Generate and append a string representation of this set to result.
+     * This does not use this.pat, the cleaned up copy of the string
+     * passed to applyPattern(). 
+     */
+    public StringBuffer _generatePattern(StringBuffer result,
+                                         boolean escapeUnprintable) {
        result.append('[');

        // Check against the predefined categories.  We implicitly build
@ -583,7 +626,7 @@ public class UnicodeSet implements UnicodeFilter {
            if (this.equals(getCategorySet(cat))) {
                result.append(':');
                result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
-                return result.append(":]").toString();
+                return result.append(":]");
            }
        }

@ -623,7 +666,7 @@ public class UnicodeSet implements UnicodeFilter {
            }
        }

-        return result.append(']').toString();
+        return result.append(']');
    }

    /**
@ -659,7 +702,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @return <tt>true</tt> if this set contains the specified range
     * of chars.
     */
-    public boolean contains(char start, char end) {
+    public boolean contains(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        int i = -1;
        while (true) {
            if (start < list[++i]) break;
@ -674,7 +723,10 @@ public class UnicodeSet implements UnicodeFilter {
     * <code>charAt()</code>.
     * @return an index from 0..size()-1, or -1
     */
-    public int indexOf(char c) {
+    public int indexOf(int c) {
+        if (c < MIN_VALUE || c > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
+        }
        int i = 0;
        int n = 0;
        for (;;) {
@ -697,7 +749,7 @@ public class UnicodeSet implements UnicodeFilter {
     * <code>indexOf()</code>.
     * @param index an index from 0..size()-1
     */
-    public char charAt(int index) {
+    public int charAt(int index) {
        if (index >= 0) {
            for (int i=0; i < len;) {
                int start = list[i++];
@ -716,12 +768,14 @@ public class UnicodeSet implements UnicodeFilter {
     *
     * @return <tt>true</tt> if this set contains the specified char.
     */
-    public boolean contains(char c) {
-//| Not needed unless HIGH > 0x10000
-//|     // catch degenerate cases
-//|     if (c == HIGH) {   // catch final, so we don't do it in loop!
-//|         return (len & 1) == 0;  // even length includes everything
-//|     }
+    public boolean contains(int c) {
+        if (c < MIN_VALUE || c > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
+        }
+        // catch degenerate cases (not needed unless HIGH > 0x10000
+        if (c == HIGH) {   // catch final, so we don't do it in loop!
+            return (len & 1) == 0;  // even length includes everything
+        }
        // Set i to the index of the start item greater than ch
        // We know we will terminate without length test!
        // LATER: for large sets, add binary search
@ -771,7 +825,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @param end last character, inclusive, of range to be added
     * to this set.
     */
-    public void add(char start, char end) {
+    public void add(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        if (start <= end) {
            add(range(start, end), 2, 0);
        }
@ -782,7 +842,7 @@ public class UnicodeSet implements UnicodeFilter {
     * present.  If this set already contains the specified character,
     * the call leaves this set unchanged.
     */
-    public final void add(char c) {
+    public final void add(int c) {
        add(c, c);
    }

@ -796,7 +856,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @param end last character, inclusive, of range to be retained
     * to this set.
     */
-    public void retain(char start, char end) {
+    public void retain(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        if (start <= end) {
            retain(range(start, end), 2, 0);
        } else {
@ -807,7 +873,7 @@ public class UnicodeSet implements UnicodeFilter {
    /**
     * Retain the specified character from this set if it is present.
     */
-    public final void retain(char c) {
+    public final void retain(int c) {
        retain(c, c);
    }

@ -822,7 +888,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @param end last character, inclusive, of range to be removed
     * from this set.
     */
-    public void remove(char start, char end) {
+    public void remove(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        if (start <= end) {
            retain(range(start, end), 2, 2);
        }
@ -833,7 +905,7 @@ public class UnicodeSet implements UnicodeFilter {
     * The set will not contain the specified character once the call
     * returns.
     */
-    public final void remove(char c) {
+    public final void remove(int c) {
        remove(c, c);
    }

@ -848,7 +920,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @param end last character, inclusive, of range to be removed
     * from this set.
     */
-    public void complement(char start, char end) {
+    public void complement(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        if (start <= end) {
            xor(range(start, end), 2, 0);
        }
@ -859,7 +937,7 @@ public class UnicodeSet implements UnicodeFilter {
     * will be removed if it is in this set, or will be added if it is
     * not in this set.
     */
-    public final void complement(char c) {
+    public final void complement(int c) {
        complement(c, c);
    }

@ -878,6 +956,7 @@ public class UnicodeSet implements UnicodeFilter {
            list[0] = LOW;
            ++len;
        }
+        pat = null;
    }

    /**
@ -960,6 +1039,7 @@ public class UnicodeSet implements UnicodeFilter {
    public void clear() {
        list[0] = HIGH;
        len = 1;
+        pat = null;
    }

    /**
@ -980,8 +1060,8 @@ public class UnicodeSet implements UnicodeFilter {
     * @see #getRangeCount
     * @see #getRangeEnd
     */
-    public char getRangeStart(int index) {
-        return (char) list[index*2];
+    public int getRangeStart(int index) {
+        return list[index*2];
    }

    /**
@ -992,8 +1072,8 @@ public class UnicodeSet implements UnicodeFilter {
     * @see #getRangeStart
     * @see #getRangeEnd
     */
-    public char getRangeEnd(int index) {
-        return (char) (list[index*2 + 1] - 1);
+    public int getRangeEnd(int index) {
+        return (list[index*2 + 1] - 1);
    }

    /**
@ -1052,7 +1132,7 @@ public class UnicodeSet implements UnicodeFilter {
     * Return a programmer-readable string representation of this object.
     */
    public String toString() {
-        return getClass().getName() + '{' + toPattern(false) + '}';
+        return getClass().getName() + '(' + toPattern(false) + ')';
    }

    //----------------------------------------------------------------
@ -1081,13 +1161,37 @@ public class UnicodeSet implements UnicodeFilter {
     * of <code>pattern</code>
     * @exception java.lang.IllegalArgumentException if the parse fails.
     */
-    void applyPattern(String pattern, ParsePosition pos,
-                      SymbolTable symbols, boolean ignoreWhitespace) {
+    void applyPattern(String pattern,
+                      ParsePosition pos,
+                      SymbolTable symbols,
+                      boolean ignoreWhitespace) {

+        // Need to build the pattern in a temporary string because
+        // _applyPattern calls add() etc., which set pat to empty.
+        StringBuffer rebuiltPat = new StringBuffer();
+        _applyPattern(pattern, pos, symbols, rebuiltPat, ignoreWhitespace);
+        pat = rebuiltPat.toString();
+    }
+
+    void _applyPattern(String pattern, ParsePosition pos,
+                       SymbolTable symbols, StringBuffer rebuiltPat,
+                       boolean ignoreWhitespace) {
+
+        // If the pattern contains any of the following, we save a
+        // rebuilt (variable-substituted) copy of the source pattern:
+        // - a category
+        // - an intersection or subtraction operator
+        // - an anchor (trailing '$', indicating RBT ether)
+        boolean rebuildPattern = false;
+        StringBuffer newPat = new StringBuffer("[");
+        int nestedPatStart = -1; // see below for usage
+        boolean nestedPatDone = false; // see below for usage
+        
        boolean invert = false;
        clear();

-        int lastChar = -1; // This is either a char (0..FFFF) or -1
+        final int NONE = -1;
+        int lastChar = NONE; // This is either a char (0..10FFFF) or -1
        char lastOp = 0;

        /* This loop iterates over the characters in the pattern.  We start at
@ -1109,7 +1213,7 @@ public class UnicodeSet implements UnicodeFilter {
        // mode 2: '[' '^'? seen; parse pattern and close with ']'
        // mode 3: '[:' seen; parse category and close with ':]'
        int mode = 0;
-        int openPos = 0; // offset to opening '['
+        int colonPos = 0; // Expected pos of ':' in '[:'
        int start = pos.getIndex();
        int i = start;
        int limit = pattern.length();
@ -1120,33 +1224,37 @@ public class UnicodeSet implements UnicodeFilter {
        char[] varValueBuffer = null;
        int ivarValueBuffer = 0;
        int anchor = 0;
-        for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
+        int c;
+        while (i<limit) {
            /* If the next element is a single character, c will be set to it,
             * and nestedSet will be null.  In this case isLiteral indicates
             * whether the character should assume special meaning if it has
             * one.  If the next element is a nested set, either via a variable
             * reference, or via an embedded "[..]"  or "[:..:]" pattern, then
-             * nestedSet will be set to the i-list for the nested set, and
+             * nestedSet will be set to the pairs list for the nested set, and
             * c's value should be ignored.
             */
            UnicodeSet nestedSet = null;
            boolean isLiteral = false;
-            char c;
            if (varValueBuffer != null) {
                if (ivarValueBuffer < varValueBuffer.length) {
-                    c = varValueBuffer[ivarValueBuffer++];
+                    c = UTF16.charAt(varValueBuffer, 0, varValueBuffer.length, ivarValueBuffer);
+                    ivarValueBuffer += UTF16.getCharCount(c);
                    nestedSet = symbols.lookupSet(c); // may be NULL
+                    nestedPatDone = false;
                } else {
                    varValueBuffer = null;
-                    c = pattern.charAt(i);
+                    c = UTF16.charAt(pattern, i);
+                    i += UTF16.getCharCount(c);
                }
            } else {
-                c = pattern.charAt(i);
+                c = UTF16.charAt(pattern, i);
+                i += UTF16.getCharCount(c);
            }

            // Ignore whitespace.  This is not Unicode whitespace, but Java
            // whitespace, a subset of Unicode whitespace.
-            if (ignoreWhitespace && Character.isWhitespace(c)) {
+            if (ignoreWhitespace && UCharacter.isWhitespace(c)) {
                continue;
            }

@ -1160,7 +1268,7 @@ public class UnicodeSet implements UnicodeFilter {
            case 0:
                if (c == '[') {
                    mode = 1; // Next look for '^'
-                    openPos = i;
+                    colonPos = i; // Expect ':' at next offset
                    continue;
                } else {
                    throw new IllegalArgumentException("Missing opening '['");
@ -1170,14 +1278,17 @@ public class UnicodeSet implements UnicodeFilter {
                switch (c) {
                case '^':
                    invert = true;
+                    newPat.append((char) c);
                    continue; // Back to top to fetch next character
                case ':':
-                    if (i == openPos+1) {
+                    if (i-1 == colonPos) {
                        // '[:' cannot have whitespace in it
-                        --i;
+                        --i; // Backup to the '['
                        c = '[';
                        mode = 3;
-                        // Fall through and parse category normally
+                        // Fall through and parse category using the same
+                        // code used to parse a nested category.  The mode
+                        // will indicate that this is actually top level.
                    }
                    break; // Fall through
                case '-':
@ -1202,27 +1313,19 @@ public class UnicodeSet implements UnicodeFilter {
                 * interpret '\\uxxxx' Unicode escapes here (as literals).
                 */
                if (c == '\\') {
-                    ++i;
-                    if (i < limit) {
-                        c = pattern.charAt(i);
-                        isLiteral = true;
-                        if (c == 'u') {
-                            if ((i+4) >= limit) {
-                                throw new IllegalArgumentException("Invalid \\u escape");
-                            }
-                            c = '\u0000';
-                            for (int j=(++i)+4; i<j; ++i) { // [sic]
-                                int digit = Character.digit(pattern.charAt(i), 16);
-                                if (digit<0) {
-                                    throw new IllegalArgumentException("Invalid \\u escape");
-                                }
-                                c = (char) ((c << 4) | digit);
-                            }
-                            --i; // Move i back to last parsed character
-                        }
-                    } else {
-                        throw new IllegalArgumentException("Trailing '\\'");
+                    int[] offset = new int[] { i };
+                    int escaped = Utility.unescapeAt(pattern, offset);
+                    if (escaped == -1) {
+                        int sta = Math.max(i - 8, 0);
+                        int lim = Math.min(i + 16, pattern.length());
+                        throw new IllegalArgumentException("Invalid escape sequence " +
+                                                           pattern.substring(sta, i-1) +
+                                                           "|" +
+                                                           pattern.substring(i-1, lim));
                    }
+                    i = offset[0];
+                    isLiteral = true;
+                    c = escaped;
                }

                /* Parse variable references.  These are treated as literals.  If a
@ -1232,7 +1335,7 @@ public class UnicodeSet implements UnicodeFilter {
                 * Set variables are only looked up if varCharToSet is not null.
                 */
                else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
-                    pos.setIndex(++i);
+                    pos.setIndex(i);
                    String name = symbols.parseReference(pattern, pos, limit);
                    if (name != null) {
                        varValueBuffer = symbols.lookup(name);
@ -1246,7 +1349,6 @@ public class UnicodeSet implements UnicodeFilter {
                        // Got a null; this means we have an isolated $.
                        // Tentatively assume this is an anchor.
                        anchor = 1;
-                        --i; // Back up so loop increment works properly
                    }
                    continue; // Back to the top to get varValueBuffer[0]
                }
@ -1256,28 +1358,56 @@ public class UnicodeSet implements UnicodeFilter {
                 * recognize these here and set nestedSet accordingly.
                 */
                else if (!isLiteral && c == '[') {
+                    // Record position before nested pattern
+                    nestedPatStart = newPat.length();
+
                    // Handle "[:...:]", representing a character category
-                    char d = charAfter(pattern, i);
-                    if (d == ':') {
-                        i += 2;
+                    if (i < pattern.length() && pattern.charAt(i) == ':') {
+                        ++i;
                        int j = pattern.indexOf(":]", i);
                        if (j < 0) {
                            throw new IllegalArgumentException("Missing \":]\"");
                        }
+                        String scratch = pattern.substring(i, j);
                        nestedSet = new UnicodeSet();
-                        nestedSet.applyCategory(pattern.substring(i, j));
-                        i = j+1; // Make i point to ']' in ":]"
+                        nestedSet.applyCategory(scratch);
+                        nestedPatDone = true; // We're going to do it just below
+                        i = j+2; // Advance i past ":]"
+
+                        // Use a rebuilt pattern.  If we are top level,
+                        // then there is already a SET_OPEN in newPat, and
+                        // SET_CLOSE will be appended elsewhere.
+                        if (mode != 3) {
+                            newPat.append('[');
+                        }
+                        newPat.append(':').append(scratch).append(':');
+                        if (mode != 3) {
+                            newPat.append(']');
+                        }
+                        rebuildPattern = true;
+
                        if (mode == 3) {
-                            // Entire pattern is a category; leave parse loop
+                            // Entire pattern is a category; leave parse
+                            // loop.  This is one of 2 ways we leave this
+                            // loop if the pattern is well-formed.
                            set(nestedSet);
+                            mode = 4;
                            break;
                        }
                    } else {
-                        // Recurse to get the i-list for this nested set.
-                        pos.setIndex(i); // Add 2 to point AFTER op
+                        // Recurse to get the pairs for this nested set.
+                        // Backup i to '['.
+                        pos.setIndex(--i);
+                        switch (lastOp) {
+                        case '-':
+                        case '&':
+                            newPat.append(lastOp);
+                            break;
+                        }
                        nestedSet = new UnicodeSet();
-                        nestedSet.applyPattern(pattern, pos, symbols, ignoreWhitespace);
-                        i = pos.getIndex() - 1; // - 1 to point at ']'
+                        nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
+                        nestedPatDone = true;
+                        i = pos.getIndex();
                    }
                }
            }
@ -1291,12 +1421,23 @@ public class UnicodeSet implements UnicodeFilter {
             * ']' have special meanings.
             */
            if (nestedSet != null) {
-                if (lastChar >= 0) {
+                if (lastChar != NONE) {
                    if (lastOp != 0) {
                        throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
                    }
                    add((char) lastChar, (char) lastChar);
-                    lastChar = -1;
+                    if (nestedPatDone) {
+                        // If there was a character before the nested set,
+                        // then we need to insert it in newPat before the
+                        // pattern for the nested set.  This position was
+                        // recorded in nestedPatStart.
+                        StringBuffer s = new StringBuffer();
+                        _appendToPat(s, lastChar, false);
+                        newPat.insert(nestedPatStart, s.toString());
+                    } else {
+                        _appendToPat(newPat, lastChar, false);
+                    }
+                    lastChar = NONE;
                }
                switch (lastOp) {
                case '-':
@ -1309,7 +1450,19 @@ public class UnicodeSet implements UnicodeFilter {
                    addAll(nestedSet);
                    break;
                }
+
+                // Get the pattern for the nested set, if we haven't done so
+                // already.
+                if (!nestedPatDone) {
+                    if (lastOp != 0) {
+                        newPat.append(lastOp);
+                    }
+                    nestedSet._toPattern(newPat, false);
+                }
+                rebuildPattern = true;
+
                lastOp = 0;
+
            } else if (!isLiteral && c == ']') {
                // Final closing delimiter.  This is the only way we leave this
                // loop if the pattern is well-formed.
@ -1318,11 +1471,14 @@ public class UnicodeSet implements UnicodeFilter {
                    
                }
                if (anchor == 2) {
+                    rebuildPattern = true;
+                    newPat.append(SymbolTable.SYMBOL_REF);
                    add(TransliterationRule.ETHER);
                }
+                mode = 4;
                break;
            } else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) {
-                lastOp = c;
+                lastOp = (char) c;
            } else if (lastOp == '-') {
                if (lastChar >= c) {
                    // Don't allow redundant (a-a) or empty (b-a) ranges;
@ -1330,36 +1486,45 @@ public class UnicodeSet implements UnicodeFilter {
                    throw new IllegalArgumentException("Invalid range " + lastChar +
                                                       '-' + c);
                }
-                add((char) lastChar, c);
+                add(lastChar, c);
+                _appendToPat(newPat, lastChar, false);
+                newPat.append('-');
+                _appendToPat(newPat, c, false);
                lastOp = 0;
-                lastChar = -1;
+                lastChar = NONE;
            } else if (lastOp != 0) {
                // We have <set>&<char> or <char>&<char>
                throw new IllegalArgumentException("Unquoted " + lastOp);
            } else {
-                if (lastChar >= 0) {
+                if (lastChar != NONE) {
                    // We have <char><char>
                    add((char) lastChar, (char) lastChar);
+                    _appendToPat(newPat, lastChar, false);
                }
                lastChar = c;
            }
        }

-        if (mode == 0) {
-            throw new IllegalArgumentException("Missing '[' in \"" +
-                                               pattern.substring(start) + '"');
+        if (lastChar != NONE) {
+            add(lastChar, lastChar);
+            _appendToPat(newPat, lastChar, false);
        }

+//      if (mode == 0) {
+//          throw new IllegalArgumentException("Missing '[' in \"" +
+//                                             pattern.substring(start) + '"');
+//      }
+
        // Handle unprocessed stuff preceding the closing ']'
        if (lastOp == '-') {
            // Trailing '-' is treated as literal
            add(lastOp, lastOp);
+            newPat.append('-');
        } else if (lastOp == '&') {
            throw new IllegalArgumentException("Unquoted trailing " + lastOp);
        }
-        if (lastChar >= 0) {
-            add((char) lastChar, (char) lastChar);
-        }
+
+        newPat.append(']');

        /**
         * If we saw a '^' after the initial '[' of this pattern, then perform
@ -1369,17 +1534,30 @@ public class UnicodeSet implements UnicodeFilter {
            complement();
        }

-        /**
-         * i indexes the last character we parsed or is pattern.length().  In
-         * the latter case, we have run off the end without finding a closing
-         * ']'.  Otherwise, we know i < pattern.length(), and we set the
-         * ParsePosition to the next character to be parsed.
-         */
-        if (i == limit) {
-            throw new IllegalArgumentException("Missing ']' in \"" +
-                                               pattern.substring(start) + '"');
+        if (mode != 4) {
+            throw new IllegalArgumentException("Missing ']'");
+        }
+
+//      /**
+//       * i indexes the last character we parsed or is pattern.length().  In
+//       * the latter case, we have run off the end without finding a closing
+//       * ']'.  Otherwise, we know i < pattern.length(), and we set the
+//       * ParsePosition to the next character to be parsed.
+//       */
+//      if (i == limit) {
+//          throw new IllegalArgumentException("Missing ']' in \"" +
+//                                             pattern.substring(start) + '"');
+//      }
+  
+        pos.setIndex(i);
+
+        // Use the rebuilt pattern (newPat) only if necessary.  Prefer the
+        // generated pattern.
+        if (rebuildPattern) {
+            rebuiltPat.append(newPat.toString());
+        } else {
+            _generatePattern(rebuiltPat, false);
        }
-        pos.setIndex(i+1);

        if (false) {
            // Debug parser
@ -1494,14 +1672,6 @@ public class UnicodeSet implements UnicodeFilter {
    // Implementation: Utility methods
    //----------------------------------------------------------------

-    /**
-     * Returns the character after the given position, or '\uFFFE' if
-     * there is none.
-     */
-    private static final char charAfter(String str, int i) {
-        return ((++i) < str.length()) ? str.charAt(i) : '\uFFFE';
-    }
-
    private void ensureCapacity(int newLen) {
        if (newLen <= list.length) return;
        int[] temp = new int[newLen + GROW_EXTRA];
@ -1571,6 +1741,7 @@ public class UnicodeSet implements UnicodeFilter {
        int[] temp = list;
        list = buffer;
        buffer = temp;
+        pat = null;
        return this;
    }

@ -1668,6 +1839,7 @@ public class UnicodeSet implements UnicodeFilter {
        int[] temp = list;
        list = buffer;
        buffer = temp;
+        pat = null;
        return this;
    }

@ -1738,6 +1910,7 @@ public class UnicodeSet implements UnicodeFilter {
        int[] temp = list;
        list = buffer;
        buffer = temp;
+        pat = null;
        return this;
    }

--- a/icu4j/src/com/ibm/test/translit/CompoundTransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/CompoundTransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/CompoundTransliteratorTest.java,v $ 
- * $Date: 2001/09/08 01:17:50 $ 
- * $Revision: 1.2 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -111,7 +111,7 @@ public class CompoundTransliteratorTest extends TestFmwk {
 
    public void TestGetTransliterator(){
        logln("Testing the getTransliterator() API of CompoundTransliterator");
-        String ID="Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Unicode-Hex;Hex-Unicode";
+        String ID="Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Any-Hex;Hex-Any";
        CompoundTransliterator ct1=null;
        try{
            ct1=new CompoundTransliterator(ID);
@ -141,9 +141,9 @@ public class CompoundTransliteratorTest extends TestFmwk {
        logln("Testing the handleTransliterate() API of CompoundTransliterator");
        CompoundTransliterator ct1=null;
        try{
-            ct1=new CompoundTransliterator("Unicode-Hex;Hex-Unicode");
+            ct1=new CompoundTransliterator("Any-Hex;Hex-Any");
        }catch(IllegalArgumentException iae){
-            errln("FAIL: construction using CompoundTransliterator(String ID) failed for " + "Unicode-Hex;Hex-Unicode");
+            errln("FAIL: construction using CompoundTransliterator(String ID) failed for " + "Any-Hex;Hex-Any");
            throw iae;
        }
    
@ -167,8 +167,8 @@ public class CompoundTransliteratorTest extends TestFmwk {
       
        String Data[]={
             //ID, input string, transliterated string
-             "Unicode-Hex;Hex-Unicode;Unicode-Hex",     "hello",  "\\u0068\\u0065\\u006C\\u006C\\u006F", 
-             "Unicode-Hex;Hex-Unicode",                 "hello! How are you?",  "hello! How are you?",
+             "Any-Hex;Hex-Any;Any-Hex",     "hello",  "\\u0068\\u0065\\u006C\\u006C\\u006F", 
+             "Any-Hex;Hex-Any",                 "hello! How are you?",  "hello! How are you?",
             "Devanagari-Latin;Latin-Devanagari",       "\u092D\u0948'\u0930'\u0935",  "\u092D\u0948\u0930\u0935", // quotes lost
             "Latin-Cyrillic;Cyrillic-Latin",           "a'b'k'd'e'f'g'h'i'j'Shch'shch'zh'h", "abkdefghijShchshchzhh",
             "Latin-Greek;Greek-Latin",                 "ABGabgAKLMN", "ABGabgAKLMN",
--- a/icu4j/src/com/ibm/test/translit/HexToUnicodeTransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/HexToUnicodeTransliteratorTest.java
@ -6,8 +6,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/HexToUnicodeTransliteratorTest.java,v $ 
- * $Date: 2000/10/09 16:32:07 $ 
- * $Revision: 1.2 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -33,7 +33,7 @@ public class HexToUnicodeTransliteratorTest extends TestFmwk {
    * Used by TestConstruction() and TestTransliterate.
    */
    UnicodeFilter HexFilter=new UnicodeFilter() {
-        public boolean contains(char c) {
+        public boolean contains(int c) {
            if(c == 0x0061 || c == 0x0063 )
                return false;
            else
--- a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
@ -4,9 +4,9 @@
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
- * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $ 
- * $Date: 2001/09/21 21:23:34 $ 
- * $Revision: 1.45 $
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
+ * $Date: 2001/09/24 19:56:41 $
+ * $Revision: 1.46 $
 *
 *****************************************************************************************
 */
@ -48,14 +48,12 @@ public class TransliteratorTest extends TestFmwk {
                throw ex;
            }

-            // TODO remove check for class when we implement full
-            // toRules().
-            if (t != null && t instanceof RuleBasedTransliterator) {
+            if (t != null) {
                // Now test toRules
                String rules = null;
                try {
-                    rules = ((RuleBasedTransliterator)t).toRules(true);
-                
+                    rules = t.toRules(true);
+
                    Transliterator u = Transliterator.createFromRules("x",
                                           rules, Transliterator.FORWARD);
                } catch (IllegalArgumentException ex2) {
@ -74,7 +72,7 @@ public class TransliteratorTest extends TestFmwk {
        } catch (IllegalArgumentException ex) {
            logln("OK: Bogus ID handled properly");
        }
-        
+
        ms = System.currentTimeMillis() - ms;
        logln("Elapsed time: " + ms + " ms");
    }
@ -223,7 +221,7 @@ public class TransliteratorTest extends TestFmwk {
     * Basic test of keyboard.
     */
    public void TestKeyboard() {
-        Transliterator t = new RuleBasedTransliterator("<ID>", 
+        Transliterator t = new RuleBasedTransliterator("<ID>",
                                                       "psch>Y;"
                                                       +"ps>y;"
                                                       +"ch>x;"
@ -246,7 +244,7 @@ public class TransliteratorTest extends TestFmwk {
     * Basic test of keyboard with cursor.
     */
    public void TestKeyboard2() {
-        Transliterator t = new RuleBasedTransliterator("<ID>", 
+        Transliterator t = new RuleBasedTransliterator("<ID>",
                                                       "ych>Y;"
                                                       +"ps>|y;"
                                                       +"ch>x;"
@ -381,7 +379,7 @@ public class TransliteratorTest extends TestFmwk {
    public void TestFiltering() {
        Transliterator hex = Transliterator.getInstance("Any-Hex");
        hex.setFilter(new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                return c != 'c';
            }
        });
@ -398,7 +396,7 @@ public class TransliteratorTest extends TestFmwk {
    /**
     * Test anchors
     */
-    public void TestAnchors() { 
+    public void TestAnchors() {
        expect("^ab  > 01 ;" +
               " ab  > |8 ;" +
               "  b  > k ;" +
@ -406,7 +404,7 @@ public class TransliteratorTest extends TestFmwk {
               " 8x  > 77 ;",

               "ababbabxabx",
-               "018k7745");  
+               "018k7745");
        expect("$s = [z$] ;" +
               "$s{ab    > 01 ;" +
               "   ab    > |8 ;" +
@ -440,7 +438,7 @@ public class TransliteratorTest extends TestFmwk {
     */
    public void TestJ277() {
        Transliterator gl = Transliterator.getInstance("Greek-Latin");
-        
+
        char sigma = (char)0x3C3;
        char upsilon = (char)0x3C5;
        char nu = (char)0x3BD;
@ -517,7 +515,7 @@ public class TransliteratorTest extends TestFmwk {

        // Try a custom Hex-Any
        // \\uXXXX and &#xXXXX;
-        HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;"); 
+        HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
        expect(hex2, "\\u61\\u062\\u0063\\u00645\\u66x&#x30;&#x031;&#x0032;&#x00033;",
               "abcd5fx012&#x00033;");

@ -740,7 +738,7 @@ public class TransliteratorTest extends TestFmwk {
        if (!got.equals(exp)) {
            errln("FAIL: Inverse of " + ID + " is " + got +
                  ", expected " + exp);
-        }        
+        }
    }

    /**
@ -752,18 +750,18 @@ public class TransliteratorTest extends TestFmwk {
            "Hex[aeiou]-Any",
            "quizzical",
            "q\\u0075\\u0069zz\\u0069c\\u0061l",
-            
+
            "Any[aeiou]-Hex;Hex[^5]-Any",
            "Any[^5]-Hex;Hex[aeiou]-Any",
            "quizzical",
            "q\\u0075izzical",
-            
+
            "Null[abc]",
            "Null[abc]",
            "xyz",
            "xyz",
        };
-        
+
        for (int i=0; i<DATA.length; i+=4) {
            String ID = DATA[i];
            Transliterator t = Transliterator.getInstance(ID);
@ -796,6 +794,109 @@ public class TransliteratorTest extends TestFmwk {
               "Th qck brwn fx.");
    }

+    public void TestToRules() {
+        String RBT = "rbt";
+        String SET = "set";
+        String[] DATA = {
+            RBT,
+            "$a=\\u4E61; [$a] > A;",
+            "[\\u4E61] > A;",
+
+            RBT,
+            "$white=[[:Zs:][:Zl:]]; $white{a} > A;",
+            "[[:Zs:][:Zl:]]{a} > A;",
+
+            SET,
+            "[[:Zs:][:Zl:]]",
+            "[[:Zs:][:Zl:]]",
+
+            SET,
+            "[:Ps:]",
+            "[:Ps:]",
+
+            SET,
+            "[:L:]",
+            "[:L:]",
+
+            SET,
+            "[[:L:]-[A]]",
+            "[[:L:]-[A]]",
+
+            SET,
+            "[~[:Lu:][:Ll:]]",
+            "[~[:Lu:][:Ll:]]",
+
+            SET,
+            "[~[a-z]]",
+            "[~[a-z]]",
+
+            RBT,
+            "$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
+            "[^[:Zs:]]{a} > A;",
+
+            RBT,
+            "$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
+            "[[a-z]-[:Zs:]]{a} > A;",
+
+            RBT,
+            "$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
+            "[[:Zs:]&[a-z]]{a} > A;",
+
+            RBT,
+            "$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
+            "[x[:Zs:]]{a} > A;",
+        };
+
+        for (int d=0; d < DATA.length; d+=3) {
+            if (DATA[d] == RBT) {
+                // Transliterator test
+                Transliterator t = Transliterator.createFromRules("ID",
+                                       DATA[d+1], Transliterator.FORWARD);
+                if (t == null) {
+                    errln("FAIL: createFromRules failed");
+                    return;
+                }
+                String rules, escapedRules;
+                rules = t.toRules(false);
+                escapedRules = t.toRules(true);
+                String expRules = Utility.unescape(DATA[d+2]);
+                String expEscapedRules = DATA[d+2];
+                if (rules.equals(expRules)) {
+                    logln("Ok: " + DATA[d+1] +
+                          " => " + Utility.escape(rules));
+                } else {
+                    errln("FAIL: " + DATA[d+1] +
+                          " => " + Utility.escape(rules + ", exp " + expRules));
+                }
+                if (escapedRules.equals(expEscapedRules)) {
+                    logln("Ok: " + DATA[d+1] +
+                          " => " + escapedRules);
+                } else {
+                    errln("FAIL: " + DATA[d+1] +
+                          " => " + escapedRules + ", exp " + expEscapedRules);
+                }
+
+            } else {
+                // UnicodeSet test
+                String pat = DATA[d+1];
+                String expToPat = DATA[d+2];
+                UnicodeSet set = new UnicodeSet(pat);
+
+                // Adjust spacing etc. as necessary.
+                String toPat;
+                toPat = set.toPattern(true);
+                if (expToPat.equals(toPat)) {
+                    logln("Ok: " + pat +
+                          " => " + toPat);
+                } else {
+                    errln("FAIL: " + pat +
+                          " => " + Utility.escape(toPat) +
+                          ", exp " + Utility.escape(pat));
+                }
+            }
+        }
+    }
+
    /**
     * Test the case mapping transliterators.
     */
@ -806,7 +907,7 @@ public class TransliteratorTest extends TestFmwk {
            Transliterator.getInstance("Any-Lower[^xyzXYZ]");
        Transliterator toTitle =
            Transliterator.getInstance("Any-Title[^xyzXYZ]");
-        
+
        expect(toUpper, "The quick brown fox jumped over the lazy dogs.",
               "THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
        expect(toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
@ -857,7 +958,7 @@ public class TransliteratorTest extends TestFmwk {
                    errln("FAIL: " + DATA[i+2] +
                          " create ID \"" + DATA[i] + "\" => \"" +
                          t.getID() + "\", exp \"" + DATA[i+1] + "\"");
-                }   
+                }
            } catch (IllegalArgumentException e) {
                errln("FAIL: " + DATA[i+2] +
                      " create ID \"" + DATA[i] + "\"");
@ -875,52 +976,52 @@ public class TransliteratorTest extends TestFmwk {
            // Input               Decomposed            Composed
            {"cat",                "cat",                "cat"               },
            {"\u00e0ardvark",      "a\u0300ardvark",     "\u00e0ardvark"     },
-                                                         
+
            {"\u1e0a",             "D\u0307",            "\u1e0a"            }, // D-dot_above
            {"D\u0307",            "D\u0307",            "\u1e0a"            }, // D dot_above
-                                                         
+
            {"\u1e0c\u0307",       "D\u0323\u0307",      "\u1e0c\u0307"      }, // D-dot_below dot_above
            {"\u1e0a\u0323",       "D\u0323\u0307",      "\u1e0c\u0307"      }, // D-dot_above dot_below
            {"D\u0307\u0323",      "D\u0323\u0307",      "\u1e0c\u0307"      }, // D dot_below dot_above
-                                                         
+
            {"\u1e10\u0307\u0323", "D\u0327\u0323\u0307","\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above
            {"D\u0307\u0328\u0323","D\u0328\u0323\u0307","\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below
-                                                         
+
            {"\u1E14",             "E\u0304\u0300",      "\u1E14"            }, // E-macron-grave
            {"\u0112\u0300",       "E\u0304\u0300",      "\u1E14"            }, // E-macron + grave
            {"\u00c8\u0304",       "E\u0300\u0304",      "\u00c8\u0304"      }, // E-grave + macron
-                                                         
+
            {"\u212b",             "A\u030a",            "\u00c5"            }, // angstrom_sign
            {"\u00c5",             "A\u030a",            "\u00c5"            }, // A-ring
-                                                         
+
            {"\u00fdffin",         "y\u0301ffin",        "\u00fdffin"        },	//updated with 3.0
            {"\u00fd\uFB03n",      "y\u0301\uFB03n",     "\u00fd\uFB03n"     },	//updated with 3.0
-                                                         
+
            {"Henry IV",           "Henry IV",           "Henry IV"          },
            {"Henry \u2163",       "Henry \u2163",       "Henry \u2163"      },
-                                                         
+
            {"\u30AC",             "\u30AB\u3099",       "\u30AC"            }, // ga (Katakana)
            {"\u30AB\u3099",       "\u30AB\u3099",       "\u30AC"            }, // ka + ten
            {"\uFF76\uFF9E",       "\uFF76\uFF9E",       "\uFF76\uFF9E"      }, // hw_ka + hw_ten
            {"\u30AB\uFF9E",       "\u30AB\uFF9E",       "\u30AB\uFF9E"      }, // ka + hw_ten
            {"\uFF76\u3099",       "\uFF76\u3099",       "\uFF76\u3099"      }, // hw_ka + ten
-                                                         
+
            {"A\u0300\u0316",      "A\u0316\u0300",      "\u00C0\u0316"      },
-        };                                                
-                                                          
-        String[][] COMPAT = {                        
+        };
+
+        String[][] COMPAT = {
            // Input               Decomposed            Composed
            {"\uFB4f",             "\u05D0\u05DC",       "\u05D0\u05DC"      }, // Alef-Lamed vs. Alef, Lamed
-                                                         
+
            {"\u00fdffin",         "y\u0301ffin",        "\u00fdffin"        },	//updated for 3.0
            {"\u00fd\uFB03n",      "y\u0301ffin",        "\u00fdffin"        }, // ffi ligature -> f + f + i
-                                                         
+
            {"Henry IV",           "Henry IV",           "Henry IV"          },
            {"Henry \u2163",       "Henry IV",           "Henry IV"          },
-                                                         
+
            {"\u30AC",             "\u30AB\u3099",       "\u30AC"            }, // ga (Katakana)
            {"\u30AB\u3099",       "\u30AB\u3099",       "\u30AC"            }, // ka + ten
-                                                         
+
            {"\uFF76\u3099",       "\u30AB\u3099",       "\u30AC"            }, // hw_ka + ten
        };

@ -960,7 +1061,7 @@ public class TransliteratorTest extends TestFmwk {
        expect(t, "\u010dx", "c\u030C");
        }
    }
-    
+
    /**
     * Test compound RBT rules.
     */
@ -1116,7 +1217,7 @@ public class TransliteratorTest extends TestFmwk {
                append('|').
                append(s.substring(index.start));
        }
-        
+
        // As a final step in keyboard transliteration, we must call
        // transliterate to finish off any pending partial matches that
        // were waiting for more input.
@ -1135,7 +1236,7 @@ public class TransliteratorTest extends TestFmwk {
                  result.equals(expectedResult),
                  expectedResult);
    }
-    
+
    void expectAux(String tag, String summary, boolean pass,
                   String expectedResult) {
        if (pass) {
--- a/icu4j/src/com/ibm/test/translit/UnicodeFilterLogicTest.java
+++ b/icu4j/src/com/ibm/test/translit/UnicodeFilterLogicTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeFilterLogicTest.java,v $ 
- * $Date: 2000/10/04 23:12:33 $ 
- * $Revision: 1.2 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -29,7 +29,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {


   UnicodeFilter Filter1=new UnicodeFilter() {
-        public boolean contains(char c) {
+        public boolean contains(int c) {
            if(c == 0x0061 || c == 0x0041 || c == 0x0063 || c == 0x0043)
                return false;
            else
@ -37,7 +37,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {
       }
   };
   UnicodeFilter Filter2=new UnicodeFilter() {
-        public boolean contains(char c) {
+        public boolean contains(int c) {
            if(c == 0x0079 || c == 0x0059 || c == 0x007a || c == 0x005a  || c == 0x0061 || c == 0x0063)
                return false;
            else
@ -47,7 +47,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {

   public void TestAllFilters() {

-        Transliterator t1 = Transliterator.getInstance("Unicode-Hex");
+        Transliterator t1 = Transliterator.getInstance("Any-Hex");
        String source="abcdABCDyzYZ";

        //sanity testing wihtout any filter
--- a/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java
+++ b/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeSetTest.java,v $ 
- * $Date: 2001/09/08 01:17:50 $ 
- * $Revision: 1.11 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.12 $
 *
 *****************************************************************************************
 */
@ -181,8 +181,8 @@ public class UnicodeSetTest extends TestFmwk {
        set.clear();
        set.applyPattern("[A-Y 1-8 b-d l-y]");
        for (int i = 0; i<set.getRangeCount(); ++i) {
-            char a = set.getRangeStart(i);
-            char b = set.getRangeEnd(i);
+            int a = set.getRangeStart(i);
+            int b = set.getRangeEnd(i);
            if (!set.contains(a, b)) {
                errln("FAIL, should contain " + (char)a + '-' + (char)b +
                      " but doesn't: " + set);
@ -219,7 +219,7 @@ public class UnicodeSetTest extends TestFmwk {
        if (c.equals(exp)) {
            logln("c.complement(): " + c);
        } else {
-            errln("FAIL: c.complement() = " + c + ", expect " + exp);
+            errln(Utility.escape("FAIL: c.complement() = " + c + ", expect " + exp));
        }
        c.complement();
        exp.set((char)3, (char)15);
@ -252,13 +252,13 @@ public class UnicodeSetTest extends TestFmwk {
    public void TestIndexOf() {
        UnicodeSet set = new UnicodeSet("[a-cx-y3578]");
        for (int i=0; i<set.size(); ++i) {
-            char c = set.charAt(i);
+            int c = set.charAt(i);
            if (set.indexOf(c) != i) {
                errln("FAIL: charAt(" + i + ") = " + c +
                      " => indexOf() => " + set.indexOf(c));
            }
        }
-        char c = set.charAt(set.size());
+        int c = set.charAt(set.size());
        if (c != '\uFFFE') {
            errln("FAIL: charAt(<out of range>) = " +
                  Utility.escape(String.valueOf(c)));
--- a/icu4j/src/com/ibm/test/translit/UnicodeToHexTransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/UnicodeToHexTransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeToHexTransliteratorTest.java,v $ 
- * $Date: 2000/10/16 16:58:29 $ 
- * $Revision: 1.3 $
+ * $Date: 2001/09/24 19:56:41 $ 
+ * $Revision: 1.4 $
 *
 *****************************************************************************************
 */
@ -32,7 +32,7 @@ public class UnicodeToHexTransliteratorTest extends TestFmwk {
    * Used by TestConstruction() and TestTransliterate.
    */
    UnicodeFilter UniFilter=new UnicodeFilter() {
-        public boolean contains(char c) {
+        public boolean contains(int c) {
            if(c==0x0063 || c==0x0061 || c==0x0043 || c==0x0041)
                return false;
            else
--- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $ 
- * $Date: 2001/09/21 21:24:04 $ 
- * $Revision: 1.44 $
+ * $Date: 2001/09/24 19:57:17 $ 
+ * $Revision: 1.45 $
 *
 *****************************************************************************************
 */
@ -279,7 +279,7 @@ import com.ibm.text.resources.ResourceReader;
 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
 * 
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.44 $ $Date: 2001/09/21 21:24:04 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.45 $ $Date: 2001/09/24 19:57:17 $
 */
 public class RuleBasedTransliterator extends Transliterator {

@ -542,7 +542,7 @@ public class RuleBasedTransliterator extends Transliterator {
            /**
             * Implement SymbolTable API.
             */
-            public UnicodeSet lookupSet(char ch) {
+            public UnicodeSet lookupSet(int ch) {
                // Note that we cannot use data.lookupSet() because the
                // set array has not been constructed yet.
                int i = ch - data.setVariablesBase;
@ -1579,6 +1579,9 @@ public class RuleBasedTransliterator extends Transliterator {

 /**
 * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.45  2001/09/24 19:57:17  alan
+ * jitterbug 60: implement toPattern in UnicodeSet; update UnicodeFilter.contains to take an int; update UnicodeSet to support code points to U+10FFFF
+ *
 * Revision 1.44  2001/09/21 21:24:04  alan
 * jitterbug 64: allow ::ID blocks in rules
 *
--- a/icu4j/src/com/ibm/text/SymbolTable.java
+++ b/icu4j/src/com/ibm/text/SymbolTable.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/SymbolTable.java,v $ 
- * $Date: 2000/08/30 20:40:30 $ 
- * $Revision: 1.6 $
+ * $Date: 2001/09/24 19:57:18 $ 
+ * $Revision: 1.7 $
 *
 *****************************************************************************************
 */
@ -41,8 +41,9 @@ public interface SymbolTable {
    /**
     * Lookup the UnicodeSet associated with the given character, and
     * return it.  Return <tt>null</tt> if not found.
+     * @param ch a 32-bit code point from 0 to 0x10FFFF.
     */
-    UnicodeSet lookupSet(char ch);
+    UnicodeSet lookupSet(int ch);

    /**
     * Parse a symbol reference name from the given string, starting
--- a/icu4j/src/com/ibm/text/UnicodeFilter.java
+++ b/icu4j/src/com/ibm/text/UnicodeFilter.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeFilter.java,v $ 
- * $Date: 2000/03/10 04:07:25 $ 
- * $Revision: 1.4 $
+ * $Date: 2001/09/24 19:57:18 $ 
+ * $Revision: 1.5 $
 *
 *****************************************************************************************
 */
@ -30,5 +30,5 @@ public interface UnicodeFilter {
     * filtered</b>, then <tt>contains()</tt> returns
     * <b><tt>false</tt></b>.
     */
-    boolean contains(char c);
+    boolean contains(int c);
 }
--- a/icu4j/src/com/ibm/text/UnicodeFilterLogic.java
+++ b/icu4j/src/com/ibm/text/UnicodeFilterLogic.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeFilterLogic.java,v $ 
- * $Date: 2000/03/10 04:07:25 $ 
- * $Revision: 1.3 $
+ * $Date: 2001/09/24 19:57:18 $ 
+ * $Revision: 1.4 $
 *
 *****************************************************************************************
 */
@ -28,7 +28,7 @@ public final class UnicodeFilterLogic {
     */
    public static UnicodeFilter not(final UnicodeFilter f) {
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                return !f.contains(c);
            }
        };
@ -51,7 +51,7 @@ public final class UnicodeFilterLogic {
            return f;
        }
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                return f.contains(c) && g.contains(c);
            }
        };
@ -66,7 +66,7 @@ public final class UnicodeFilterLogic {
     */
    public static UnicodeFilter and(final UnicodeFilter[] f) {
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                for (int i=0; i<f.length; ++i) {
                    if (!f[i].contains(c)) {
                        return false;
@ -94,7 +94,7 @@ public final class UnicodeFilterLogic {
            return f;
        }
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                return f.contains(c) || g.contains(c);
            }
        };
@ -109,7 +109,7 @@ public final class UnicodeFilterLogic {
     */
    public static UnicodeFilter or(final UnicodeFilter[] f) {
        return new UnicodeFilter() {
-            public boolean contains(char c) {
+            public boolean contains(int c) {
                for (int i=0; i<f.length; ++i) {
                    if (f[i].contains(c)) {
                        return true;
--- a/icu4j/src/com/ibm/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/text/UnicodeSet.java
@ -5,21 +5,22 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
- * $Date: 2001/09/20 21:20:00 $
- * $Revision: 1.33 $
+ * $Date: 2001/09/24 19:57:18 $
+ * $Revision: 1.34 $
 *
 *****************************************************************************************
 */
 package com.ibm.text;

 import java.text.*;
+import com.ibm.util.Utility;

 /**
 * A mutable set of Unicode characters.  Objects of this class
 * represent <em>character classes</em> used in regular expressions.
 * Such classes specify a subset of the set of all Unicode characters,
 * which in this implementation is the characters from U+0000 to
- * U+FFFF, ignoring surrogates.
+ * U+10FFFF.
 *
 * <p><code>UnicodeSet</code> supports two APIs. The first is the
 * <em>operand</em> API that allows the caller to modify the value of
@ -184,7 +185,7 @@ import java.text.*;
 * through 'z' and all letters in between, in Unicode order
 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
 * all characters but 'a' through 'z',
- * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF
+ * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
@ -254,10 +255,11 @@ import java.text.*;
 * *Unsupported by Java (and hence unsupported by UnicodeSet).
 *
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.33 $ $Date: 2001/09/20 21:20:00 $ */
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.34 $ $Date: 2001/09/24 19:57:18 $ */
 public class UnicodeSet implements UnicodeFilter {

    /* Implementation Notes.
+     * NOTE: This conversion has been completed as of 2.0.
     *
     * UnicodeSet currently represents only the characters U+0000 to
     * U+FFFF.  This allows the API to be written in terms of the Java
@ -285,24 +287,35 @@ public class UnicodeSet implements UnicodeFilter {
     */

    private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
-    private static final int HIGH = 0x10000; // HIGH > all valid values. 10000 for code units.
+    private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
                                             // 110000 for codepoints

    /**
     * Minimum value that can be stored in a UnicodeSet.
     */
-    public static final char MIN_VALUE = (char) LOW;
+    public static final int MIN_VALUE = LOW;

    /**
     * Maximum value that can be stored in a UnicodeSet.
     */
-    public static final char MAX_VALUE = (char) (HIGH - 1);
+    public static final int MAX_VALUE = HIGH - 1;

    private int len;      // length used; list may be longer to minimize reallocs
    private int[] list;   // MUST be terminated with HIGH
    private int[] rangeList; // internal buffer
    private int[] buffer; // internal buffer

+    /**
+     * The pattern representation of this set.  This may not be the
+     * most economical pattern.  It is the pattern supplied to
+     * applyPattern(), with variables substituted and whitespace
+     * removed.  For sets constructed without applyPattern(), or
+     * modified using the non-pattern API, this string will be null,
+     * indicating that toPattern() must generate a pattern
+     * representation from the inversion list.
+     */ 
+    private String pat = null;
+
    private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
    private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0

@ -349,7 +362,7 @@ public class UnicodeSet implements UnicodeFilter {
     * @param start first character, inclusive, of range
     * @param end last character, inclusive, of range
     */
-    public UnicodeSet(char start, char end) {
+    public UnicodeSet(int start, int end) {
        this();
        complement(start, end);
    }
@ -418,7 +431,7 @@ public class UnicodeSet implements UnicodeFilter {
     * @param start first character in the set, inclusive
     * @rparam end last character in the set, inclusive
     */
-    public void set(char start, char end) {
+    public void set(int start, int end) {
        clear();
        complement(start, end);
    }
@ -431,6 +444,7 @@ public class UnicodeSet implements UnicodeFilter {
    public void set(UnicodeSet other) {
        list = (int[]) other.list.clone();
        len = other.len;
+        pat = other.pat;
    }

    /**
@ -475,24 +489,6 @@ public class UnicodeSet implements UnicodeFilter {
        }
    }

-    /**
-     * Append the <code>toPattern()</code> representation of a
-     * character to the given <code>StringBuffer</code>.
-     */
-    private static final void _toPat(StringBuffer buf, char c) {
-        // Okay to let ':' pass through
-        switch (c) {
-        case '[':
-        case ']':
-        case '-':
-        case '^':
-        case '&':
-        case '\\':
-            buf.append('\\');
-        }
-        buf.append(c);
-    }
-
    /**
     * Append the <code>toPattern()</code> representation of a
     * character to the given <code>StringBuffer</code>.
@ -575,6 +571,53 @@ public class UnicodeSet implements UnicodeFilter {
     */
    public String toPattern(boolean escapeUnprintable) {
        StringBuffer result = new StringBuffer();
+        return _toPattern(result, escapeUnprintable).toString();
+    }
+
+    /**
+     * Append a string representation of this set to result.  This will be
+     * a cleaned version of the string passed to applyPattern(), if there
+     * is one.  Otherwise it will be generated.
+     */
+    private StringBuffer _toPattern(StringBuffer result,
+                                    boolean escapeUnprintable) {
+        if (pat != null) {
+            int i;
+            int backslashCount = 0;
+            for (i=0; i<pat.length(); ++i) {
+                char c = pat.charAt(i);
+                if (escapeUnprintable && _isUnprintable(c)) {
+                    // If the unprintable character is preceded by an odd
+                    // number of backslashes, then it has been escaped.
+                    // Before unescaping it, we delete the final
+                    // backslash.
+                    if ((backslashCount % 2) == 1) {
+                        result.setLength(result.length() - 1);
+                    }
+                    _escapeUnprintable(result, c);
+                    backslashCount = 0;
+                } else {
+                    result.append(c);
+                    if (c == '\\') {
+                        ++backslashCount;
+                    } else {
+                        backslashCount = 0;
+                    }
+                }
+            }
+            return result;
+        }
+        
+        return _generatePattern(result, escapeUnprintable);
+    }
+
+    /**
+     * Generate and append a string representation of this set to result.
+     * This does not use this.pat, the cleaned up copy of the string
+     * passed to applyPattern(). 
+     */
+    public StringBuffer _generatePattern(StringBuffer result,
+                                         boolean escapeUnprintable) {
        result.append('[');

        // Check against the predefined categories.  We implicitly build
@ -583,7 +626,7 @@ public class UnicodeSet implements UnicodeFilter {
            if (this.equals(getCategorySet(cat))) {
                result.append(':');
                result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
-                return result.append(":]").toString();
+                return result.append(":]");
            }
        }

@ -623,7 +666,7 @@ public class UnicodeSet implements UnicodeFilter {
            }
        }

-        return result.append(']').toString();
+        return result.append(']');
    }

    /**
@ -659,7 +702,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @return <tt>true</tt> if this set contains the specified range
     * of chars.
     */
-    public boolean contains(char start, char end) {
+    public boolean contains(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        int i = -1;
        while (true) {
            if (start < list[++i]) break;
@ -674,7 +723,10 @@ public class UnicodeSet implements UnicodeFilter {
     * <code>charAt()</code>.
     * @return an index from 0..size()-1, or -1
     */
-    public int indexOf(char c) {
+    public int indexOf(int c) {
+        if (c < MIN_VALUE || c > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
+        }
        int i = 0;
        int n = 0;
        for (;;) {
@ -697,7 +749,7 @@ public class UnicodeSet implements UnicodeFilter {
     * <code>indexOf()</code>.
     * @param index an index from 0..size()-1
     */
-    public char charAt(int index) {
+    public int charAt(int index) {
        if (index >= 0) {
            for (int i=0; i < len;) {
                int start = list[i++];
@ -716,12 +768,14 @@ public class UnicodeSet implements UnicodeFilter {
     *
     * @return <tt>true</tt> if this set contains the specified char.
     */
-    public boolean contains(char c) {
-//| Not needed unless HIGH > 0x10000
-//|     // catch degenerate cases
-//|     if (c == HIGH) {   // catch final, so we don't do it in loop!
-//|         return (len & 1) == 0;  // even length includes everything
-//|     }
+    public boolean contains(int c) {
+        if (c < MIN_VALUE || c > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
+        }
+        // catch degenerate cases (not needed unless HIGH > 0x10000
+        if (c == HIGH) {   // catch final, so we don't do it in loop!
+            return (len & 1) == 0;  // even length includes everything
+        }
        // Set i to the index of the start item greater than ch
        // We know we will terminate without length test!
        // LATER: for large sets, add binary search
@ -771,7 +825,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @param end last character, inclusive, of range to be added
     * to this set.
     */
-    public void add(char start, char end) {
+    public void add(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        if (start <= end) {
            add(range(start, end), 2, 0);
        }
@ -782,7 +842,7 @@ public class UnicodeSet implements UnicodeFilter {
     * present.  If this set already contains the specified character,
     * the call leaves this set unchanged.
     */
-    public final void add(char c) {
+    public final void add(int c) {
        add(c, c);
    }

@ -796,7 +856,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @param end last character, inclusive, of range to be retained
     * to this set.
     */
-    public void retain(char start, char end) {
+    public void retain(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        if (start <= end) {
            retain(range(start, end), 2, 0);
        } else {
@ -807,7 +873,7 @@ public class UnicodeSet implements UnicodeFilter {
    /**
     * Retain the specified character from this set if it is present.
     */
-    public final void retain(char c) {
+    public final void retain(int c) {
        retain(c, c);
    }

@ -822,7 +888,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @param end last character, inclusive, of range to be removed
     * from this set.
     */
-    public void remove(char start, char end) {
+    public void remove(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        if (start <= end) {
            retain(range(start, end), 2, 2);
        }
@ -833,7 +905,7 @@ public class UnicodeSet implements UnicodeFilter {
     * The set will not contain the specified character once the call
     * returns.
     */
-    public final void remove(char c) {
+    public final void remove(int c) {
        remove(c, c);
    }

@ -848,7 +920,13 @@ public class UnicodeSet implements UnicodeFilter {
     * @param end last character, inclusive, of range to be removed
     * from this set.
     */
-    public void complement(char start, char end) {
+    public void complement(int start, int end) {
+        if (start < MIN_VALUE || start > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+        }
+        if (end < MIN_VALUE || end > MAX_VALUE) {
+            throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+        }
        if (start <= end) {
            xor(range(start, end), 2, 0);
        }
@ -859,7 +937,7 @@ public class UnicodeSet implements UnicodeFilter {
     * will be removed if it is in this set, or will be added if it is
     * not in this set.
     */
-    public final void complement(char c) {
+    public final void complement(int c) {
        complement(c, c);
    }

@ -878,6 +956,7 @@ public class UnicodeSet implements UnicodeFilter {
            list[0] = LOW;
            ++len;
        }
+        pat = null;
    }

    /**
@ -960,6 +1039,7 @@ public class UnicodeSet implements UnicodeFilter {
    public void clear() {
        list[0] = HIGH;
        len = 1;
+        pat = null;
    }

    /**
@ -980,8 +1060,8 @@ public class UnicodeSet implements UnicodeFilter {
     * @see #getRangeCount
     * @see #getRangeEnd
     */
-    public char getRangeStart(int index) {
-        return (char) list[index*2];
+    public int getRangeStart(int index) {
+        return list[index*2];
    }

    /**
@ -992,8 +1072,8 @@ public class UnicodeSet implements UnicodeFilter {
     * @see #getRangeStart
     * @see #getRangeEnd
     */
-    public char getRangeEnd(int index) {
-        return (char) (list[index*2 + 1] - 1);
+    public int getRangeEnd(int index) {
+        return (list[index*2 + 1] - 1);
    }

    /**
@ -1052,7 +1132,7 @@ public class UnicodeSet implements UnicodeFilter {
     * Return a programmer-readable string representation of this object.
     */
    public String toString() {
-        return getClass().getName() + '{' + toPattern(false) + '}';
+        return getClass().getName() + '(' + toPattern(false) + ')';
    }

    //----------------------------------------------------------------
@ -1081,13 +1161,37 @@ public class UnicodeSet implements UnicodeFilter {
     * of <code>pattern</code>
     * @exception java.lang.IllegalArgumentException if the parse fails.
     */
-    void applyPattern(String pattern, ParsePosition pos,
-                      SymbolTable symbols, boolean ignoreWhitespace) {
+    void applyPattern(String pattern,
+                      ParsePosition pos,
+                      SymbolTable symbols,
+                      boolean ignoreWhitespace) {

+        // Need to build the pattern in a temporary string because
+        // _applyPattern calls add() etc., which set pat to empty.
+        StringBuffer rebuiltPat = new StringBuffer();
+        _applyPattern(pattern, pos, symbols, rebuiltPat, ignoreWhitespace);
+        pat = rebuiltPat.toString();
+    }
+
+    void _applyPattern(String pattern, ParsePosition pos,
+                       SymbolTable symbols, StringBuffer rebuiltPat,
+                       boolean ignoreWhitespace) {
+
+        // If the pattern contains any of the following, we save a
+        // rebuilt (variable-substituted) copy of the source pattern:
+        // - a category
+        // - an intersection or subtraction operator
+        // - an anchor (trailing '$', indicating RBT ether)
+        boolean rebuildPattern = false;
+        StringBuffer newPat = new StringBuffer("[");
+        int nestedPatStart = -1; // see below for usage
+        boolean nestedPatDone = false; // see below for usage
+        
        boolean invert = false;
        clear();

-        int lastChar = -1; // This is either a char (0..FFFF) or -1
+        final int NONE = -1;
+        int lastChar = NONE; // This is either a char (0..10FFFF) or -1
        char lastOp = 0;

        /* This loop iterates over the characters in the pattern.  We start at
@ -1109,7 +1213,7 @@ public class UnicodeSet implements UnicodeFilter {
        // mode 2: '[' '^'? seen; parse pattern and close with ']'
        // mode 3: '[:' seen; parse category and close with ':]'
        int mode = 0;
-        int openPos = 0; // offset to opening '['
+        int colonPos = 0; // Expected pos of ':' in '[:'
        int start = pos.getIndex();
        int i = start;
        int limit = pattern.length();
@ -1120,33 +1224,37 @@ public class UnicodeSet implements UnicodeFilter {
        char[] varValueBuffer = null;
        int ivarValueBuffer = 0;
        int anchor = 0;
-        for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
+        int c;
+        while (i<limit) {
            /* If the next element is a single character, c will be set to it,
             * and nestedSet will be null.  In this case isLiteral indicates
             * whether the character should assume special meaning if it has
             * one.  If the next element is a nested set, either via a variable
             * reference, or via an embedded "[..]"  or "[:..:]" pattern, then
-             * nestedSet will be set to the i-list for the nested set, and
+             * nestedSet will be set to the pairs list for the nested set, and
             * c's value should be ignored.
             */
            UnicodeSet nestedSet = null;
            boolean isLiteral = false;
-            char c;
            if (varValueBuffer != null) {
                if (ivarValueBuffer < varValueBuffer.length) {
-                    c = varValueBuffer[ivarValueBuffer++];
+                    c = UTF16.charAt(varValueBuffer, 0, varValueBuffer.length, ivarValueBuffer);
+                    ivarValueBuffer += UTF16.getCharCount(c);
                    nestedSet = symbols.lookupSet(c); // may be NULL
+                    nestedPatDone = false;
                } else {
                    varValueBuffer = null;
-                    c = pattern.charAt(i);
+                    c = UTF16.charAt(pattern, i);
+                    i += UTF16.getCharCount(c);
                }
            } else {
-                c = pattern.charAt(i);
+                c = UTF16.charAt(pattern, i);
+                i += UTF16.getCharCount(c);
            }

            // Ignore whitespace.  This is not Unicode whitespace, but Java
            // whitespace, a subset of Unicode whitespace.
-            if (ignoreWhitespace && Character.isWhitespace(c)) {
+            if (ignoreWhitespace && UCharacter.isWhitespace(c)) {
                continue;
            }

@ -1160,7 +1268,7 @@ public class UnicodeSet implements UnicodeFilter {
            case 0:
                if (c == '[') {
                    mode = 1; // Next look for '^'
-                    openPos = i;
+                    colonPos = i; // Expect ':' at next offset
                    continue;
                } else {
                    throw new IllegalArgumentException("Missing opening '['");
@ -1170,14 +1278,17 @@ public class UnicodeSet implements UnicodeFilter {
                switch (c) {
                case '^':
                    invert = true;
+                    newPat.append((char) c);
                    continue; // Back to top to fetch next character
                case ':':
-                    if (i == openPos+1) {
+                    if (i-1 == colonPos) {
                        // '[:' cannot have whitespace in it
-                        --i;
+                        --i; // Backup to the '['
                        c = '[';
                        mode = 3;
-                        // Fall through and parse category normally
+                        // Fall through and parse category using the same
+                        // code used to parse a nested category.  The mode
+                        // will indicate that this is actually top level.
                    }
                    break; // Fall through
                case '-':
@ -1202,27 +1313,19 @@ public class UnicodeSet implements UnicodeFilter {
                 * interpret '\\uxxxx' Unicode escapes here (as literals).
                 */
                if (c == '\\') {
-                    ++i;
-                    if (i < limit) {
-                        c = pattern.charAt(i);
-                        isLiteral = true;
-                        if (c == 'u') {
-                            if ((i+4) >= limit) {
-                                throw new IllegalArgumentException("Invalid \\u escape");
-                            }
-                            c = '\u0000';
-                            for (int j=(++i)+4; i<j; ++i) { // [sic]
-                                int digit = Character.digit(pattern.charAt(i), 16);
-                                if (digit<0) {
-                                    throw new IllegalArgumentException("Invalid \\u escape");
-                                }
-                                c = (char) ((c << 4) | digit);
-                            }
-                            --i; // Move i back to last parsed character
-                        }
-                    } else {
-                        throw new IllegalArgumentException("Trailing '\\'");
+                    int[] offset = new int[] { i };
+                    int escaped = Utility.unescapeAt(pattern, offset);
+                    if (escaped == -1) {
+                        int sta = Math.max(i - 8, 0);
+                        int lim = Math.min(i + 16, pattern.length());
+                        throw new IllegalArgumentException("Invalid escape sequence " +
+                                                           pattern.substring(sta, i-1) +
+                                                           "|" +
+                                                           pattern.substring(i-1, lim));
                    }
+                    i = offset[0];
+                    isLiteral = true;
+                    c = escaped;
                }

                /* Parse variable references.  These are treated as literals.  If a
@ -1232,7 +1335,7 @@ public class UnicodeSet implements UnicodeFilter {
                 * Set variables are only looked up if varCharToSet is not null.
                 */
                else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
-                    pos.setIndex(++i);
+                    pos.setIndex(i);
                    String name = symbols.parseReference(pattern, pos, limit);
                    if (name != null) {
                        varValueBuffer = symbols.lookup(name);
@ -1246,7 +1349,6 @@ public class UnicodeSet implements UnicodeFilter {
                        // Got a null; this means we have an isolated $.
                        // Tentatively assume this is an anchor.
                        anchor = 1;
-                        --i; // Back up so loop increment works properly
                    }
                    continue; // Back to the top to get varValueBuffer[0]
                }
@ -1256,28 +1358,56 @@ public class UnicodeSet implements UnicodeFilter {
                 * recognize these here and set nestedSet accordingly.
                 */
                else if (!isLiteral && c == '[') {
+                    // Record position before nested pattern
+                    nestedPatStart = newPat.length();
+
                    // Handle "[:...:]", representing a character category
-                    char d = charAfter(pattern, i);
-                    if (d == ':') {
-                        i += 2;
+                    if (i < pattern.length() && pattern.charAt(i) == ':') {
+                        ++i;
                        int j = pattern.indexOf(":]", i);
                        if (j < 0) {
                            throw new IllegalArgumentException("Missing \":]\"");
                        }
+                        String scratch = pattern.substring(i, j);
                        nestedSet = new UnicodeSet();
-                        nestedSet.applyCategory(pattern.substring(i, j));
-                        i = j+1; // Make i point to ']' in ":]"
+                        nestedSet.applyCategory(scratch);
+                        nestedPatDone = true; // We're going to do it just below
+                        i = j+2; // Advance i past ":]"
+
+                        // Use a rebuilt pattern.  If we are top level,
+                        // then there is already a SET_OPEN in newPat, and
+                        // SET_CLOSE will be appended elsewhere.
+                        if (mode != 3) {
+                            newPat.append('[');
+                        }
+                        newPat.append(':').append(scratch).append(':');
+                        if (mode != 3) {
+                            newPat.append(']');
+                        }
+                        rebuildPattern = true;
+
                        if (mode == 3) {
-                            // Entire pattern is a category; leave parse loop
+                            // Entire pattern is a category; leave parse
+                            // loop.  This is one of 2 ways we leave this
+                            // loop if the pattern is well-formed.
                            set(nestedSet);
+                            mode = 4;
                            break;
                        }
                    } else {
-                        // Recurse to get the i-list for this nested set.
-                        pos.setIndex(i); // Add 2 to point AFTER op
+                        // Recurse to get the pairs for this nested set.
+                        // Backup i to '['.
+                        pos.setIndex(--i);
+                        switch (lastOp) {
+                        case '-':
+                        case '&':
+                            newPat.append(lastOp);
+                            break;
+                        }
                        nestedSet = new UnicodeSet();
-                        nestedSet.applyPattern(pattern, pos, symbols, ignoreWhitespace);
-                        i = pos.getIndex() - 1; // - 1 to point at ']'
+                        nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
+                        nestedPatDone = true;
+                        i = pos.getIndex();
                    }
                }
            }
@ -1291,12 +1421,23 @@ public class UnicodeSet implements UnicodeFilter {
             * ']' have special meanings.
             */
            if (nestedSet != null) {
-                if (lastChar >= 0) {
+                if (lastChar != NONE) {
                    if (lastOp != 0) {
                        throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
                    }
                    add((char) lastChar, (char) lastChar);
-                    lastChar = -1;
+                    if (nestedPatDone) {
+                        // If there was a character before the nested set,
+                        // then we need to insert it in newPat before the
+                        // pattern for the nested set.  This position was
+                        // recorded in nestedPatStart.
+                        StringBuffer s = new StringBuffer();
+                        _appendToPat(s, lastChar, false);
+                        newPat.insert(nestedPatStart, s.toString());
+                    } else {
+                        _appendToPat(newPat, lastChar, false);
+                    }
+                    lastChar = NONE;
                }
                switch (lastOp) {
                case '-':
@ -1309,7 +1450,19 @@ public class UnicodeSet implements UnicodeFilter {
                    addAll(nestedSet);
                    break;
                }
+
+                // Get the pattern for the nested set, if we haven't done so
+                // already.
+                if (!nestedPatDone) {
+                    if (lastOp != 0) {
+                        newPat.append(lastOp);
+                    }
+                    nestedSet._toPattern(newPat, false);
+                }
+                rebuildPattern = true;
+
                lastOp = 0;
+
            } else if (!isLiteral && c == ']') {
                // Final closing delimiter.  This is the only way we leave this
                // loop if the pattern is well-formed.
@ -1318,11 +1471,14 @@ public class UnicodeSet implements UnicodeFilter {
                    
                }
                if (anchor == 2) {
+                    rebuildPattern = true;
+                    newPat.append(SymbolTable.SYMBOL_REF);
                    add(TransliterationRule.ETHER);
                }
+                mode = 4;
                break;
            } else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) {
-                lastOp = c;
+                lastOp = (char) c;
            } else if (lastOp == '-') {
                if (lastChar >= c) {
                    // Don't allow redundant (a-a) or empty (b-a) ranges;
@ -1330,36 +1486,45 @@ public class UnicodeSet implements UnicodeFilter {
                    throw new IllegalArgumentException("Invalid range " + lastChar +
                                                       '-' + c);
                }
-                add((char) lastChar, c);
+                add(lastChar, c);
+                _appendToPat(newPat, lastChar, false);
+                newPat.append('-');
+                _appendToPat(newPat, c, false);
                lastOp = 0;
-                lastChar = -1;
+                lastChar = NONE;
            } else if (lastOp != 0) {
                // We have <set>&<char> or <char>&<char>
                throw new IllegalArgumentException("Unquoted " + lastOp);
            } else {
-                if (lastChar >= 0) {
+                if (lastChar != NONE) {
                    // We have <char><char>
                    add((char) lastChar, (char) lastChar);
+                    _appendToPat(newPat, lastChar, false);
                }
                lastChar = c;
            }
        }

-        if (mode == 0) {
-            throw new IllegalArgumentException("Missing '[' in \"" +
-                                               pattern.substring(start) + '"');
+        if (lastChar != NONE) {
+            add(lastChar, lastChar);
+            _appendToPat(newPat, lastChar, false);
        }

+//      if (mode == 0) {
+//          throw new IllegalArgumentException("Missing '[' in \"" +
+//                                             pattern.substring(start) + '"');
+//      }
+
        // Handle unprocessed stuff preceding the closing ']'
        if (lastOp == '-') {
            // Trailing '-' is treated as literal
            add(lastOp, lastOp);
+            newPat.append('-');
        } else if (lastOp == '&') {
            throw new IllegalArgumentException("Unquoted trailing " + lastOp);
        }
-        if (lastChar >= 0) {
-            add((char) lastChar, (char) lastChar);
-        }
+
+        newPat.append(']');

        /**
         * If we saw a '^' after the initial '[' of this pattern, then perform
@ -1369,17 +1534,30 @@ public class UnicodeSet implements UnicodeFilter {
            complement();
        }

-        /**
-         * i indexes the last character we parsed or is pattern.length().  In
-         * the latter case, we have run off the end without finding a closing
-         * ']'.  Otherwise, we know i < pattern.length(), and we set the
-         * ParsePosition to the next character to be parsed.
-         */
-        if (i == limit) {
-            throw new IllegalArgumentException("Missing ']' in \"" +
-                                               pattern.substring(start) + '"');
+        if (mode != 4) {
+            throw new IllegalArgumentException("Missing ']'");
+        }
+
+//      /**
+//       * i indexes the last character we parsed or is pattern.length().  In
+//       * the latter case, we have run off the end without finding a closing
+//       * ']'.  Otherwise, we know i < pattern.length(), and we set the
+//       * ParsePosition to the next character to be parsed.
+//       */
+//      if (i == limit) {
+//          throw new IllegalArgumentException("Missing ']' in \"" +
+//                                             pattern.substring(start) + '"');
+//      }
+  
+        pos.setIndex(i);
+
+        // Use the rebuilt pattern (newPat) only if necessary.  Prefer the
+        // generated pattern.
+        if (rebuildPattern) {
+            rebuiltPat.append(newPat.toString());
+        } else {
+            _generatePattern(rebuiltPat, false);
        }
-        pos.setIndex(i+1);

        if (false) {
            // Debug parser
@ -1494,14 +1672,6 @@ public class UnicodeSet implements UnicodeFilter {
    // Implementation: Utility methods
    //----------------------------------------------------------------

-    /**
-     * Returns the character after the given position, or '\uFFFE' if
-     * there is none.
-     */
-    private static final char charAfter(String str, int i) {
-        return ((++i) < str.length()) ? str.charAt(i) : '\uFFFE';
-    }
-
    private void ensureCapacity(int newLen) {
        if (newLen <= list.length) return;
        int[] temp = new int[newLen + GROW_EXTRA];
@ -1571,6 +1741,7 @@ public class UnicodeSet implements UnicodeFilter {
        int[] temp = list;
        list = buffer;
        buffer = temp;
+        pat = null;
        return this;
    }

@ -1668,6 +1839,7 @@ public class UnicodeSet implements UnicodeFilter {
        int[] temp = list;
        list = buffer;
        buffer = temp;
+        pat = null;
        return this;
    }

@ -1738,6 +1910,7 @@ public class UnicodeSet implements UnicodeFilter {
        int[] temp = list;
        list = buffer;
        buffer = temp;
+        pat = null;
        return this;
    }