From 9ffcb85ba127a118317c1a4ba7e331cf4cf00c34 Mon Sep 17 00:00:00 2001
From: Mark Davis <mark@macchiato.com>
Date: Tue, 14 Dec 2010 07:51:00 +0000
Subject: [PATCH] ICU-8227 Whew, there were lots of problems in the way the old
 code was done. Now working much better.

X-SVN-Rev: 29207
---
 .gitattributes                                |   1 +
 .../com/ibm/icu/text/AnyTransliterator.java   |  13 +
 .../com/ibm/icu/text/BreakTransliterator.java |  12 +
 .../ibm/icu/text/CaseFoldTransliterator.java  |  20 +
 .../ibm/icu/text/CompoundTransliterator.java  |  28 +-
 .../ibm/icu/text/EscapeTransliterator.java    |  19 +
 .../ibm/icu/text/LowercaseTransliterator.java |  21 +
 .../icu/text/NameUnicodeTransliterator.java   |  27 +
 .../icu/text/NormalizationTransliterator.java |  51 +-
 .../com/ibm/icu/text/NullTransliterator.java  |   8 +
 .../ibm/icu/text/RemoveTransliterator.java    |  13 +-
 .../ibm/icu/text/RuleBasedTransliterator.java |  38 +-
 .../com/ibm/icu/text/SourceTargetUtility.java | 133 +++++
 .../ibm/icu/text/TitlecaseTransliterator.java |  21 +
 .../com/ibm/icu/text/TransliterationRule.java |  53 +-
 .../ibm/icu/text/TransliterationRuleSet.java  |  18 +-
 .../src/com/ibm/icu/text/Transliterator.java  | 476 ++++++++++--------
 .../ibm/icu/text/UnescapeTransliterator.java  |  35 ++
 .../icu/text/UnicodeNameTransliterator.java   |  21 +
 .../ibm/icu/text/UppercaseTransliterator.java |  21 +
 .../dev/test/translit/TransliteratorTest.java | 370 +++++++++++++-
 21 files changed, 1121 insertions(+), 278 deletions(-)
 create mode 100644 icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java

diff --git a/.gitattributes b/.gitattributes
index a161e067b3..1f16107fba 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -262,6 +262,7 @@ icu4j/main/classes/translit/.externalToolBuilders/copy-data-translit.launch -tex
 icu4j/main/classes/translit/.settings/org.eclipse.core.resources.prefs -text
 icu4j/main/classes/translit/.settings/org.eclipse.jdt.core.prefs -text
 icu4j/main/classes/translit/.settings/org.eclipse.jdt.ui.prefs -text
+icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java -text
 icu4j/main/classes/translit/translit-build.launch -text
 icu4j/main/shared/.project -text
 icu4j/main/shared/.settings/org.eclipse.core.resources.prefs -text
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/AnyTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/AnyTransliterator.java
index 295f599712..7f8809c397 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/AnyTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/AnyTransliterator.java
@@ -404,5 +404,18 @@ class AnyTransliterator extends Transliterator {
         }
         return new AnyTransliterator(getID(), filter, target, targetScript, widthFix, cache);
     }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
+        // Assume that it can modify any character to any other character
+        sourceSet.addAll(myFilter);
+        if (myFilter.size() != 0) {
+            targetSet.addAll(0, 0x10FFFF);
+        }
+    }
 }
 
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/BreakTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/BreakTransliterator.java
index c27156e77f..49db4aa5e9 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/BreakTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/BreakTransliterator.java
@@ -387,5 +387,17 @@ final class BreakTransliterator extends Transliterator {
         }
 
     }
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
+        // Doesn't actually modify the source characters, so leave them alone.
+        // add the characters inserted
+        if (myFilter.size() != 0) {
+            targetSet.addAll(insertion);
+        }
+    }
 
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/CaseFoldTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/CaseFoldTransliterator.java
index 9d4f443b31..6955030b56 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/CaseFoldTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/CaseFoldTransliterator.java
@@ -7,6 +7,7 @@
 package com.ibm.icu.text;
 
 import com.ibm.icu.impl.UCaseProps;
+import com.ibm.icu.lang.UCharacter;
 
 /**
  * A transliterator that performs locale-sensitive toLower()
@@ -102,4 +103,23 @@ class CaseFoldTransliterator extends Transliterator{
         }
         offsets.start = offsets.limit;
     }
+    
+    static SourceTargetUtility sourceTargetUtility = null;
+    
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        synchronized (UppercaseTransliterator.class) {
+            if (sourceTargetUtility == null) {
+                sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
+                    public String transform(String source) {
+                        return UCharacter.foldCase(source, true);
+                    }
+                });
+            }
+        }
+        sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/CompoundTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/CompoundTransliterator.java
index 0192787549..a83f48dd9f 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/CompoundTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/CompoundTransliterator.java
@@ -305,26 +305,20 @@ class CompoundTransliterator extends Transliterator {
     }
 
     /**
-     * Return the set of all characters that may be modified by this
-     * Transliterator, ignoring the effect of our filter.
+     * @internal
      */
-    protected UnicodeSet handleGetSourceSet() {
-        UnicodeSet set = new UnicodeSet();
+    @Override
+    public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        UnicodeSet myFilter = new UnicodeSet(getFilterAsUnicodeSet(filter));
+        UnicodeSet tempTargetSet = new UnicodeSet();
         for (int i=0; i<trans.length; ++i) {
-            set.addAll(trans[i].getSourceSet());
-            // Take the example of Hiragana-Latin.  This is really
-            // Hiragana-Katakana; Katakana-Latin.  The source set of
-            // these two is roughly [:Hiragana:] and [:Katakana:].
-            // But the source set for the entire transliterator is
-            // actually [:Hiragana:] ONLY -- that is, the first
-            // non-empty source set.
-
-            // This is a heuristic, and not 100% reliable.
-            if (!set.isEmpty()) {
-                break;
-            }
+            // each time we produce targets, those can be used by subsequent items, despite the filter.
+            // so we get just those items, and add them to the filter each time.
+            tempTargetSet.clear();
+            trans[i].addSourceTargetSet(myFilter, sourceSet, tempTargetSet);
+            targetSet.addAll(tempTargetSet);
+            myFilter.addAll(tempTargetSet);
         }
-        return set;
     }
 
     /**
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/EscapeTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/EscapeTransliterator.java
index 86d54c2276..f1203dcb63 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/EscapeTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/EscapeTransliterator.java
@@ -197,4 +197,23 @@ class EscapeTransliterator extends Transliterator {
         pos.limit = limit;
         pos.start = start;
     }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        sourceSet.addAll(getFilterAsUnicodeSet(inputFilter));
+        for (EscapeTransliterator it = this; it != null ; it = it.supplementalHandler) {
+            if (inputFilter.size() != 0) {
+                targetSet.addAll(it.prefix);
+                targetSet.addAll(it.suffix);
+                StringBuilder buffer = new StringBuilder();
+                for (int i = 0; i < it.radix; ++i) {
+                    Utility.appendNumber(buffer, i, it.radix, it.minDigits);
+                }
+                targetSet.addAll(buffer.toString()); // TODO drop once String is changed to CharSequence in UnicodeSet
+            }
+        }
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java
index e405665132..a5953a8ea3 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java
@@ -7,6 +7,7 @@
 package com.ibm.icu.text;
 
 import com.ibm.icu.impl.UCaseProps;
+import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.util.ULocale;
 
 /**
@@ -109,4 +110,24 @@ class LowercaseTransliterator extends Transliterator{
         }
         offsets.start = offsets.limit;
     }
+    
+    // NOTE: normally this would be static, but because the results vary by locale....
+    SourceTargetUtility sourceTargetUtility = null;
+    
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        synchronized (this) {
+            if (sourceTargetUtility == null) {
+                sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
+                    public String transform(String source) {
+                        return UCharacter.toLowerCase(locale, source);                    
+                    }
+                });
+            }
+        }
+        sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/NameUnicodeTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/NameUnicodeTransliterator.java
index 7e6a44aa92..f8c28506f5 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/NameUnicodeTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/NameUnicodeTransliterator.java
@@ -165,4 +165,31 @@ class NameUnicodeTransliterator extends Transliterator {
         // open delimiter candidate.
         offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
     }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
+        if (!myFilter.containsAll(UnicodeNameTransliterator.OPEN_DELIM) || !myFilter.contains(CLOSE_DELIM)) {
+            return; // we have to contain both prefix and suffix 
+        }
+        UnicodeSet items = new UnicodeSet()
+        .addAll('0', '9')
+        .addAll('A', 'F')
+        .addAll('a', 'z') // for controls
+        .add('<').add('>') // for controls
+        .add('(').add(')') // for controls
+        .add('-')
+        .add(' ')
+        .addAll(UnicodeNameTransliterator.OPEN_DELIM)
+        .add(CLOSE_DELIM);
+        items.retainAll(myFilter);
+        if (items.size() > 0) {
+            sourceSet.addAll(items);
+            // could produce any character
+            targetSet.addAll(0, 0x10FFFF);
+        }
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/NormalizationTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/NormalizationTransliterator.java
index e5d7613b47..02ae4c2dd3 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/NormalizationTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/NormalizationTransliterator.java
@@ -1,14 +1,17 @@
 /*
-**********************************************************************
-*   Copyright (C) 2001-2010, International Business Machines
-*   Corporation and others.  All Rights Reserved.
-**********************************************************************
-*   Date        Name        Description
-*   06/08/01    aliu        Creation.
-**********************************************************************
-*/
+ **********************************************************************
+ *   Copyright (C) 2001-2010, International Business Machines
+ *   Corporation and others.  All Rights Reserved.
+ **********************************************************************
+ *   Date        Name        Description
+ *   06/08/01    aliu        Creation.
+ **********************************************************************
+ */
 
 package com.ibm.icu.text;
+import java.util.HashMap;
+import java.util.Map;
+
 import com.ibm.icu.impl.Norm2AllModes;
 import com.ibm.icu.impl.Normalizer2Impl;
 
@@ -76,7 +79,7 @@ final class NormalizationTransliterator extends Transliterator {
      * Implements {@link Transliterator#handleTransliterate}.
      */
     protected void handleTransliterate(Replaceable text,
-                                       Position offsets, boolean isIncremental) {
+            Position offsets, boolean isIncremental) {
         // start and limit of the input range
         int start = offsets.start;
         int limit = offsets.limit;
@@ -129,4 +132,34 @@ final class NormalizationTransliterator extends Transliterator {
         offsets.contextLimit += limit - offsets.limit;
         offsets.limit = limit;
     }
+
+    static final Map<Normalizer2, SourceTargetUtility> SOURCE_CACHE = new HashMap<Normalizer2, SourceTargetUtility>();
+    
+    // TODO Get rid of this if Normalizer2 becomes a Transform
+    static class NormalizingTransform implements Transform<String,String> {
+        final Normalizer2 norm2;
+        public NormalizingTransform(Normalizer2 norm2) {
+            this.norm2 = norm2;
+        }
+        public String transform(String source) {
+            return norm2.normalize(source);
+        }   
+    }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        SourceTargetUtility cache;
+        synchronized (SOURCE_CACHE) {
+            //String id = getID();
+            cache = SOURCE_CACHE.get(norm2);
+            if (cache == null) {
+                cache = new SourceTargetUtility(new NormalizingTransform(norm2), norm2);
+                SOURCE_CACHE.put(norm2, cache);
+            }
+        }
+        cache.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/NullTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/NullTransliterator.java
index 906017146c..8ae4bfc826 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/NullTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/NullTransliterator.java
@@ -30,4 +30,12 @@ class NullTransliterator extends Transliterator {
                                        Position offsets, boolean incremental) {
         offsets.start = offsets.limit;
     }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(boolean, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        // do nothing
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/RemoveTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/RemoveTransliterator.java
index 3d020021f0..2e4afa21c5 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/RemoveTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/RemoveTransliterator.java
@@ -1,6 +1,6 @@
 /*
  *******************************************************************************
- * Copyright (C) 1996-2004, International Business Machines Corporation and    *
+ * Copyright (C) 1996-2010, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
@@ -49,4 +49,15 @@ class RemoveTransliterator extends Transliterator {
         index.contextLimit -= len;
         index.limit -= len;
     }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(boolean, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        // intersect myFilter with the input filter
+        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
+        sourceSet.addAll(myFilter);
+        // do nothing with the target
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java
index 08c62b152a..3edb796259 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java
@@ -448,24 +448,32 @@ public class RuleBasedTransliterator extends Transliterator {
         return data.ruleSet.toRules(escapeUnprintable);
     }
 
+//    /**
+//     * Return the set of all characters that may be modified by this
+//     * Transliterator, ignoring the effect of our filter.
+//     * @internal
+//     * @deprecated This API is ICU internal only.
+//     */
+//    protected UnicodeSet handleGetSourceSet() {
+//        return data.ruleSet.getSourceTargetSet(false, unicodeFilter);
+//    }
+//
+//    /**
+//     * Returns the set of all characters that may be generated as
+//     * replacement text by this transliterator.
+//     * @internal
+//     * @deprecated This API is ICU internal only.
+//     */
+//    public UnicodeSet getTargetSet() {
+//        return data.ruleSet.getSourceTargetSet(true, unicodeFilter);
+//    }
+    
     /**
-     * Return the set of all characters that may be modified by this
-     * Transliterator, ignoring the effect of our filter.
      * @internal
-     * @deprecated This API is ICU internal only.
      */
-    protected UnicodeSet handleGetSourceSet() {
-        return data.ruleSet.getSourceTargetSet(false);
-    }
-
-    /**
-     * Returns the set of all characters that may be generated as
-     * replacement text by this transliterator.
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    public UnicodeSet getTargetSet() {
-        return data.ruleSet.getSourceTargetSet(true);
+    @Override
+    public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        data.ruleSet.addSourceTargetSet(filter, sourceSet, targetSet);
     }
 
     /**
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java
new file mode 100644
index 0000000000..57890353c1
--- /dev/null
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java
@@ -0,0 +1,133 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2010, Google, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import com.ibm.icu.lang.CharSequences;
+import com.ibm.icu.text.Normalizer2.Mode;
+
+/**
+ * Simple internal utility class for helping with getSource/TargetSet
+ */
+class SourceTargetUtility {
+    final Transform<String, String> transform;
+    final UnicodeSet sourceCache;
+    final Set<String> sourceStrings;
+    static final UnicodeSet NON_STARTERS = new UnicodeSet("[:^ccc=0:]").freeze();
+    static Normalizer2 NFC = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
+    //static final UnicodeSet TRAILING_COMBINING = new UnicodeSet();
+
+    public SourceTargetUtility(Transform<String, String> transform) {
+        this(transform, null);
+    }
+
+    public SourceTargetUtility(Transform<String, String> transform, Normalizer2 normalizer) {
+        this.transform = transform;
+        if (normalizer != null) {
+//            synchronized (SourceTargetUtility.class) {
+//                if (NFC == null) {
+//                    NFC = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
+//                    for (int i = 0; i <= 0x10FFFF; ++i) {
+//                        String d = NFC.getDecomposition(i);
+//                        if (d == null) {
+//                            continue;
+//                        }
+//                        String s = NFC.normalize(d);
+//                        if (!CharSequences.equals(i, s)) {
+//                            continue;
+//                        }
+//                        // composes
+//                        boolean first = false;
+//                        for (int trailing : CharSequences.codePoints(d)) {
+//                            if (first) {
+//                                first = false;
+//                            } else {
+//                                TRAILING_COMBINING.add(trailing);
+//                            }
+//                        }
+//                    }
+//                }
+//            }
+            sourceCache = new UnicodeSet("[:^ccc=0:]");
+        } else {
+            sourceCache = new UnicodeSet();
+        }
+        sourceStrings = new HashSet<String>();
+        for (int i = 0; i <= 0x10FFFF; ++i) {
+            String s = transform.transform(UTF16.valueOf(i));
+            boolean added = false;
+            if (!CharSequences.equals(i, s)) {
+                sourceCache.add(i);
+                added = true;
+            }
+            if (normalizer == null) {
+                continue;
+            }
+            String d = NFC.getDecomposition(i);
+            if (d == null) {
+                continue;
+            }
+            s = transform.transform(d);
+            if (!d.equals(s)) {
+                sourceStrings.add(d);
+            }
+            if (added) {
+                continue;
+            }
+            if (!normalizer.isInert(i)) {
+                sourceCache.add(i);
+                continue;
+            }
+            // see if any of the non-starters change s; if so, add i
+//            for (String ns : TRAILING_COMBINING) {
+//                String s2 = transform.transform(s + ns);
+//                if (!s2.startsWith(s)) {
+//                    sourceCache.add(i);
+//                    break;
+//                }
+//            }
+
+            // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2;
+            // if (endOfFirst >= d.length()) {
+            // continue;
+            // }
+            // // now add all initial substrings
+            // for (int j = 1; j < d.length(); ++j) {
+            // if (!CharSequences.onCharacterBoundary(d, j)) {
+            // continue;
+            // }
+            // String dd = d.substring(0,j);
+            // s = transform.transform(dd);
+            // if (!dd.equals(s)) {
+            // sourceStrings.add(dd);
+            // }
+            // }
+        }
+        sourceCache.freeze();
+    }
+
+    public void addSourceTargetSet(Transliterator transliterator, UnicodeSet inputFilter, UnicodeSet sourceSet,
+            UnicodeSet targetSet) {
+        UnicodeSet myFilter = transliterator.getFilterAsUnicodeSet(inputFilter);
+        UnicodeSet affectedCharacters = new UnicodeSet(sourceCache).retainAll(myFilter);
+        sourceSet.addAll(affectedCharacters);
+        for (String s : affectedCharacters) {
+            targetSet.addAll(transform.transform(s));
+        }
+        for (String s : sourceStrings) {
+            if (myFilter.containsAll(s)) {
+                String t = transform.transform(s);
+                if (!s.equals(t)) {
+                    targetSet.addAll(t);
+                    sourceSet.addAll(s);
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java
index 577592702e..fe2fc98af9 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java
@@ -6,6 +6,7 @@
 package com.ibm.icu.text;
 
 import com.ibm.icu.impl.UCaseProps;
+import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.util.ULocale;
 
 /**
@@ -147,4 +148,24 @@ class TitlecaseTransliterator extends Transliterator {
         }
         offsets.start = offsets.limit;
     }
+    
+    // NOTE: normally this would be static, but because the results vary by locale....
+    SourceTargetUtility sourceTargetUtility = null;
+    
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        synchronized (this) {
+            if (sourceTargetUtility == null) {
+                sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
+                    public String transform(String source) {
+                        return UCharacter.toTitleCase(locale, source, null);                    
+                    }
+                });
+            }
+        }
+        sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java
index 907e1ac6d9..494827f6ff 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java
@@ -1,6 +1,6 @@
 /*
  *******************************************************************************
- * Copyright (C) 1996-2007, International Business Machines Corporation and    *
+ * Copyright (C) 1996-2010, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
@@ -548,8 +548,11 @@ class TransliterationRule {
      * Union the set of all characters that may be modified by this rule
      * into the given set.
      */
-    void addSourceSetTo(UnicodeSet toUnionTo) {
+    void addSourceSetTo(UnicodeSet toUnionTo, UnicodeSet filter) {
         int limit = anteContextLength + keyLength;
+        if (filter != null && !matches(filter)) {
+            return;
+        }
         for (int i=anteContextLength; i<limit; ) {
             int ch = UTF16.charAt(pattern, i);
             i += UTF16.getCharCount(ch);
@@ -562,11 +565,55 @@ class TransliterationRule {
         }
     }
 
+    /**
+     * Sees if the source of the rule can match the filter. There is a known issue with filters containing multiple characters.
+     * @param filter must not be null (check in caller)
+     * @param pattern2
+     * @param anteContextLength2
+     * @param limit
+     * @return
+     */
+    // Problem: the rule is [{ab}]c > x
+    // The filter is [a{bc}].
+    // If the input is abc, then the rule will work.
+    // However, following code applying the filter won't catch that case.
+    private boolean matches(UnicodeSet filter) {
+        int limit = anteContextLength + keyLength;
+        // We need to walk through the pattern.
+        // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo
+        for (int i=anteContextLength; i<limit; ) {
+            int ch = UTF16.charAt(pattern, i);
+            i += UTF16.getCharCount(ch);
+            UnicodeMatcher matcher = data.lookupMatcher(ch);
+            if (matcher == null) {
+                if (!filter.contains(ch)) {
+                    return false;
+                }
+            } else {
+                try {
+                    if (!filter.containsSome((UnicodeSet) matcher)) {
+                        return false;
+                    }
+                } catch (ClassCastException e) {
+                    UnicodeSet temp = new UnicodeSet();
+                    matcher.addMatchSetTo(temp);
+                    if (!filter.containsSome(temp)) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
     /**
      * Union the set of all characters that may be emitted by this rule
      * into the given set.
      */
-    void addTargetSetTo(UnicodeSet toUnionTo) {
+    void addTargetSetTo(UnicodeSet toUnionTo, UnicodeSet filter) {
+        if (filter != null && !matches(filter)) {
+            return;
+        }
         output.addReplacementSetTo(toUnionTo);
     }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRuleSet.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRuleSet.java
index 6feb251097..c1a184f02b 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRuleSet.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRuleSet.java
@@ -238,21 +238,15 @@ class TransliterationRuleSet {
         return ruleSource.toString();
     }
 
-    /**
-     * Return the set of all characters that may be modified (getTarget=false)
-     * or emitted (getTarget=true) by this set.
-     */
-    UnicodeSet getSourceTargetSet(boolean getTarget) {
-        UnicodeSet set = new UnicodeSet();
+    // TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
+    // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
+    void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
         int count = ruleVector.size();
         for (int i=0; i<count; ++i) {
             TransliterationRule r = ruleVector.get(i);
-            if (getTarget) {
-                r.addTargetSetTo(set);
-            } else {
-                r.addSourceSetTo(set);
-            }
+            r.addTargetSetTo(targetSet, filter);
+            r.addSourceSetTo(sourceSet, filter);
         }
-        return set;
     }
+
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/Transliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/Transliterator.java
index 42985e68a2..99ee6b6462 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/Transliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/Transliterator.java
@@ -26,226 +26,200 @@ import com.ibm.icu.util.ULocale;
 import com.ibm.icu.util.UResourceBundle;
 
 /**
- * <code>Transliterator</code> is an abstract class that
- * transliterates text from one format to another.  The most common
- * kind of transliterator is a script, or alphabet, transliterator.
- * For example, a Russian to Latin transliterator changes Russian text
- * written in Cyrillic characters to phonetically equivalent Latin
- * characters.  It does not <em>translate</em> Russian to English!
- * Transliteration, unlike translation, operates on characters, without
- * reference to the meanings of words and sentences.
- *
- * <p>Although script conversion is its most common use, a
- * transliterator can actually perform a more general class of tasks.
- * In fact, <code>Transliterator</code> defines a very general API
- * which specifies only that a segment of the input text is replaced
- * by new text.  The particulars of this conversion are determined
- * entirely by subclasses of <code>Transliterator</code>.
- *
- * <p><b>Transliterators are stateless</b>
- *
- * <p><code>Transliterator</code> objects are <em>stateless</em>; they
- * retain no information between calls to
- * <code>transliterate()</code>.  As a result, threads may share
- * transliterators without synchronizing them.  This might seem to
- * limit the complexity of the transliteration operation.  In
- * practice, subclasses perform complex transliterations by delaying
- * the replacement of text until it is known that no other
- * replacements are possible.  In other words, although the
- * <code>Transliterator</code> objects are stateless, the source text
- * itself embodies all the needed information, and delayed operation
- * allows arbitrary complexity.
- *
- * <p><b>Batch transliteration</b>
- *
- * <p>The simplest way to perform transliteration is all at once, on a
- * string of existing text.  This is referred to as <em>batch</em>
- * transliteration.  For example, given a string <code>input</code>
- * and a transliterator <code>t</code>, the call
- *
+ * <code>Transliterator</code> is an abstract class that transliterates text from one format to another. The most common
+ * kind of transliterator is a script, or alphabet, transliterator. For example, a Russian to Latin transliterator
+ * changes Russian text written in Cyrillic characters to phonetically equivalent Latin characters. It does not
+ * <em>translate</em> Russian to English! Transliteration, unlike translation, operates on characters, without reference
+ * to the meanings of words and sentences.
+ * 
+ * <p>
+ * Although script conversion is its most common use, a transliterator can actually perform a more general class of
+ * tasks. In fact, <code>Transliterator</code> defines a very general API which specifies only that a segment of the
+ * input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of
+ * <code>Transliterator</code>.
+ * 
+ * <p>
+ * <b>Transliterators are stateless</b>
+ * 
+ * <p>
+ * <code>Transliterator</code> objects are <em>stateless</em>; they retain no information between calls to
+ * <code>transliterate()</code>. As a result, threads may share transliterators without synchronizing them. This might
+ * seem to limit the complexity of the transliteration operation. In practice, subclasses perform complex
+ * transliterations by delaying the replacement of text until it is known that no other replacements are possible. In
+ * other words, although the <code>Transliterator</code> objects are stateless, the source text itself embodies all the
+ * needed information, and delayed operation allows arbitrary complexity.
+ * 
+ * <p>
+ * <b>Batch transliteration</b>
+ * 
+ * <p>
+ * The simplest way to perform transliteration is all at once, on a string of existing text. This is referred to as
+ * <em>batch</em> transliteration. For example, given a string <code>input</code> and a transliterator <code>t</code>,
+ * the call
+ * 
  * <blockquote><code>String result = t.transliterate(input);
  * </code></blockquote>
- *
- * will transliterate it and return the result.  Other methods allow
- * the client to specify a substring to be transliterated and to use
- * {@link Replaceable} objects instead of strings, in order to
- * preserve out-of-band information (such as text styles).
- *
- * <p><b>Keyboard transliteration</b>
- *
- * <p>Somewhat more involved is <em>keyboard</em>, or incremental
- * transliteration.  This is the transliteration of text that is
- * arriving from some source (typically the user's keyboard) one
- * character at a time, or in some other piecemeal fashion.
- *
- * <p>In keyboard transliteration, a <code>Replaceable</code> buffer
- * stores the text.  As text is inserted, as much as possible is
- * transliterated on the fly.  This means a GUI that displays the
- * contents of the buffer may show text being modified as each new
- * character arrives.
- *
- * <p>Consider the simple <code>RuleBasedTransliterator</code>:
- *
+ * 
+ * will transliterate it and return the result. Other methods allow the client to specify a substring to be
+ * transliterated and to use {@link Replaceable} objects instead of strings, in order to preserve out-of-band
+ * information (such as text styles).
+ * 
+ * <p>
+ * <b>Keyboard transliteration</b>
+ * 
+ * <p>
+ * Somewhat more involved is <em>keyboard</em>, or incremental transliteration. This is the transliteration of text that
+ * is arriving from some source (typically the user's keyboard) one character at a time, or in some other piecemeal
+ * fashion.
+ * 
+ * <p>
+ * In keyboard transliteration, a <code>Replaceable</code> buffer stores the text. As text is inserted, as much as
+ * possible is transliterated on the fly. This means a GUI that displays the contents of the buffer may show text being
+ * modified as each new character arrives.
+ * 
+ * <p>
+ * Consider the simple <code>RuleBasedTransliterator</code>:
+ * 
  * <blockquote><code>
  * th&gt;{theta}<br>
  * t&gt;{tau}
  * </code></blockquote>
- *
- * When the user types 't', nothing will happen, since the
- * transliterator is waiting to see if the next character is 'h'.  To
- * remedy this, we introduce the notion of a cursor, marked by a '|'
- * in the output string:
- *
+ * 
+ * When the user types 't', nothing will happen, since the transliterator is waiting to see if the next character is
+ * 'h'. To remedy this, we introduce the notion of a cursor, marked by a '|' in the output string:
+ * 
  * <blockquote><code>
  * t&gt;|{tau}<br>
  * {tau}h&gt;{theta}
  * </code></blockquote>
- *
- * Now when the user types 't', tau appears, and if the next character
- * is 'h', the tau changes to a theta.  This is accomplished by
- * maintaining a cursor position (independent of the insertion point,
- * and invisible in the GUI) across calls to
- * <code>transliterate()</code>.  Typically, the cursor will
- * be coincident with the insertion point, but in a case like the one
- * above, it will precede the insertion point.
- *
- * <p>Keyboard transliteration methods maintain a set of three indices
- * that are updated with each call to
- * <code>transliterate()</code>, including the cursor, start,
- * and limit.  These indices are changed by the method, and they are
- * passed in and out via a Position object. The <code>start</code> index
- * marks the beginning of the substring that the transliterator will
- * look at.  It is advanced as text becomes committed (but it is not
- * the committed index; that's the <code>cursor</code>).  The
- * <code>cursor</code> index, described above, marks the point at
- * which the transliterator last stopped, either because it reached
- * the end, or because it required more characters to disambiguate
- * between possible inputs.  The <code>cursor</code> can also be
- * explicitly set by rules in a <code>RuleBasedTransliterator</code>.
- * Any characters before the <code>cursor</code> index are frozen;
- * future keyboard transliteration calls within this input sequence
- * will not change them.  New text is inserted at the
- * <code>limit</code> index, which marks the end of the substring that
- * the transliterator looks at.
- *
- * <p>Because keyboard transliteration assumes that more characters
- * are to arrive, it is conservative in its operation.  It only
- * transliterates when it can do so unambiguously.  Otherwise it waits
- * for more characters to arrive.  When the client code knows that no
- * more characters are forthcoming, perhaps because the user has
- * performed some input termination operation, then it should call
- * <code>finishTransliteration()</code> to complete any
- * pending transliterations.
- *
- * <p><b>Inverses</b>
- *
- * <p>Pairs of transliterators may be inverses of one another.  For
- * example, if transliterator <b>A</b> transliterates characters by
- * incrementing their Unicode value (so "abc" -> "def"), and
- * transliterator <b>B</b> decrements character values, then <b>A</b>
- * is an inverse of <b>B</b> and vice versa.  If we compose <b>A</b>
- * with <b>B</b> in a compound transliterator, the result is the
- * indentity transliterator, that is, a transliterator that does not
- * change its input text.
- *
- * The <code>Transliterator</code> method <code>getInverse()</code>
- * returns a transliterator's inverse, if one exists, or
- * <code>null</code> otherwise.  However, the result of
- * <code>getInverse()</code> usually will <em>not</em> be a true
- * mathematical inverse.  This is because true inverse transliterators
- * are difficult to formulate.  For example, consider two
- * transliterators: <b>AB</b>, which transliterates the character 'A'
- * to 'B', and <b>BA</b>, which transliterates 'B' to 'A'.  It might
- * seem that these are exact inverses, since
- *
+ * 
+ * Now when the user types 't', tau appears, and if the next character is 'h', the tau changes to a theta. This is
+ * accomplished by maintaining a cursor position (independent of the insertion point, and invisible in the GUI) across
+ * calls to <code>transliterate()</code>. Typically, the cursor will be coincident with the insertion point, but in a
+ * case like the one above, it will precede the insertion point.
+ * 
+ * <p>
+ * Keyboard transliteration methods maintain a set of three indices that are updated with each call to
+ * <code>transliterate()</code>, including the cursor, start, and limit. These indices are changed by the method, and
+ * they are passed in and out via a Position object. The <code>start</code> index marks the beginning of the substring
+ * that the transliterator will look at. It is advanced as text becomes committed (but it is not the committed index;
+ * that's the <code>cursor</code>). The <code>cursor</code> index, described above, marks the point at which the
+ * transliterator last stopped, either because it reached the end, or because it required more characters to
+ * disambiguate between possible inputs. The <code>cursor</code> can also be explicitly set by rules in a
+ * <code>RuleBasedTransliterator</code>. Any characters before the <code>cursor</code> index are frozen; future keyboard
+ * transliteration calls within this input sequence will not change them. New text is inserted at the <code>limit</code>
+ * index, which marks the end of the substring that the transliterator looks at.
+ * 
+ * <p>
+ * Because keyboard transliteration assumes that more characters are to arrive, it is conservative in its operation. It
+ * only transliterates when it can do so unambiguously. Otherwise it waits for more characters to arrive. When the
+ * client code knows that no more characters are forthcoming, perhaps because the user has performed some input
+ * termination operation, then it should call <code>finishTransliteration()</code> to complete any pending
+ * transliterations.
+ * 
+ * <p>
+ * <b>Inverses</b>
+ * 
+ * <p>
+ * Pairs of transliterators may be inverses of one another. For example, if transliterator <b>A</b> transliterates
+ * characters by incrementing their Unicode value (so "abc" -> "def"), and transliterator <b>B</b> decrements character
+ * values, then <b>A</b> is an inverse of <b>B</b> and vice versa. If we compose <b>A</b> with <b>B</b> in a compound
+ * transliterator, the result is the indentity transliterator, that is, a transliterator that does not change its input
+ * text.
+ * 
+ * The <code>Transliterator</code> method <code>getInverse()</code> returns a transliterator's inverse, if one exists,
+ * or <code>null</code> otherwise. However, the result of <code>getInverse()</code> usually will <em>not</em> be a true
+ * mathematical inverse. This is because true inverse transliterators are difficult to formulate. For example, consider
+ * two transliterators: <b>AB</b>, which transliterates the character 'A' to 'B', and <b>BA</b>, which transliterates
+ * 'B' to 'A'. It might seem that these are exact inverses, since
+ * 
  * <blockquote>"A" x <b>AB</b> -> "B"<br>
  * "B" x <b>BA</b> -> "A"</blockquote>
- *
- * where 'x' represents transliteration.  However,
- *
+ * 
+ * where 'x' represents transliteration. However,
+ * 
  * <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
  * "BBCD" x <b>BA</b> -> "AACD"</blockquote>
- *
- * so <b>AB</b> composed with <b>BA</b> is not the
- * identity. Nonetheless, <b>BA</b> may be usefully considered to be
- * <b>AB</b>'s inverse, and it is on this basis that
- * <b>AB</b><code>.getInverse()</code> could legitimately return
+ * 
+ * so <b>AB</b> composed with <b>BA</b> is not the identity. Nonetheless, <b>BA</b> may be usefully considered to be
+ * <b>AB</b>'s inverse, and it is on this basis that <b>AB</b><code>.getInverse()</code> could legitimately return
  * <b>BA</b>.
- *
- * <p><b>IDs and display names</b>
- *
- * <p>A transliterator is designated by a short identifier string or
- * <em>ID</em>.  IDs follow the format <em>source-destination</em>,
- * where <em>source</em> describes the entity being replaced, and
- * <em>destination</em> describes the entity replacing
- * <em>source</em>.  The entities may be the names of scripts,
- * particular sequences of characters, or whatever else it is that the
- * transliterator converts to or from.  For example, a transliterator
- * from Russian to Latin might be named "Russian-Latin".  A
- * transliterator from keyboard escape sequences to Latin-1 characters
- * might be named "KeyboardEscape-Latin1".  By convention, system
- * entity names are in English, with the initial letters of words
- * capitalized; user entity names may follow any format so long as
- * they do not contain dashes.
- *
- * <p>In addition to programmatic IDs, transliterator objects have
- * display names for presentation in user interfaces, returned by
- * {@link #getDisplayName}.
- *
- * <p><b>Factory methods and registration</b>
- *
- * <p>In general, client code should use the factory method
- * <code>getInstance()</code> to obtain an instance of a
- * transliterator given its ID.  Valid IDs may be enumerated using
- * <code>getAvailableIDs()</code>.  Since transliterators are
- * stateless, multiple calls to <code>getInstance()</code> with the
- * same ID will return the same object.
- *
- * <p>In addition to the system transliterators registered at startup,
- * user transliterators may be registered by calling
- * <code>registerInstance()</code> at run time.  To register a
- * transliterator subclass without instantiating it (until it is
- * needed), users may call <code>registerClass()</code>.
- *
- * <p><b>Composed transliterators</b>
- *
- * <p>In addition to built-in system transliterators like
- * "Latin-Greek", there are also built-in <em>composed</em>
- * transliterators.  These are implemented by composing two or more
- * component transliterators.  For example, if we have scripts "A",
- * "B", "C", and "D", and we want to transliterate between all pairs
- * of them, then we need to write 12 transliterators: "A-B", "A-C",
- * "A-D", "B-A",..., "D-A", "D-B", "D-C".  If it is possible to
- * convert all scripts to an intermediate script "M", then instead of
- * writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M",
- * "D~M", "M~A", "M~B", "M~C", "M~D".  (This might not seem like a big
- * win, but it's really 2<em>n</em> vs. <em>n</em><sup>2</sup> -
- * <em>n</em>, so as <em>n</em> gets larger the gain becomes
- * significant.  With 9 scripts, it's 18 vs. 72 rule sets, a big
- * difference.)  Note the use of "~" rather than "-" for the script
- * separator here; this indicates that the given transliterator is
- * intended to be composed with others, rather than be used as is.
- *
- * <p>Composed transliterators can be instantiated as usual.  For
- * example, the system transliterator "Devanagari-Gujarati" is a
- * composed transliterator built internally as
- * "Devanagari~InterIndic;InterIndic~Gujarati".  When this
- * transliterator is instantiated, it appears externally to be a
- * standard transliterator (e.g., getID() returns
+ * 
+ * <p>
+ * <b>Filtering</b>
+ * <p>Each transliterator has a filter, which restricts changes to those characters selected by the filter. The
+ * filter affects just the characters that are changed -- the characters outside of the filter are still part of the
+ * context for the filter. For example, in the following even though 'x' is filtered out, and doesn't convert to y, it does affect the conversion of 'a'.
+ * 
+ * <pre>
+ * String rules = &quot;x &gt; y; x{a} &gt; b; &quot;;
+ * Transliterator tempTrans = Transliterator.createFromRules(&quot;temp&quot;, rules, Transliterator.FORWARD);
+ * tempTrans.setFilter(new UnicodeSet(&quot;[a]&quot;));
+ * String tempResult = tempTrans.transform(&quot;xa&quot;);
+ * // results in &quot;xb&quot;
+ *</pre>
+ * <p>
+ * <b>IDs and display names</b>
+ * 
+ * <p>
+ * A transliterator is designated by a short identifier string or <em>ID</em>. IDs follow the format
+ * <em>source-destination</em>, where <em>source</em> describes the entity being replaced, and <em>destination</em>
+ * describes the entity replacing <em>source</em>. The entities may be the names of scripts, particular sequences of
+ * characters, or whatever else it is that the transliterator converts to or from. For example, a transliterator from
+ * Russian to Latin might be named "Russian-Latin". A transliterator from keyboard escape sequences to Latin-1
+ * characters might be named "KeyboardEscape-Latin1". By convention, system entity names are in English, with the
+ * initial letters of words capitalized; user entity names may follow any format so long as they do not contain dashes.
+ * 
+ * <p>
+ * In addition to programmatic IDs, transliterator objects have display names for presentation in user interfaces,
+ * returned by {@link #getDisplayName}.
+ * 
+ * <p>
+ * <b>Factory methods and registration</b>
+ * 
+ * <p>
+ * In general, client code should use the factory method <code>getInstance()</code> to obtain an instance of a
+ * transliterator given its ID. Valid IDs may be enumerated using <code>getAvailableIDs()</code>. Since transliterators
+ * are stateless, multiple calls to <code>getInstance()</code> with the same ID will return the same object.
+ * 
+ * <p>
+ * In addition to the system transliterators registered at startup, user transliterators may be registered by calling
+ * <code>registerInstance()</code> at run time. To register a transliterator subclass without instantiating it (until it
+ * is needed), users may call <code>registerClass()</code>.
+ * 
+ * <p>
+ * <b>Composed transliterators</b>
+ * 
+ * <p>
+ * In addition to built-in system transliterators like "Latin-Greek", there are also built-in <em>composed</em>
+ * transliterators. These are implemented by composing two or more component transliterators. For example, if we have
+ * scripts "A", "B", "C", and "D", and we want to transliterate between all pairs of them, then we need to write 12
+ * transliterators: "A-B", "A-C", "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to convert all scripts to an
+ * intermediate script "M", then instead of writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", "D~M",
+ * "M~A", "M~B", "M~C", "M~D". (This might not seem like a big win, but it's really 2<em>n</em> vs. <em>n</em>
+ * <sup>2</sup> - <em>n</em>, so as <em>n</em> gets larger the gain becomes significant. With 9 scripts, it's 18 vs. 72
+ * rule sets, a big difference.) Note the use of "~" rather than "-" for the script separator here; this indicates that
+ * the given transliterator is intended to be composed with others, rather than be used as is.
+ * 
+ * <p>
+ * Composed transliterators can be instantiated as usual. For example, the system transliterator "Devanagari-Gujarati"
+ * is a composed transliterator built internally as "Devanagari~InterIndic;InterIndic~Gujarati". When this
+ * transliterator is instantiated, it appears externally to be a standard transliterator (e.g., getID() returns
  * "Devanagari-Gujarati").
- *
- * <p><b>Subclassing</b>
- *
- * <p>Subclasses must implement the abstract method
- * <code>handleTransliterate()</code>.  <p>Subclasses should override
- * the <code>transliterate()</code> method taking a
- * <code>Replaceable</code> and the <code>transliterate()</code>
- * method taking a <code>String</code> and <code>StringBuffer</code>
- * if the performance of these methods can be improved over the
- * performance obtained by the default implementations in this class.
- *
- * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
- *
+ * 
+ * <p>
+ * <b>Subclassing</b>
+ * 
+ * <p>
+ * Subclasses must implement the abstract method <code>handleTransliterate()</code>.
+ * <p>
+ * Subclasses should override the <code>transliterate()</code> method taking a <code>Replaceable</code> and the
+ * <code>transliterate()</code> method taking a <code>String</code> and <code>StringBuffer</code> if the performance of
+ * these methods can be improved over the performance obtained by the default implementations in this class.
+ * 
+ * <p>
+ * Copyright &copy; IBM Corporation 1999. All rights reserved.
+ * 
  * @author Alan Liu
  * @stable ICU 2.0
  */
@@ -1418,7 +1392,7 @@ public abstract class Transliterator implements StringTransform  {
             t = new NullTransliterator();
         }
         else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
-            t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), null);
+            t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), parser.compoundFilter);
         }
         else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
             // idBlock, no data -- this is an alias.  The ID has
@@ -1536,6 +1510,8 @@ public abstract class Transliterator implements StringTransform  {
         return result;
     }
 
+    static final UnicodeSet ALL_CODEPOINTS = new UnicodeSet(0,0x10FFFF).freeze();
+    
     /**
      * Returns the set of all characters that may be modified in the
      * input text by this Transliterator.  This incorporates this
@@ -1550,20 +1526,9 @@ public abstract class Transliterator implements StringTransform  {
      * @stable ICU 2.2
      */
     public final UnicodeSet getSourceSet() {
-        UnicodeSet set = handleGetSourceSet();
-        if (filter != null) {
-            UnicodeSet filterSet;
-            // Most, but not all filters will be UnicodeSets.  Optimize for
-            // the high-runner case.
-            try {
-                filterSet = (UnicodeSet) filter;
-            } catch (ClassCastException e) {
-                filterSet = new UnicodeSet();
-                filter.addMatchSetTo(filterSet);
-            }
-            set.retainAll(filterSet);
-        }
-        return set;
+        UnicodeSet result = new UnicodeSet();
+        addSourceTargetSet(getFilterAsUnicodeSet(ALL_CODEPOINTS), result, new UnicodeSet());
+        return result;
     }
 
     /**
@@ -1595,7 +1560,78 @@ public abstract class Transliterator implements StringTransform  {
      * @stable ICU 2.2
      */
     public UnicodeSet getTargetSet() {
-        return new UnicodeSet();
+        UnicodeSet result = new UnicodeSet();
+        addSourceTargetSet(getFilterAsUnicodeSet(ALL_CODEPOINTS), new UnicodeSet(), result);
+        return result;
+    }
+
+    /**
+     * Returns the set of all characters that may be generated as
+     * replacement text by this transliterator, filtered by BOTH the input filter, and the current getFilter().
+     * <p>SHOULD BE OVERRIDEN BY SUBCLASSES.
+     * It is probably an error for any transliterator to NOT override this, but we can't force them to
+     * for backwards compatibility.
+     * <p>Other methods vector through this.
+     * <p>When gathering the information on source and target, the compound transliterator makes things complicated.
+     * For example, suppose we have:
+     * <pre>
+     * Global FILTER = [ax]
+     * a > b;
+     * :: NULL;
+     * b > c;
+     * x > d;
+     * </pre>
+     * While the filter just allows a and x, b is an intermediate result, which could produce c. So the source and target sets
+     * cannot be gathered independently. What we have to do is filter the sources for the first transliterator according to
+     * the global filter, intersect that transliterator's filter. Based on that we get the target.
+     * The next transliterator gets as a global filter (global + last target). And so on.
+     * <p>There is another complication:
+     * <pre>
+     * Global FILTER = [ax]
+     * a > |b;
+     * b > c;
+     * </pre>
+     * Even though b would be filtered from the input, whenever we have a backup, it could be part of the input. So ideally we will
+     * change the global filter as we go.
+     * @param targetSet TODO
+     * @see #getTargetSet
+     * @internal
+     */
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
+        UnicodeSet temp = new UnicodeSet(handleGetSourceSet()).retainAll(myFilter);
+        // use old method, if we don't have anything better
+        sourceSet.addAll(temp);
+        // clumsy guess with target
+        for (String s : temp) {
+            String t = transliterate(s);
+            if (!s.equals(t)) {
+                targetSet.addAll(t);
+            }
+        }
+    }
+
+    /**
+     * Returns the intersectionof this instance's filter intersected with an external filter. 
+     * The externalFilter must be frozen (it is frozen if not).
+     * The result may be frozen, so don't attempt to modify.
+     * @internal
+     */
+   // TODO change to getMergedFilter
+    public UnicodeSet getFilterAsUnicodeSet(UnicodeSet externalFilter) {
+        if (filter == null) {
+            return externalFilter;
+        }
+        UnicodeSet filterSet = new UnicodeSet(externalFilter);
+        // Most, but not all filters will be UnicodeSets.  Optimize for
+        // the high-runner case.
+        UnicodeSet temp;
+        try {
+            temp = (UnicodeSet) filter;
+        } catch (ClassCastException e) {
+            filter.addMatchSetTo(temp = new UnicodeSet());
+        }
+        return filterSet.retainAll(temp).freeze();
     }
 
     /**
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java
index a9f3840b9b..dc573cb612 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java
@@ -8,6 +8,7 @@
 **********************************************************************
 */
 package com.ibm.icu.text;
+import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
 
 /**
@@ -248,4 +249,38 @@ class UnescapeTransliterator extends Transliterator {
         pos.limit = limit;
         pos.start = start;
     }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        // Each form consists of a prefix, suffix,
+        // * radix, minimum digit count, and maximum digit count.  These
+        // * values are stored as a five character header. ...
+        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
+        UnicodeSet items = new UnicodeSet();
+        StringBuilder buffer = new StringBuilder();
+        for (int i = 0; spec[i] != END;) {
+            // first 5 items are header
+            int end = i + spec[i] + spec[i+1] + 5;
+            int radix = spec[i+2];
+            for (int j = 0; j < radix; ++j) {
+                Utility.appendNumber(buffer, j, radix, 0);
+            }
+            // then add the characters
+            for (int j = i + 5; j < end; ++j) {
+                items.add(spec[j]);
+            }
+            // and go to next block
+            i = end;
+        }
+        items.addAll(buffer.toString());
+        items.retainAll(myFilter);
+
+        if (items.size() > 0) {
+            sourceSet.addAll(items);
+            targetSet.addAll(0,0x10FFFF); // assume we can produce any character
+        }
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UnicodeNameTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UnicodeNameTransliterator.java
index 12b832fd2f..f5bb5ad89d 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UnicodeNameTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UnicodeNameTransliterator.java
@@ -70,4 +70,25 @@ class UnicodeNameTransliterator extends Transliterator {
         offsets.limit = limit;
         offsets.start = cursor;
     }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
+        if (myFilter.size() > 0) {
+            sourceSet.addAll(myFilter);
+            targetSet.addAll('0', '9')
+            .addAll('A', 'Z')
+            .add('-')
+            .add(' ')
+            .addAll(OPEN_DELIM)
+            .add(CLOSE_DELIM)
+            .addAll('a', 'z') // for controls
+            .add('<').add('>') // for controls
+            .add('(').add(')') // for controls
+            ;
+        }
+    }
 }
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java
index 6767d703ce..d8b54d5672 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java
@@ -7,6 +7,7 @@
 package com.ibm.icu.text;
 
 import com.ibm.icu.impl.UCaseProps;
+import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.util.ULocale;
 
 /**
@@ -105,4 +106,24 @@ class UppercaseTransliterator extends Transliterator {
         }
         offsets.start = offsets.limit;
     }
+
+    // NOTE: normally this would be static, but because the results vary by locale....
+    SourceTargetUtility sourceTargetUtility = null;
+    
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
+     */
+    @Override
+    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
+        synchronized (this) {
+            if (sourceTargetUtility == null) {
+                sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
+                    public String transform(String source) {
+                        return UCharacter.toUpperCase(locale, source);
+                    }
+                });
+            }
+        }
+        sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
+    }
 }
diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
index 87ea37c616..8095696f0a 100644
--- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
+++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
@@ -13,13 +13,18 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map.Entry;
 
 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.dev.test.TestUtil;
+import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.impl.UtilityExtensions;
+import com.ibm.icu.lang.CharSequences;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.CanonicalIterator;
+import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.Replaceable;
 import com.ibm.icu.text.ReplaceableString;
 import com.ibm.icu.text.StringTransform;
@@ -28,6 +33,7 @@ import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeFilter;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.text.Normalizer2.Mode;
 import com.ibm.icu.util.CaseInsensitiveString;
 import com.ibm.icu.util.ULocale;
 
@@ -480,6 +486,16 @@ public class TransliteratorTest extends TestFmwk {
      * Do some basic tests of filtering.
      */
     public void TestFiltering() {
+        
+        Transliterator tempTrans = Transliterator.createFromRules("temp", "x > y; x{a} > b; ", Transliterator.FORWARD);
+        tempTrans.setFilter(new UnicodeSet("[a]"));
+        String tempResult = tempTrans.transform("xa");
+        assertEquals("context should not be filtered ", "xb", tempResult);
+        
+        tempTrans = Transliterator.createFromRules("temp", "::[a]; x > y; x{a} > b; ", Transliterator.FORWARD);
+        tempResult = tempTrans.transform("xa");
+        assertEquals("context should not be filtered ", "xb", tempResult);
+        
         Transliterator hex = Transliterator.getInstance("Any-Hex");
         hex.setFilter(new UnicodeFilter() {
             public boolean contains(int c) {
@@ -2997,6 +3013,358 @@ public class TransliteratorTest extends TestFmwk {
         }
     }
 
+    public void TestSourceTargetSet2() {
+        
+
+        Normalizer2 nfkd = Normalizer2.getInstance(null, "NFKC", Mode.DECOMPOSE);
+        Normalizer2 nfc = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
+        Normalizer2 nfd = Normalizer2.getInstance(null, "NFC", Mode.DECOMPOSE);
+        //        UnicodeSet nfkdSource = new UnicodeSet();
+        //        UnicodeSet nfkdTarget = new UnicodeSet();
+        //        for (int i = 0; i <= 0x10FFFF; ++i) {
+        //            if (nfkd.isInert(i)) {
+        //                continue;
+        //            }
+        //            nfkdSource.add(i);
+        //            String t = nfkd.getDecomposition(i);
+        //            if (t != null) {
+        //                nfkdTarget.addAll(t);
+        //            } else {
+        //                nfkdTarget.add(i);
+        //            }
+        //        }
+        //        nfkdSource.freeze();
+        //        nfkdTarget.freeze();
+        //        logln("NFKD Source: " + nfkdSource.toPattern(false));
+        //        logln("NFKD Target: " + nfkdTarget.toPattern(false));
+
+        UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
+        UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
+        UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
+        CanonicalIterator can = new CanonicalIterator("");
+
+        UnicodeSet disorderedMarks = new UnicodeSet();
+
+        for (int i = 0; i <= 0x10FFFF; ++i) {
+            String s = nfd.getDecomposition(i);
+            if (s == null) {
+                continue;
+            }
+            
+            can.setSource(s);
+            for (String t = can.next(); t != null; t = can.next()) {
+                disorderedMarks.add(t);
+            }
+            
+            // if s has two code points, (or more), add the lead/trail information
+            int first = s.codePointAt(0);
+            int firstCount = Character.charCount(first);
+            if (s.length() == firstCount) continue;
+            String trailString = s.substring(firstCount);
+
+            // add all the trail characters
+            if (!nonStarters.containsSome(trailString)) {
+               continue; 
+            }
+            UnicodeSet trailSet = leadToTrail.get(first);
+            if (trailSet == null) {
+                leadToTrail.put(first, trailSet = new UnicodeSet());
+            }
+            trailSet.addAll(trailString); // add remaining trails
+
+            // add the sources
+            UnicodeSet sourcesSet = leadToSources.get(first);
+            if (sourcesSet == null) {
+                leadToSources.put(first, sourcesSet = new UnicodeSet());
+            }
+            sourcesSet.add(i);
+        }
+
+
+        for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
+            String lead = x.getKey();
+            UnicodeSet sources = x.getValue();
+            UnicodeSet trailSet = leadToTrail.get(lead);
+            for (String source : sources) {
+                for (String trail : trailSet) {
+                    can.setSource(source + trail);
+                    for (String t = can.next(); t != null; t = can.next()) {
+                        if (t.endsWith(trail)) continue;
+                        disorderedMarks.add(t);
+                    }
+                }
+            }
+        }
+
+
+        for (String s : nonStarters) {
+            disorderedMarks.add("\u0345" + s);
+            disorderedMarks.add(s+"\u0323");
+            String xx = nfc.normalize("Ǭ" + s);
+            if (!xx.startsWith("Ǭ")) {
+                logln("??");
+            }
+        }
+
+//        for (int i = 0; i <= 0x10FFFF; ++i) {
+//            String s = nfkd.getDecomposition(i);
+//            if (s != null) {
+//                disorderedMarks.add(s);
+//                disorderedMarks.add(nfc.normalize(s));
+//                addDerivedStrings(nfc, disorderedMarks, s);
+//            }            
+//            s = nfd.getDecomposition(i);
+//            if (s != null) {
+//                disorderedMarks.add(s);
+//            }
+//            if (!nfc.isInert(i)) {
+//                if (i == 0x00C0) {
+//                    logln("À");
+//                }
+//                can.setSource(s+"\u0334");
+//                for (String t = can.next(); t != null; t = can.next()) {
+//                    addDerivedStrings(nfc, disorderedMarks, t);
+//                }
+//                can.setSource(s+"\u0345");
+//                for (String t = can.next(); t != null; t = can.next()) {
+//                    addDerivedStrings(nfc, disorderedMarks, t);
+//                }
+//                can.setSource(s+"\u0323");
+//                for (String t = can.next(); t != null; t = can.next()) {
+//                    addDerivedStrings(nfc, disorderedMarks, t);
+//                }
+//            }
+//        }
+        logln("Test cases: " + disorderedMarks.size());
+        disorderedMarks.addAll(0,0x10FFFF).freeze();
+        logln("isInert \u0104 " + nfc.isInert('\u0104'));
+
+        Object[][] rules = {
+                {":: [:sc=COMMON:] any-name;", null},
+
+                {":: [:Greek:] hex-any/C;", null},
+                {":: [:Greek:] any-hex/C;", null},
+
+                {":: [[:Mn:][:Me:]] remove;", null},
+                {":: [[:Mn:][:Me:]] null;", null},
+
+
+                {":: lower;", null},
+                {":: upper;", null},
+                {":: title;", null},
+                {":: CaseFold;", null},
+                
+                {":: NFD;", null},
+                {":: NFC;", null},
+                {":: NFKD;", null},
+                {":: NFKC;", null},
+                
+                {":: [[:Mn:][:Me:]] NFKD;", null},
+                {":: Latin-Greek;", null},
+                {":: [:Latin:] NFKD;", null},
+                {":: NFKD;", null},
+                {":: NFKD;\n" +
+                    ":: [[:Mn:][:Me:]] remove;\n" +
+                    ":: NFC;", null},
+        };
+        for (Object[] rulex : rules) {
+            String rule = (String) rulex[0];
+            Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
+            UnicodeSet actualSource = trans.getSourceSet();
+            UnicodeSet actualTarget = trans.getTargetSet();
+            UnicodeSet empiricalSource = new UnicodeSet();
+            UnicodeSet empiricalTarget = new UnicodeSet();
+            String ruleDisplay = rule.replace("\n", "\t\t");
+            UnicodeSet toTest = disorderedMarks;
+//            if (rulex[1] != null) {
+//                toTest = new UnicodeSet(disorderedMarks);
+//                toTest.addAll((UnicodeSet) rulex[1]);
+//            }
+
+            String test = nfd.normalize("Ą");
+            boolean DEBUG = true;
+            int count = 0; // for debugging
+            for (String s : toTest) {
+                if (s.equals(test)) {
+                    logln(test);
+                }
+                String t = trans.transform(s);
+                if (!s.equals(t)) {
+                    if (!isAtomic(s, t, trans)) {
+                        isAtomic(s, t, trans);
+                        continue;
+                    }
+
+                    // only keep the part that changed; so skip the front and end.
+                    //                    int start = findSharedStartLength(s,t);
+                    //                    int end = findSharedEndLength(s,t);
+                    //                    if (start != 0 || end != 0) {
+                    //                        s = s.substring(start, s.length() - end);
+                    //                        t = t.substring(start, t.length() - end);
+                    //                    }
+                    if (DEBUG) {
+                        if (!actualSource.containsAll(s)) {
+                            count++;
+                        }
+                        if (!actualTarget.containsAll(t)) {
+                            count++;
+                        }
+                    }
+                    addSourceTarget(s, empiricalSource, t, empiricalTarget);
+                }
+            }
+            assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
+            assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
+        }
+    }
+
+    private boolean isAtomic(String s, String t, Transliterator trans) {
+        for (int i = 1; i < s.length(); ++i) {
+            if (!CharSequences.onCharacterBoundary(s, i)) {
+                continue;
+            }
+            String q = trans.transform(s.substring(0,i));
+            if (t.startsWith(q)) {
+                String r = trans.transform(s.substring(i));
+                if (t.length() == q.length() + r.length() && t.endsWith(r)) {
+                    return false;
+                }
+            }
+        }
+        return true;
+        //        // make sure that every part is different
+        //        if (s.codePointCount(0, s.length()) > 1) {
+        //            int[] codePoints = It.codePoints(s);
+        //            for (int k = 0; k < codePoints.length; ++k) {
+        //                int pos = indexOf(t,codePoints[k]);
+        //                if (pos >= 0) {
+        //                    int x;
+        //                }
+        //            }
+        //            if (s.contains("À")) {
+        //                logln("À");
+        //            }
+        //        }
+    }
+
+    private void addSourceTarget(String s, UnicodeSet expectedSource, String t, UnicodeSet expectedTarget) {
+        expectedSource.addAll(s);
+        if (t.length() > 0) {
+            expectedTarget.addAll(t);
+        }
+    }
+
+    private void addDerivedStrings(Normalizer2 nfc, UnicodeSet disorderedMarks, String s) {
+        disorderedMarks.add(s);
+        for (int j = 1; j < s.length(); ++j) {
+            if (CharSequences.onCharacterBoundary(s, j)) {
+                String shorter = s.substring(0,j);
+                disorderedMarks.add(shorter);
+                disorderedMarks.add(nfc.normalize(shorter) + s.substring(j));
+            }
+        }
+    }
+
+    public void TestCharUtils() {
+        String[][] startTests = {
+                {"1", "a", "ab"},
+                {"0", "a", "xb"},
+                {"0", "\uD800", "\uD800\uDC01"},
+                {"1", "\uD800a", "\uD800b"},
+                {"0", "\uD800\uDC00", "\uD800\uDC01"},
+        };
+        for (String[] row : startTests) {
+            int actual = findSharedStartLength(row[1], row[2]);
+            assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")", 
+                    Integer.parseInt(row[0]),
+                    actual);
+        }
+        String[][] endTests = {
+                {"0", "\uDC00", "\uD801\uDC00"},
+                {"1", "a", "ba"},
+                {"0", "a", "bx"},
+                {"1", "a\uDC00", "b\uDC00"},
+                {"0", "\uD800\uDC00", "\uD801\uDC00"},
+        };
+        for (String[] row : endTests) {
+            int actual = findSharedEndLength(row[1], row[2]);
+            assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")", 
+                    Integer.parseInt(row[0]), 
+                    actual);
+        }
+    }
+
+    /**
+     * @param s
+     * @param t
+     * @return
+     */
+    // TODO make generally available
+    private static int findSharedStartLength(CharSequence s, CharSequence t) {
+        int min = Math.min(s.length(), t.length());
+        int i;
+        char sch, tch;
+        for (i = 0; i < min; ++i) {
+            sch = s.charAt(i);
+            tch = t.charAt(i);
+            if (sch != tch) {
+                break;
+            }
+        }
+        return CharSequences.onCharacterBoundary(s,i) && CharSequences.onCharacterBoundary(t,i) ? i : i - 1;
+    }
+
+    /**
+     * @param s
+     * @param t
+     * @return
+     */
+    // TODO make generally available
+    private static int findSharedEndLength(CharSequence s, CharSequence t) {
+        int slength = s.length();
+        int tlength = t.length();
+        int min = Math.min(slength, tlength);
+        int i;
+        char sch, tch;
+        // TODO can make the calculations slightly faster... Not sure if it is worth the complication, tho'
+        for (i = 0; i < min; ++i) {
+            sch = s.charAt(slength - i - 1);
+            tch = t.charAt(tlength - i - 1);
+            if (sch != tch) {
+                break;
+            }
+        }
+        return CharSequences.onCharacterBoundary(s,slength - i) && CharSequences.onCharacterBoundary(t,tlength - i) ? i : i - 1;
+    }
+
+    enum SetAssert {EQUALS, MISSING_OK, EXTRA_OK}
+
+    void assertEquals(String message, UnicodeSet empirical, UnicodeSet actual, SetAssert setAssert) {
+        boolean haveError = false;
+        if (!actual.containsAll(empirical)) {
+            UnicodeSet missing = new UnicodeSet(empirical).removeAll(actual);
+            errln(message + " \tgetXSet < empirical (" + missing.size() + "): " + toPattern(missing));
+            haveError = true;
+        }
+        if (!empirical.containsAll(actual)) {
+            UnicodeSet extra = new UnicodeSet(actual).removeAll(empirical);
+            logln("WARNING: " + message + " \tgetXSet > empirical (" + extra.size() + "): " + toPattern(extra));
+            haveError = true;
+        }
+        if (!haveError) {
+            logln("OK " + message + ' ' + toPattern(empirical));
+        }
+    }
+
+    private String toPattern(UnicodeSet missing) {
+        String result = missing.toPattern(false);
+        if (result.length() < 200) {
+            return result;
+        }
+        return result.substring(0, CharSequences.onCharacterBoundary(result, 200) ? 200 : 199) + "…";
+    }
+
+
     /**
      * Test handling of rule whitespace, for both RBT and UnicodeSet.
      */
@@ -3741,7 +4109,7 @@ the ::BEGIN/::END stuff)
             Transliterator.createFromRules("gif", "\\", Transliterator.FORWARD);
         } catch(Exception e){
             errln("TransliteratorParser.nextLine() was not suppose to return an " +
-                    "exception for a rule of '\\'");
+            "exception for a rule of '\\'");
         }
     }
 }