ICU-9353 merge dbbi-tries work into the trunk

X-SVN-Rev: 32185
2012-08-16 23:16:04 +00:00 · 2012-08-16 23:16:04 +00:00 · ed2c14b425
commit ed2c14b425
parent c64c0299d7
26 changed files with 1372 additions and 1271 deletions
--- a/icu4j/build.xml
+++ b/icu4j/build.xml
@ -1583,7 +1583,7 @@
                <include name="**/pnames.icu"/>
                <include name="**/*.res"/>
                <include name="**/*.brk"/>
-                <include name="**/*.ctd"/>
+                <include name="**/*.dict"/>
                <include name="**/*.nrm"/>
                <exclude name="**/coll/*.res"/>
                <exclude name="**/translit/*.res"/>
@ -1676,7 +1676,7 @@
                <include name="**/unames.icu"/>
                <include name="**/pnames.icu"/>
                <include name="**/*.brk"/>
-                <include name="**/*.ctd"/>
+                <include name="**/*.dict"/>
                <include name="**/*.nrm"/>
                <include name="**/brkitr/*.res"/>
                <include name="**/translit/*.res"/>
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterIteration.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterIteration.java
@ -0,0 +1,126 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.impl;
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.text.UTF16;
+
+public final class CharacterIteration {
+    // disallow instantiation
+    private CharacterIteration() { }
+
+    // 32 bit Char value returned from when an iterator has run out of range.
+    //     Positive value so fast case (not end, not surrogate) can be checked
+    //     with a single test.
+    public static int DONE32 = 0x7fffffff;
+
+    /**
+     * Move the iterator forward to the next code point, and return that code point,
+     *   leaving the iterator positioned at char returned.
+     *   For Supplementary chars, the iterator is left positioned at the lead surrogate.
+     * @param ci  The character iterator
+     * @return    The next code point.
+     */
+    public static int next32(CharacterIterator ci) {
+        // If the current position is at a surrogate pair, move to the trail surrogate
+        //   which leaves it in positon for underlying iterator's next() to work.
+        int c= ci.current();
+        if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
+            c = ci.next();   
+            if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+               c = ci.previous();   
+            }
+        }
+
+        // For BMP chars, this next() is the real deal.
+        c = ci.next();
+        
+        // If we might have a lead surrogate, we need to peak ahead to get the trail 
+        //  even though we don't want to really be positioned there.
+        if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
+            c = nextTrail32(ci, c);   
+        }
+        
+        if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) {
+            // We got a supplementary char.  Back the iterator up to the postion
+            // of the lead surrogate.
+            ci.previous();   
+        }
+        return c;
+   }
+
+    
+    // Out-of-line portion of the in-line Next32 code.
+    // The call site does an initial ci.next() and calls this function
+    //    if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
+    // NOTE:  we leave the underlying char iterator positioned in the
+    //        middle of a surroage pair.  ci.next() will work correctly
+    //        from there, but the ci.getIndex() will be wrong, and needs
+    //        adjustment.
+    public static int nextTrail32(CharacterIterator ci, int lead) {
+        int retVal = lead;
+        if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+            char  cTrail = ci.next();
+            if (UTF16.isTrailSurrogate(cTrail)) {
+                retVal = ((lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
+                            (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
+                            UTF16.SUPPLEMENTARY_MIN_VALUE;
+            } else {
+                ci.previous();
+            }
+        } else {
+            if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
+                retVal = DONE32;
+            }
+        }
+        return retVal;
+    }
+       
+    public static int previous32(CharacterIterator ci) {
+        if (ci.getIndex() <= ci.getBeginIndex()) {
+            return DONE32;   
+        }
+        char trail = ci.previous();
+        int retVal = trail;
+        if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
+            char lead = ci.previous();
+            if (UTF16.isLeadSurrogate(lead)) {
+                retVal = (((int)lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
+                          ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
+                          UTF16.SUPPLEMENTARY_MIN_VALUE;
+            } else {
+                ci.next();
+            }           
+        }
+        return retVal;
+    }
+   
+    public static int current32(CharacterIterator ci) {
+        char  lead   = ci.current();
+        int   retVal = lead;
+        if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
+            return retVal;   
+        }
+        if (UTF16.isLeadSurrogate(lead)) {
+            int  trail = (int)ci.next();
+            ci.previous();
+            if (UTF16.isTrailSurrogate((char)trail)) {
+                retVal = ((lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
+                         (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
+                         UTF16.SUPPLEMENTARY_MIN_VALUE;
+            }
+         } else {
+            if (lead == CharacterIterator.DONE) {
+                if (ci.getIndex() >= ci.getEndIndex())   {
+                    retVal = DONE32;   
+                }
+            }
+         }
+        return retVal;
+    }
+}
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIterator.java
@ -732,6 +732,11 @@ s     */

        BreakIteratorCache cache = new BreakIteratorCache(where, result);
        iterCache[kind] = new SoftReference<BreakIteratorCache>(cache);
+        if (result instanceof RuleBasedBreakIterator) {
+            RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
+            rbbi.setBreakType(kind);
+        }
+
        return result;
    }

--- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
- * Copyright (C) 2002-2010, International Business Machines Corporation and    *
+ * Copyright (C) 2002-2012, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
@ -90,28 +90,20 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
     *             pre-compiled break rules.  The resource bundle name is "boundaries".
     *             The value for each key will be the rules to be used for the
     *             specified locale - "word" -> "word_th" for Thai, for example.
-     *  DICTIONARY_POSSIBLE indexes in the same way, and indicates whether a
-     *             dictionary is a possibility for that type of break.  This is just
-     *             an optimization to avoid a resource lookup where no dictionary is
-     *             ever possible.
     */
    private static final String[] KIND_NAMES = {
            "grapheme", "word", "line", "sentence", "title"
-        };
-    private static final boolean[] DICTIONARY_POSSIBLE = {
-            false,      true,  true,   false,     false
    };


    private static BreakIterator createBreakInstance(ULocale locale, int kind) {

-        BreakIterator    iter       = null;
-        ICUResourceBundle rb        = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale);
+        RuleBasedBreakIterator    iter = null;
+        ICUResourceBundle rb           = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale);
        
        //
-        //  Get the binary rules.  These are needed for both normal RulesBasedBreakIterators
-        //                         and for Dictionary iterators.
-        //
+        //  Get the binary rules.
+        // 
        InputStream      ruleStream = null;
        try {
            String         typeKey       = KIND_NAMES[kind];
@ -122,51 +114,22 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
        catch (Exception e) {
            throw new MissingResourceException(e.toString(),"","");
        }
- 
-        //
-        //  Check whether a dictionary exists, and create a DBBI iterator is
-        //   one does.
-        //
-        if (DICTIONARY_POSSIBLE[kind]) {
-            // This type of break iterator could potentially use a dictionary.
-            //
-            try {
-                if (locale.getLanguage().equals("th")){
-                    // If the language is Thai, load the thai compact trie dictionary.
-                    String dictType = "Thai";
-                    String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType);
-                    dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName;
-                    InputStream is = ICUData.getStream(dictFileName);
-                    iter = new ThaiBreakIterator(ruleStream, is);
-                }
-            } catch (MissingResourceException e) {
-                //  Couldn't find a dictionary.
-                //  This is normal, and will occur whenever creating a word or line
-                //  break iterator for a locale that does not have a BreakDictionaryData
-                //  resource - meaning for all but Thai.
-                //  Fall through to creating a normal RulebasedBreakIterator.
-            } catch (IOException e) {
-                Assert.fail(e);
-            }
-         }

-        if (iter == null) {
-            //
-            // Create a normal RuleBasedBreakIterator.
-            //    We have determined that this is not supposed to be a dictionary iterator.
-            //
-            try {
-                iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream);
-            }
-            catch (IOException e) {
-                // Shouldn't be possible to get here.
-                // If it happens, the compiled rules are probably corrupted in some way.
-                Assert.fail(e);
-           }
+        //
+        // Create a normal RuleBasedBreakIterator.
+        //
+        try {
+            iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream);
+        }
+        catch (IOException e) {
+            // Shouldn't be possible to get here.
+            // If it happens, the compiled rules are probably corrupted in some way.
+            Assert.fail(e);
        }
        // TODO: Determine valid and actual locale correctly.
        ULocale uloc = ULocale.forLocale(rb.getLocale());
        iter.setLocale(uloc, uloc);
+        iter.setBreakType(kind);
        
        return iter;

--- a/icu4j/main/classes/core/src/com/ibm/icu/text/BytesDictionaryMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BytesDictionaryMatcher.java
@ -0,0 +1,83 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.impl.Assert;
+import com.ibm.icu.util.BytesTrie;
+import com.ibm.icu.util.BytesTrie.Result;
+
+class BytesDictionaryMatcher extends DictionaryMatcher {
+    private final byte[] characters;
+    private final int transform;
+    
+    public BytesDictionaryMatcher(byte[] chars, int transform) {
+        characters = chars;
+        Assert.assrt((transform & DictionaryData.TRANSFORM_TYPE_MASK) == DictionaryData.TRANSFORM_TYPE_OFFSET);
+        // while there is only one transform type so far, save the entire transform constant so that
+        // if we add any others, we need only change code in transform() and the assert above rather
+        // than adding a "transform type" variable
+        this.transform = transform;
+    }
+    
+    private int transform(int c) {
+        if (c == 0x200D) { 
+            return 0xFF;
+        } else if (c == 0x200C) {
+            return 0xFE;
+        }
+
+        int delta = c - (transform & DictionaryData.TRANSFORM_OFFSET_MASK);
+        if (delta < 0 || 0xFD < delta) {
+            return -1;
+        }
+        return delta;
+    }
+
+    public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) {
+        UCharacterIterator text = UCharacterIterator.getInstance(text_);
+        BytesTrie bt = new BytesTrie(characters, 0);
+        int c = text.nextCodePoint();
+        Result result = bt.first(transform(c));
+        // TODO: should numChars count Character.charCount() ?
+        int numChars = 1;
+        int count = 0;
+        for (;;) {
+            if (result.hasValue()) {
+                if (count < limit) {
+                    if (values != null) {
+                        values[count] = bt.getValue();
+                    }
+                    lengths[count] = numChars;
+                    count++;
+                }
+                if (result == Result.FINAL_VALUE) {
+                    break;
+                }
+            } else if (result == Result.NO_MATCH) {
+                break;
+            }
+
+            if (numChars >= maxLength) {
+                break;
+            }
+
+            c = text.nextCodePoint();
+            ++numChars;
+            result = bt.next(transform(c));
+        }
+        count_[0] = count;
+        return numChars;
+    }
+
+    public int getType() {
+        return DictionaryData.TRIE_TYPE_BYTES;
+    }
+}
+
+
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsDictionaryMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsDictionaryMatcher.java
@ -0,0 +1,61 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.util.BytesTrie.Result;
+import com.ibm.icu.util.CharsTrie;
+
+class CharsDictionaryMatcher extends DictionaryMatcher {
+    private CharSequence characters;
+    
+    public CharsDictionaryMatcher(CharSequence chars) {
+        characters = chars;
+    }
+
+    public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) {
+        UCharacterIterator text = UCharacterIterator.getInstance(text_);
+        CharsTrie uct = new CharsTrie(characters, 0);
+        int c = text.nextCodePoint();
+        Result result = uct.firstForCodePoint(c);
+        // TODO: should numChars count Character.charCount?
+        int numChars = 1;
+        int count = 0;
+        for (;;) {
+            if (result.hasValue()) {
+                if (count < limit) {
+                    if (values != null) {
+                        values[count] = uct.getValue();
+                    }
+                    lengths[count] = numChars;
+                    count++;
+                }
+
+                if (result == Result.FINAL_VALUE) {
+                    break;
+                }
+            } else if (result == Result.NO_MATCH) {
+                break;
+            }
+
+            if (numChars >= maxLength) {
+                break;
+            }
+            c = text.nextCodePoint();
+            ++numChars;
+            result = uct.nextForCodePoint(c);
+        }
+        count_[0] = count;
+        return numChars;
+    }
+
+    public int getType() {
+        return DictionaryData.TRIE_TYPE_UCHARS;
+    }
+}
+
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java
@ -0,0 +1,218 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.io.IOException;
+import java.text.CharacterIterator;
+import java.util.Stack;
+
+import com.ibm.icu.impl.Assert;
+
+import static com.ibm.icu.impl.CharacterIteration.*;
+
+public class CjkBreakEngine implements LanguageBreakEngine {
+    private static final UnicodeSet fHangulWordSet = new UnicodeSet();
+    private static final UnicodeSet fHanWordSet = new UnicodeSet();
+    private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
+    private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
+    static {
+        fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
+        fHanWordSet.applyPattern("[:Han:]");
+        fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
+        fHiraganaWordSet.applyPattern("[:Hiragana:]");
+        
+        // freeze them all
+        fHangulWordSet.freeze();
+        fHanWordSet.freeze();
+        fKatakanaWordSet.freeze();
+        fHiraganaWordSet.freeze();
+    }
+
+    private final UnicodeSet fWordSet;
+    private DictionaryMatcher fDictionary = null;
+    
+    public CjkBreakEngine(boolean korean) throws IOException {
+        fDictionary = DictionaryData.loadDictionaryFor("Hira");
+        if (korean) {
+            fWordSet = fHangulWordSet;
+        } else {
+            fWordSet = new UnicodeSet();
+            fWordSet.addAll(fHanWordSet);
+            fWordSet.addAll(fKatakanaWordSet);
+            fWordSet.addAll(fHiraganaWordSet);
+            fWordSet.add("\\uff70\\u30fc");
+        }
+    }
+
+    public boolean handles(int c, int breakType) {
+        return (breakType == BreakIterator.KIND_WORD) &&
+                (fWordSet.contains(c));
+    }
+
+    private static final int kMaxKatakanaLength = 8;
+    private static final int kMaxKatakanaGroupLength = 20;
+    private static final int maxSnlp = 255;
+    private static final int kint32max = Integer.MAX_VALUE;
+    private static int getKatakanaCost(int wordlength) {
+        int katakanaCost[] =  new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
+        return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
+    }
+    
+    private static boolean isKatakana(int value) {
+        return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
+                (value >= 0xFF66 && value <= 0xFF9F);
+    }
+    
+    public int findBreaks(CharacterIterator inText, int startPos, int endPos,
+            boolean reverse, int breakType, Stack<Integer> foundBreaks) {
+        if (startPos >= endPos) {
+            return 0;
+        }
+
+        inText.setIndex(startPos);
+
+        int inputLength = endPos - startPos;
+        int[] charPositions = new int[inputLength + 1];
+        StringBuffer s = new StringBuffer("");
+        inText.setIndex(startPos);
+        while (inText.getIndex() < endPos) {
+            s.append(inText.current());
+            inText.next();
+        }
+        String prenormstr = s.toString();
+        boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
+                               Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
+        CharacterIterator text = inText;
+        int numChars = 0;
+        if (isNormalized) {
+            int index = 0;
+            charPositions[0] = 0;
+            while (index < prenormstr.length()) {
+                int codepoint = prenormstr.codePointAt(index);
+                index += Character.charCount(codepoint);
+                numChars++;
+                charPositions[numChars] = index;
+            }
+        } else {
+            String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
+            text = new java.text.StringCharacterIterator(normStr);
+            charPositions = new int[normStr.length() + 1];
+            Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
+            int index = 0;
+            charPositions[0] = 0;
+            while (index < normalizer.endIndex()) {
+                normalizer.next();
+                numChars++;
+                index = normalizer.getIndex();
+                charPositions[numChars] = index;
+            }
+        }
+        
+        // From here on out, do the algorithm. Note that our indices
+        // refer to indices within the normalized string.
+        int[] bestSnlp = new int[numChars + 1];
+        bestSnlp[0] = 0;
+        for (int i = 1; i <= numChars; i++) {
+            bestSnlp[i] = kint32max;
+        }
+
+        int[] prev = new int[numChars + 1];
+        for (int i = 0; i <= numChars; i++) {
+            prev[i] = -1;
+        }
+        
+        final int maxWordSize = 20;
+        int values[] = new int[numChars];
+        int lengths[] = new int[numChars];
+        // dynamic programming to find the best segmentation
+        boolean is_prev_katakana = false;
+        for (int i = 0; i < numChars; i++) {
+            text.setIndex(i);
+            if (bestSnlp[i] == kint32max) {
+                continue;
+            }
+            
+            int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
+            int[] count_ = new int[1];
+            fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
+            int count = count_[0];
+            
+            // if there are no single character matches found in the dictionary 
+            // starting with this character, treat character as a 1-character word
+            // with the highest value possible (i.e. the least likely to occur).
+            // Exclude Korean characters from this treatment, as they should be 
+            // left together by default.
+            if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
+                values[count] = maxSnlp;
+                lengths[count] = 1;
+                count++;
+            }
+
+            for (int j = 0; j < count; j++) {
+                int newSnlp = bestSnlp[i] + values[j];
+                if (newSnlp < bestSnlp[lengths[j] + i]) {
+                    bestSnlp[lengths[j] + i] = newSnlp;
+                    prev[lengths[j] + i] = i;
+                }
+            }
+            
+            // In Japanese, single-character Katakana words are pretty rare.
+            // So we apply the following heuristic to Katakana: any continuous
+            // run of Katakana characters is considered a candidate word with
+            // a default cost specified in the katakanaCost table according 
+            // to its length.
+            text.setIndex(i);
+            boolean is_katakana = isKatakana(current32(text));
+            if (!is_prev_katakana && is_katakana) {
+                int j = i + 1;
+                next32(text);
+                while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
+                    next32(text);
+                    ++j;
+                }
+                
+                if ((j - i) < kMaxKatakanaGroupLength) {
+                    int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
+                    if (newSnlp < bestSnlp[j]) {
+                        bestSnlp[j] = newSnlp;
+                        prev[j] = i;
+                    }
+                }
+            }
+            is_prev_katakana = is_katakana;
+        }
+
+        int t_boundary[] = new int[numChars + 1];
+        int numBreaks = 0;
+        if (bestSnlp[numChars] == kint32max) {
+            t_boundary[numBreaks] = numChars;
+            numBreaks++;
+        } else {
+            for (int i = numChars; i > 0; i = prev[i]) {
+                t_boundary[numBreaks] = i;
+                numBreaks++;
+            }
+            Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
+        }
+
+        if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
+            t_boundary[numBreaks++] = 0;
+        }
+
+        for (int i = numBreaks - 1; i >= 0; i--) {
+            int pos = charPositions[t_boundary[i]] + startPos;
+            if (!(foundBreaks.contains(pos) || pos == startPos))
+                foundBreaks.push(charPositions[t_boundary[i]] + startPos);
+        }
+
+        if (!foundBreaks.empty() && foundBreaks.peek() == endPos)
+            foundBreaks.pop();
+        if (!foundBreaks.empty()) 
+            inText.setIndex(foundBreaks.peek());
+        return 0;
+    }
+}
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBasedBreakIterator.java
@ -1,565 +0,0 @@
-/*
- *******************************************************************************
- * Copyright (C) 1996-2010, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
- *******************************************************************************
- */
-
-package com.ibm.icu.text;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.CharacterIterator;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Stack;
-
-import com.ibm.icu.impl.Assert;
-
-
-/**
- * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
- * to further subdivide ranges of text beyond what is possible using just the
- * state-table-based algorithm.  This is necessary, for example, to handle
- * word and line breaking in Thai, which doesn't use spaces between words.  The
- * state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide
- * up text as far as possible, and then contiguous ranges of letters are
- * repeatedly compared against a list of known words (i.e., the dictionary)
- * to divide them up into words.
- *
- * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old,
- * but adds one more special substitution name: _dictionary_.  This substitution
- * name is used to identify characters in words in the dictionary.  The idea is that
- * if the iterator passes over a chunk of text that includes two or more characters
- * in a row that are included in _dictionary_, it goes back through that range and
- * derives additional break positions (if possible) using the dictionary.
- *
- * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
- * file.  It uses Class.getResource() to locate the dictionary file.  The
- * dictionary file is in a serialized binary format.  We have a very primitive (and
- * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
- * currently making it public.  Contact us for help.
- *
- * @stable ICU 2.0
- */
-public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
-    
-    /**
-     * Keeps track of if we are using the compact trie dictionary.
-     */
-    private boolean usingCTDictionary = false;
-    /**
-     * a list of known words that is used to divide up contiguous ranges of letters,
-     * stored in a compressed, indexed, format that offers fast access
-     */
-    private BreakDictionary dictionary;
-
-    /*
-     * a list of flags indicating which character categories are contained in
-     * the dictionary file (this is used to determine which ranges of characters
-     * to apply the dictionary to)
-     */
-    //private boolean[] categoryFlags;
-
-
-    /**
-     * when a range of characters is divided up using the dictionary, the break
-     * positions that are discovered are stored here, preventing us from having
-     * to use either the dictionary or the state table again until the iterator
-     * leaves this range of text
-     */
-    int[] cachedBreakPositions;
-
-    /**
-     * if cachedBreakPositions is not null, this indicates which item in the
-     * cache the current iteration position refers to
-     */
-    int positionInCache;
-
-    /**
-     * Special variable name for characters in words in dictionary
-     */
-    
-    /**
-     * Construct a DictionarBasedBreakIterator from precompiled rules. Use by ThaiBreakEngine
-     * uses the BreakCTDictionary.
-     * @param compiledRules an input stream containing the binary (flattened) compiled rules.
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    protected DictionaryBasedBreakIterator(InputStream compiledRules) throws IOException {
-        fRData = RBBIDataWrapper.get(compiledRules);   // Init the RBBI part of this iterator.
-        dictionary = null;
-        usingCTDictionary = true;
-    }
-    /**
-     * Constructs a DictionaryBasedBreakIterator.
-     * @param rules Same as the rules parameter on RuleBasedBreakIterator,
-     * except for the special meaning of "_dictionary_".  This parameter is just
-     * passed through to RuleBasedBreakIterator constructor.
-     * @param dictionaryStream the stream containing the dictionary data
-     * @stable ICU 2.0
-     */
-    public DictionaryBasedBreakIterator(String rules,
-                                        InputStream dictionaryStream) throws IOException {
-        super(rules);
-        dictionary = new BreakDictionary(dictionaryStream);
-    }
-
-    
-    /**
-     * Construct a DictionarBasedBreakIterator from precompiled rules.
-     * @param compiledRules an input stream containing the binary (flattened) compiled rules.
-     * @param dictionaryStream an input stream containing the dictionary data
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    public DictionaryBasedBreakIterator(InputStream compiledRules,
-                                         InputStream dictionaryStream) throws IOException {
-       fRData = RBBIDataWrapper.get(compiledRules);   // Init the RBBI part of this iterator.
-       dictionary = new BreakDictionary(dictionaryStream);
-    }
-                    
-
-    /** @stable ICU 2.0 */
-    public void setText(CharacterIterator newText) {
-        super.setText(newText);
-        cachedBreakPositions = null;
-        fDictionaryCharCount = 0;
-        positionInCache = 0;
-    }
-
-    /**
-     * Sets the current iteration position to the beginning of the text.
-     * (i.e., the CharacterIterator's starting offset).
-     * @return The offset of the beginning of the text.
-     * @stable ICU 2.0
-     */
-    public int first() {
-        cachedBreakPositions = null;
-        fDictionaryCharCount = 0;
-        positionInCache = 0;
-        return super.first();
-    }
-
-    /**
-     * Sets the current iteration position to the end of the text.
-     * (i.e., the CharacterIterator's ending offset).
-     * @return The text's past-the-end offset.
-     * @stable ICU 2.0
-     */
-    public int last() {
-        cachedBreakPositions = null;
-        fDictionaryCharCount = 0;
-        positionInCache = 0;
-        return super.last();
-    }
-
-    /**
-     * Advances the iterator one step backwards.
-     * @return The position of the last boundary position before the
-     * current iteration position
-     * @stable ICU 2.0
-     */
-    public int previous() {
-        CharacterIterator text = getText();
-
-        // if we have cached break positions and we're still in the range
-        // covered by them, just move one step backward in the cache
-        if (cachedBreakPositions != null && positionInCache > 0) {
-            --positionInCache;
-            text.setIndex(cachedBreakPositions[positionInCache]);
-            return cachedBreakPositions[positionInCache];
-        }
-
-        // otherwise, dump the cache and use the inherited previous() method to move
-        // backward.  This may fill up the cache with new break positions, in which
-        // case we have to mark our position in the cache. If it doesn't, use next()
-        // to move forward until we hit or pass the current position. This *will* fill
-        // the cache.
-        else {
-            cachedBreakPositions = null;
-            int offset = current();
-            int result = super.previous();
-            
-            if (cachedBreakPositions != null) {
-                positionInCache = cachedBreakPositions.length - 2;
-                return result;
-            }
-            
-            while (result < offset) {
-                int nextResult = next();
-                
-                if (nextResult >= offset) {
-                    break;
-                }
-                
-                result = nextResult;
-            }
-            
-            if (cachedBreakPositions != null) {
-                positionInCache = cachedBreakPositions.length - 2;
-            }
-            
-            if (result != BreakIterator.DONE) {
-                text.setIndex(result);
-            }
-            
-            return result;
-        }
-    }
-
-    /**
-     * Sets the current iteration position to the last boundary position
-     * before the specified position.
-     * @param offset The position to begin searching from
-     * @return The position of the last boundary before "offset"
-     * @stable ICU 2.0
-     */
-    public int preceding(int offset) {
-        CharacterIterator text = getText();
-        checkOffset(offset, text);
-
-        // if we have no cached break positions, or "offset" is outside the
-        // range covered by the cache, we can just call the inherited routine
-        // (which will eventually call other routines in this class that may
-        // refresh the cache)
-        if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
-                offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
-            cachedBreakPositions = null;
-            return super.preceding(offset);
-        }
-
-        // on the other hand, if "offset" is within the range covered by the cache,
-        // then all we have to do is search the cache for the last break position
-        // before "offset"
-        else {
-            positionInCache = 0;
-            while (positionInCache < cachedBreakPositions.length
-                   && offset > cachedBreakPositions[positionInCache])
-                ++positionInCache;
-            --positionInCache;
-            text.setIndex(cachedBreakPositions[positionInCache]);
-            return text.getIndex();
-        }
-    }
-
-    /**
-     * Sets the current iteration position to the first boundary position after
-     * the specified position.
-     * @param offset The position to begin searching forward from
-     * @return The position of the first boundary after "offset"
-     * @stable ICU 2.0
-     */
-    public int following(int offset) {
-        CharacterIterator text = getText();
-        checkOffset(offset, text);
-
-        // if we have no cached break positions, or if "offset" is outside the
-        // range covered by the cache, then dump the cache and call our
-        // inherited following() method.  This will call other methods in this
-        // class that may refresh the cache.
-        if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
-                offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
-            cachedBreakPositions = null;
-            return super.following(offset);
-        }
-
-        // on the other hand, if "offset" is within the range covered by the
-        // cache, then just search the cache for the first break position
-        // after "offset"
-        else {
-            positionInCache = 0;
-            while (positionInCache < cachedBreakPositions.length
-                   && offset >= cachedBreakPositions[positionInCache])
-                ++positionInCache;
-            text.setIndex(cachedBreakPositions[positionInCache]);
-            return text.getIndex();
-        }
-    }
-    
-    
-    /**
-     * Return the status tag from the break rule that determined the most recently
-     * returned break position. 
-     * 
-     * TODO:  not supported with dictionary based break iterators.
-     *
-     * @return the status from the break rule that determined the most recently
-     * returned break position.
-     * @draft ICU 3.0
-     * @provisional This API might change or be removed in a future release.
-     */
-     public int getRuleStatus() {
-        return 0;
-     }
-
-
-    /**
-     * Get the status (tag) values from the break rule(s) that determined the most 
-     * recently returned break position.  The values appear in the rule source
-     * within brackets, {123}, for example.  The default status value for rules
-     * that do not explicitly provide one is zero.
-     * <p>
-     * TODO: not supported for dictionary based break iterator. 
-     *
-     * @param fillInArray an array to be filled in with the status values.  
-     * @return          The number of rule status values from rules that determined 
-     *                  the most recent boundary returned by the break iterator.
-     *                  In the event that the array is too small, the return value
-     *                  is the total number of status values that were available,
-     *                  not the reduced number that were actually returned.
-     * @draft ICU 3.0
-     * @provisional This API might change or be removed in a future release.
-     */
-    public int getRuleStatusVec(int[] fillInArray) {
-        if (fillInArray != null && fillInArray.length>=1) {  
-            fillInArray[0] = 0;
-        }
-        return 1;
-    }
-    /**
-     * This is the implementation function for next().
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    protected int handleNext() {
-        CharacterIterator text = getText();
-
-        // if there are no cached break positions, or if we've just moved
-        // off the end of the range covered by the cache, we have to dump
-        // and possibly regenerate the cache
-        if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
-
-            // start by using the inherited handleNext() to find a tentative return
-            // value.   dictionaryCharCount tells us how many dictionary characters
-            // we passed over on our way to the tentative return value
-            int startPos = text.getIndex();
-            fDictionaryCharCount = 0;
-            int result = super.handleNext();
-
-            // if we passed over more than one dictionary character, then we use
-            // divideUpDictionaryRange() to regenerate the cached break positions
-            // for the new range.
-            if (!usingCTDictionary && fDictionaryCharCount > 1 && result - startPos > 1) {
-                divideUpDictionaryRange(startPos, result);
-            }
-
-            // otherwise, the value we got back from the inherited fuction
-            // is our return value, and we can dump the cache
-            else {
-                cachedBreakPositions = null;
-                return result;
-            }
-        }
-
-        // if the cache of break positions has been regenerated (or existed all
-        // along), then just advance to the next break position in the cache
-        // and return it
-        if (cachedBreakPositions != null) {
-            ++positionInCache;
-            text.setIndex(cachedBreakPositions[positionInCache]);
-            return cachedBreakPositions[positionInCache];
-        }
-        ///CLOVER:OFF
-        Assert.assrt(false);
-        return -9999;   // SHOULD NEVER GET HERE!
-        ///CLOVER:ON
-    }
-
-    /**
-     * This is the function that actually implements the dictionary-based
-     * algorithm.  Given the endpoints of a range of text, it uses the
-     * dictionary to determine the positions of any boundaries in this
-     * range.  It stores all the boundary positions it discovers in
-     * cachedBreakPositions so that we only have to do this work once
-     * for each time we enter the range.
-     */
-    @SuppressWarnings("unchecked")
-    private void divideUpDictionaryRange(int startPos, int endPos) {
-        CharacterIterator text = getText();
-
-        // the range we're dividing may begin or end with non-dictionary characters
-        // (i.e., for line breaking, we may have leading or trailing punctuation
-        // that needs to be kept with the word).  Seek from the beginning of the
-        // range to the first dictionary character
-        text.setIndex(startPos);
-        int c = CICurrent32(text);
-        while (isDictionaryChar(c) == false) {  
-            c = CINext32(text);
-        }
-        
-        //System.out.println("\nDividing up range from " + (text.getIndex() + 1) + " to " + endPos);
-
-        // initialize.  We maintain two stacks: currentBreakPositions contains
-        // the list of break positions that will be returned if we successfully
-        // finish traversing the whole range now.  possibleBreakPositions lists
-        // all other possible word ends we've passed along the way.  (Whenever
-        // we reach an error [a sequence of characters that can't begin any word
-        // in the dictionary], we back up, possibly delete some breaks from
-        // currentBreakPositions, move a break from possibleBreakPositions
-        // to currentBreakPositions, and start over from there.  This process
-        // continues in this way until we either successfully make it all the way
-        // across the range, or exhaust all of our combinations of break
-        // positions.)
-        Stack<Integer> currentBreakPositions = new Stack<Integer>();
-        Stack<Integer> possibleBreakPositions = new Stack<Integer>();
-        List<Integer> wrongBreakPositions = new ArrayList<Integer>();
-
-        // the dictionary is implemented as a trie, which is treated as a state
-        // machine.  -1 represents the end of a legal word.  Every word in the
-        // dictionary is represented by a path from the root node to -1.  A path
-        // that ends in state 0 is an illegal combination of characters.
-        int state = 0;
-
-        // these two variables are used for error handling.  We keep track of the
-        // farthest we've gotten through the range being divided, and the combination
-        // of breaks that got us that far.  If we use up all possible break
-        // combinations, the text contains an error or a word that's not in the
-        // dictionary.  In this case, we "bless" the break positions that got us the
-        // farthest as real break positions, and then start over from scratch with
-        // the character where the error occurred.
-        int farthestEndPoint = text.getIndex();
-        Stack<Integer> bestBreakPositions = null;
-
-        // initialize (we always exit the loop with a break statement)
-        c = CICurrent32(text);
-        while (true) {
-//System.out.print("c = " + Integer.toString(c, 16) + ", pos = " + text.getIndex());
-
-            // if we can transition to state "-1" from our current state, we're
-            // on the last character of a legal word.  Push that position onto
-            // the possible-break-positions stack
-            if (dictionary.at(state, 0) == -1) {
-                possibleBreakPositions.push(Integer.valueOf(text.getIndex()));
-            }
-
-            // look up the new state to transition to in the dictionary
-            //    There will be no supplementaries here because the Thai dictionary
-            //     does not include any.  This code is going away soon, not worth
-            //     fixing.
-            state = (dictionary.at(state, (char)c)) & 0xFFFF;  // TODO: fix supplementaries
-//System.out.print(", state = " + state);
-
-            // if the character we're sitting on causes us to transition to
-            // the "end of word" state, then it was a non-dictionary character
-            // and we've successfully traversed the whole range.  Drop out
-            // of the loop.
-            if (state == /*-1*/ 0xFFFF) {
-                currentBreakPositions.push(Integer.valueOf(text.getIndex()));
-                break;
-            }
-
-            // if the character we're sitting on causes us to transition to
-            // the error state, or if we've gone off the end of the range
-            // without transitioning to the "end of word" state, we've hit
-            // an error...
-            else if (state == 0 || text.getIndex() >= endPos) {
-
-                // if this is the farthest we've gotten, take note of it in
-                // case there's an error in the text
-                if (text.getIndex() > farthestEndPoint) {
-                    farthestEndPoint = text.getIndex();
-                    bestBreakPositions = (Stack<Integer>)(currentBreakPositions.clone());
-                }
-
-                // wrongBreakPositions is a list of all break positions we've tried starting
-                // that didn't allow us to traverse all the way through the text.  Every time
-                // we pop a break position off of currentBreakPositions, we put it into
-                // wrongBreakPositions to avoid trying it again later.  If we make it to this
-                // spot, we're either going to back up to a break in possibleBreakPositions
-                // and try starting over from there, or we've exhausted all possible break
-                // positions and are going to do the fallback procedure.  This loop prevents
-                // us from messing with anything in possibleBreakPositions that didn't work as
-                // a starting point the last time we tried it (this is to prevent a bunch of
-                // repetitive checks from slowing down some extreme cases)
-                // variable not used Integer newStartingSpot = null;
-                while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
-                            possibleBreakPositions.peek())) {
-                    possibleBreakPositions.pop();
-                }
-
-                // if we've used up all possible break-position combinations, there's
-                // an error or an unknown word in the text.  In this case, we start
-                // over, treating the farthest character we've reached as the beginning
-                // of the range, and "blessing" the break positions that got us that
-                // far as real break positions
-                if (possibleBreakPositions.isEmpty()) {
-                    if (bestBreakPositions != null) {
-                        currentBreakPositions = bestBreakPositions;
-                        if (farthestEndPoint < endPos) {
-                            text.setIndex(farthestEndPoint + 1);
-                        }
-                        else {
-                            break;
-                        }
-                    }
-                    else {
-                        if ((currentBreakPositions.size() == 0
-                                || currentBreakPositions.peek().intValue() != text.getIndex())
-                                && text.getIndex() != startPos) {
-                            currentBreakPositions.push(Integer.valueOf(text.getIndex()));
-                        }
-                        CINext32(text);
-                        currentBreakPositions.push(Integer.valueOf(text.getIndex()));
-                    }
-                }
-
-                // if we still have more break positions we can try, then promote the
-                // last break in possibleBreakPositions into currentBreakPositions,
-                // and get rid of all entries in currentBreakPositions that come after
-                // it.  Then back up to that position and start over from there (i.e.,
-                // treat that position as the beginning of a new word)
-                else {
-                    Integer temp = possibleBreakPositions.pop();
-                    Integer temp2 = null;
-                    while (!currentBreakPositions.isEmpty() && temp.intValue() <
-                           currentBreakPositions.peek().intValue()) {
-                        temp2 = currentBreakPositions.pop();
-                        wrongBreakPositions.add(temp2);
-                    }
-                    currentBreakPositions.push(temp);
-                    text.setIndex(currentBreakPositions.peek().intValue());
-                }
-
-                // re-sync "c" for the next go-round, and drop out of the loop if
-                // we've made it off the end of the range
-                c = CICurrent32(text);
-                state = 0;
-                if (text.getIndex() >= endPos) {
-                    break;
-                }
-            }
-
-            // if we didn't hit any exceptional conditions on this last iteration,
-            // just advance to the next character and loop
-            else {
-                c = CINext32(text);
-            }
-//System.out.print(", possibleBreakPositions = { "); for (int i = 0; i < possibleBreakPositions.size(); i++) System.out.print(possibleBreakPositions.elementAt(i) + " "); System.out.print("}");
-//System.out.print(", currentBreakPositions = { "); for (int i = 0; i < currentBreakPositions.size(); i++) System.out.print(currentBreakPositions.elementAt(i) + " "); System.out.println("}");
-        }
-
-        // dump the last break position in the list, and replace it with the actual
-        // end of the range (which may be the same character, or may be further on
-        // because the range actually ended with non-dictionary characters we want to
-        // keep with the word)
-        if (!currentBreakPositions.isEmpty()) {
-            currentBreakPositions.pop();
-        }
-        currentBreakPositions.push(Integer.valueOf(endPos));
-
-        // create a regular array to hold the break positions and copy
-        // the break positions from the stack to the array (in addition,
-        // our starting position goes into this array as a break position).
-        // This array becomes the cache of break positions used by next()
-        // and previous(), so this is where we actually refresh the cache.
-        cachedBreakPositions = new int[currentBreakPositions.size() + 1];
-        cachedBreakPositions[0] = startPos;
-
-        for (int i = 0; i < currentBreakPositions.size(); i++) {
-            cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
-        }
-        positionInCache = 0;
-    }
-}
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java
@ -0,0 +1,69 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.text.CharacterIterator;
+import java.util.Stack;
+
+abstract class DictionaryBreakEngine implements LanguageBreakEngine {
+    protected UnicodeSet fSet = new UnicodeSet();
+    private final int fTypes;
+
+    /**
+     * @param breakTypes A mask of the break iterators that can use this engine.
+     *  For example, (1 << KIND_WORD) | (1 << KIND_LINE) could be used by 
+     *  word iterators and line iterators, but not any other kind.
+     */
+    public DictionaryBreakEngine(int breakTypes) {
+        // TODO: consider using a java.util.BitSet with nbits <= 32
+        fTypes = breakTypes;
+    }
+
+    public boolean handles(int c, int breakType) {
+        return (breakType >= 0 && breakType < 32) && // breakType is in range
+                ((1 << breakType) & fTypes) != 0 && // this type can use us
+                fSet.contains(c); // we recognize the character
+    }
+
+    public int findBreaks(CharacterIterator text_, int startPos, int endPos, 
+            boolean reverse, int breakType, Stack<Integer> foundBreaks) {
+        if (breakType < 0 || breakType >= 32 ||
+                ((1 << breakType) & fTypes) == 0) {
+            return 0;
+        }
+
+        int result = 0;
+        UCharacterIterator text = UCharacterIterator.getInstance(text_);
+        int start = text.getIndex();
+        int current, rangeStart, rangeEnd;
+        int c = text.current();
+        if (reverse) {
+            boolean isDict = fSet.contains(c);
+            while ((current = text.getIndex()) > startPos && isDict) {
+                c = text.previous();
+                isDict = fSet.contains(c);
+            }
+            rangeStart = (current < startPos) ? startPos :
+                current + (isDict ? 0 : 1);
+            rangeEnd = start + 1;
+        } else {
+            while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
+                c = text.next();
+            }
+            rangeStart = start;
+            rangeEnd = current;
+        }
+
+        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
+        text.setIndex(current);
+
+        return result;
+    }
+
+    protected abstract int divideUpDictionaryRange(UCharacterIterator text, 
+            int rangeStart, int rangeEnd, Stack<Integer> foundBreaks);
+}
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryData.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryData.java
@ -0,0 +1,90 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.ibm.icu.impl.Assert;
+import com.ibm.icu.impl.ICUBinary;
+import com.ibm.icu.impl.ICUData;
+import com.ibm.icu.impl.ICUResourceBundle;
+import com.ibm.icu.util.UResourceBundle;
+
+final class DictionaryData {
+    // disallow instantiation
+    private DictionaryData() { }
+
+    public static final int TRIE_TYPE_BYTES = 0;
+    public static final int TRIE_TYPE_UCHARS = 1;
+    public static final int TRIE_TYPE_MASK = 7;
+    public static final int TRIE_HAS_VALUES = 8;
+    public static final int TRANSFORM_NONE = 0;
+    public static final int TRANSFORM_TYPE_OFFSET = 0x1000000;
+    public static final int TRANSFORM_TYPE_MASK = 0x7f000000;
+    public static final int TRANSFORM_OFFSET_MASK = 0x1fffff;
+
+    public static final int IX_STRING_TRIE_OFFSET = 0;
+    public static final int IX_RESERVED1_OFFSET = 1;
+    public static final int IX_RESERVED2_OFFSET = 2;
+    public static final int IX_TOTAL_SIZE = 3;
+    public static final int IX_TRIE_TYPE = 4;
+    public static final int IX_TRANSFORM = 5;
+    public static final int IX_RESERVED6 = 6;
+    public static final int IX_RESERVED7 = 7;
+    public static final int IX_COUNT = 8;
+
+    private static final byte DATA_FORMAT_ID[] = { (byte) 0x44, (byte) 0x69,
+        (byte) 0x63, (byte) 0x74 };
+    
+    public static DictionaryMatcher loadDictionaryFor(String dictType) throws IOException {
+        ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME);
+        String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType);
+        dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName;
+        InputStream is = ICUData.getStream(dictFileName);
+        ICUBinary.readHeader(is, DATA_FORMAT_ID, null);
+        DataInputStream s = new DataInputStream(is);
+        int[] indexes = new int[IX_COUNT];
+        // TODO: read indexes[IX_STRING_TRIE_OFFSET] first, then read a variable-length indexes[]
+        for (int i = 0; i < IX_COUNT; i++) {
+            indexes[i] = s.readInt();
+        }
+        int offset = indexes[IX_STRING_TRIE_OFFSET];
+        Assert.assrt(offset >= (4 * IX_COUNT));
+        if (offset > (4 * IX_COUNT)) {
+            int diff = offset - (4 * IX_COUNT);
+            s.skipBytes(diff);
+        }
+        int trieType = indexes[IX_TRIE_TYPE] & TRIE_TYPE_MASK;
+        int totalSize = indexes[IX_TOTAL_SIZE] - offset;
+        DictionaryMatcher m = null;
+        if (trieType == TRIE_TYPE_BYTES) {
+            int transform = indexes[IX_TRANSFORM];
+            byte[] data = new byte[totalSize];
+            int i;
+            for (i = 0; i < data.length; i++) {
+                data[i] = s.readByte();
+            }
+            Assert.assrt(i == totalSize);
+            m = new BytesDictionaryMatcher(data, transform);
+        } else if (trieType == TRIE_TYPE_UCHARS) {
+            Assert.assrt(totalSize % 2 == 0);
+            int num = totalSize / 2;
+            char[] data = new char[totalSize / 2];
+            for (int i = 0; i < num; i++) {
+                data[i] = s.readChar();
+            }
+            m = new CharsDictionaryMatcher(new String(data));
+        } else {
+            m = null;
+        }
+        s.close();
+        is.close();
+        return m;
+    }
+}
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryMatcher.java
@ -0,0 +1,40 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.text.CharacterIterator;
+
+/**
+ * The DictionaryMatcher interface is used to allow arbitrary "types" of
+ * back-end data structures to be used with the break iteration code.
+ */
+abstract class DictionaryMatcher {
+    /**
+     * Find dictionary words that match the text.
+     * 
+     * @param text A CharacterIterator representing the text. The iterator is
+     *            left after the longest prefix match in the dictionary.
+     * @param maxLength The maximum number of code units to match.
+     * @param lengths An array that is filled with the lengths of words that matched.
+     * @param count Filled with the number of elements output in lengths.
+     * @param limit The maximum amount of words to output. Must be less than or equal to lengths.length.
+     * @param values Filled with the weight values associated with the various words.
+     * @return The number of characters in text that were matched.
+     */
+    public abstract int matches(CharacterIterator text, int maxLength, int[] lengths,
+            int[] count, int limit, int[] values);
+    
+    public int matches(CharacterIterator text, int maxLength, int[] lengths, 
+            int[] count, int limit) {
+        return matches(text, maxLength, lengths, count, limit, null);
+    }
+
+    /**
+     * @return the kind of dictionary that this matcher is using
+     */
+    public abstract int getType();
+}
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java
@ -0,0 +1,40 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.text.CharacterIterator;
+import java.util.Stack;
+
+/**
+ * The LanguageBreakEngine interface is to be used to implement any 
+ * language-specific logic for break iteration.
+ */
+interface LanguageBreakEngine {
+    /**
+     * @param c A Unicode codepoint value
+     * @param breakType The kind of break iterator that is wanting to make use
+     *  of this engine - character, word, line, sentence
+     * @return true if the engine can handle this character, false otherwise
+     */
+    public boolean handles(int c, int breakType);
+
+    /**
+     * Implements the actual breaking logic.
+     * @param text The text to break over
+     * @param startPos The index of the beginning of our range
+     * @param endPos The index of the possible end of our range. It is possible,
+     *  however, that our range ends earlier
+     * @param reverse true iff we are iterating backwards (in a call to 
+     *  previous(), for example)
+     * @param breakType The kind of break iterator that is wanting to make use
+     *  of this engine - character, word, line, sentence
+     * @param foundBreaks A Stack that the breaks found will be added to
+     * @return the number of words found
+     */
+    public int findBreaks(CharacterIterator text, int startPos, int endPos,
+            boolean reverse, int breakType, Stack<Integer> foundBreaks);
+}
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakIterator.java
@ -1,20 +1,20 @@
 /*
 *******************************************************************************
- * Copyright (C) 1996-2011, International Business Machines Corporation and    *
+ * Copyright (C) 2012, International Business Machines Corporation and         *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
 package com.ibm.icu.text;

 import java.io.IOException;
-import java.io.InputStream;
 import java.text.CharacterIterator;
 import java.util.Stack;

-import com.ibm.icu.impl.Assert;
-
-class ThaiBreakIterator extends DictionaryBasedBreakIterator {
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.lang.UScript;

+public class ThaiBreakEngine implements LanguageBreakEngine {
    /* Helper class for improving readability of the Thai word break
     * algorithm.
     */
@ -25,7 +25,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
        //list of word candidate lengths, in increasing length order
        private int lengths[];
        private int count[];    // Count of candidates
-        private int prefix;     // The longeset match with a dictionary word
+        private int prefix;     // The longest match with a dictionary word
        private int offset;     // Offset in the text of these candidates
        private int mark;       // The preferred candidate's offset
        private int current;    // The candidate we're currently looking at
@ -38,7 +38,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
        }

        // Fill the list of candidates if needed, select the longest, and return the number found
-        public int candidates(CharacterIterator fIter, BreakCTDictionary dict, int rangeEnd) {
+        public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
            int start = fIter.getIndex();
            if (start != offset) {
                offset = start;
@ -62,7 +62,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
            return lengths[mark];
        }

-        // Backup from the current candidate to the next shorter one; rreturn true if that exists
+        // Backup from the current candidate to the next shorter one; return true if that exists
        // and point the text after it
        public boolean backUp(CharacterIterator fIter) {
            if (current > 0) {
@ -82,14 +82,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
            mark = current;
        }
    }
-
-    private static UnicodeSet fThaiWordSet;
-    private static UnicodeSet fEndWordSet;
-    private static UnicodeSet fBeginWordSet;
-    private static UnicodeSet fSuffixSet;
-    private static UnicodeSet fMarkSet;
-    private BreakCTDictionary fDictionary;
-
+    
    // Constants for ThaiBreakIterator
    // How many words in a row are "good enough"?
    private static final byte THAI_LOOKAHEAD = 3;
@ -104,9 +97,14 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
    private static final char THAI_MAIYAMOK = 0x0E46;
    // Minimum word size
    private static final byte THAI_MIN_WORD = 2;
-    // Minimum number of characters for two words
-    //private final int THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
-
+    
+    private DictionaryMatcher fDictionary;
+    private static UnicodeSet fThaiWordSet;
+    private static UnicodeSet fEndWordSet;
+    private static UnicodeSet fBeginWordSet;
+    private static UnicodeSet fSuffixSet;
+    private static UnicodeSet fMarkSet;
+    
    static {
        // Initialize UnicodeSets
        fThaiWordSet = new UnicodeSet();
@ -141,73 +139,28 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
        fBeginWordSet.freeze();
        fSuffixSet.freeze();
    }
-
-    public ThaiBreakIterator(InputStream ruleStream, InputStream dictionaryStream) throws IOException {
-        super(ruleStream);
-        // Initialize diciontary
-        fDictionary = new BreakCTDictionary(dictionaryStream);
+    
+    public ThaiBreakEngine() throws IOException {
+        // Initialize dictionary
+        fDictionary = DictionaryData.loadDictionaryFor("Thai");
    }

-    /**
-     * This is the implementation function for next().
-     */
-    protected int handleNext() {
-        CharacterIterator text = getText();
-
-        // if there are no cached break positions, or if we've just moved
-        // off the end of the range covered by the cache, we have to dump
-        // and possibly regenerate the cache
-        if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
-
-            // start by using the inherited handleNext() to find a tentative return
-            // value.   dictionaryCharCount tells us how many dictionary characters
-            // we passed over on our way to the tentative return value
-            int startPos = text.getIndex();
-            fDictionaryCharCount = 0;
-            int result = super.handleNext();
-
-            // if we passed over more than one dictionary character, then we use
-            // divideUpDictionaryRange() to regenerate the cached break positions
-            // for the new range
-            if (fDictionaryCharCount > 1 && result - startPos > 1) {
-                divideUpDictionaryRange(startPos, result);
-            }
-
-            // otherwise, the value we got back from the inherited fuction
-            // is our return value, and we can dump the cache
-            else {
-                cachedBreakPositions = null;
-                return result;
-            }
+    public boolean handles(int c, int breakType) {
+        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
+            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+            return (script == UScript.THAI);
        }
-        // if the cache of break positions has been regenerated (or existed all
-        // along), then just advance to the next break position in the cache
-        // and return it
-        if (cachedBreakPositions != null) {
-            ++positionInCache;
-            text.setIndex(cachedBreakPositions[positionInCache]);
-            return cachedBreakPositions[positionInCache];
-        }
-        Assert.assrt(false);
-        return -9999;   // SHOULD NEVER GET HERE!
+        return false;
    }

-    /**
-     * Divide up a range of known dictionary characters.
-     *
-     * @param rangeStart The start of the range of dictionary characters
-     * @param rangeEnd The end of the range of dictionary characters
-     * @return The number of breaks found
-     */
-    private int divideUpDictionaryRange(int rangeStart, int rangeEnd) {
+    public int findBreaks(CharacterIterator fIter, int rangeStart, int rangeEnd, boolean reverse, int breakType,
+            Stack<Integer> foundBreaks) {
        if ((rangeEnd - rangeStart) < THAI_MIN_WORD) {
-            return 0;  // Not enough chacters for word
+            return 0;  // Not enough characters for word
        }
-        CharacterIterator fIter = getText();
        int wordsFound = 0;
        int wordLength;
        int current;
-        Stack<Integer> foundBreaks = new Stack<Integer>();
        PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];
        for (int i = 0; i < THAI_LOOKAHEAD; i++) {
            words[i] = new PossibleWord();
@ -228,7 +181,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
                wordsFound += 1;
            }

-            // If there was more than one, see which one can take use forward the most words
+            // If there was more than one, see which one can take us forward the most words
            else if (candidates > 1) {
                boolean foundBest = false;
                // If we're already at the end of the range, we're done
@ -259,9 +212,10 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
                        }
                    } while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter) && !foundBest);
                }
-                /* foundBest: */wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
+                wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
                wordsFound += 1;
            }
+
            // We come here after having either found a word or not. We look ahead to the
            // next word. If it's not a dictionary word, we will combine it with the word we
            // just found (if there is one), but only if the preceding word does not exceed
@ -291,8 +245,8 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
                            // two characters after uc were not 0x0E4C THANTHAKHAT before
                            // checking the dictionary. That is just a performance filter,
                            // but it's not clear it's faster than checking the trie
-                            int candidate = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
-                            fIter.setIndex(current+wordLength+chars);
+                            int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
+                            fIter.setIndex(current + wordLength + chars);
                            if (candidate > 0) {
                                break;
                            }
@ -300,7 +254,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
                        pc = uc;
                    }

-                    // Bump the word cound if there wasn't already one
+                    // Bump the word count if there wasn't already one
                    if (wordLength <= 0) {
                        wordsFound += 1;
                    }
@ -351,13 +305,13 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
                        }
                    }
                } else {
-                    fIter.setIndex(current+wordLength);
+                    fIter.setIndex(current + wordLength);
                }
            }

            // Did we find a word on this iteration? If so, push it on the break stack
            if (wordLength > 0) {
-                foundBreaks.push(Integer.valueOf(current+wordLength));
+                foundBreaks.push(Integer.valueOf(current + wordLength));
            }
        }

@ -367,16 +321,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
            wordsFound -= 1;
        }

-        // Store the break points in cachedBreakPositions.
-        cachedBreakPositions = new int[foundBreaks.size() + 2];
-        cachedBreakPositions[0] = rangeStart;
-        int i;
-        for (i = 0; i < foundBreaks.size(); i++) {
-            cachedBreakPositions[i + 1] = foundBreaks.elementAt(i).intValue();
-        }
-        cachedBreakPositions[i + 1] = rangeEnd;
-        positionInCache = 0;
-
        return wordsFound;
    }
+
 }
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java
@ -0,0 +1,46 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.text.CharacterIterator;
+import java.util.Stack;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+
+import static com.ibm.icu.impl.CharacterIteration.*;
+
+public final class UnhandledBreakEngine implements LanguageBreakEngine {
+    // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
+    // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
+    private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
+    public UnhandledBreakEngine() {
+        for (int i = 0; i < fHandled.length; i++) {
+            fHandled[i] = new UnicodeSet();
+        }
+    }
+    
+    public boolean handles(int c, int breakType) {
+        return (breakType >= 0 && breakType < fHandled.length) && 
+                (fHandled[breakType].contains(c));
+    }
+
+    public int findBreaks(CharacterIterator text, int startPos, int endPos,
+            boolean reverse, int breakType, Stack<Integer> foundBreaks) {
+        text.setIndex(endPos);
+        return 0;
+    }
+
+    public synchronized void handleChar(int c, int breakType) {
+        if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
+            if (!fHandled[breakType].contains(c)) {
+                int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+                fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
+            }
+        }
+    }
+}
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a243a8584459d751b33c922f2fbfaea27200721a1a27661b5fa2ec96bb5fc6e2
-size 7929565
+oid sha256:23641fd85dfa40f916a7a5b47a6dc8ebd591862a9fe2d62ddcd46b7f1a862d36
+size 9286396
--- a/icu4j/main/shared/data/icutzdata.jar
+++ b/icu4j/main/shared/data/icutzdata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc6ebf5e136b448a03a7e74463c67d96217cc9f9d3feed4d2aa7f74dc5e25e63
+oid sha256:e951e7a3cc20e7126326db97e92ce533db611fde39c201795680246fde86c8e0
 size 97666
--- a/icu4j/main/shared/data/testdata.jar
+++ b/icu4j/main/shared/data/testdata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2029b2752b52d544749fffea9b2574ddfd19ea278cf5f26243efd98bd3f15313
-size 719725
+oid sha256:54eeee6d7834231edb7d2d9bd3174d3c4347c737f556bc6b25915bb6860b6fe2
+size 719912
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java
@ -1,16 +1,11 @@
 /*
 *******************************************************************************
- * Copyright (C) 1996-2010, International Business Machines Corporation and    *
+ * Copyright (C) 1996-2012, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
 package com.ibm.icu.dev.test.rbbi;

-import java.io.DataInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
 import java.text.StringCharacterIterator;
 import java.util.ArrayList;
 import java.util.List;
@ -18,7 +13,6 @@ import java.util.Locale;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;

 public class BreakIteratorTest extends TestFmwk
 {
@ -849,52 +843,4 @@ public class BreakIteratorTest extends TestFmwk
            errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc + " / exception: " + e.getMessage());
        }
    }
-    
-    /*
-     * Tests the constructors public DictionaryBasedBreakIterator(String rules, ... public
-     * DictionaryBasedBreakIterator(InputStream compiledRules, ...
-     */
-    public void TestDictionaryBasedBreakIterator() throws IOException {
-        // The following class allows the testing of the constructor
-        // public DictionaryBasedBreakIterator(String rules, ...
-        class TestDictionaryBasedBreakIterator extends DictionaryBasedBreakIterator {
-            public TestDictionaryBasedBreakIterator(InputStream is) throws IOException {
-                super("", is);
-            }
-        }
-        try {
-            @SuppressWarnings("unused")
-            TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(null);
-            errln("DictionaryBasedBreakIterator constructor is suppose to return an "
-                    + "exception for an empty string.");
-        } catch (Exception e) {
-        }
-        
-        try {
-            File file = File.createTempFile("dummy", "");
-            FileInputStream fis = new FileInputStream(file);
-            DataInputStream dis = new DataInputStream(fis);
-            @SuppressWarnings("unused")
-            TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(dis);
-            errln("DictionaryBasedBreakIterator constructor is suppose to return an "
-                    + "exception for a temporary file with EOF.");
-        } catch (Exception e) {
-        }
-        
-        // The following class allows the testing of the constructor
-        // public DictionaryBasedBreakIterator(InputStream compiledRules, ...
-        class TestDictionaryBasedBreakIterator1 extends DictionaryBasedBreakIterator {
-            public TestDictionaryBasedBreakIterator1() throws IOException {
-                super((InputStream) null, (InputStream) null);
-            }
-
-        }
-        try {
-            @SuppressWarnings("unused")
-            TestDictionaryBasedBreakIterator1 td1 = new TestDictionaryBasedBreakIterator1();
-            errln("DictionaryBasedBreakIterator constructor is suppose to return an "
-                    + "exception for an null input stream.");
-        } catch (Exception e) {
-        }
-    }   
-}
+}
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
- * Copyright (C) 1996-2011, International Business Machines Corporation and
+ * Copyright (C) 1996-2012, International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */
@ -20,7 +20,6 @@ import java.util.List;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.util.ULocale;

@ -584,7 +583,7 @@ public class RBBITest extends TestFmwk {
           errln("Incorrect following position.");
       }
       int []fillInArray = new int[2];
-       if (((DictionaryBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
+       if (((RuleBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
           errln("Error: Since getRuleStatusVec is not supported in DictionaryBasedBreakIterator, it should return 1 and fillInArray[0] == 0.");
       }
   }
@ -663,11 +662,6 @@ public class RBBITest extends TestFmwk {
        final String posxWordText     = "Can't have breaks in xx:yy or struct.field for CS-types.";
        final int[]  posxWordTOffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
        final int[]  posxWordROffsets = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
-        // KIND_WORD "ja"
-        final String jaWordText     = "\u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF" +
-                                      "\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002";
-        final int[]  jaWordTOffsets = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
-        final int[]  jaWordROffsets = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
        // KIND_SENTENCE "el"
        final String elSentText     = "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " +
                                      "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3";
@ -688,8 +682,6 @@ public class RBBITest extends TestFmwk {
        final TBItem[] tests = {
            new TBItem( BreakIterator.KIND_WORD,      new ULocale("en_US_POSIX"), posxWordText, posxWordTOffsets ),
            new TBItem( BreakIterator.KIND_WORD,      ULocale.ROOT,               posxWordText, posxWordROffsets ),
-            new TBItem( BreakIterator.KIND_WORD,      new ULocale("ja"),          jaWordText,   jaWordTOffsets   ),
-            new TBItem( BreakIterator.KIND_WORD,      ULocale.ROOT,               jaWordText,   jaWordROffsets   ),
            new TBItem( BreakIterator.KIND_SENTENCE,  new ULocale("el"),          elSentText,   elSentTOffsets   ),
            new TBItem( BreakIterator.KIND_SENTENCE,  ULocale.ROOT,               elSentText,   elSentROffsets   ),
            new TBItem( BreakIterator.KIND_CHARACTER, new ULocale("th"),          thCharText,   thCharTOffsets   ),
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
@ -51,7 +51,6 @@ static class TestParams {


 public void TestExtended() {
-
    TestParams     tp = new TestParams();


@ -434,6 +433,7 @@ void executeTest(TestParams t) {
       }
    }

+    
    //
    //  Run the iterator backwards, verify that the same breaks are found.
    //
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
- * Copyright (C) 2003-2011 International Business Machines Corporation and
+ * Copyright (C) 2003-2012 International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */
@ -264,15 +264,19 @@ public class RBBITestMonkey extends TestFmwk {
        UnicodeSet                fExtendSet;
        UnicodeSet                fExtendNumLetSet;
        UnicodeSet                fOtherSet;
+        
+        UnicodeSet                fDictionaryCjkSet;

        
        RBBIWordMonkey() {
            fCharProperty    = UProperty.WORD_BREAK;

+            fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]");
            fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
            fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
            fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
            fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
+            fALetterSet.removeAll(fDictionaryCjkSet);
            fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
            fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
            fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
@ -297,13 +301,14 @@ public class RBBITestMonkey extends TestFmwk {
            fOtherSet.removeAll(fExtendNumLetSet);
            // Inhibit dictionary characters from being tested at all.
            fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
+            fOtherSet.removeAll(fDictionaryCjkSet);

            fSets            = new ArrayList();
            fSets.add(fCRSet);
            fSets.add(fLFSet);
            fSets.add(fNewlineSet);
            fSets.add(fALetterSet);
-            fSets.add(fKatakanaSet);
+            //fSets.add(fKatakanaSet); // TODO: work out how to test katakana
            fSets.add(fMidLetterSet);
            fSets.add(fMidNumLetSet);
            fSets.add(fMidNumSet);
@ -1484,7 +1489,6 @@ public class RBBITestMonkey extends TestFmwk {
    /**
     * return the index of the next code point in the input text.
     * @param i the preceding index
-     * @return
     */
    static int  nextCP(StringBuffer s, int i) {
        if (i == -1) {
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/SimpleBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/SimpleBITest.java
@ -1,19 +1,15 @@
 /*
 *******************************************************************************
- * Copyright (C) 1996-2006, International Business Machines Corporation and    *
+ * Copyright (C) 1996-2012, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
 package com.ibm.icu.dev.test.rbbi;

-import java.io.IOException;
-import java.io.InputStream;
 import java.util.ListResourceBundle;
-import java.util.MissingResourceException;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;

 // TODO: {dlf} this test currently doesn't test anything!
@ -160,30 +156,12 @@ public class SimpleBITest extends TestFmwk{
            "Character", "Word", "Line", "Sentence"
        };
        String rulesName = kindNames[kind] + "BreakRules";
-        String dictionaryName = kindNames[kind] + "BreakDictionary";
        
        String[] classNames = bundle.getStringArray("BreakIteratorClasses");
        String rules = bundle.getString(rulesName);
        if (classNames[kind].equals("RuleBasedBreakIterator")) {
            iter = new RuleBasedBreakIterator(rules);
        }
-        else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
-            try {
-                String dictionaryPath = bundle.getString(dictionaryName);
-                InputStream dictionary = bundle.getClass().getResourceAsStream(dictionaryPath);
-                System.out.println("looking for " + dictionaryPath + " from " + bundle.getClass() + " returned " + dictionary);
-                iter = new DictionaryBasedBreakIterator(rules, dictionary);
-            }
-            catch(IOException e) {
-                e.printStackTrace();
-                errln(e.getMessage());
-                System.out.println(e); // debug
-            }
-            catch(MissingResourceException e) {
-                errln(e.getMessage());
-                System.out.println(e); // debug
-            }
-        }
        if (iter == null) {
            errln("could not create iterator");
        }
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@ -33,9 +33,8 @@


 #   Temp debugging tests 
-<locale en>
 <line>
-<data>•Hello, •World.•</data>
+<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>

 ########################################################################################
 #
@ -171,7 +170,14 @@
 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>

 # Hiragana & Katakana stay together, but separates from each other and Latin.
-<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
+# *** what to do about theoretical combos of chars? i.e. hiragana + accent
+#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#•</data>
+
+# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
+<data>•芽キャベツ<400>芽キャﾍﾞツ<400></data>
+
+# Testing of word boundary for dictionary word containing both kanji and kana
+<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>

 # Words with interior formatting characters
 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
@ -179,7 +185,6 @@
 # to test for bug #4097779
 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>

-
 #      to test for bug #4098467
 #      What follows is a string of Korean characters (I found it in the Yellow Pages
 #      ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
@ -188,9 +193,14 @@
 #      precomposed syllables...
 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>

-<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
+# more Korean tests (Jamo not tested here, not counted as dictionary characters)
+# Disable them now because we don't include a Korean dictionary.
+#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
+#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
+#<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
+
+<data>•\u06c9<200>\uc799<200>\ufffa•</data>

-<data>•\u06c9\uc799\ufffa<200></data>

 #      
 #      Try some words from other scripts.
@ -507,8 +517,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>

 #      conjoining jamo...
-#      TODO:  rules update needed
-#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
+<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>

 #      to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
@ -572,17 +581,17 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
 # Test data originally from the test code source file
 #      // @suwit -- Thai sample data from GVT Guideline
 #
-#<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
-#\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
-#\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
-#\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
-#
-## Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
-#<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
-#
-#<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
-#\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
-#\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>
+<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
+\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
+\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
+\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
+
+# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
+<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
+
+<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
+\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
+\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>

 <line>
 <data>•0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\
@ -619,22 +628,22 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
 #   @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start
 #
 <line>
-#<data>•\u0E1B\u0E35•\
-#\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
-#2545 •\
-#\u0E40\u0E1B\u0E47\u0E19•\
-#\u0E1B\u0E35•\
-#\u0E09\u0E25\u0E2D\u0E07•\
-#\u0E04\u0E23\u0E1A•\
-#\u0E23\u0E2D\u0E1A •\
-#\"\u0E52\u0E52\u0E50 •\
-#\u0E1b\u0E35\" •\
-#\u0E02\u0E2d\u0E07•\
-#\u0E01\u0E23\u0E38\u0E07•\
-#\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\
-#(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F•\
-#\u0E2B\u0E23\u0E37\u0E2D •\
-#Bangkok)•</data>
+<data>•\u0E1B\u0E35•\
+\u0E1E\u0E38\u0E17\u0E18•\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
+2545 •\
+\u0E40\u0E1B\u0E47\u0E19•\
+\u0E1B\u0E35•\
+\u0E09\u0E25\u0E2D\u0E07•\
+\u0E04\u0E23\u0E1A•\
+\u0E23\u0E2D\u0E1A •\
+\"\u0E52\u0E52\u0E50 •\
+\u0E1b\u0E35\" •\
+\u0E02\u0E2d\u0E07•\
+\u0E01\u0E23\u0E38\u0E07•\
+\u0E23\u0E31\u0E15\u0E19•\u0E42\u0E01•\u0E2A\u0E34•\u0E19\u0E17\u0E23\u0E4C •\
+(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F\
+\u0E2B\u0E23\u0E37\u0E2D •\
+Bangkok)•</data>

 # Data originally from RBBITest::TestMaiyamok()
 #   The Thai maiyamok character is a shorthand symbol that means "repeat the previous
@ -652,58 +661,6 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
 \u0e22\u0e07•\
 \u0e43\u0e2b\u0e21\u0e48•</data>

-
-
-##########################################################################################
-#
-#   Khmer Tests
-#
-##########################################################################################
-
-# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
-#  from the file testdata/wordsegments.txt
-<locale th>
-<word>
-
-#<data>•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200></data>
-#<data>•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200></data>
-#<data>•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200></data>
-##ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
-#<data>•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>អាច<200>ចូល<200></data>
-##ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
-#<data>•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់<200>ព្រះអង្គ<200></data>
-#<data>•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200></data>
-#<data>•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200></data>
-#<data>•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200></data>
-#<data>•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200></data>
-#<data>•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200></data>
-#<data>•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200></data>
-#<data>•បន្ត<200>សេចក្ត<200>ទៅទៀត<200></data>
-#<data>•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200></data>
-#<data>•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200></data>
-#<data>•បាន<200>សុខភាព<200>បរិប្បូណ៌<200></data>
-#<data>•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200></data>
-#<data>•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200></data>
-#<data>•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200></data>
-##អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200></data>
-#<data>•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200></data>
-#<data>•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200></data>
-#<data>•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200></data>
-
-
-#
-#  Jitterbug 3671 Test Case
-#
-#<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>
-
-#
-#  Trac ticket 5595 Test Case
-#<data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\
-#ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\
-#ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\
-#สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\
-#ไมล์<200></data>
-
 ####################################################################################
 #
 #  Tailored (locale specific) breaking.
@ -714,7 +671,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal

 <locale ja>
 <line>
-<data>•\u3041•\u3043•\u3045•\u31f1•</data>
+<data>•\u3041\u3043\u3045\u31f1•</data>
 <locale en>
 <line>
 <data>•\u3041\u3043\u3045\u31f1•</data>
@ -722,19 +679,20 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
 # The following data was originally in RBBITest::TestJapaneseWordBreak()
 <locale ja>
 <word>
-<data>•\u4ECA\u65E5<400>\u306F\u3044\u3044<300>\u5929\u6C17<400>\u3067\u3059\u306D<300>\u3002•\u000D\u000A•</data>
+<data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002•\u000D\u000A•</data>

 # UBreakIteratorType UBRK_WORD, Locale "ja"
 # Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
 # \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
+# modified to work with dbbi code - should verify

 <locale ja>
 <word>
-<data>•私達<400>に<300>一〇〇〇<400>の<300>コンピュータ<300>がある<300>。<0>奈々<400>は<300>ワード<300>である<300>。•</data>
+<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。•</data>

 <locale root>
 <word>
-<data>•私<400>達<400>に<300>一<400>〇<400>〇<400>〇<400>の<300>コンピュータ<300>が<300>あ<300>る<300>。<0>奈<400>々<200>は<300>ワード<300>で<300>あ<300>る<300>。•</data>
+<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。•</data>

 # UBreakIteratorType UBRK_SENTENCE, Locale "el"
 # Add break after Greek question mark (cldrbug #2069).
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java
@ -474,18 +474,6 @@ public final class ICUResourceBundleTest extends TestFmwk {
                errln("Did not get the expected output for referencingalias");
            }
        }
-        {
-            rb = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader);
-            sub = rb.get("boundaries");
-            String word = sub.getString("word");
-
-            if(word.equals("word_ja.brk")){
-                logln("Got the expected output for boundaries/word");
-            }else{
-                errln("Did not get the expected type for boundaries/word");
-            }
-
-        }
        {
            UResourceBundle rb1 = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader);
            if(rb1!=rb){
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java
@ -104,23 +104,6 @@ public class ULocaleTest extends TestFmwk {
    }
    */

-    public void TestBreakIterator() {
-        checkService("ja_JP_OSAKA", new ServiceFacade() {
-                public Object create(ULocale req) {
-                    return BreakIterator.getWordInstance(req);
-                }
-            }, null, new Registrar() {
-                    public Object register(ULocale loc, Object prototype) {
-                        return BreakIterator.registerInstance(
-                                                              (BreakIterator) prototype,
-                                                              loc, BreakIterator.KIND_WORD);
-                    }
-                    public boolean unregister(Object key) {
-                        return BreakIterator.unregister(key);
-                    }
-                });
-    }
-
    public void TestDateFormat() {
        checkService("de_CH_ZURICH", new ServiceFacade() {
                public Object create(ULocale req) {