ICU-45 Initial check-in of RuleBasedBreakIterator and DictionaryBasedBreakIterator.

X-SVN-Rev: 502
2000-01-08 02:05:05 +00:00 · 2000-01-08 02:05:05 +00:00 · 016aa963f6
commit 016aa963f6
parent bbccafffa4
27 changed files with 3823 additions and 1795 deletions
--- a/icu4c/source/common/schriter.cpp
+++ b/icu4c/source/common/schriter.cpp
@ -190,6 +190,15 @@ StringCharacterIterator::getIndex() const
  return pos;
 }

+void
+StringCharacterIterator::setText(const UnicodeString& newText)
+{
+    text = newText;
+    begin = 0;
+    end = newText.length();
+    pos = begin;
+}
+
 void
 StringCharacterIterator::getText(UnicodeString& result)
 {
--- a/icu4c/source/common/uchriter.cpp
+++ b/icu4c/source/common/uchriter.cpp
@ -142,6 +142,15 @@ UCharCharacterIterator::getIndex() const
    return pos;
 }

+void UCharCharacterIterator::setText(const UChar* newText,
+                                     int32_t      newTextLength)
+{
+    text = newText;
+    begin = 0;
+    end = newTextLength;
+    pos = begin;
+}
+
 void
 UCharCharacterIterator::getText(UnicodeString& result)
 {
--- a/icu4c/source/common/unicode/schriter.h
+++ b/icu4c/source/common/unicode/schriter.h
@ -139,6 +139,11 @@ public:
   * returned by current()).  */
  virtual UTextOffset      getIndex(void) const;

+  /**
+   * Sets the iterator to iterate over the provided string.
+   */
+  virtual void             setText(const UnicodeString& newText);
+  
  /**
   * Copies the UnicodeString under iteration into the UnicodeString
   * referred to by "result".  Even if this iterator iterates across
--- a/icu4c/source/common/unicode/uchriter.h
+++ b/icu4c/source/common/unicode/uchriter.h
@ -108,6 +108,12 @@ public:
   * returned by current()).  */
  virtual UTextOffset      getIndex(void) const;

+  /**
+   * Sets the iterator to iterate over a new range of text
+   */
+  virtual void             setText(const UChar* newText,
+                                   int32_t newTextLength);
+  
  /**
   * Copies the UnicodeString under iteration into the UnicodeString
   * referred to by "result".  Even if this iterator iterates across
--- a/icu4c/source/common/uvector.cpp
+++ b/icu4c/source/common/uvector.cpp
@ -49,7 +49,7 @@ UVector::~UVector() {
 }

 void UVector::addElement(void* obj) {
-    if (ensureCapacity(count+1)) {
+    if (ensureCapacity(count + 1)) {
        elements[count++] = obj;
    }
 }
@ -66,7 +66,7 @@ void UVector::setElementAt(void* obj, int32_t index) {

 void UVector::insertElementAt(void* obj, int32_t index) {
    // must have 0 <= index <= count
-    if (0 <= index && index <= count && ensureCapacity(count)) {
+    if (0 <= index && index <= count && ensureCapacity(count + 1)) {
        for (int32_t i=count; i>index; --i) {
            elements[i] = elements[i-1];
        }
--- a/icu4c/source/common/uvector.h
+++ b/icu4c/source/common/uvector.h
@ -227,15 +227,6 @@ inline void* UVector::operator[](int32_t index) const {
    return elementAt(index);
 }

-// Dummy implementation - disallowed method
-inline UVector::UVector(const UVector&) {}
-
-// Dummy implementation - disallowed method
-inline UVector& UVector::operator=(const UVector&) {
-    return *this;
-}
-
-
 // UStack inlines

 inline bool_t UStack::empty(void) const {
@ -251,12 +242,4 @@ inline void* UStack::push(void* obj) {
    return obj;
 }

-// Dummy implementation - disallowed method
-inline UStack::UStack(const UStack&) {}
-
-// Dummy implementation - disallowed method
-inline UStack& UStack::operator=(const UStack&) {
-    return *this;
-}
-
 #endif
--- a/icu4c/source/i18n/brkiter.cpp
+++ b/icu4c/source/i18n/brkiter.cpp
@ -17,9 +17,10 @@
 // This file was generated from the java source file BreakIterator.java
 // *****************************************************************************

-#include "unicode/utypes.h"
+#include "dbbi.h"
 #include "unicode/brkiter.h"
-#include "simtxbd.h"
+#include "unicode/udata.h"
+#include "resbund.h"

 #include <string.h>

@ -38,7 +39,41 @@ const UTextOffset BreakIterator::DONE = (int32_t)-1;
 BreakIterator*
 BreakIterator::createWordInstance(const Locale& key)
 {
-    return new SimpleTextBoundary(&TextBoundaryData::kWordBreakData);
+    // WARNING: This routine is currently written specifically to handle only the
+    // default rules files and the alternate rules files for Thai.  This function
+    // will have to be made fully general at some time in the future!
+    BreakIterator* result = NULL;
+    const char* filename = "word";
+
+    UnicodeString temp;
+    if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
+        filename = "word_th";
+    }
+
+    UErrorCode err = U_ZERO_ERROR;
+    UDataMemory* file = udata_open(NULL, "brk", filename, &err);
+
+    if (!U_FAILURE(err)) {
+        const void* image = udata_getMemory(file);
+
+        if (image != NULL) {
+            if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
+                const char* dataDir = u_getDataDirectory();
+                filename = "thaidict.brk";
+                char* fullPath = new char[strlen(dataDir) + strlen(filename) + 1];
+                strcpy(fullPath, dataDir);
+                strcpy(fullPath, filename);
+                
+                result = new DictionaryBasedBreakIterator(image, fullPath);
+                delete [] fullPath;
+            }
+            else {
+                result = new RuleBasedBreakIterator(image);
+            }
+        }
+    }
+    
+    return result;
 }

 // -------------------------------------
@ -47,7 +82,41 @@ BreakIterator::createWordInstance(const Locale& key)
 BreakIterator*
 BreakIterator::createLineInstance(const Locale& key)
 {
-    return new SimpleTextBoundary(&TextBoundaryData::kLineBreakData);
+    // WARNING: This routine is currently written specifically to handle only the
+    // default rules files and the alternate rules files for Thai.  This function
+    // will have to be made fully general at some time in the future!
+    BreakIterator* result = NULL;
+    const char* filename = "line";
+
+    UnicodeString temp;
+    if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
+        filename = "line_th";
+    }
+
+    UErrorCode err = U_ZERO_ERROR;
+    UDataMemory* file = udata_open(NULL, "brk", filename, &err);
+
+    if (!U_FAILURE(err)) {
+        const void* image = udata_getMemory(file);
+
+        if (image != NULL) {
+            if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
+                const char* dataDir = u_getDataDirectory();
+                filename = "thaidict.brk";
+                char* fullPath = new char[strlen(dataDir) + strlen(filename) + 1];
+                strcpy(fullPath, dataDir);
+                strcat(fullPath, filename);
+                
+                result = new DictionaryBasedBreakIterator(image, fullPath);
+                delete [] fullPath;
+            }
+            else {
+                result = new RuleBasedBreakIterator(image);
+            }
+        }
+    }
+    
+    return result;
 }

 // -------------------------------------
@ -56,7 +125,24 @@ BreakIterator::createLineInstance(const Locale& key)
 BreakIterator*
 BreakIterator::createCharacterInstance(const Locale& key)
 {
-    return new SimpleTextBoundary(&TextBoundaryData::kCharacterBreakData);
+    // WARNING: This routine is currently written specifically to handle only the
+    // default rules files and the alternate rules files for Thai.  This function
+    // will have to be made fully general at some time in the future!
+    BreakIterator* result = NULL;
+    const char* filename = "char";
+
+    UErrorCode err = U_ZERO_ERROR;
+    UDataMemory* file = udata_open(NULL, "brk", filename, &err);
+
+    if (!U_FAILURE(err)) {
+        const void* image = udata_getMemory(file);
+
+        if (image != NULL) {
+            result = new RuleBasedBreakIterator(image);
+        }
+    }
+    
+    return result;
 }

 // -------------------------------------
@ -65,7 +151,24 @@ BreakIterator::createCharacterInstance(const Locale& key)
 BreakIterator*
 BreakIterator::createSentenceInstance(const Locale& key)
 {
-    return new SimpleTextBoundary(&TextBoundaryData::kSentenceBreakData);
+    // WARNING: This routine is currently written specifically to handle only the
+    // default rules files and the alternate rules files for Thai.  This function
+    // will have to be made fully general at some time in the future!
+    BreakIterator* result = NULL;
+    const char* filename = "sent";
+
+    UErrorCode err = U_ZERO_ERROR;
+    UDataMemory* file = udata_open(NULL, "brk", filename, &err);
+
+    if (!U_FAILURE(err)) {
+        const void* image = udata_getMemory(file);
+
+        if (image != NULL) {
+            result = new RuleBasedBreakIterator(image);
+        }
+    }
+    
+    return result;
 }

 // -------------------------------------
--- a/icu4c/source/i18n/dbbi.cpp
+++ b/icu4c/source/i18n/dbbi.cpp
@ -0,0 +1,439 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 IBM Corp. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   12/1/99    rgillam     Complete port from Java.
+**********************************************************************
+*/
+
+#include "dbbi.h"
+#include "dbbi_tbl.h"
+#include "uvector.h"
+
+char DictionaryBasedBreakIterator::fgClassID = 0;
+
+//=======================================================================
+// constructors
+//=======================================================================
+
+DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const void* tablesImage,
+                                                           char* dictionaryFilename)
+: RuleBasedBreakIterator((const void*)NULL),
+  dictionaryCharCount(0),
+  cachedBreakPositions(NULL),
+  numCachedBreakPositions(0),
+  positionInCache(0)
+{
+    tables = new DictionaryBasedBreakIteratorTables(tablesImage, dictionaryFilename);
+    tables->addReference();
+}
+
+//=======================================================================
+// boilerplate
+//=======================================================================
+
+/**
+ * Destructor
+ */
+DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
+{
+    delete [] cachedBreakPositions;
+}
+
+/**
+ * Assignment operator.  Sets this iterator to have the same behavior,
+ * and iterate over the same text, as the one passed in.
+ */
+DictionaryBasedBreakIterator&
+DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
+    reset();
+    RuleBasedBreakIterator::operator=(that);
+    return *this;
+}
+
+/**
+ * Returns a newly-constructed RuleBasedBreakIterator with the same
+ * behavior, and iterating over the same text, as this one.
+ */
+BreakIterator*
+DictionaryBasedBreakIterator::clone() const {
+    return new DictionaryBasedBreakIterator(*this);
+}
+
+//=======================================================================
+// BreakIterator overrides
+//=======================================================================
+
+/**
+ * Advances the iterator one step backwards.
+ * @return The position of the last boundary position before the
+ * current iteration position
+ */
+int32_t
+DictionaryBasedBreakIterator::previous()
+{
+    // if we have cached break positions and we're still in the range
+    // covered by them, just move one step backward in the cache
+    if (cachedBreakPositions != NULL && positionInCache > 0) {
+        --positionInCache;
+        text->setIndex(cachedBreakPositions[positionInCache]);
+        return cachedBreakPositions[positionInCache];
+    }
+
+    // otherwise, dump the cache and use the inherited previous() method to move
+    // backward.  This may fill up the cache with new break positions, in which
+    // case we have to mark our position in the cache
+    else {
+        reset();
+        int32_t result = RuleBasedBreakIterator::previous();
+        if (cachedBreakPositions != NULL) {
+            positionInCache = numCachedBreakPositions - 2;
+        }
+        return result;
+    }
+}
+
+/**
+ * Sets the current iteration position to the last boundary position
+ * before the specified position.
+ * @param offset The position to begin searching from
+ * @return The position of the last boundary before "offset"
+ */
+int32_t
+DictionaryBasedBreakIterator::preceding(int32_t offset)
+{
+    // if the offset passed in is already past the end of the text,
+    // just return DONE; if it's before the beginning, return the
+    // text's starting offset
+    if (text == NULL || offset > text->endIndex()) {
+        return BreakIterator::DONE;
+    }
+    else if (offset < text->startIndex()) {
+        return text->startIndex();
+    }
+
+    // if we have no cached break positions, or "offset" is outside the
+    // range covered by the cache, we can just call the inherited routine
+    // (which will eventually call other routines in this class that may
+    // refresh the cache)
+    if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
+            offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
+        reset();
+        return RuleBasedBreakIterator::preceding(offset);
+    }
+
+    // on the other hand, if "offset" is within the range covered by the cache,
+    // then all we have to do is search the cache for the last break position
+    // before "offset"
+    else {
+        positionInCache = 0;
+        while (positionInCache < numCachedBreakPositions
+               && offset > cachedBreakPositions[positionInCache])
+            ++positionInCache;
+        --positionInCache;
+        text->setIndex(cachedBreakPositions[positionInCache]);
+        return text->getIndex();
+    }
+}
+
+/**
+ * Sets the current iteration position to the first boundary position after
+ * the specified position.
+ * @param offset The position to begin searching forward from
+ * @return The position of the first boundary after "offset"
+ */
+int32_t
+DictionaryBasedBreakIterator::following(int32_t offset)
+{
+    // if the offset passed in is already past the end of the text,
+    // just return DONE; if it's before the beginning, return the
+    // text's starting offset
+    if (text == NULL || offset > text->endIndex()) {
+        return BreakIterator::DONE;
+    }
+    else if (offset < text->startIndex()) {
+        return text->startIndex();
+    }
+
+    // if we have no cached break positions, or if "offset" is outside the
+    // range covered by the cache, then dump the cache and call our
+    // inherited following() method.  This will call other methods in this
+    // class that may refresh the cache.
+    if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
+            offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
+        reset();
+        return RuleBasedBreakIterator::following(offset);
+    }
+
+    // on the other hand, if "offset" is within the range covered by the
+    // cache, then just search the cache for the first break position
+    // after "offset"
+    else {
+        positionInCache = 0;
+        while (positionInCache < numCachedBreakPositions
+               && offset >= cachedBreakPositions[positionInCache])
+            ++positionInCache;
+        text->setIndex(cachedBreakPositions[positionInCache]);
+        return text->getIndex();
+    }
+}
+
+/**
+ * This is the implementation function for next().
+ */
+int32_t
+DictionaryBasedBreakIterator::handleNext()
+{
+    // if there are no cached break positions, or if we've just moved
+    // off the end of the range covered by the cache, we have to dump
+    // and possibly regenerate the cache
+    if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {
+
+        // start by using the inherited handleNext() to find a tentative return
+        // value.   dictionaryCharCount tells us how many dictionary characters
+        // we passed over on our way to the tentative return value
+        int32_t startPos = text->getIndex();
+        dictionaryCharCount = 0;
+        int32_t result = RuleBasedBreakIterator::handleNext();
+
+        // if we passed over more than one dictionary character, then we use
+        // divideUpDictionaryRange() to regenerate the cached break positions
+        // for the new range
+        if (dictionaryCharCount > 1 && result - startPos > 1) {
+            divideUpDictionaryRange(startPos, result);
+        }
+
+        // otherwise, the value we got back from the inherited fuction
+        // is our return value, and we can dump the cache
+        else {
+            reset();
+            return result;
+        }
+    }
+
+    // if the cache of break positions has been regenerated (or existed all
+    // along), then just advance to the next break position in the cache
+    // and return it
+    if (cachedBreakPositions != NULL) {
+        ++positionInCache;
+        text->setIndex(cachedBreakPositions[positionInCache]);
+        return cachedBreakPositions[positionInCache];
+    }
+    return -9999;   // SHOULD NEVER GET HERE!
+}
+
+void
+DictionaryBasedBreakIterator::reset()
+{
+    delete [] cachedBreakPositions;
+    cachedBreakPositions = NULL;
+    numCachedBreakPositions = 0;
+    dictionaryCharCount = 0;
+    positionInCache = 0;
+}
+
+/**
+ * This is the function that actually implements the dictionary-based
+ * algorithm.  Given the endpoints of a range of text, it uses the
+ * dictionary to determine the positions of any boundaries in this
+ * range.  It stores all the boundary positions it discovers in
+ * cachedBreakPositions so that we only have to do this work once
+ * for each time we enter the range.
+ */
+void
+DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos)
+{
+    // to avoid casts throughout the rest of this function
+    DictionaryBasedBreakIteratorTables* tables
+            = (DictionaryBasedBreakIteratorTables*)(this->tables);
+
+    // the range we're dividing may begin or end with non-dictionary characters
+    // (i.e., for line breaking, we may have leading or trailing punctuation
+    // that needs to be kept with the word).  Seek from the beginning of the
+    // range to the first dictionary character
+    text->setIndex(startPos);
+    UChar c = text->current();
+    int category = tables->lookupCategory(c, this);
+    while (category == IGNORE || !tables->categoryFlags[category]) {
+        c = text->next();
+        category = tables->lookupCategory(c, this);
+    }
+    
+
+    // initialize.  We maintain two stacks: currentBreakPositions contains
+    // the list of break positions that will be returned if we successfully
+    // finish traversing the whole range now.  possibleBreakPositions lists
+    // all other possible word ends we've passed along the way.  (Whenever
+    // we reach an error [a sequence of characters that can't begin any word
+    // in the dictionary], we back up, possibly delete some breaks from
+    // currentBreakPositions, move a break from possibleBreakPositions
+    // to currentBreakPositions, and start over from there.  This process
+    // continues in this way until we either successfully make it all the way
+    // across the range, or exhaust all of our combinations of break
+    // positions.) wrongBreakPositions is used to keep track of paths we've
+    // tried on previous iterations.  As the iterator backs up further and
+    // further, this saves us from having to follow each possible path
+    // through the text all the way to the error (hopefully avoiding many
+    // future recursive calls as well).
+    UStack currentBreakPositions;
+    UStack possibleBreakPositions;
+    UVector wrongBreakPositions;
+
+    // the dictionary is implemented as a trie, which is treated as a state
+    // machine.  -1 represents the end of a legal word.  Every word in the
+    // dictionary is represented by a path from the root node to -1.  A path
+    // that ends in state 0 is an illegal combination of characters.
+    int16_t state = 0;
+
+    // these two variables are used for error handling.  We keep track of the
+    // farthest we've gotten through the range being divided, and the combination
+    // of breaks that got us that far.  If we use up all possible break
+    // combinations, the text contains an error or a word that's not in the
+    // dictionary.  In this case, we "bless" the break positions that got us the
+    // farthest as real break positions, and then start over from scratch with
+    // the character where the error occurred.
+    int32_t farthestEndPoint = text->getIndex();
+    UStack bestBreakPositions;
+    bool_t bestBreakPositionsInitialized = FALSE;
+
+    // initialize (we always exit the loop with a break statement)
+    c = text->current();
+    while (true) {
+
+        // if we can transition to state "-1" from our current state, we're
+        // on the last character of a legal word.  Push that position onto
+        // the possible-break-positions stack
+        if (tables->dictionary.at(state, (int32_t)0) == -1) {
+            possibleBreakPositions.push((void*)text->getIndex());
+        }
+
+        // look up the new state to transition to in the dictionary
+        state = tables->dictionary.at(state, c);
+
+        // if the character we're sitting on causes us to transition to
+        // the "end of word" state, then it was a non-dictionary character
+        // and we've successfully traversed the whole range.  Drop out
+        // of the loop.
+        if (state == -1) {
+            currentBreakPositions.push((void*)text->getIndex());
+            break;
+        }
+
+        // if the character we're sitting on causes us to transition to
+        // the error state, or if we've gone off the end of the range
+        // without transitioning to the "end of word" state, we've hit
+        // an error...
+        else if (state == 0 || text->getIndex() >= endPos) {
+
+            // if this is the farthest we've gotten, take note of it in
+            // case there's an error in the text
+            if (text->getIndex() > farthestEndPoint) {
+                farthestEndPoint = text->getIndex();
+                bestBreakPositions.removeAllElements();
+                bestBreakPositionsInitialized = TRUE;
+                for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
+                    bestBreakPositions.push(currentBreakPositions.elementAt(i));
+                }
+            }
+
+            // wrongBreakPositions is a list of all break positions we've tried starting
+            // that didn't allow us to traverse all the way through the text.  Every time
+            // we pop a break position off of currentBreakPositions, we put it into
+            // wrongBreakPositions to avoid trying it again later.  If we make it to this
+            // spot, we're either going to back up to a break in possibleBreakPositions
+            // and try starting over from there, or we've exhausted all possible break
+            // positions and are going to do the fallback procedure.  This loop prevents
+            // us from messing with anything in possibleBreakPositions that didn't work as
+            // a starting point the last time we tried it (this is to prevent a bunch of
+            // repetitive checks from slowing down some extreme cases)
+            while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
+                        possibleBreakPositions.peek())) {
+                possibleBreakPositions.pop();
+            }
+            
+            // if we've used up all possible break-position combinations, there's
+            // an error or an unknown word in the text.  In this case, we start
+            // over, treating the farthest character we've reached as the beginning
+            // of the range, and "blessing" the break positions that got us that
+            // far as real break positions
+            if (possibleBreakPositions.isEmpty()) {
+                if (bestBreakPositionsInitialized) {
+                    currentBreakPositions.removeAllElements();
+                    for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
+                        currentBreakPositions.push(bestBreakPositions.elementAt(i));
+                    }
+                    bestBreakPositions.removeAllElements();
+                    if (farthestEndPoint < endPos) {
+                        text->setIndex(farthestEndPoint + 1);
+                    }
+                    else {
+                        break;
+                    }
+                }
+                else {
+                    if ((currentBreakPositions.isEmpty()
+                            || (int32_t)currentBreakPositions.peek() != text->getIndex())
+                            && text->getIndex() != startPos) {
+                        currentBreakPositions.push((void*)text->getIndex());
+                    }
+                    text->next();
+                    currentBreakPositions.push((void*)text->getIndex());
+                }
+            }
+
+            // if we still have more break positions we can try, then promote the
+            // last break in possibleBreakPositions into currentBreakPositions,
+            // and get rid of all entries in currentBreakPositions that come after
+            // it.  Then back up to that position and start over from there (i.e.,
+            // treat that position as the beginning of a new word)
+            else {
+                int32_t temp = (int32_t)possibleBreakPositions.pop();
+                void* temp2 = NULL;
+                while (!currentBreakPositions.isEmpty() && temp <
+                       (int32_t)currentBreakPositions.peek()) {
+                    temp2 = currentBreakPositions.pop();
+                    wrongBreakPositions.addElement(temp2);
+                }
+                currentBreakPositions.push((void*)temp);
+                text->setIndex((int32_t)currentBreakPositions.peek());
+            }
+
+            // re-sync "c" for the next go-round, and drop out of the loop if
+            // we've made it off the end of the range
+            c = text->current();
+            if (text->getIndex() >= endPos) {
+                break;
+            }
+        }
+
+        // if we didn't hit any exceptional conditions on this last iteration,
+        // just advance to the next character and loop
+        else {
+            c = text->next();
+        }
+    }
+
+    // dump the last break position in the list, and replace it with the actual
+    // end of the range (which may be the same character, or may be further on
+    // because the range actually ended with non-dictionary characters we want to
+    // keep with the word)
+    if (!currentBreakPositions.isEmpty()) {
+        currentBreakPositions.pop();
+    }
+    currentBreakPositions.push((void*)endPos);
+
+    // create a regular array to hold the break positions and copy
+    // the break positions from the stack to the array (in addition,
+    // our starting position goes into this array as a break position).
+    // This array becomes the cache of break positions used by next()
+    // and previous(), so this is where we actually refresh the cache.
+    cachedBreakPositions = new int32_t[currentBreakPositions.size() + 1];
+    numCachedBreakPositions = currentBreakPositions.size() + 1;
+    cachedBreakPositions[0] = startPos;
+
+    for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
+        cachedBreakPositions[i + 1] = (int32_t)currentBreakPositions.elementAt(i);
+    }
+    positionInCache = 0;
+}
--- a/icu4c/source/i18n/dbbi.h
+++ b/icu4c/source/i18n/dbbi.h
@ -0,0 +1,201 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 IBM Corp. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   12/1/99    rgillam     Complete port from Java.
+**********************************************************************
+*/
+
+#ifndef DBBI_H
+#define DBBI_H
+
+#include "rbbi.h"
+
+/**
+ * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
+ * to further subdivide ranges of text beyond what is possible using just the
+ * state-table-based algorithm.  This is necessary, for example, to handle
+ * word and line breaking in Thai, which doesn't use spaces between words.  The
+ * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
+ * up text as far as possible, and then contiguous ranges of letters are
+ * repeatedly compared against a list of known words (i.e., the dictionary)
+ * to divide them up into words.
+ *
+ * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
+ * but adds one more special substitution name: &lt;dictionary&gt;.  This substitution
+ * name is used to identify characters in words in the dictionary.  The idea is that
+ * if the iterator passes over a chunk of text that includes two or more characters
+ * in a row that are included in &lt;dictionary&gt;, it goes back through that range and
+ * derives additional break positions (if possible) using the dictionary.
+ *
+ * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
+ * file.  It follows a prescribed search path to locate the dictionary (right now,
+ * it looks for it in /com/ibm/text/resources in each directory in the classpath,
+ * and won't find it in JAR files, but this location is likely to change).  The
+ * dictionary file is in a serialized binary format.  We have a very primitive (and
+ * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
+ * currently making it public.  Contact us for help.
+ */
+class U_I18N_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {
+
+private:
+    /**
+     * a temporary hiding place for the number of dictionary characters in the
+     * last range passed over by next()
+     */
+    int32_t dictionaryCharCount;
+
+    /**
+     * when a range of characters is divided up using the dictionary, the break
+     * positions that are discovered are stored here, preventing us from having
+     * to use either the dictionary or the state table again until the iterator
+     * leaves this range of text
+     */
+    int32_t* cachedBreakPositions;
+
+    /**
+     * The number of elements in cachedBreakPositions
+     */
+    int32_t numCachedBreakPositions;
+
+    /**
+     * if cachedBreakPositions is not null, this indicates which item in the
+     * cache the current iteration position refers to
+     */
+    int32_t positionInCache;
+
+    /**
+     * Class ID
+     */
+    static char fgClassID;
+
+public:
+    //=======================================================================
+    // constructors
+    //=======================================================================
+
+DictionaryBasedBreakIterator(const void* tablesImage, char* dictionaryFilename);
+
+    //=======================================================================
+    // boilerplate
+    //=======================================================================
+
+    /**
+     * Destructor
+     */
+    virtual ~DictionaryBasedBreakIterator();
+
+    /**
+     * Assignment operator.  Sets this iterator to have the same behavior,
+     * and iterate over the same text, as the one passed in.
+     */
+    DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);
+
+    /**
+     * Returns a newly-constructed RuleBasedBreakIterator with the same
+     * behavior, and iterating over the same text, as this one.
+     */
+    virtual BreakIterator* clone() const;
+
+    //=======================================================================
+    // BreakIterator overrides
+    //=======================================================================
+    /**
+     * Advances the iterator backwards, to the last boundary preceding this one.
+     * @return The position of the last boundary position preceding this one.
+     */
+    virtual int32_t previous();
+
+    /**
+     * Sets the iterator to refer to the first boundary position following
+     * the specified position.
+     * @offset The position from which to begin searching for a break position.
+     * @return The position of the first break after the current position.
+     */
+    virtual int32_t following(int32_t offset);
+
+    /**
+     * Sets the iterator to refer to the last boundary position before the
+     * specified position.
+     * @offset The position to begin searching for a break from.
+     * @return The position of the last boundary before the starting position.
+     */
+    virtual int32_t preceding(int32_t offset);
+
+    /**
+     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
+     * This method is to implement a simple version of RTTI, since not all
+     * C++ compilers support genuine RTTI.  Polymorphic operator==() and
+     * clone() methods call this method.
+     *
+     * @return          The class ID for this object. All objects of a
+     *                  given class have the same class ID.  Objects of
+     *                  other classes have different class IDs.
+     */
+    virtual UClassID getDynamicClassID() const;
+
+    /**
+     * Returns the class ID for this class.  This is useful only for
+     * comparing to a return value from getDynamicClassID().  For example:
+     *
+     *      Base* polymorphic_pointer = createPolymorphicObject();
+     *      if (polymorphic_pointer->getDynamicClassID() ==
+     *          Derived::getStaticClassID()) ...
+     *
+     * @return          The class ID for all objects of this class.
+     */
+    static UClassID getStaticClassID();
+
+protected:
+    //=======================================================================
+    // implementation
+    //=======================================================================
+    /**
+     * This method is the actual implementation of the next() method.  All iteration
+     * vectors through here.  This method initializes the state machine to state 1
+     * and advances through the text character by character until we reach the end
+     * of the text or the state machine transitions to state 0.  We update our return
+     * value every time the state machine passes through a possible end state.
+     */
+    virtual int32_t handleNext();
+
+    /**
+     * dumps the cache of break positions (usually in response to a change in
+     * position of some sort)
+     */
+    virtual void reset();
+
+private:
+    /**
+     * This is the function that actually implements the dictionary-based
+     * algorithm.  Given the endpoints of a range of text, it uses the
+     * dictionary to determine the positions of any boundaries in this
+     * range.  It stores all the boundary positions it discovers in
+     * cachedBreakPositions so that we only have to do this work once
+     * for each time we enter the range.
+     */
+    void divideUpDictionaryRange(int32_t startPos, int32_t endPos);
+
+    /**
+     * Used by the tables object to increment the count of dictionary characters
+     * during iteration
+     */
+    void bumpDictionaryCharCount();
+
+    friend class DictionaryBasedBreakIteratorTables;
+};
+
+inline UClassID DictionaryBasedBreakIterator::getDynamicClassID() const {
+    return RuleBasedBreakIterator::getStaticClassID();
+}
+
+inline UClassID DictionaryBasedBreakIterator::getStaticClassID() {
+    return (UClassID)(&fgClassID);
+}
+
+inline void DictionaryBasedBreakIterator::bumpDictionaryCharCount() {
+    ++dictionaryCharCount;
+}
+
+#endif
--- a/icu4c/source/i18n/dbbi_bld.cpp
+++ b/icu4c/source/i18n/dbbi_bld.cpp
@ -0,0 +1,64 @@
+/**
+ * The Builder class for DictionaryBasedBreakIterator inherits almost all of
+ * its functionality from the Builder class for RuleBasedBreakIterator, but
+ * extends it with extra logic to handle the "<dictionary>" token
+ */
+protected class Builder extends RuleBasedBreakIterator.Builder {
+
+    /**
+     * A CharSet that contains all the characters represented in the dictionary
+     */
+    private CharSet dictionaryChars = new CharSet();
+    private String dictionaryExpression = "";
+
+    /**
+     * No special initialization
+     */
+    public Builder() {
+    }
+
+    /**
+     * We override handleSpecialSubstitution() to add logic to handle
+     * the <dictionary> tag.  If we see a substitution named "<dictionary>",
+     * parse the substitution expression and store the result in
+     * dictionaryChars.
+     */
+    protected void handleSpecialSubstitution(String replace, String replaceWith,
+                                             int startPos, String description) {
+        super.handleSpecialSubstitution(replace, replaceWith, startPos, description);
+
+        if (replace.equals("<dictionary>")) {
+            if (replaceWith.charAt(0) == '(') {
+                error("Dictionary group can't be enclosed in (", startPos, description);
+            }
+            dictionaryExpression = replaceWith;
+            dictionaryChars = CharSet.parseString(replaceWith);
+        }
+    }
+
+    /**
+     * The other half of the logic to handle the dictionary characters happens here.
+     * After the inherited builder has derived the real character categories, we
+     * set up the categoryFlags array in the iterator.  This array contains "true"
+     * for every character category that includes a dictionary character.
+     */
+    protected void buildCharCategories(Vector tempRuleList) {
+        super.buildCharCategories(tempRuleList);
+
+        categoryFlags = new boolean[categories.size()];
+        for (int i = 0; i < categories.size(); i++) {
+            CharSet cs = (CharSet)categories.elementAt(i);
+            if (!(cs.intersection(dictionaryChars).empty())) {
+                categoryFlags[i] = true;
+            }
+        }
+    }
+
+    // This function is actually called by RuleBasedBreakIterator.buildCharCategories(),
+    // which is called by the function above.  This gives us a way to create a separate
+    // character category for the dictionary characters even when RuleBasedBreakIterator
+    // isn't making a distinction
+    protected void mungeExpressionList(Hashtable expressions) {
+        expressions.put(dictionaryExpression, dictionaryChars);
+    }
+}
--- a/icu4c/source/i18n/dbbi_bld.h
+++ b/icu4c/source/i18n/dbbi_bld.h
@ -0,0 +1,64 @@
+/**
+ * The Builder class for DictionaryBasedBreakIterator inherits almost all of
+ * its functionality from the Builder class for RuleBasedBreakIterator, but
+ * extends it with extra logic to handle the "<dictionary>" token
+ */
+protected class Builder extends RuleBasedBreakIterator.Builder {
+
+    /**
+     * A CharSet that contains all the characters represented in the dictionary
+     */
+    private CharSet dictionaryChars = new CharSet();
+    private String dictionaryExpression = "";
+
+    /**
+     * No special initialization
+     */
+    public Builder() {
+    }
+
+    /**
+     * We override handleSpecialSubstitution() to add logic to handle
+     * the <dictionary> tag.  If we see a substitution named "<dictionary>",
+     * parse the substitution expression and store the result in
+     * dictionaryChars.
+     */
+    protected void handleSpecialSubstitution(String replace, String replaceWith,
+                                             int startPos, String description) {
+        super.handleSpecialSubstitution(replace, replaceWith, startPos, description);
+
+        if (replace.equals("<dictionary>")) {
+            if (replaceWith.charAt(0) == '(') {
+                error("Dictionary group can't be enclosed in (", startPos, description);
+            }
+            dictionaryExpression = replaceWith;
+            dictionaryChars = CharSet.parseString(replaceWith);
+        }
+    }
+
+    /**
+     * The other half of the logic to handle the dictionary characters happens here.
+     * After the inherited builder has derived the real character categories, we
+     * set up the categoryFlags array in the iterator.  This array contains "true"
+     * for every character category that includes a dictionary character.
+     */
+    protected void buildCharCategories(Vector tempRuleList) {
+        super.buildCharCategories(tempRuleList);
+
+        categoryFlags = new boolean[categories.size()];
+        for (int i = 0; i < categories.size(); i++) {
+            CharSet cs = (CharSet)categories.elementAt(i);
+            if (!(cs.intersection(dictionaryChars).empty())) {
+                categoryFlags[i] = true;
+            }
+        }
+    }
+
+    // This function is actually called by RuleBasedBreakIterator.buildCharCategories(),
+    // which is called by the function above.  This gives us a way to create a separate
+    // character category for the dictionary characters even when RuleBasedBreakIterator
+    // isn't making a distinction
+    protected void mungeExpressionList(Hashtable expressions) {
+        expressions.put(dictionaryExpression, dictionaryChars);
+    }
+}
--- a/icu4c/source/i18n/dbbi_tbl.cpp
+++ b/icu4c/source/i18n/dbbi_tbl.cpp
@ -0,0 +1,59 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 IBM Corp. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   12/1/99    rgillam     Complete port from Java.
+**********************************************************************
+*/
+
+#include "dbbi_tbl.h"
+#include "dbbi.h"
+
+//=======================================================================
+// constructor
+//=======================================================================
+
+DictionaryBasedBreakIteratorTables::DictionaryBasedBreakIteratorTables(
+                                 const void* tablesImage,
+                                 char* dictionaryFilename)
+: RuleBasedBreakIteratorTables(tablesImage),
+  dictionary(dictionaryFilename)
+{
+    const void** tablesIdx = (const void**)tablesImage;
+    const void* dbbiImage = (const void*)((const int8_t*)tablesImage + (int32_t)tablesIdx[8]);
+        // we know the offset into the memory image where the DBBI stuff
+        // starts is stored in element 8 of the array.  There should be
+        // a way for the RBBI constructor to give us this, but there's
+        // isn't a good one.
+    const void** dbbiIdx = (const void**)dbbiImage;
+
+    categoryFlags = (int8_t*)((const int8_t*)dbbiImage + (int32_t)dbbiIdx[0]);
+}
+
+//=======================================================================
+// boilerplate
+//=======================================================================
+
+/**
+ * Destructor
+ */
+DictionaryBasedBreakIteratorTables::~DictionaryBasedBreakIteratorTables() {
+    if (ownTables)
+        delete [] categoryFlags;
+}
+
+int32_t
+DictionaryBasedBreakIteratorTables::lookupCategory(UChar c,
+                                                   BreakIterator* bi) const {
+    // this override of lookupCategory() exists only to keep track of whether we've
+    // passed over any dictionary characters.  It calls the inherited lookupCategory()
+    // to do the real work, and then checks whether its return value is one of the
+    // categories represented in the dictionary.  If it is, bump the dictionary-
+    // character count.
+    int32_t result = RuleBasedBreakIteratorTables::lookupCategory(c, bi);
+    if (result != RuleBasedBreakIterator::IGNORE && categoryFlags[result]) {
+        ((DictionaryBasedBreakIterator*)bi)->bumpDictionaryCharCount();
+    }
+    return result;
+}
--- a/icu4c/source/i18n/dbbi_tbl.h
+++ b/icu4c/source/i18n/dbbi_tbl.h
@ -0,0 +1,79 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 IBM Corp. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   12/1/99    rgillam     Complete port from Java.
+**********************************************************************
+*/
+
+#ifndef DBBI_TBL_H
+#define DBBI_TBL_H
+
+#include "rbbi_tbl.h"
+#include "brkdict.h"
+
+/**
+ * This subclass of RuleBasedBreakIteratorTables contains the additional
+ * static data that is used by DictionaryBasedBreakIterator.  This comprises
+ * the dictionary itself and an array of flags that indicate which characters
+ * are in the dictionary.
+ *
+ * @author Richard Gillam
+ */
+class DictionaryBasedBreakIteratorTables : public RuleBasedBreakIteratorTables {
+
+private:
+    /**
+     * a list of known words that is used to divide up contiguous ranges of letters,
+     * stored in a compressed, indexed, format that offers fast access
+     */
+    BreakDictionary dictionary;
+
+    /**
+     * a list of flags indicating which character categories are contained in
+     * the dictionary file (this is used to determine which ranges of characters
+     * to apply the dictionary to)
+     */
+    int8_t* categoryFlags;
+
+    //=======================================================================
+    // constructor
+    //=======================================================================
+
+    DictionaryBasedBreakIteratorTables(const void* tablesImage,
+                                       char* dictionaryFilename);
+                                 
+    /**
+     * The copy constructor is declared private and not implemented.
+     * THIS CLASS MAY NOT BE COPIED.
+     */
+    DictionaryBasedBreakIteratorTables(const DictionaryBasedBreakIteratorTables& that);
+
+    //=======================================================================
+    // boilerplate
+    //=======================================================================
+
+    /**
+     * Destructor
+     */
+    virtual ~DictionaryBasedBreakIteratorTables();
+
+    /**
+     * The assignment operator is declared private and not implemented.
+     * THIS CLASS MAY NOT BE COPIED.
+     */
+    DictionaryBasedBreakIteratorTables& operator=(
+            const DictionaryBasedBreakIteratorTables& that);
+
+protected:
+    /**
+     * Looks up a character's category (i.e., its category for breaking purposes,
+     * not its Unicode category)
+     */
+    virtual int32_t lookupCategory(UChar c, BreakIterator* bi) const;
+
+    friend class DictionaryBasedBreakIterator;
+};
+
+#endif
--- a/icu4c/source/i18n/i18n.dsp
+++ b/icu4c/source/i18n/i18n.dsp
@ -69,7 +69,7 @@ LINK32=link.exe
 # PROP Ignore_Export_Lib 0
 # PROP Target_Dir ""
 # ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /YX /FD /GZ /c
-# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /FR /YX /FD /GZ /c
+# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "_WINDOWS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "UDATA_MAP" /FR /YX /FD /GZ /c
 # ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
 # ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
 # ADD BASE RSC /l 0x409 /d "_DEBUG"
@ -92,6 +92,10 @@ LINK32=link.exe
 # PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
 # Begin Source File

+SOURCE=.\brkdict.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\brkiter.cpp
 # End Source File
 # Begin Source File
@ -100,10 +104,6 @@ SOURCE=.\calendar.cpp
 # End Source File
 # Begin Source File

-SOURCE=.\chbkdat.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\choicfmt.cpp
 # End Source File
 # Begin Source File
@ -132,6 +132,14 @@ SOURCE=.\datefmt.cpp
 # End Source File
 # Begin Source File

+SOURCE=.\dbbi.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=.\dbbi_tbl.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\dcfmtsym.cpp
 # End Source File
 # Begin Source File
@ -161,10 +169,6 @@ SOURCE=.\hextouni.cpp
 # End Source File
 # Begin Source File

-SOURCE=.\lnbkdat.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\mergecol.cpp
 # End Source File
 # Begin Source File
@ -181,6 +185,14 @@ SOURCE=.\ptnentry.cpp
 # End Source File
 # Begin Source File

+SOURCE=.\rbbi.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=.\rbbi_tbl.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\rbt.cpp
 # End Source File
 # Begin Source File
@ -205,18 +217,10 @@ SOURCE=.\simpletz.cpp
 # End Source File
 # Begin Source File

-SOURCE=.\simtxbd.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\smpdtfmt.cpp
 # End Source File
 # Begin Source File

-SOURCE=.\snbkdat.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\sortkey.cpp
 # End Source File
 # Begin Source File
@ -241,10 +245,6 @@ SOURCE=.\translit.cpp
 # End Source File
 # Begin Source File

-SOURCE=.\txtbdat.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\txtbdry.cpp
 # End Source File
 # Begin Source File
@ -269,10 +269,6 @@ SOURCE=.\umsg.cpp
 # End Source File
 # Begin Source File

-SOURCE=.\unicdcm.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\unifltlg.cpp
 # End Source File
 # Begin Source File
@ -291,20 +287,16 @@ SOURCE=.\unitohex.cpp

 SOURCE=.\unum.cpp
 # End Source File
-# Begin Source File
-
-SOURCE=.\wdbkdat.cpp
-# End Source File
-# Begin Source File
-
-SOURCE=.\wdbktbl.cpp
-# End Source File
 # End Group
 # Begin Group "Header Files"

 # PROP Default_Filter "h;hpp;hxx;hm;inl"
 # Begin Source File

+SOURCE=.\brkdict.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\unicode\brkiter.h

 !IF  "$(CFG)" == "i18n - Win32 Release"
@ -502,6 +494,14 @@ InputPath=.\unicode\datefmt.h
 # End Source File
 # Begin Source File

+SOURCE=.\dbbi.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\dbbi_tbl.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\unicode\dcfmtsym.h

 !IF  "$(CFG)" == "i18n - Win32 Release"
@ -811,7 +811,7 @@ SOURCE=.\rbbi.h
 # End Source File
 # Begin Source File

-SOURCE=.\rbbi_bld.h
+SOURCE=.\rbbi_tbl.h
 # End Source File
 # Begin Source File

@ -885,10 +885,6 @@ InputPath=.\unicode\simpletz.h
 # End Source File
 # Begin Source File

-SOURCE=.\simtxbd.h
-# End Source File
-# Begin Source File
-
 SOURCE=.\unicode\smpdtfmt.h

 !IF  "$(CFG)" == "i18n - Win32 Release"
@ -943,10 +939,6 @@ InputPath=.\unicode\sortkey.h
 # End Source File
 # Begin Source File

-SOURCE=.\spclmap.h
-# End Source File
-# Begin Source File
-
 SOURCE=.\tables.h
 # End Source File
 # Begin Source File
@ -1036,10 +1028,6 @@ InputPath=.\unicode\translit.h
 # End Source File
 # Begin Source File

-SOURCE=.\txtbdat.h
-# End Source File
-# Begin Source File
-
 SOURCE=.\txtbdry.h
 # End Source File
 # Begin Source File
@ -1179,10 +1167,6 @@ InputPath=.\unicode\umsg.h
 # End Source File
 # Begin Source File

-SOURCE=.\unicdcm.h
-# End Source File
-# Begin Source File
-
 SOURCE=.\unicode\unifilt.h

 !IF  "$(CFG)" == "i18n - Win32 Release"
@ -1319,10 +1303,6 @@ InputPath=.\unicode\unum.h

 !ENDIF 

-# End Source File
-# Begin Source File
-
-SOURCE=.\wdbktbl.h
 # End Source File
 # End Group
 # Begin Group "Resource Files"
--- a/icu4c/source/i18n/rbbi.cpp
+++ b/icu4c/source/i18n/rbbi.cpp
@ -4,98 +4,237 @@
 *   and others. All rights reserved.                                 *
 **********************************************************************
 *   Date        Name        Description
-*   10/22/99    alan        Creation.
+*   11/11/99    rgillam     Complete port from Java.
 **********************************************************************
 */

 #include "rbbi.h"
-#include "rbbi_bld.h"
+#include "schriter.h"

 /**
 * A token used as a character-category value to identify ignore characters
 */
-int8_t RuleBasedBreakIterator::IGNORE = -1;
+int8_t
+RuleBasedBreakIterator::IGNORE = -1;

 /**
 * The state number of the starting state
 */
-int16_t RuleBasedBreakIterator::START_STATE = 1;
+int16_t
+RuleBasedBreakIterator::START_STATE = 1;

 /**
 * The state-transition value indicating "stop"
 */
-int16_t RuleBasedBreakIterator::STOP_STATE = 0;
+int16_t
+RuleBasedBreakIterator::STOP_STATE = 0;
+
+/**
+ * Class ID.  (value is irrelevant; address is important)
+ */
+char
+RuleBasedBreakIterator::fgClassID = 0;

 //=======================================================================
 // constructors
 //=======================================================================

 /**
- * Constructs a RuleBasedBreakIterator according to the description
- * provided.  If the description is malformed, throws an
- * IllegalArgumentException.  Normally, instead of constructing a
- * RuleBasedBreakIterator directory, you'll use the factory methods
- * on BreakIterator to create one indirectly from a description
- * in the framework's resource files.  You'd use this when you want
- * special behavior not provided by the built-in iterators.
+ * Constructs a RuleBasedBreakIterator that uses the already-created
+ * tables object that is passed in as a parameter.
 */
-RuleBasedBreakIterator::RuleBasedBreakIterator(const UnicodeString& description) {
-    this.description = description;
-    
-    // the actual work is done by the Builder class
-    Builder builder;
-    builder.buildBreakIterator(*this, description);
+RuleBasedBreakIterator::RuleBasedBreakIterator(RuleBasedBreakIteratorTables* tables)
+: tables(tables),
+  text(NULL)
+{
+}
+
+// This constructor uses the udata interface to create a BreakIterator whose
+// internal tables live in a memory-mapped file.  "image" is a pointer to the
+// beginning of that file.
+RuleBasedBreakIterator::RuleBasedBreakIterator(const void* image)
+: tables(image != NULL ? new RuleBasedBreakIteratorTables(image) : NULL),
+  text(NULL)
+{
+    if (tables != NULL)
+        tables->addReference();
+}
+
+/**
+ * Copy constructor.  Will produce a collator with the same behavior,
+ * and which iterates over the same text, as the one passed in.
+ */
+RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& that)
+: tables(that.tables),
+  text(that.text->clone())
+{
+    tables->addReference();
 }

 //=======================================================================
 // boilerplate
 //=======================================================================
 /**
- * Clones this iterator.
- * @return A newly-constructed RuleBasedBreakIterator with the same
- * behavior as this one.
+ * Destructor
 */
-RuleBasedBreakIterator* RuleBasedBreakIterator::clone(void) const {
+RuleBasedBreakIterator::~RuleBasedBreakIterator() {
+    delete text;
+    tables->removeReference();
+}
+
+/**
+ * Assignment operator.  Sets this iterator to have the same behavior,
+ * and iterate over the same text, as the one passed in.
+ */
+RuleBasedBreakIterator&
+RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
+    delete text;
+    text = that.text->clone();
+
+    tables->removeReference();
+    tables = that.tables;
+    tables->addReference();
+
+    return *this;
+}
+
+/**
+ * Returns a newly-constructed RuleBasedBreakIterator with the same
+ * behavior, and iterating over the same text, as this one.
+ */
+BreakIterator*
+RuleBasedBreakIterator::clone(void) const {
    return new RuleBasedBreakIterator(*this);
 }

 /**
- * Returns true if both BreakIterators are of the same class, have the same
- * rules, and iterate over the same text.
+ * Equality operator.  Returns TRUE if both BreakIterators are of the
+ * same class, have the same behavior, and iterate over the same text.
 */
-bool_t RuleBasedBreakIterator::operator==(const RuleBasedBreakIterator& that) {
-    return description.equals(((RuleBasedBreakIterator)that).description)
-        && text.equals(((RuleBasedBreakIterator)that).text);
+bool_t
+RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
+    if (that.getDynamicClassID() != getDynamicClassID())
+        return FALSE;
+
+    
+    const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&)that;
+    return (that2.text == text || *that2.text == *text)
+            && (that2.tables == tables || *that2.tables == *tables);
+}
+
+/**
+ * Compute a hash code for this BreakIterator
+ * @return A hash code
+ */
+int32_t
+RuleBasedBreakIterator::hashCode(void) const {
+    return tables->hashCode();
 }

 /**
 * Returns the description used to create this iterator
 */
-UnicodeString RuleBasedBreakIterator::toString(void) {
-    return description;
-}
-
-/**
- * Compute a hashcode for this BreakIterator
- * @return A hash code
- */
-int32_t RuleBasedBreakIterator::hashCode(void) {
-    return description.hashCode();
+const UnicodeString&
+RuleBasedBreakIterator::getRules() const {
+    return tables->getRules();
 }

 //=======================================================================
 // BreakIterator overrides
 //=======================================================================
+
+/**
+ * Return a CharacterIterator over the text being analyzed.  This version
+ * of this method returns the actual CharacterIterator we're using internally.
+ * Changing the state of this iterator can have undefined consequences.  If
+ * you need to change it, clone it first.
+ * @return An iterator over the text being analyzed.
+ */
+const CharacterIterator&
+RuleBasedBreakIterator::getText() const {
+    RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
+    
+    // The iterator is initialized pointing to no text at all, so if this
+    // function is called while we're in that state, we have to fudge an
+    // an iterator to return.
+    if (nonConstThis->text == NULL)
+        nonConstThis->text = new StringCharacterIterator("");
+    return *nonConstThis->text;
+}
+
+/**
+ * Returns a newly-created CharacterIterator that the caller is to take
+ * ownership of.
+ * THIS FUNCTION SHOULD NOT BE HERE.  IT'S HERE BECAUSE BreakIterator DEFINES
+ * IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT.  IT SHOULD BE REMOVED
+ * FROM *BOTH* CLASSES.
+ */
+CharacterIterator*
+RuleBasedBreakIterator::createText() const {
+    if (text == NULL)
+        return new StringCharacterIterator("");
+    else
+        return text->clone();
+}
+
+
+/**
+ * Set the iterator to analyze a new piece of text.  This function resets
+ * the current iteration position to the beginning of the text.
+ * @param newText An iterator over the text to analyze.
+ */
+void
+RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
+    reset();
+    delete text;
+    text = newText;
+    text->first();
+}
+
+/**
+ * Set the iterator to analyze a new piece of text.  This function resets
+ * the current iteration position to the beginning of the text.
+ * @param newText An iterator over the text to analyze.
+ */
+void
+RuleBasedBreakIterator::setText(const UnicodeString& newText) {
+    reset();
+    if (text != NULL && text->getDynamicClassID()
+            == StringCharacterIterator::getStaticClassID()) {
+        ((StringCharacterIterator*)text)->setText(newText);
+    }
+    else {
+        delete text;
+		text = new StringCharacterIterator(newText);
+        text->first();
+    }
+}
+
+/**
+ * Set the iterator to analyze a new piece of text.  This function resets
+ * the current iteration position to the beginning of the text.
+ * @param newText The text to analyze.
+ * THIS FUNCTION SHOULD NOT BE HERE.  IT'S HERE BECAUSE BreakIterator DEFINES
+ * IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT.  IT SHOULD BE REMOVED
+ * FROM *BOTH* CLASSES.
+ */
+void
+RuleBasedBreakIterator::setText(const UnicodeString* newText) {
+    setText(*newText);
+}
+
 /**
 * Sets the current iteration position to the beginning of the text.
 * (i.e., the CharacterIterator's starting offset).
 * @return The offset of the beginning of the text.
 */
 int32_t RuleBasedBreakIterator::first(void) {
-    CharacterIterator t = getText();
+    reset();
+    if (text == NULL)
+        return BreakIterator::DONE;

-    t.first();
-    return t.getIndex();
+    text->first();
+    return text->getIndex();
 }

 /**
@ -104,12 +243,16 @@ int32_t RuleBasedBreakIterator::first(void) {
 * @return The text's past-the-end offset.
 */
 int32_t RuleBasedBreakIterator::last(void) {
-    CharacterIterator t = getText();
-
+    reset();
+    if (text == NULL)
+        return BreakIterator::DONE;
+    
    // I'm not sure why, but t.last() returns the offset of the last character,
    // rather than the past-the-end offset
-    t.setIndex(t.getEndIndex());
-    return t.getIndex();
+
+    int32_t pos = text->endIndex();
+    text->setIndex(pos);
+    return pos;
 }

 /**
@ -148,9 +291,8 @@ int32_t RuleBasedBreakIterator::next(void) {
 */
 int32_t RuleBasedBreakIterator::previous(void) {
    // if we're already sitting at the beginning of the text, return DONE
-    CharacterIterator text = getText();
-    if (current() == text.getBeginIndex())
-        return BreakIterator.DONE;
+    if (text == NULL || current() == text->startIndex())
+        return BreakIterator::DONE;

    // set things up.  handlePrevious() will back us up to some valid
    // break position before the current position (we back our internal
@ -158,21 +300,21 @@ int32_t RuleBasedBreakIterator::previous(void) {
    // the current position), but not necessarily the last one before
    // where we started
    int32_t start = current();
-    text.previous();
+    text->previous();
    int32_t lastResult = handlePrevious();
    int32_t result = lastResult;
    
    // iterate forward from the known break position until we pass our
    // starting point.  The last break position before the starting
    // point is our return value
-    while (result != BreakIterator.DONE && result < start) {
+    while (result != BreakIterator::DONE && result < start) {
        lastResult = result;
        result = handleNext();
    }
    
    // set the current iteration position to be the last break position
    // before where we started, and then return that value
-    text.setIndex(lastResult);
+    text->setIndex(lastResult);
    return lastResult;
 }

@ -184,16 +326,20 @@ int32_t RuleBasedBreakIterator::previous(void) {
 */
 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    // if the offset passed in is already past the end of the text,
-    // just return DONE
-    CharacterIterator text = getText();
-    if (offset == text.getEndIndex())
-        return BreakIterator.DONE;
+    // just return DONE; if it's before the beginning, return the
+    // text's starting offset
+    if (text == NULL || offset >= text->endIndex()) {
+        return BreakIterator::DONE;
+    }
+    else if (offset < text->startIndex()) {
+        return text->startIndex();
+    }

    // otherwise, set our internal iteration position (temporarily)
    // to the position passed in.  If this is the _beginning_ position,
    // then we can just use next() to get our return value
-    text.setIndex(offset);
-    if (offset == text.getBeginIndex())
+    text->setIndex(offset);
+    if (offset == text->startIndex())
        return handleNext();

    // otherwise, we have to sync up first.  Use handlePrevious() to back
@ -204,7 +350,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
    // from here until we've passed the starting position.  The position
    // we stop on will be the first break position after the specified one.
    int32_t result = handlePrevious();
-    while (result != BreakIterator.DONE && result <= offset)
+    while (result != BreakIterator::DONE && result <= offset)
        result = handleNext();
    return result;
 }
@ -216,11 +362,20 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
 * @return The position of the last boundary before the starting position.
 */
 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
+    // if the offset passed in is already past the end of the text,
+    // just return DONE; if it's before the beginning, return the
+    // text's starting offset
+    if (text == NULL || offset > text->endIndex()) {
+        return BreakIterator::DONE;
+    }
+    else if (offset < text->startIndex()) {
+        return text->startIndex();
+    }
+    
    // if we start by updating the current iteration position to the
    // position specified by the caller, we can just use previous()
    // to carry out this operation
-    CharacterIterator text = getText();
-    text.setIndex(offset);
+    text->setIndex(offset);
    return previous();
 }

@ -232,10 +387,15 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
 * @return True if "offset" is a boundary position.
 */
 bool_t RuleBasedBreakIterator::isBoundary(int32_t offset) {
-    // 0 is always a boundary position (I suspect this code is wrong; I think
-    // we're supposed to be comparing "offset" against text.getBeginIndex(). )
-    if (offset == 0)
+    // the beginning index of the iterator is always a boundary position by definition
+    if (text == NULL || offset == text->startIndex()) {
        return TRUE;
+    }
+
+    // out-of-range indexes are never boundary positions
+    else if (offset < text->startIndex() || offset > text->endIndex()) {
+        return FALSE;
+    }
        
    // otherwise, we can use following() on the position before the specified
    // one and return true of the position we get back is the one the user
@ -248,38 +408,14 @@ bool_t RuleBasedBreakIterator::isBoundary(int32_t offset) {
 * Returns the current iteration position.
 * @return The current iteration position.
 */
-int32_t RuleBasedBreakIterator::current(void) {
-    return getText().getIndex();
+int32_t RuleBasedBreakIterator::current(void) const {
+    return (text != NULL) ? text->getIndex() : BreakIterator::DONE;
 }

-/**
- * Return a CharacterIterator over the text being analyzed.  This version
- * of this method returns the actual CharacterIterator we're using internally.
- * Changing the state of this iterator can have undefined consequences.  If
- * you need to change it, clone it first.
- * @return An iterator over the text being analyzed.
- */
-CharacterIterator RuleBasedBreakIterator::getText(void) {
-    // The iterator is initialized pointing to no text at all, so if this
-    // function is called while we're in that state, we have to fudge an
-    // an iterator to return.
-    if (text == 0)
-        text = new StringCharacterIterator("");
-    return text;
-}
-
-/**
- * Set the iterator to analyze a new piece of text.  This function resets
- * the current iteration position to the beginning of the text.
- * @param newText An iterator over the text to analyze.
- */
-void RuleBasedBreakIterator::setText(CharacterIterator newText) {
-    text = newText;
-    text.first();
-}
 //=======================================================================
 // implementation
 //=======================================================================
+
 /**
 * This method is the actual implementation of the next() method.  All iteration
 * vectors through here.  This method initializes the state machine to state 1
@ -289,38 +425,82 @@ void RuleBasedBreakIterator::setText(CharacterIterator newText) {
 */
 int32_t RuleBasedBreakIterator::handleNext(void) {
    // if we're already at the end of the text, return DONE.
-    CharacterIterator text = getText();
-    if (text.getIndex() == text.getEndIndex())
-        return BreakIterator.DONE;
+    if (text == NULL || tables == NULL || text->getIndex() == text->endIndex())
+        return BreakIterator::DONE;

    // no matter what, we always advance at least one character forward
-    int32_t result = text.getIndex() + 1;
+    int32_t result = text->getIndex() + 1;
+    int32_t lookaheadResult = 0;
    
    // begin in state 1
    int32_t state = START_STATE;
    int32_t category;
-    UChar c = text.current();
+    UChar c = text->current();
+    UChar lastC = c;
+    int32_t lastCPos = 0;
+

    // loop until we reach the end of the text or transition to state 0
-    while (c != CharacterIterator.DONE && state != STOP_STATE) {
+    while (c != CharacterIterator::DONE && state != STOP_STATE) {

        // look up the current character's character category (which tells us
        // which column in the state table to look at)
-        category = lookupCategory(c);
+        category = tables->lookupCategory(c, this);
        
        // if the character isn't an ignore character, look up a state
        // transition in the state table
        if (category != IGNORE) {
-            state = lookupState(state, category);
+            state = tables->lookupState(state, category);
        }
        
-        // if the state we've just transitioned to is an accepting state,
+        // if the state we've just transitioned to is a lookahead state,
+        // (but not also an end state), save its position.  If it's
+        // both a lookahead state and an end state, update the break position
+        // to the last saved lookup-state position
+        if (tables->isLookaheadState(state)) {
+            if (tables->isEndState(state)) {
+                result = lookaheadResult;
+            }
+            else {
+                lookaheadResult = text->getIndex() + 1;
+            }
+        }
+
+        // otherwise, if the state we've just transitioned to is an accepting state,
        // update our return value to be the current iteration position
-        if (endStates[state])
-            result = text.getIndex() + 1;
-        c = text.next();
+        else {
+            if (tables->isEndState(state)) {
+                result = text->getIndex() + 1;
+            }
+        }
+            
+        // keep track of the last "real" character we saw.  If this character isn't an
+        // ignore character, take note of it and its position in the text
+        if (category != IGNORE && state != STOP_STATE) {
+            lastC = c;
+            lastCPos = text->getIndex();
+        }
+        c = text->next();
    }
-    text.setIndex(result);
+
+    // if we've run off the end of the text, and the very last character took us into
+    // a lookahead state, advance the break position to the lookahead position
+    // (the theory here is that if there are no characters at all after the lookahead
+    // position, that always matches the lookahead criteria)
+    if (c == CharacterIterator::DONE && lookaheadResult == text->endIndex()) {
+        result = lookaheadResult;
+    }
+        
+    // if the last character we saw before the one that took us into the stop state
+    // was a mandatory breaking character, then the break position goes right after it
+    // (this is here so that breaks come before, rather than after, a string of
+    // ignore characters when they follow a mandatory break character)
+    else if (lastC == 0x0a || lastC == 0x0d || lastC == 0x0c || lastC == 0x2028
+            || lastC == 0x2029) {
+        result = lastCPos + 1;
+    }
+
+    text->setIndex(result);
    return result;
 }

@ -332,27 +512,29 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
 * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
 */
 int32_t RuleBasedBreakIterator::handlePrevious(void) {
-    CharacterIterator text = getText();
+    if (text == NULL || tables == NULL)
+        return 0;
+    
    int32_t state = START_STATE;
    int32_t category = 0;
    int32_t lastCategory = 0;
-    UChar c = text.current();
+    UChar c = text->current();
    
    // loop until we reach the beginning of the text or transition to state 0
-    while (c != CharacterIterator.DONE && state != STOP_STATE) {
+    while (c != CharacterIterator::DONE && state != STOP_STATE) {

        // save the last character's category and look up the current
        // character's category
        lastCategory = category;
-        category = lookupCategory(c);
+        category = tables->lookupCategory(c, this);
        
        // if the current character isn't an ignore character, look up a
        // state transition in the backwards state table
        if (category != IGNORE)
-            state = lookupBackwardState(state, category);
+            state = tables->lookupBackwardState(state, category);
            
        // then advance one character backwards
-        c = text.previous();
+        c = text->previous();
    }
    
    // if we didn't march off the beginning of the text, we're either one or two
@ -360,35 +542,19 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
    // previous() at the end of the loop above, and another because the character
    // that takes us into the stop state will always be the character BEFORE
    // the break position.)
-    if (c != CharacterIterator.DONE) {
+    if (c != CharacterIterator::DONE) {
        if (lastCategory != IGNORE)
-            text.setIndex(text.getIndex() + 2);
+            text->setIndex(text->getIndex() + 2);
        else
-            text.next();
+            text->next();
    }
-    return text.getIndex();
+
+    return text->getIndex();
 }

-/**
- * Looks up a character's category (i.e., its category for breaking purposes,
- * not its Unicode category)
- */
-int32_t RuleBasedBreakIterator::lookupCategory(UChar c) {
-    return UCharCategoryTable.elementAt(c);
-}
-
-/**
- * Given a current state and a character category, looks up the
- * next state to transition to in the state table.
- */
-int32_t RuleBasedBreakIterator::lookupState(int32_t state, int32_t category) {
-    return stateTable[state * numCategories + category];
-}
-
-/**
- * Given a current state and a character category, looks up the
- * next state to transition to in the backwards state table.
- */
-int32_t RuleBasedBreakIterator::lookupBackwardState(int32_t state, int32_t category) {
-    return backwardsStateTable[state * numCategories + category];
+void
+RuleBasedBreakIterator::reset()
+{
+    // Base-class version of this function is a no-op.
+    // Subclasses may override with their own reset behavior.
 }
--- a/icu4c/source/i18n/rbbi.h
+++ b/icu4c/source/i18n/rbbi.h
@ -3,12 +3,18 @@
 **********************************************************************
 *   Date        Name        Description
 *   10/22/99    alan        Creation.
+*   11/11/99    rgillam     Complete port from Java.
 **********************************************************************
 */

 #ifndef RBBI_H
 #define RBBI_H

+#include "utypes.h"
+#include "rbbi_tbl.h"
+#include "unicode/brkiter.h"
+#include "filestrm.h"
+
 /**
 * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
 * 
@ -173,17 +179,15 @@
 *
 * @author Richard Gillam
 */
-class RuleBasedBreakIterator {
-
-protected:
+class U_I18N_API RuleBasedBreakIterator : public BreakIterator {

+public:
    /**
     * A token used as a character-category value to identify ignore characters
     */
    static int8_t IGNORE;

 private:
-
    /**
     * The state number of the starting state
     */
@ -194,92 +198,130 @@ private:
     */
    static int16_t STOP_STATE;

-    /**
-     * The textual description this iterator was created from
-     */
-    UnicodeString description;
-
-    /**
-     * A table that indexes from character values to character category numbers
-     */
-    CompactByteArray charCategoryTable;
-
-    /**
-     * The table of state transitions used for forward iteration
-     */
-    int16_t* stateTable;
-
-    /**
-     * The table of state transitions used to sync up the iterator with the
-     * text in backwards and random-access iteration
-     */
-    int16_t* backwardsStateTable;
-
-    /**
-     * A list of flags indicating which states in the state table are accepting
-     * ("end") states
-     */
-    bool_t* endStates;
-
-    /**
-     * The number of character categories (and, thus, the number of columns in
-     * the state tables)
-     */
-    int32_t numCategories;
-
+protected:
    /**
     * The character iterator through which this BreakIterator accesses the text
     */
-    CharacterIterator text;
+    CharacterIterator* text;

+    /**
+     * The data tables this iterator uses to determine the break positions
+     */
+    RuleBasedBreakIteratorTables* tables;
+
+private:
+    /**
+     * Class ID
+     */
+    static char fgClassID;
+
+public:
    //=======================================================================
    // constructors
    //=======================================================================
-
-public:
+    
+// This constructor uses the udata interface to create a BreakIterator whose
+// internal tables live in a memory-mapped file.  "image" is a pointer to the
+// beginning of that file.
+RuleBasedBreakIterator(const void* image);

    /**
-     * Constructs a RuleBasedBreakIterator according to the description
-     * provided.  If the description is malformed, throws an
-     * IllegalArgumentException.  Normally, instead of constructing a
-     * RuleBasedBreakIterator directory, you'll use the factory methods
-     * on BreakIterator to create one indirectly from a description
-     * in the framework's resource files.  You'd use this when you want
-     * special behavior not provided by the built-in iterators.
+     * Copy constructor.  Will produce a collator with the same behavior,
+     * and which iterates over the same text, as the one passed in.
     */
-    RuleBasedBreakIterator(UnicodeString description);
+    RuleBasedBreakIterator(const RuleBasedBreakIterator& that);

    //=======================================================================
    // boilerplate
    //=======================================================================
-public:

    /**
-     * Clones this iterator.
-     * @return A newly-constructed RuleBasedBreakIterator with the same
-     * behavior as this one.
+     * Destructor
     */
-    virtual Object clone(void);
+    virtual ~RuleBasedBreakIterator();

    /**
-     * Returns true if both BreakIterators are of the same class, have the same
-     * rules, and iterate over the same text.
+     * Assignment operator.  Sets this iterator to have the same behavior,
+     * and iterate over the same text, as the one passed in.
     */
-    virtual bool_t equals(Object that);
+    RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
+
+    /**
+     * Equality operator.  Returns TRUE if both BreakIterators are of the
+     * same class, have the same behavior, and iterate over the same text.
+     */
+    virtual bool_t operator==(const BreakIterator& that) const;
+
+    /**
+     * Not-equal operator.  If operator== returns TRUE, this returns FALSE,
+     * and vice versa.
+     */
+    bool_t operator!=(const BreakIterator& that) const;
+
+    /**
+     * Returns a newly-constructed RuleBasedBreakIterator with the same
+     * behavior, and iterating over the same text, as this one.
+     */
+    virtual BreakIterator* clone(void) const;
+
+    /**
+     * Compute a hash code for this BreakIterator
+     * @return A hash code
+     */
+    virtual int32_t hashCode() const;

    /**
     * Returns the description used to create this iterator
     */
-    virtual UnicodeString toString(void);
+    virtual const UnicodeString& getRules() const;

-    /**
-     * Compute a hashcode for this BreakIterator
-     * @return A hash code
-     */
-    virtual int32_t hashCode(void);
    //=======================================================================
    // BreakIterator overrides
    //=======================================================================
+
+    /**
+     * Return a CharacterIterator over the text being analyzed.  This version
+     * of this method returns the actual CharacterIterator we're using internally.
+     * Changing the state of this iterator can have undefined consequences.  If
+     * you need to change it, clone it first.
+     * @return An iterator over the text being analyzed.
+     */
+    virtual const CharacterIterator& getText() const;
+
+    /**
+     * Returns a newly-created CharacterIterator that the caller is to take
+     * ownership of.
+     * THIS FUNCTION SHOULD NOT BE HERE.  IT'S HERE BECAUSE BreakIterator DEFINES
+     * IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT.  IT SHOULD BE REMOVED
+     * FROM *BOTH* CLASSES.
+     */
+    virtual CharacterIterator* createText() const;
+
+    /**
+     * Set the iterator to analyze a new piece of text.  This function resets
+     * the current iteration position to the beginning of the text.
+     * @param newText An iterator over the text to analyze.  The BreakIterator
+     * takes ownership of the character iterator.  The caller MUST NOT delete it!
+     */
+    virtual void adoptText(CharacterIterator* newText);
+
+    /**
+     * Set the iterator to analyze a new piece of text.  This function resets
+     * the current iteration position to the beginning of the text.
+     * @param newText The text to analyze.
+     */
+    virtual void setText(const UnicodeString& newText);
+
+    /**
+     * Set the iterator to analyze a new piece of text.  This function resets
+     * the current iteration position to the beginning of the text.
+     * @param newText The text to analyze.
+     * THIS FUNCTION SHOULD NOT BE HERE.  IT'S HERE BECAUSE BreakIterator DEFINES
+     * IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT.  IT SHOULD BE REMOVED
+     * FROM *BOTH* CLASSES.
+     */
+    virtual void setText(const UnicodeString* newText);
+
    /**
     * Sets the current iteration position to the beginning of the text.
     * (i.e., the CharacterIterator's starting offset).
@ -346,28 +388,36 @@ public:
     * Returns the current iteration position.
     * @return The current iteration position.
     */
-    virtual int32_t current(void);
+    virtual int32_t current(void) const;

    /**
-     * Return a CharacterIterator over the text being analyzed.  This version
-     * of this method returns the actual CharacterIterator we're using internally.
-     * Changing the state of this iterator can have undefined consequences.  If
-     * you need to change it, clone it first.
-     * @return An iterator over the text being analyzed.
+     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
+     * This method is to implement a simple version of RTTI, since not all
+     * C++ compilers support genuine RTTI.  Polymorphic operator==() and
+     * clone() methods call this method.
+     *
+     * @return          The class ID for this object. All objects of a
+     *                  given class have the same class ID.  Objects of
+     *                  other classes have different class IDs.
     */
-    virtual CharacterIterator getText(void);
+    virtual UClassID getDynamicClassID() const;

    /**
-     * Set the iterator to analyze a new piece of text.  This function resets
-     * the current iteration position to the beginning of the text.
-     * @param newText An iterator over the text to analyze.
+     * Returns the class ID for this class.  This is useful only for
+     * comparing to a return value from getDynamicClassID().  For example:
+     *
+     *      Base* polymorphic_pointer = createPolymorphicObject();
+     *      if (polymorphic_pointer->getDynamicClassID() ==
+     *          Derived::getStaticClassID()) ...
+     *
+     * @return          The class ID for all objects of this class.
     */
-    virtual void setText(CharacterIterator newText);
+    static UClassID getStaticClassID();
+
+protected:
    //=======================================================================
    // implementation
    //=======================================================================
-protected:
-
    /**
     * This method is the actual implementation of the next() method.  All iteration
     * vectors through here.  This method initializes the state machine to state 1
@ -387,22 +437,33 @@ protected:
    virtual int32_t handlePrevious(void);

    /**
-     * Looks up a character's category (i.e., its category for breaking purposes,
-     * not its Unicode category)
+     * Dumps caches and performs other actions associated with a complete change
+     * in text or iteration position.  This function is a no-op in RuleBasedBreakIterator,
+     * but subclasses can and do override it.
     */
-    virtual int32_t lookupCategory(UChar c);
+    virtual void reset();
+
+private:

    /**
-     * Given a current state and a character category, looks up the
-     * next state to transition to in the state table.
+     * Constructs a RuleBasedBreakIterator that uses the already-created
+     * tables object that is passed in as a parameter.
     */
-    virtual int32_t lookupState(int32_t state, int32_t category);
+    RuleBasedBreakIterator(RuleBasedBreakIteratorTables* tables);

-    /**
-     * Given a current state and a character category, looks up the
-     * next state to transition to in the backwards state table.
-     */
-    virtual int32_t lookupBackwardState(int32_t state, int32_t category);
+    friend class BreakIterator;
 };

+inline bool_t RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
+    return !operator==(that);
+}
+
+inline UClassID RuleBasedBreakIterator::getDynamicClassID() const {
+    return RuleBasedBreakIterator::getStaticClassID();
+}
+
+inline UClassID RuleBasedBreakIterator::getStaticClassID() {
+    return (UClassID)(&fgClassID);
+}
+
 #endif
--- a/icu4c/source/i18n/rbbi_bld.cpp
+++ b/icu4c/source/i18n/rbbi_bld.cpp
--- a/icu4c/source/i18n/rbbi_bld.h
+++ b/icu4c/source/i18n/rbbi_bld.h
@ -2,8 +2,7 @@
 * Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
-*   10/22/99    alan        Creation.  This is an internal header; it
-*                           shall not be exported.
+*   12/15/99    rgillam     Port from Java.
 **********************************************************************
 */

@ -11,9 +10,12 @@
 #define RBBI_BLD_H

 #include "rbbi.h"
+#include "rbbi_tbl.h"
 #include "unicode/uniset.h"
 #include "uvector.h"

+class ExpressionList;
+
 //=======================================================================
 // RuleBasedBreakIterator.Builder
 //=======================================================================
@ -42,18 +44,37 @@
 class RuleBasedBreakIteratorBuilder {

 protected:
+    /**
+     * The iterator we're constructing.
+     */
+    RuleBasedBreakIterator& iterator;

+    /**
+     * The tables object for the iterator we're constructing.
+     */
+    RuleBasedBreakIteratorTables* tables;
+
+    /**
+     * A temporary place to hold the rules as they're being processed.
+     */
+    UVector tempRuleList;
+    
    /**
     * A temporary holding place used for calculating the character categories.
     * This object contains UnicodeSet objects.
     */
    UVector categories;

+    /**
+     * The number of categories (and thus the number of columns in the finished state tables)
+     */
+    int32_t numCategories;
+
    /**
     * A table used to map parts of regexp text to lists of character categories,
     * rather than having to figure them out from scratch each time
     */
-    Hashtable expressions;
+    ExpressionList* expressions;

    /**
     * A temporary holding place for the list of ignore characters
@ -104,18 +125,56 @@ protected:
     */
    bool_t clearLoopingStates;

+    /**
+     * A place where an error message can be stored if we get a parse error.
+     * The error message is never displayed anywhere, so this is useful pretty
+     * much only in conjunction with a debugger.
+     */
+    UnicodeString errorMessage;
+
+    /**
+     * A bit mask used to indicate a bit in the table's flags column that marks a
+     * state as an accepting state.
+     */
+    static const int32_t END_STATE_FLAG /*= 0x8000*/;
+
+    /**
+     * A bit mask used to indicate a bit in the table's flags column that marks a
+     * state as one the builder shouldn't loop to any looping states
+     */
+    static const int32_t DONT_LOOP_FLAG /*= 0x4000*/;
+
+    /**
+     * A bit mask used to indicate a bit in the table's flags column that marks a
+     * state as a lookahead state.
+     */
+    static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/;
+
+    /**
+     * A bit mask representing the union of the mask values listed above.
+     * Used for clearing or masking off the flag bits.
+     */
+    static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
+            | DONT_LOOP_FLAG*/;
+
 public:

    /**
-     * No special construction is required for the Builder.
+     * The Builder class contains a reference to the iterator it's supposed to build.
     */
-    RuleBasedBreakIteratorBuilder();
+    RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild);
+
+    /**
+     * Destructor.
+     */
+    ~RuleBasedBreakIteratorBuilder();

    /**
     * This is the main function for setting up the BreakIterator's tables.  It
-     * just UVectors different parts of the job off to other functions.
+     * just vectors different parts of the job off to other functions.
     */
-    virtual void buildBreakIterator(void);
+    virtual void buildBreakIterator(const UnicodeString&    description,
+                                    UErrorCode& err);

 private:

@ -127,7 +186,8 @@ private:
     * <li>Perform variable-name substitutions (so that no one else sees variable names)
     * </ul>
     */
-    virtual UVector buildRuleList(UnicodeString description);
+    virtual void buildRuleList(UnicodeString& description,
+                               UErrorCode& err);

 protected:

@ -138,8 +198,11 @@ protected:
     * find-and-replace of the variable name with its text.  (The variable text
     * must be enclosed in either [] or () for this to work.)
     */
-    virtual UnicodeString processSubstitution(UnicodeString substitutionRule, UnicodeString description,
-                    int32_t startPos);
+    virtual void processSubstitution(UnicodeString& description,
+                                     UTextOffset ruleStart,
+                                     UTextOffset ruleEnd,
+                                     UTextOffset startPos,
+                                     UErrorCode& err);

    /**
     * This function defines a protocol for handling substitution names that
@ -150,8 +213,17 @@ protected:
     * that which is done by the normal substitution-processing code is done
     * here.
     */
-    virtual void handleSpecialSubstitution(UnicodeString replace, UnicodeString replaceWith,
-                int32_t startPos, UnicodeString description);
+    virtual void handleSpecialSubstitution(const UnicodeString& replace,
+                                           const UnicodeString& replaceWith,
+                                           int32_t startPos,
+                                           const UnicodeString& description,
+                                           UErrorCode& err);
+
+    /**
+     * This function provides a hook for subclasses to mess with the character
+     * category table.
+     */
+    virtual void mungeExpressionList();

    /**
     * This function builds the character category table.  On entry,
@ -161,7 +233,7 @@ protected:
     * character category numbers everywhere a literal character or a [] expression
     * originally occurred.
     */
-    virtual void buildCharCategories(UVector tempRuleList);
+    virtual void buildCharCategories(UErrorCode& err);

 private:

@ -170,7 +242,7 @@ private:
     * work is done in parseRule(), which is called once for each rule in the
     * description.
     */
-    virtual void buildStateTable(UVector tempRuleList);
+    virtual void buildStateTable(UErrorCode& err);

    /**
     * This is where most of the work really happens.  This routine parses a single
@ -179,7 +251,8 @@ private:
     * throughout the whole operation, although some ugly postprocessing is needed
     * to handle the *? token.
     */
-    virtual void parseRule(UnicodeString rule, bool_t forward);
+    virtual void parseRule(const UnicodeString& rule,
+                           bool_t               forward);

    /**
     * Update entries in the state table, and merge states when necessary to keep
@ -189,9 +262,9 @@ private:
     * list of the columns that need updating.
     * @param newValue Update the cells specfied above to contain this value
     */
-    virtual void updateStateTable(UVector rows,
-                                  UnicodeString pendingChars,
-                                  int16_t newValue);
+    virtual void updateStateTable(const UVector&       rows,
+                                  const UnicodeString& pendingChars,
+                                  int16_t              newValue);

    /**
     * The real work of making the state table deterministic happens here.  This function
@ -213,9 +286,9 @@ private:
     * (itself a copy of the decision point list from parseRule()).  Newly-created
     * states get added to the decision point list if their "parents" were on it.
     */
-    virtual void mergeStates(int32_t rowNum,
+    virtual void mergeStates(int32_t  rowNum,
                             int16_t* newValues,
-                             UVector rowsBeingUpdated);
+                             const UVector& rowsBeingUpdated);

    /**
     * The merge list is a list of pairs of rows that have been merged somewhere in
@ -236,7 +309,8 @@ private:
     * @param endStates The list of states to treat as end states (states that
     * can exit the loop).
     */
-    virtual void setLoopingStates(UVector newLoopingStates, UVector endStates);
+    virtual void setLoopingStates(const UVector* newLoopingStates,
+                                  const UVector& endStates);

    /**
     * This removes "ending states" and states reachable from them from the
@ -264,7 +338,7 @@ private:
     * table and any additional rules (identified by the ! on the front)
     * supplied in the description
     */
-    virtual void buildBackwardsStateTable(UVector tempRuleList);
+    virtual void buildBackwardsStateTable(UErrorCode& err);

 protected:

@ -276,7 +350,9 @@ protected:
     * discovered
     * @param context The string containing the error
     */
-    virtual void error(UnicodeString message, int32_t position, UnicodeString context);
+    virtual void setUpErrorMessage(const UnicodeString& message,
+                                   int32_t position,
+                                   const UnicodeString& context);
 };

 #endif
--- a/icu4c/source/i18n/rbbi_tbl.cpp
+++ b/icu4c/source/i18n/rbbi_tbl.cpp
@ -0,0 +1,128 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 IBM Corp. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   11/11/99    rgillam     Complete port from Java.
+**********************************************************************
+*/
+
+#include "rbbi_tbl.h"
+#include "cmemory.h"
+
+//=======================================================================
+// constructor
+//=======================================================================
+
+RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables(const void* image)
+: refCount(0),
+  ownTables(FALSE)
+{
+    const void** im = (const void**)(image);
+    const int8_t* base = (const int8_t*)(image);
+
+    // the memory image begins with an index that gives the offsets into the
+    // image for each of the fields in the BreakIteratorTables object--
+    // use those to initialize the tables object (it will end up pointing
+    // into the memory image for everything)
+    numCategories = (int32_t)im[0];
+    description = UnicodeString(TRUE, (UChar*)((int32_t)im[1] + base), -1);
+    charCategoryTable = ucmp8_openAdopt((uint16_t*)((int32_t)im[2] + base),
+            (int8_t*)((int32_t)im[3] + base), 0);
+    stateTable = (int16_t*)((int32_t)im[4] + base);
+    backwardsStateTable = (int16_t*)((int32_t)im[5] + base);
+    endStates = (int8_t*)((int32_t)im[6] + base);
+    lookaheadStates = (int8_t*)((int32_t)im[7] + base);
+}
+
+RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables()
+: refCount(0),
+  ownTables(TRUE)
+{
+    // everything else is null-initialized.  This constructor depends on
+    // a RuleBasedBreakIteratorBuilder filling in all the members
+}
+
+//=======================================================================
+// boilerplate
+//=======================================================================
+
+/**
+ * Destructor
+ */
+RuleBasedBreakIteratorTables::~RuleBasedBreakIteratorTables() {
+    if (ownTables) {
+        delete [] stateTable;
+        delete [] backwardsStateTable;
+        delete [] endStates;
+        delete [] lookaheadStates;
+        ucmp8_close(charCategoryTable);
+    }
+    else {
+        uprv_free(charCategoryTable);
+    }
+}
+
+/**
+ * Equality operator.  Returns TRUE if both tables objects are of the
+ * same class, have the same behavior, and iterate over the same text.
+ */
+bool_t
+RuleBasedBreakIteratorTables::operator==(const RuleBasedBreakIteratorTables& that) const {
+    return this->description == that.description;
+}
+
+/**
+ * Compute a hash code for these tables
+ * @return A hash code
+ */
+int32_t
+RuleBasedBreakIteratorTables::hashCode() const {
+    return description.hashCode();
+}
+
+//=======================================================================
+// implementation
+//=======================================================================
+/**
+ * Looks up a character's category (i.e., its category for breaking purposes,
+ * not its Unicode category)
+ */
+int32_t
+RuleBasedBreakIteratorTables::lookupCategory(UChar c, BreakIterator* ignored) const {
+    return ucmp8_get(charCategoryTable, c);
+}
+
+/**
+ * Given a current state and a character category, looks up the
+ * next state to transition to in the state table.
+ */
+int32_t
+RuleBasedBreakIteratorTables::lookupState(int32_t state, int32_t category) const {
+    return stateTable[state * numCategories + category];
+}
+
+/**
+ * Given a current state and a character category, looks up the
+ * next state to transition to in the backwards state table.
+ */
+int32_t
+RuleBasedBreakIteratorTables::lookupBackwardState(int32_t state, int32_t category) const {
+    return backwardsStateTable[state * numCategories + category];
+}
+
+/**
+ * Returns true if the specified state is an accepting state.
+ */
+bool_t
+RuleBasedBreakIteratorTables::isEndState(int32_t state) const {
+    return endStates[state];
+}
+
+/**
+ * Returns true if the specified state is a lookahead state.
+ */
+bool_t
+RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
+    return lookaheadStates[state];
+}
--- a/icu4c/source/i18n/rbbi_tbl.h
+++ b/icu4c/source/i18n/rbbi_tbl.h
@ -0,0 +1,213 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 IBM Corp. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   11/11/99    rgillam     Complete port from Java.
+**********************************************************************
+*/
+
+#ifndef RBBI_TBL_H
+#define RBBI_TBL_H
+
+#include "ucmp8.h"
+#include "utypes.h"
+#include "unistr.h"
+#include "unicode/brkiter.h"
+#include "filestrm.h"
+
+/**
+ * This class contains the internal static tables that are used by the
+ * RuleBasedBreakIterator.  Once created, these tables are immutable,
+ * so they can be shared among all break iterators using a particular
+ * set of rules.  This class uses a reference-counting scheme to
+ * manage the sharing.
+ *
+ * @author Richard Gillam
+ */
+class RuleBasedBreakIteratorTables {
+
+private:
+    /**
+     * The number of RuleBasedBreakIterators using this object.
+     */
+    int16_t refCount;
+
+protected:
+    /**
+     * Whether or not we own the storage for the tables (the tables may be
+     * stored in a memory-mapped file)
+     */
+    bool_t ownTables;
+
+private:
+    /**
+     * The textual description that was used to create these tables
+     */
+    UnicodeString description;
+
+    /**
+     * A table that indexes from character values to character category numbers
+     */
+    CompactByteArray* charCategoryTable;
+
+    /**
+     * The table of state transitions used for forward iteration
+     */
+    int16_t* stateTable;
+
+    /**
+     * The table of state transitions used to sync up the iterator with the
+     * text in backwards and random-access iteration
+     */
+    int16_t* backwardsStateTable;
+
+    /**
+     * A list of flags indicating which states in the state table are accepting
+     * ("end") states
+     */
+    int8_t* endStates;
+
+    /**
+     * A list of flags indicating which states in the state table are
+     * lookahead states (states which turn lookahead on and off)
+     */
+    int8_t* lookaheadStates;
+
+    /**
+     * The number of character categories (and, thus, the number of columns in
+     * the state tables)
+     */
+    int32_t numCategories;
+
+    //=======================================================================
+    // constructor
+    //=======================================================================
+
+    /**
+     * Creates a tables object, adopting all of the tables that are passed in.
+     */
+protected:
+    RuleBasedBreakIteratorTables();
+    
+    RuleBasedBreakIteratorTables(const void* image);
+
+private:
+    /**
+     * The copy constructor is declared private and is a no-op.
+     * THIS CLASS MAY NOT BE COPIED.
+     */
+    RuleBasedBreakIteratorTables(const RuleBasedBreakIteratorTables& that);
+
+    //=======================================================================
+    // boilerplate
+    //=======================================================================
+
+protected:
+    /**
+     * Destructor
+     */
+    virtual ~RuleBasedBreakIteratorTables();
+
+private:
+    /**
+     * The assignment operator is declared private and is a no-op.
+     * THIS CLASS MAY NOT BE COPIED.
+     */
+    RuleBasedBreakIteratorTables& operator=(const RuleBasedBreakIteratorTables& that);
+
+    /**
+     * Equality operator.  Returns TRUE if both tables objects are of the
+     * same class, have the same behavior, and iterate over the same text.
+     */
+    virtual bool_t operator==(const RuleBasedBreakIteratorTables& that) const;
+
+    /**
+     * Not-equal operator.  If operator== returns TRUE, this returns FALSE,
+     * and vice versa.
+     */
+    bool_t operator!=(const RuleBasedBreakIteratorTables& that) const;
+
+    /**
+     * Compute a hash code for these tables
+     * @return A hash code
+     */
+    virtual int32_t hashCode() const;
+
+    /**
+     * Returns the description used to create these tables
+     */
+    const UnicodeString& getRules() const;
+
+    //=======================================================================
+    // reference counting
+    //=======================================================================
+    
+    /**
+     * increments the reference count.
+     */
+    void addReference();
+
+    /**
+     * decrements the reference count and deletes the object if it reaches zero
+     */
+    void removeReference();
+
+protected:
+    //=======================================================================
+    // implementation
+    //=======================================================================
+    /**
+     * Looks up a character's category (i.e., its category for breaking purposes,
+     * not its Unicode category)
+     */
+    virtual int32_t lookupCategory(UChar c, BreakIterator* bi) const;
+
+    /**
+     * Given a current state and a character category, looks up the
+     * next state to transition to in the state table.
+     */
+    virtual int32_t lookupState(int32_t state, int32_t category) const;
+
+    /**
+     * Given a current state and a character category, looks up the
+     * next state to transition to in the backwards state table.
+     */
+    virtual int32_t lookupBackwardState(int32_t state, int32_t category) const;
+
+    /**
+     * Returns true if the specified state is an accepting state.
+     */
+    virtual bool_t isEndState(int32_t state) const;
+
+    /**
+     * Returns true if the specified state is a lookahead state.
+     */
+    virtual bool_t isLookaheadState(int32_t state) const;
+
+    friend class RuleBasedBreakIterator;
+    friend class DictionaryBasedBreakIterator;
+};
+
+inline bool_t
+RuleBasedBreakIteratorTables::operator!=(const RuleBasedBreakIteratorTables& that) const {
+    return !operator==(that);
+}
+
+inline const UnicodeString&
+RuleBasedBreakIteratorTables::getRules() const {
+    return description;
+}
+
+inline void
+RuleBasedBreakIteratorTables::addReference() {
+    ++refCount;
+}
+
+inline void
+RuleBasedBreakIteratorTables::removeReference() {
+    if (--refCount <= 0)
+        delete this;
+}
+
+#endif
--- a/icu4c/source/i18n/ubrk.cpp
+++ b/icu4c/source/i18n/ubrk.cpp
@ -79,6 +79,31 @@ ubrk_close(UBreakIterator *bi)
  delete (BreakIterator*) bi;
 }

+U_CAPI void
+ubrk_setText(UBreakIterator* bi,
+             const UChar*    text,
+             int32_t         textLength,
+             UErrorCode*     status)
+{
+  if (U_FAILURE(*status)) return;
+    
+  const CharacterIterator& biText = ((BreakIterator*)bi)->getText();
+
+  int32_t textLen = (textLength == -1 ? u_strlen(text) : textLength);
+  if (biText.getDynamicClassID() == UCharCharacterIterator::getStaticClassID()) {
+      ((UCharCharacterIterator&)biText).setText(text, textLen);
+  }
+  else {    
+      UCharCharacterIterator *iter = 0;
+      iter = new UCharCharacterIterator(text, textLen);
+      if(iter == 0) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+      }
+      ((BreakIterator*)bi)->adoptText(iter);
+  }
+}
+
 U_CAPI UTextOffset
 ubrk_current(const UBreakIterator *bi)
 {
--- a/icu4c/source/i18n/unicode/brkiter.h
+++ b/icu4c/source/i18n/unicode/brkiter.h
@ -177,53 +177,73 @@ public:
     * BreakIterator, as the argument.  Text is considered the same if
     * it contains the same characters, it need not be the same
     * object, and styles are not considered.
+     * @stable
     */
    virtual bool_t operator==(const BreakIterator&) const = 0;

+    /**
+     * Returns the complement of the result of operator==
+     * @stable
+     */
    bool_t operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }

    /**
     * Return a polymorphic copy of this object.  This is an abstract
     * method which subclasses implement.
+     * @stable
     */
    virtual BreakIterator* clone(void) const = 0;

    /**
     * Return a polymorphic class ID for this object. Different subclasses
     * will return distinct unequal values.
+     * @stable
     */
    virtual UClassID getDynamicClassID(void) const = 0;

+    /**
+     * Return a CharacterIterator over the text being analyzed.
+     * @draft
+     */
+    virtual const CharacterIterator& getText() const = 0;
+
    /**
     * Get the text for which this object is finding the boundaries.
+     * @draft
     */
    virtual CharacterIterator* createText(void) const = 0;

    /**
     * Change the text over which this operates. The text boundary is
     * reset to the start.
+     * [This function should be modified to take a const UnicodeString& agrument.]
+     * @deprecate
     */
    virtual void  setText(const UnicodeString* it) = 0;

    /**
     * Change the text over which this operates. The text boundary is
     * reset to the start.
+     * @stable
     */
    virtual void  adoptText(CharacterIterator* it) = 0;

    /**
     * DONE is returned by previous() and next() after all valid
     * boundaries have been returned.
+     @stable
     */
    static const UTextOffset DONE;

    /**
     * Return the index of the first character in the text being scanned.
+     * @stable
     */
    virtual UTextOffset first(void) = 0;

    /**
     * Return the index immediately BEYOND the last character in the text being scanned.
+     * @stable
     */
    virtual UTextOffset last(void) = 0;

@ -231,6 +251,7 @@ public:
     * Return the boundary preceding the current boundary.
     * @return The character index of the previous text boundary or DONE if all
     * boundaries have been returned.
+     * @stable
     */
    virtual UTextOffset previous(void) = 0;

@ -238,6 +259,7 @@ public:
     * Return the boundary following the current boundary.
     * @return The character index of the next text boundary or DONE if all
     * boundaries have been returned.
+     * @stable
     */
    virtual UTextOffset next(void) = 0;

@ -245,6 +267,7 @@ public:
     * Return character index of the text boundary that was most recently
     * returned by next(), previous(), first(), or last()
     * @return The boundary most recently returned.
+     * @stable
     */
    virtual UTextOffset current(void) const = 0;

@ -254,6 +277,7 @@ public:
     * the value BreakIterator.DONE
     * @param offset the offset to begin scanning.
     * @return The first boundary after the specified offset.
+     * @stable
     */
    virtual UTextOffset following(UTextOffset offset) = 0;

@ -263,6 +287,7 @@ public:
     * the value BreakIterator.DONE
     * @param offset the offset to begin scanning.
     * @return The first boundary before the specified offset.
+     * @stable
     */
    virtual UTextOffset preceding(UTextOffset offset) = 0;
 
@ -270,6 +295,7 @@ public:
     * Return true if the specfied position is a boundary position.
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
+     * @stable
     */
    virtual bool_t isBoundary(UTextOffset offset) = 0;

@ -280,6 +306,7 @@ public:
     * and positive values move to later boundaries.
     * @return The index of the nth boundary from the current position, or
     * DONE if there are fewer than |n| boundaries in the specfied direction.
+     * @stable
     */
    virtual UTextOffset next(int32_t n) = 0;

@ -290,6 +317,7 @@ public:
     * @param where the locale. If a specific WordBreak is not
     * avaliable for the specified locale, a default WordBreak is returned.
     * @return A BreakIterator for word-breaks
+     * @stable
     */
    static BreakIterator* createWordInstance(const Locale& where = Locale::getDefault());

@ -302,6 +330,7 @@ public:
     * @param where the locale. If a specific LineBreak is not
     * avaliable for the specified locale, a default LineBreak is returned.
     * @return A BreakIterator for line-breaks
+     * @stable
     */
    static BreakIterator* createLineInstance(const Locale& where = Locale::getDefault());

@ -312,6 +341,7 @@ public:
     * @param where the locale. If a specific character break is not
     * avaliable for the specified locale, a default character break is returned.
     * @return A BreakIterator for character-breaks
+     * @stable
     */
    static BreakIterator* createCharacterInstance(const Locale& where = Locale::getDefault());

@ -321,6 +351,7 @@ public:
     * @param where the locale. If a specific SentenceBreak is not
     * avaliable for the specified locale, a default SentenceBreak is returned.
     * @return A BreakIterator for sentence-breaks
+     * @stable
     */
    static BreakIterator* createSentenceInstance(const Locale& where = Locale::getDefault());

@ -328,6 +359,7 @@ public:
     * Get the set of Locales for which TextBoundaries are installed
     * @param count the output parameter of number of elements in the locale list
     * @return available locales
+     * @stable
     */
    static const Locale* getAvailableLocales(int32_t& count);

@ -338,6 +370,7 @@ public:
     * @param name the fill-in parameter of the return value
     * Uses best match.
     * @return user-displayable name
+     * @stable
     */
    static UnicodeString& getDisplayName(const Locale& objectLocale,
                                         const Locale& displayLocale,
@ -349,6 +382,7 @@ public:
     * @param objectLocale must be from getMatchingLocales
     * @param name the fill-in parameter of the return value
     * @return user-displayable name
+     * @stable
     */
    static UnicodeString& getDisplayName(const Locale& objectLocale,
                                         UnicodeString& name);
--- a/icu4c/source/i18n/unicode/ubrk.h
+++ b/icu4c/source/i18n/unicode/ubrk.h
@ -178,6 +178,7 @@ typedef enum UBreakIteratorType UBreakIteratorType;
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified locale.
 * @see ubrk_openRules
+ * @stable
 */
 U_CAPI UBreakIterator*
 ubrk_open(UBreakIteratorType type,
@ -196,6 +197,7 @@ ubrk_open(UBreakIteratorType type,
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
+ * @stable
 */
 U_CAPI UBreakIterator*
 ubrk_openRules(const UChar *rules,
@ -208,16 +210,28 @@ ubrk_openRules(const UChar *rules,
 * Close a UBreakIterator.
 * Once closed, a UBreakIterator may no longer be used.
 * @param bi The break iterator to close.
+ * @stable
 */
 U_CAPI void
 ubrk_close(UBreakIterator *bi);

+/**
+ * Sets an existing iterator to point to a new piece of text
+ * @stable
+ */
+U_CAPI void
+ubrk_setText(UBreakIterator* bi,
+             const UChar*    text,
+             int32_t         textLength,
+             UErrorCode*     status);
+
 /**
 * Determine the most recently-returned text boundary.
 * 
 * @param bi The break iterator to use.
 * @return The character index most recently returned by \Ref{ubrk_next}, \Ref{ubrk_previous}, 
 * \Ref{ubrk_first}, or \Ref{ubrk_last}.
+ * @stable
 */
 U_CAPI UTextOffset
 ubrk_current(const UBreakIterator *bi);
@ -229,6 +243,7 @@ ubrk_current(const UBreakIterator *bi);
 * @return The character index of the next text boundary, or UBRK_DONE
 * if all text boundaries have been returned.
 * @see ubrk_previous
+ * @stable
 */
 U_CAPI UTextOffset
 ubrk_next(UBreakIterator *bi);
@ -240,6 +255,7 @@ ubrk_next(UBreakIterator *bi);
 * @return The character index of the preceding text boundary, or UBRK_DONE
 * if all text boundaries have been returned.
 * @see ubrk_next
+ * @stable
 */
 U_CAPI UTextOffset
 ubrk_previous(UBreakIterator *bi);
@ -250,6 +266,7 @@ ubrk_previous(UBreakIterator *bi);
 * @param bi The break iterator to use.
 * @return The character index of the first character in the text being scanned.
 * @see ubrk_last
+ * @stable
 */
 U_CAPI UTextOffset
 ubrk_first(UBreakIterator *bi);
@ -262,6 +279,7 @@ ubrk_first(UBreakIterator *bi);
 * @return The character offset immediately <EM>beyond</EM> the last character in the
 * text being scanned.
 * @see ubrk_first
+ * @stable
 */
 U_CAPI UTextOffset
 ubrk_last(UBreakIterator *bi);
@ -273,6 +291,7 @@ ubrk_last(UBreakIterator *bi);
 * @param offset The offset to begin scanning.
 * @return The text boundary preceding offset, or UBRK_DONE.
 * @see ubrk_following
+ * @stable
 */
 U_CAPI UTextOffset
 ubrk_preceding(UBreakIterator *bi,
@ -285,6 +304,7 @@ ubrk_preceding(UBreakIterator *bi,
 * @param offset The offset to begin scanning.
 * @return The text boundary following offset, or UBRK_DONE.
 * @see ubrk_preceding
+ * @stable
 */
 U_CAPI UTextOffset
 ubrk_following(UBreakIterator *bi,
@ -297,6 +317,7 @@ ubrk_following(UBreakIterator *bi,
 * @param index The index of the desired locale.
 * @return A locale for which number text breaking information is available, or 0 if none.
 * @see ubrk_countAvailable
+* @stable
 */
 U_CAPI const char*
 ubrk_getAvailable(int32_t index);
@ -307,6 +328,7 @@ ubrk_getAvailable(int32_t index);
 * calls to \Ref{ubrk_getAvailable}.
 * @return The number of locales for which text breaking information is available.
 * @see ubrk_getAvailable
+* @stable
 */
 U_CAPI int32_t
 ubrk_countAvailable(void);
--- a/icu4c/source/test/cintltst/cregrtst.c
+++ b/icu4c/source/test/cintltst/cregrtst.c
@ -1023,6 +1023,7 @@ AllocateTextBoundary();

    /* in addition to the other invariants, a line-break iterator should make sure that:
       it doesn't break around the non-breaking characters */
+    e = ubrk_open(UBRK_LINE, "en_US", work, u_strlen(work), &status);
    errorCount=0;
    status=U_ZERO_ERROR;
    u_strcpy(noBreak, CharsToUCharArray("\\u00a0\\u2007\\u2011\\ufeff"));
@ -1035,9 +1036,8 @@ AllocateTextBoundary();
        for (j = 0; j < u_strlen(noBreak); j++) {
            work[1] = noBreak[j];
            for (k = 0; k < u_strlen(s); k++) {
-                work[2] = s[k];
-                
-                e = ubrk_open(UBRK_LINE, "en_US", work, u_strlen(work), &status);
+                work[2] = s[k];                
+                ubrk_setText(e, work, u_strlen(work), &status);
                if(U_FAILURE(status)){
                log_err("FAIL: Error in opening the word break Iterator in testLineInvaiants:\n %s\n", myErrorName(status));
                return;
@ -1530,7 +1530,8 @@ void doBreakInvariantTest(UBreakIteratorType type, UChar* testChars)
      
    u_strcpy(breaks, CharsToUCharArray("\r\n\\u2029\\u2028"));
    
-
+    tb = ubrk_open(type, "en_US", work, u_strlen(work), &status);
+    
    for (i = 0; i < u_strlen(breaks); i++) {
        work[1] = breaks[i];
        for (j = 0; j < u_strlen(testChars); j++) {
@ -1545,7 +1546,7 @@ void doBreakInvariantTest(UBreakIteratorType type, UChar* testChars)
                    continue;

                work[2] = testChars[k];
-                tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
+                ubrk_setText(tb, work, u_strlen(work), &status);
                if(U_FAILURE(status)){
                    log_err("ERROR in opening the breakIterator in doVariant Function: %s\n", myErrorName(status));
                }
@ -1582,12 +1583,14 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
    
    log_verbose("doOtherInvariantTest text of length: %d\n", u_strlen(testChars));
    
+    tb = ubrk_open(type, "en_us", work, u_strlen(work), &status);
+    
    /* a break should never occur between CR and LF */
    for (i = 0; i < u_strlen(testChars); i++) {
        work[0] = testChars[i];
        for (j = 0; j < u_strlen(testChars); j++) {
            work[3] = testChars[j];
-            tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
+            ubrk_setText(tb, work, u_strlen(work), &status);
                if(U_FAILURE(status)){
                    log_err("ERROR in opening the breakIterator in doVariant Function: %s\n", myErrorName(status));
                    }
@ -1601,7 +1604,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
                }
        }
    }
-    ubrk_close(tb);
+
    /* a break should never occur before a non-spacing mark, unless the preceding
       character is CR, LF, PS, or LS */
    u_uastrcpy(work,"aaaa");
@ -1616,7 +1619,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
                (u_charType(c) != U_ENCLOSING_MARK))
                continue;
            work[2] = c;
-            tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
+            ubrk_setText(tb, work, u_strlen(work), &status);
                if(U_FAILURE(status)){
                    log_err("ERROR in opening the breakIterator in doOtherVariant Function %s\n", myErrorName(status));
                    }
@ -1630,6 +1633,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
                }
        }
    }
+    ubrk_close(tb);
 }

 void sample(UBreakIterator* tb, UChar* text)
--- a/icu4c/source/test/intltest/ittxtbd.cpp
+++ b/icu4c/source/test/intltest/ittxtbd.cpp
--- a/icu4c/source/test/intltest/ittxtbd.h
+++ b/icu4c/source/test/intltest/ittxtbd.h
@ -32,121 +32,59 @@ public:
    ~IntlTestTextBoundary();
    
    void runIndexedTest( int32_t index, bool_t exec, char* &name, char* par = NULL );
-
    /**
-     * Test sentence break using doForwardSelectionTest
+     * Test sentence break using generalIteratorTest()
     **/
-    void TestForwardSentenceSelection(void);
+	void TestSentenceIteration(void);
    /**
-     * Test sentence break using doFirstSelectionTest
+     * Test word break using generalIteratorTest()
     **/
-    void TestFirstSentenceSelection(void);
+	void TestWordIteration(void);
    /**
-     * Test sentence break using doLastSelectionTest
+     * Test line break using generalIteratorTest()
+     **/ 
+	void TestLineIteration(void);
+    /**
+     * Test character break using generalIteratorTest()
     **/
-    void TestLastSentenceSelection(void);
+	void TestCharacterIteration(void);
    /**
-     * Test sentence break using doBackwardSelectionTest
+     * Test sentence break using ()
     **/
-    void TestBackwardSentenceSelection(void);
-    /**
-     * Test sentence break using doForwardIndexSelectionTest
+    void TestSentenceInvariants(void);
+     /**
+     * Test sentence break Invariants using generalIteratorTest()
+     **/ 
+    void TestWordInvariants(void);
+     /**
+     * Test sentence break Invariants using generalIteratorTest()
     **/
-    void TestForwardSentenceIndexSelection(void);
-    /**
-     * Test sentence break using doBackwardIndexSelectionTest
+    void TestLineInvariants(void);
+     /**
+     * Test sentence break Invariants using generalIteratorTest()
     **/
-    void TestBackwardSentenceIndexSelection(void);
-    /**
-     * Test sentence break using doMultipleSelectionTest
+    void TestCharacterInvariants(void);
+     /**
+     * Test Japanese line break Invariants using generalIteratorTest()
     **/
-    void TestSentenceMultipleSelection(void);
-    /**
-     * Test word break using doForwardSelectionTest
+	void TestJapaneseLineBreak(void);
+     /**
+     * Test Thai line break using generalIteratorTest()
     **/
-    void TestForwardWordSelection(void);
-    /**
-     * Test word break using doFirstSelectionTest
+	void TestThaiLineBreak(void);
+     /**
+     * Test Mixed Thai (thai with other languages like english)line break using generalIteratorTest()
     **/
-    void TestFirstWordSelection(void);
+	void TestMixedThaiLineBreak(void);
    /**
-     * Test word break using doLastSelectionTest
+     * Test Thai Line break with Maiyamok using generalIteratorTest()
+	 * The Thai maiyamok character is a shorthand symbol that means "repeat the previous
+     * word".  Instead of appearing as a word unto itself, however, it's kept together
+     * with the word before it
     **/
-    void TestLastWordSelection(void);
+	void TestMaiyamok(void);
    /**
-     * Test word break using doBackwardSelectionTest
-     **/
-    void TestBackwardWordSelection(void);
-    /**
-     * Test word break using doForwardIndexSelectionTest
-     **/
-    void TestForwardWordIndexSelection(void);
-    /**
-     * Test word break using doBackwardIndexSelectionTest
-     **/
-    void TestBackwardWordIndexSelection(void);
-    /**
-     * Test word break using doMultipleSelectionTest
-     **/
-    void TestWordMultipleSelection(void);
-    /**
-     * Test line break using doLastSelectionTest
-     **/
-    void TestForwardLineSelection(void);
-    /**
-     * Test line break using doFirstSelectionTest
-     **/
-    void TestFirstLineSelection(void);
-    /**
-     * Test line break using doLastSelectionTest
-     **/
-    void TestLastLineSelection(void);
-    /**
-     * Test line break using doBackwardSelectionTest
-     **/
-    void TestBackwardLineSelection(void);
-    /**
-     * Test line break using doForwardIndexSelectionTest
-     **/
-    void TestForwardLineIndexSelection(void);
-    /**
-     * Test line break using doBackwardIndexSelectionTest
-     **/
-    void TestBackwardLineIndexSelection(void);
-    /**
-     * Test line break using doMultipleSelectionTest
-     **/
-    void TestLineMultipleSelection(void);
-    /**
-     * Test word break using doForwardIndexSelectionTest
-     **/
-    void TestForwardCharacterSelection(void);
-    /**
-     * Test character break using doFirstSelectionTest
-     **/
-    void TestFirstCharacterSelection(void);
-    /**
-     * Test character break using doLastSelectionTest
-     **/
-    void TestLastCharacterSelection(void);
-    /**
-     * Test character break using doBackwardSelectionTest
-     **/
-    void TestBackwardCharacterSelection(void);
-    /**
-     * Test character break using doForwardIndexSelectionTest
-     **/
-    void TestForwardCharacterIndexSelection(void);
-    /**
-     * Test character break using doBackwardIndexSelectionTest
-     **/
-    void TestBackwardCharacterIndexSelection(void);
-    /**
-     * Test character break using doMultipleSelectionTest
-     **/
-    void TestCharacterMultipleSelection(void);
-    /**
-     * test behaviour of BrakIteraor on an empty string
+     * test behaviour of BreakIterator on an empty string
     **/
    void TestEmptyString(void);
    /**
@ -162,20 +100,14 @@ public:
     **/
    void TestPreceding(void);

-    void TestJapaneseLineBreak(void);
-
    void TestBug4153072(void);
-
-    void TestEndBehavior(void);
-
-    void TestSentenceInvariants(void);
-
-    void TestWordInvariants(void);
+    /**
+	 * Test End Behaviour
+	 * @bug 4068137
+	 **/
+	void TestEndBehaviour(void);
    
-    void TestLineInvariants(void);
-
-    void TestCharacterInvariants(void);
-
+/***********************/
 private:
    /**
     * internal methods to prepare test data
@ -184,62 +116,68 @@ private:
    void addTestSentenceData(void);
    void addTestLineData(void);
    void addTestCharacterData(void);
-
    UnicodeString createTestData(Enumeration* e);
-
-
+	
    /**
-     * Perform tests of BreakIterator forward functionality 
-     * on different kinds of iterators (word, sentence, line and character)
+     * Perform tests of BreakIterator forward and backward functionality 
+     * on different kinds of iterators (word, sentence, line and character).
+	 * It tests the methods first(), next(), current(), preceding(), following()
+	 * previous() and isBoundary().
+	 * It makes use of internal functions to achieve this.
     **/
-    void doForwardSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
+    void generalIteratorTest(BreakIterator& bi, Vector* expectedResult);
    /**
-     * Perform tests of BreakIterator backward functionality 
-     * on different kinds of iterators (word, sentence, line and character)
-     **/
-    void doBackwardSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
+	 * Internal method to perform iteration and test the first() and next() functions
+	 **/
+	Vector* testFirstAndNext(BreakIterator& bi, UnicodeString& text);
    /**
-     * Perform tests of BreakIterator first selection functionality 
-     * on different kinds of iterators (word, sentence, line and character)
-     **/
-    void doFirstSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
+	 * Internal method to perform iteration and test the last() and previous() functions
+	 **/
+    Vector* testLastAndPrevious(BreakIterator& bi, UnicodeString& text);
+	/**
+	 * Internal method to perform iteration and test the following() function
+	 **/
+	void testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
    /**
-     * Perform tests of BreakIterator last selection functionality 
-     * on different kinds of iterators (word, sentence, line and character)
-     **/
-    void doLastSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
+	 * Internal method to perform iteration and test the preceding() function
+	 **/
+	void testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
+	/**
+	 * Internal method to perform iteration and test the isBoundary() function
+	 **/
+	void testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
+    /** 
+	 * Internal method which does the comparision of expected and got results.
+	 **/
+	void compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2);
    /**
-     * Perform tests of BreakIterator forward index functionality 
-     * on different kinds of iterators (word, sentence, line and character)
-     **/
-    void doForwardIndexSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
-    /**
-     * Perform tests of BreakIterator backward index functionality 
-     * on different kinds of iterators (word, sentence, line and character)
-     **/
-    void doBackwardIndexSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
-    /**
-     * Perform tests of BreakIterator multiple selection functionality 
+     * Internal method to perform tests of BreakIterator multiple selection functionality 
     * on different kinds of iterators (word, sentence, line and character)
     **/
    void doMultipleSelectionTest(BreakIterator& iterator, UnicodeString& testText);
    /**
-     * Perform tests with short sample code
+     * Internal method to perform tests of BreakIterator break Invariants 
+     * on different kinds of iterators (word, sentence, line and character)
     **/
-    void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title);
-
    void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars);
-
+    /**
+     * Internal method to perform tests of BreakIterator other invariants 
+     * on different kinds of iterators (word, sentence, line and character)
+     **/
    void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars);
-
+    /**
+     * Perform tests with short sample code
+     **/ 
+	void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title);
+	/**
+	 * The vectors holding test data for testing 
+	 * different kinds of iterators( word, sentence, line and character)
+	 **/
    Vector* lineSelectionData;
-    UnicodeString testLineText;
    Vector* sentenceSelectionData;
-    UnicodeString testSentenceText;
    Vector* wordSelectionData;
-    UnicodeString testWordText;
    Vector* characterSelectionData;
-    UnicodeString testCharacterText;
+    
    static const UChar cannedTestArray[];
    static UnicodeString *cannedTestChars;
 };
--- a/icu4c/source/tools/makedata.mak
+++ b/icu4c/source/tools/makedata.mak
@ -11,7 +11,7 @@
 !IF "$(CFG)" == ""
 CFG=Debug
 !MESSAGE No configuration specified. Defaulting to common - Win32 Debug.
-!ENDIF 
+!ENDIF

 !IF [cl.exe]
 !MESSAGE Could not find build tools!
@ -24,7 +24,7 @@ CFG=Debug
 #Let's see if user has given us a path to ICU
 #This could be found according to the path to makefile, but for now it is this way
 !MESSAGE ICUP=$(ICUP)
-!IF "$(ICUP)"=="" 
+!IF "$(ICUP)"==""
 !ERROR Can't find path!
 !ELSE
 ICUDATA=$(ICUP)\icu\data
@ -38,47 +38,47 @@ LINK32 = link.exe
 LINK32_FLAGS = /out:"$(ICUDATA)/icudata.dll" /DLL /NOENTRY /base:"0x4ad00000" /comment:" Copyright (C) 1999 International Business Machines Corporation and others.  All Rights Reserved. "
 CPP_FLAGS = /I$(ICUP)\icu\include /GD /c

-#Here we test if configuration is given 
+#Here we test if configuration is given
 !IF "$(CFG)" != "Release" && "$(CFG)" != "release" && "$(CFG)" != "Debug" && "$(CFG)" != "debug"
 !MESSAGE Invalid configuration "$(CFG)" specified.
 !MESSAGE You can specify a configuration when running NMAKE
 !MESSAGE by defining the macro CFG on the command line. For example:
-!MESSAGE 
+!MESSAGE
 !MESSAGE NMAKE /f "makedata.mak" CFG="Debug"
-!MESSAGE 
+!MESSAGE
 !MESSAGE Possible choices for configuration are:
-!MESSAGE 
+!MESSAGE
 !MESSAGE "Release"
 !MESSAGE "Debug"
-!MESSAGE 
+!MESSAGE
 !ERROR An invalid configuration is specified.
-!ENDIF 
+!ENDIF

 # This appears in original Microsofts makefiles
 !IF "$(OS)" == "Windows_NT"
 NULL=
-!ELSE 
+!ELSE
 NULL=nul
-!ENDIF 
+!ENDIF

 PATH = $(PATH);$(ICUP)\icu\bin\$(CFG)

 # Suffixes for data files
 .SUFFIXES : .ucm .cnv .dll .dat .col .res .txt .c

-# We're including a list of ucm files. There are two lists, one is essential 'ucmfiles.mk' and 
+# We're including a list of ucm files. There are two lists, one is essential 'ucmfiles.mk' and
 # the other is optional 'ucmlocal.mk'
 !IF EXISTS("$(ICUTOOLS)\makeconv\ucmfiles.mk")
 !INCLUDE "$(ICUTOOLS)\makeconv\ucmfiles.mk"
 !IF EXISTS("$(ICUTOOLS)\makeconv\ucmlocal.mk")
 !INCLUDE "$(ICUTOOLS)\makeconv\ucmlocal.mk"
 $(UCM_SOURCE)=$(UCM_SOURCE) $(UCM_SOURCE_LOCAL)
-!ELSE 
+!ELSE
 #!MESSAGE Warning: cannot find "ucmlocal.mk"
 !ENDIF
-!ELSE 
+!ELSE
 !ERROR ERROR: cannot find "ucmfiles.mk"
-!ENDIF 
+!ENDIF

 # According to the read files, we will generate CNV and C files
 CNV_FILES=$(UCM_SOURCE:.ucm=.cnv)
@ -91,12 +91,12 @@ OBJ_CNV_FILES = $(C_CNV_FILES:.c=.obj)
 !IF EXISTS("$(ICUTOOLS)\genrb\genrblocal.mk")
 !INCLUDE "$(ICUTOOLS)\genrb\genrblocal.mk"
 GENRB_SOURCE=$(GENRB_SOURCE) $(GENRB_SOURCE_LOCAL)
-!ELSE 
+!ELSE
 #!MESSAGE Warning: cannot find "genrblocal.mk"
 !ENDIF
-!ELSE 
+!ELSE
 !ERROR ERROR: cannot find "genrbfiles.mk"
-!ENDIF 
+!ENDIF
 RB_FILES = $(GENRB_SOURCE:.txt=.res)

 # Read list of resource bundle files for colation
@ -105,48 +105,81 @@ RB_FILES = $(GENRB_SOURCE:.txt=.res)
 !IF EXISTS("$(ICUTOOLS)\gencol\gencollocal.mk")
 !INCLUDE "$(ICUTOOLS)\gencol\gencollocal.mk"
 GENCOL_SOURCE=$(GENCOL_SOURCE) $(GENCOL_SOURCE_LOCAL)
-!ELSE 
+!ELSE
 #!MESSAGE Warning: cannot find "gencollocal.mk"
 !ENDIF
-!ELSE 
+!ELSE
 !ERROR ERROR: cannot find "gencolfiles.mk"
-!ENDIF 
+!ENDIF
 COL_FILES = $(GENCOL_SOURCE:.txt=.col)

+
 # This target should build all the data files
 ALL : GODATA $(RB_FILES) $(CNV_FILES) $(COL_FILES) icudata.dll icudata.dat GOBACK
 	@echo All targets are up to date
-	
-CPP_SOURCES = $(C_CNV_FILES) unames_dat.c cnvalias_dat.c tz_dat.c 
+
+BRK_FILES = sent.brk char.brk line.brk word.brk line_th.brk word_th.brk
+BRK_CSOURCES = $(BRK_FILES:.brk=_brk.c)
+
+CPP_SOURCES = $(C_CNV_FILES) unames_dat.c cnvalias_dat.c tz_dat.c $(BRK_CSOURCES)
 LINK32_OBJS = $(CPP_SOURCES:.c=.obj)

 # target for DLL
 icudata.dll : $(LINK32_OBJS) $(CNV_FILES)
-	@echo Creating DLL file 
+	@echo Creating DLL file
 	@cd $(ICUDATA)
 	@$(LINK32) @<<
 $(LINK32_FLAGS) $(LINK32_OBJS)
 <<

+$(ICUDATA)\sent.brk : $(ICUDATA)\sentLE.brk
+    copy $(ICUDATA)\sentLE.brk $(ICUDATA)\sent.brk
+
+$(ICUDATA)\char.brk : $(ICUDATA)\charLE.brk
+    copy $(ICUDATA)\charLE.brk $(ICUDATA)\char.brk
+
+$(ICUDATA)\line.brk : $(ICUDATA)\lineLE.brk
+    copy $(ICUDATA)\lineLE.brk $(ICUDATA)\line.brk
+
+$(ICUDATA)\word.brk : $(ICUDATA)\wordLE.brk
+    copy $(ICUDATA)\wordLE.brk $(ICUDATA)\word.brk
+
+$(ICUDATA)\line_th.brk : $(ICUDATA)\line_thLE.brk
+    copy $(ICUDATA)\line_thLE.brk $(ICUDATA)\line_th.brk
+
+$(ICUDATA)\word_th.brk : $(ICUDATA)\word_thLE.brk
+    copy $(ICUDATA)\word_thLE.brk $(ICUDATA)\word_th.brk
+
 # target for memory mapped file
-icudata.dat : $(CNV_FILES) unames.dat cnvalias.dat tz.dat 
+icudata.dat : $(CNV_FILES) unames.dat cnvalias.dat tz.dat
 	@echo Creating memory-mapped file
 	@cd $(ICUDATA)
 	@$(ICUTOOLS)\gencmn\$(CFG)\gencmn 1000000 <<
 $(ICUDATA)\unames.dat
 $(ICUDATA)\cnvalias.dat
 $(ICUDATA)\tz.dat
+$(ICUDATA)\sent.brk
+$(ICUDATA)\char.brk
+$(ICUDATA)\line.brk
+$(ICUDATA)\word.brk
+$(ICUDATA)\line_th.brk
+$(ICUDATA)\word_th.brk
 $(CNV_FILES:.cnv =.cnv
 )
 <<

-# nothing works without this target, but we're making 
+# nothing works without this target, but we're making
 # these files while creating converters
 $(C_CNV_FILES) : $(CNV_FILES)
 	@$(ICUTOOLS)\genccode\$(CFG)\genccode $(CNV_FILES)

+# nothing works without this target, but we're making
+# these files while creating converters
+$(BRK_CSOURCES) : $(BRK_FILES)
+	@$(ICUTOOLS)\genccode\$(CFG)\genccode $(BRK_FILES)
+
 # utility to send us to the right dir
-GODATA : 
+GODATA :
 	@cd $(ICUDATA)

 # utility to get us back to the right dir
@ -164,8 +197,15 @@ CLEAN :
 	-@erase "cnvalias*.*"
 	-@erase "tz*.*"
 	-@erase "ibm*_cnv.c"
+	-@erase "*_brk.c"
 	-@erase "icudata.*"
 	-@erase "*.obj"
+	-@erase "sent.brk"
+	-@erase "char.brk"
+	-@erase "line.brk"
+	-@erase "word.brk"
+	-@erase "line_th.brk"
+	-@erase "word_th.brk"
 	@cd $(TEST)
 	-@erase "*.res"
 	@cd $(ICUTOOLS)
@ -184,7 +224,7 @@ CLEAN :
 	@$(ICUTOOLS)\makeconv\$(CFG)\makeconv $<
 #	@$(ICUTOOLS)\genccode\$(CFG)\genccode $(CNV_FILES)

-# Inference rule for creating collation files - 
+# Inference rule for creating collation files -
 # this should be integrated in genrb
 .txt.col::
 	@echo Making Collation files
@ -203,7 +243,7 @@ unames.dat : UnicodeData-3.0.0.txt
 	@echo Creating data file for Unicode Names
 	@$(ICUTOOLS)\gennames\$(CFG)\gennames -v- -c- UnicodeData-3.0.0.txt

-unames_dat.c : unames.dat 
+unames_dat.c : unames.dat
 	@echo Creating C source file for Unicode Names
 	@$(ICUTOOLS)\genccode\$(CFG)\genccode $(ICUDATA)\$?

@ -211,8 +251,8 @@ unames_dat.c : unames.dat
 cnvalias.dat : convrtrs.txt
 	@echo Creating data file for Converter Aliases
 	@$(ICUTOOLS)\gencnval\$(CFG)\gencnval -c-
-	
-cnvalias_dat.c : cnvalias.dat 
+
+cnvalias_dat.c : cnvalias.dat
 	@echo Creating C source file for Converter Aliases
 	@$(ICUTOOLS)\genccode\$(CFG)\genccode $(ICUDATA)\$?