ICU-45 Initial check in of rbbi files. Do not compile yet, but handing code over to R. Gillam.

X-SVN-Rev: 131
1999-10-27 16:34:57 +00:00 · 1999-10-27 16:34:57 +00:00 · cc8631bf1e
commit cc8631bf1e
parent 72ad425a57
4 changed files with 2712 additions and 0 deletions
--- a/icu4c/source/i18n/rbbi.cpp
+++ b/icu4c/source/i18n/rbbi.cpp
@ -0,0 +1,393 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 Alan Liu and others. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   10/22/99    alan        Creation.
+**********************************************************************
+*/
+
+#include "rbbi.h"
+#include "rbbi_bld.h"
+
+/**
+ * A token used as a character-category value to identify ignore characters
+ */
+int8_t RuleBasedBreakIterator::IGNORE = -1;
+
+/**
+ * The state number of the starting state
+ */
+int16_t RuleBasedBreakIterator::START_STATE = 1;
+
+/**
+ * The state-transition value indicating "stop"
+ */
+int16_t RuleBasedBreakIterator::STOP_STATE = 0;
+
+//=======================================================================
+// constructors
+//=======================================================================
+
+/**
+ * Constructs a RuleBasedBreakIterator according to the description
+ * provided.  If the description is malformed, throws an
+ * IllegalArgumentException.  Normally, instead of constructing a
+ * RuleBasedBreakIterator directory, you'll use the factory methods
+ * on BreakIterator to create one indirectly from a description
+ * in the framework's resource files.  You'd use this when you want
+ * special behavior not provided by the built-in iterators.
+ */
+RuleBasedBreakIterator::RuleBasedBreakIterator(const UnicodeString& description) {
+    this.description = description;
+    
+    // the actual work is done by the Builder class
+    Builder builder;
+    builder.buildBreakIterator(*this, description);
+}
+
+//=======================================================================
+// boilerplate
+//=======================================================================
+/**
+ * Clones this iterator.
+ * @return A newly-constructed RuleBasedBreakIterator with the same
+ * behavior as this one.
+ */
+RuleBasedBreakIterator* RuleBasedBreakIterator::clone() const {
+    return new RuleBasedBreakIterator(*this);
+}
+
+/**
+ * Returns true if both BreakIterators are of the same class, have the same
+ * rules, and iterate over the same text.
+ */
+bool_t RuleBasedBreakIterator::operator==(const RuleBasedBreakIterator& that) {
+    return description.equals(((RuleBasedBreakIterator)that).description)
+        && text.equals(((RuleBasedBreakIterator)that).text);
+}
+
+/**
+ * Returns the description used to create this iterator
+ */
+UnicodeString RuleBasedBreakIterator::toString() {
+    return description;
+}
+
+/**
+ * Compute a hashcode for this BreakIterator
+ * @return A hash code
+ */
+int32_t RuleBasedBreakIterator::hashCode() {
+    return description.hashCode();
+}
+
+//=======================================================================
+// BreakIterator overrides
+//=======================================================================
+/**
+ * Sets the current iteration position to the beginning of the text.
+ * (i.e., the CharacterIterator's starting offset).
+ * @return The offset of the beginning of the text.
+ */
+int32_t RuleBasedBreakIterator::first() {
+    CharacterIterator t = getText();
+
+    t.first();
+    return t.getIndex();
+}
+
+/**
+ * Sets the current iteration position to the end of the text.
+ * (i.e., the CharacterIterator's ending offset).
+ * @return The text's past-the-end offset.
+ */
+int32_t RuleBasedBreakIterator::last() {
+    CharacterIterator t = getText();
+
+    // I'm not sure why, but t.last() returns the offset of the last character,
+    // rather than the past-the-end offset
+    t.setIndex(t.getEndIndex());
+    return t.getIndex();
+}
+
+/**
+ * Advances the iterator either forward or backward the specified number of steps.
+ * Negative values move backward, and positive values move forward.  This is
+ * equivalent to repeatedly calling next() or previous().
+ * @param n The number of steps to move.  The sign indicates the direction
+ * (negative is backwards, and positive is forwards).
+ * @return The character offset of the boundary position n boundaries away from
+ * the current one.
+ */
+int32_t RuleBasedBreakIterator::next(int32_t n) {
+    int32_t result = current();
+    while (n > 0) {
+        result = handleNext();
+        --n;
+    }
+    while (n < 0) {
+        result = previous();
+        ++n;
+    }
+    return result;
+}
+
+/**
+ * Advances the iterator to the next boundary position.
+ * @return The position of the first boundary after this one.
+ */
+int32_t RuleBasedBreakIterator::next() {
+    return handleNext();
+}
+
+/**
+ * Advances the iterator backwards, to the last boundary preceding this one.
+ * @return The position of the last boundary position preceding this one.
+ */
+int32_t RuleBasedBreakIterator::previous() {
+    // if we're already sitting at the beginning of the text, return DONE
+    CharacterIterator text = getText();
+    if (current() == text.getBeginIndex())
+        return BreakIterator.DONE;
+
+    // set things up.  handlePrevious() will back us up to some valid
+    // break position before the current position (we back our internal
+    // iterator up one step to prevent handlePrevious() from returning
+    // the current position), but not necessarily the last one before
+    // where we started
+    int32_t start = current();
+    text.previous();
+    int32_t lastResult = handlePrevious();
+    int32_t result = lastResult;
+    
+    // iterate forward from the known break position until we pass our
+    // starting point.  The last break position before the starting
+    // point is our return value
+    while (result != BreakIterator.DONE && result < start) {
+        lastResult = result;
+        result = handleNext();
+    }
+    
+    // set the current iteration position to be the last break position
+    // before where we started, and then return that value
+    text.setIndex(lastResult);
+    return lastResult;
+}
+
+/**
+ * Sets the iterator to refer to the first boundary position following
+ * the specified position.
+ * @offset The position from which to begin searching for a break position.
+ * @return The position of the first break after the current position.
+ */
+int32_t RuleBasedBreakIterator::following(int32_t offset) {
+    // if the offset passed in is already past the end of the text,
+    // just return DONE
+    CharacterIterator text = getText();
+    if (offset == text.getEndIndex())
+        return BreakIterator.DONE;
+
+    // otherwise, set our internal iteration position (temporarily)
+    // to the position passed in.  If this is the _beginning_ position,
+    // then we can just use next() to get our return value
+    text.setIndex(offset);
+    if (offset == text.getBeginIndex())
+        return handleNext();
+
+    // otherwise, we have to sync up first.  Use handlePrevious() to back
+    // us up to a known break position before the specified position (if
+    // we can determine that the specified position is a break position,
+    // we don't back up at all).  This may or may not be the last break
+    // position at or before our starting position.  Advance forward
+    // from here until we've passed the starting position.  The position
+    // we stop on will be the first break position after the specified one.
+    int32_t result = handlePrevious();
+    while (result != BreakIterator.DONE && result <= offset)
+        result = handleNext();
+    return result;
+}
+
+/**
+ * Sets the iterator to refer to the last boundary position before the
+ * specified position.
+ * @offset The position to begin searching for a break from.
+ * @return The position of the last boundary before the starting position.
+ */
+int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
+    // if we start by updating the current iteration position to the
+    // position specified by the caller, we can just use previous()
+    // to carry out this operation
+    CharacterIterator text = getText();
+    text.setIndex(offset);
+    return previous();
+}
+
+/**
+ * Returns true if the specfied position is a boundary position.  As a side
+ * effect, leaves the iterator pointing to the first boundary position at
+ * or after "offset".
+ * @param offset the offset to check.
+ * @return True if "offset" is a boundary position.
+ */
+bool_t RuleBasedBreakIterator::isBoundary(int32_t offset) {
+    // 0 is always a boundary position (I suspect this code is wrong; I think
+    // we're supposed to be comparing "offset" against text.getBeginIndex(). )
+    if (offset == 0)
+        return TRUE;
+        
+    // otherwise, we can use following() on the position before the specified
+    // one and return true of the position we get back is the one the user
+    // specified
+    else
+        return following(offset - 1) == offset;
+}
+
+/**
+ * Returns the current iteration position.
+ * @return The current iteration position.
+ */
+int32_t RuleBasedBreakIterator::current() {
+    return getText().getIndex();
+}
+
+/**
+ * Return a CharacterIterator over the text being analyzed.  This version
+ * of this method returns the actual CharacterIterator we're using internally.
+ * Changing the state of this iterator can have undefined consequences.  If
+ * you need to change it, clone it first.
+ * @return An iterator over the text being analyzed.
+ */
+CharacterIterator RuleBasedBreakIterator::getText() {
+    // The iterator is initialized pointing to no text at all, so if this
+    // function is called while we're in that state, we have to fudge an
+    // an iterator to return.
+    if (text == 0)
+        text = new StringCharacterIterator("");
+    return text;
+}
+
+/**
+ * Set the iterator to analyze a new piece of text.  This function resets
+ * the current iteration position to the beginning of the text.
+ * @param newText An iterator over the text to analyze.
+ */
+void RuleBasedBreakIterator::setText(CharacterIterator newText) {
+    text = newText;
+    text.first();
+}
+//=======================================================================
+// implementation
+//=======================================================================
+/**
+ * This method is the actual implementation of the next() method.  All iteration
+ * vectors through here.  This method initializes the state machine to state 1
+ * and advances through the text character by character until we reach the end
+ * of the text or the state machine transitions to state 0.  We update our return
+ * value every time the state machine passes through a possible end state.
+ */
+int32_t RuleBasedBreakIterator::handleNext() {
+    // if we're already at the end of the text, return DONE.
+    CharacterIterator text = getText();
+    if (text.getIndex() == text.getEndIndex())
+        return BreakIterator.DONE;
+
+    // no matter what, we always advance at least one character forward
+    int32_t result = text.getIndex() + 1;
+    
+    // begin in state 1
+    int32_t state = START_STATE;
+    int32_t category;
+    UChar c = text.current();
+
+    // loop until we reach the end of the text or transition to state 0
+    while (c != CharacterIterator.DONE && state != STOP_STATE) {
+
+        // look up the current character's character category (which tells us
+        // which column in the state table to look at)
+        category = lookupCategory(c);
+        
+        // if the character isn't an ignore character, look up a state
+        // transition in the state table
+        if (category != IGNORE) {
+            state = lookupState(state, category);
+        }
+        
+        // if the state we've just transitioned to is an accepting state,
+        // update our return value to be the current iteration position
+        if (endStates[state])
+            result = text.getIndex() + 1;
+        c = text.next();
+    }
+    text.setIndex(result);
+    return result;
+}
+
+/**
+ * This method backs the iterator back up to a "safe position" in the text.
+ * This is a position that we know, without any context, must be a break position.
+ * The various calling methods then iterate forward from this safe position to
+ * the appropriate position to return.  (For more information, see the description
+ * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
+ */
+int32_t RuleBasedBreakIterator::handlePrevious() {
+    CharacterIterator text = getText();
+    int32_t state = START_STATE;
+    int32_t category = 0;
+    int32_t lastCategory = 0;
+    UChar c = text.current();
+    
+    // loop until we reach the beginning of the text or transition to state 0
+    while (c != CharacterIterator.DONE && state != STOP_STATE) {
+
+        // save the last character's category and look up the current
+        // character's category
+        lastCategory = category;
+        category = lookupCategory(c);
+        
+        // if the current character isn't an ignore character, look up a
+        // state transition in the backwards state table
+        if (category != IGNORE)
+            state = lookupBackwardState(state, category);
+            
+        // then advance one character backwards
+        c = text.previous();
+    }
+    
+    // if we didn't march off the beginning of the text, we're either one or two
+    // positions away from the real break position.  (One because of the call to
+    // previous() at the end of the loop above, and another because the character
+    // that takes us into the stop state will always be the character BEFORE
+    // the break position.)
+    if (c != CharacterIterator.DONE) {
+        if (lastCategory != IGNORE)
+            text.setIndex(text.getIndex() + 2);
+        else
+            text.next();
+    }
+    return text.getIndex();
+}
+
+/**
+ * Looks up a character's category (i.e., its category for breaking purposes,
+ * not its Unicode category)
+ */
+int32_t RuleBasedBreakIterator::lookupCategory(UChar c) {
+    return UCharCategoryTable.elementAt(c);
+}
+
+/**
+ * Given a current state and a character category, looks up the
+ * next state to transition to in the state table.
+ */
+int32_t RuleBasedBreakIterator::lookupState(int32_t state, int32_t category) {
+    return stateTable[state * numCategories + category];
+}
+
+/**
+ * Given a current state and a character category, looks up the
+ * next state to transition to in the backwards state table.
+ */
+int32_t RuleBasedBreakIterator::lookupBackwardState(int32_t state, int32_t category) {
+    return backwardsStateTable[state * numCategories + category];
+}
--- a/icu4c/source/i18n/rbbi.h
+++ b/icu4c/source/i18n/rbbi.h
@ -0,0 +1,409 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 Alan Liu and others. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   10/22/99    alan        Creation.
+**********************************************************************
+*/
+
+#ifndef RBBI_H
+#define RBBI_H
+
+/**
+ * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
+ * 
+ * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
+ * and <i>regular expressions.</i></p>
+ * 
+ * <p>A substitution rule defines a name that can be used in place of an expression. It
+ * consists of a name, which is a string of characters contained in angle brackets, an equals
+ * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
+ * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
+ * square brackets. A substitution is visible after its definition, and is filled in using
+ * simple textual substitution. Substitution definitions can contain other substitutions, as
+ * long as those substitutions have been defined first. Substitutions are generally used to
+ * make the regular expressions (which can get quite complex) shorted and easier to read.
+ * They typically define either character categories or commonly-used subexpressions.</p>
+ * 
+ * <p>There is one special substitution.&nbsp; If the description defines a substitution
+ * called &quot;&lt;ignore&gt;&quot;, the expression must be a [] expression, and the
+ * expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
+ * will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
+ * same way it would if any ignore characters it contains are taken out.&nbsp; Break
+ * positions never occur befoer ignore characters.</p>
+ * 
+ * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
+ * defines a sequence of characters to be kept together. With one significant exception, the
+ * iterator uses a longest-possible-match algorithm when matching text to regular
+ * expressions. The iterator also treats descriptions containing multiple regular expressions
+ * as if they were ORed together (i.e., as if they were separated by |).</p>
+ * 
+ * <p>The special characters recognized by the regular-expression parser are as follows:</p>
+ * 
+ * <blockquote>
+ *   <table border="1" width="100%">
+ *     <tr>
+ *       <td width="6%">*</td>
+ *       <td width="94%">Specifies that the expression preceding the asterisk may occur any number
+ *       of times (including not at all).</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">{}</td>
+ *       <td width="94%">Encloses a sequence of characters that is optional.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">()</td>
+ *       <td width="94%">Encloses a sequence of characters.&nbsp; If followed by *, the sequence
+ *       repeats.&nbsp; Otherwise, the parentheses are just a grouping device and a way to delimit
+ *       the ends of expressions containing |.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">|</td>
+ *       <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
+ *       sequence or the other, but not both, matches this expression.&nbsp; The | character can
+ *       only occur inside ().</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">.</td>
+ *       <td width="94%">Matches any character.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">*?</td>
+ *       <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
+ *       when there is overlap between the last group of characters in the expression preceding the
+ *       * and the first group of characters following the *.&nbsp; When there is this kind of
+ *       overlap, * will match the longest sequence of characters that match the expression before
+ *       the *, and *? will match the shortest sequence of characters matching the expression
+ *       before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
+ *       &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
+ *       but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">[]</td>
+ *       <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
+ *       match any single character that is specified in the [] expression.&nbsp; For more on the
+ *       syntax of [] expressions, see below.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">/</td>
+ *       <td width="94%">Specifies where the break position should go if text matches this
+ *       expression.&nbsp; (e.g., &quot;[a-z]&#42;/[:Zs:]*1&quot; will match if the iterator sees a run
+ *       of letters, followed by a run of whitespace, followed by a digit, but the break position
+ *       will actually go before the whitespace).&nbsp; Expressions that don't contain / put the
+ *       break position at the end of the matching text.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">\</td>
+ *       <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
+ *       character to be treated as literal character.&nbsp; This has no effect for many
+ *       characters, but for the characters listed above, this deprives them of their special
+ *       meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
+ *       newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
+ *       &quot;\n&quot; will be converted to a literal newline character by the time the
+ *       regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
+ *       visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
+ *       characters in the ASCII range except for letters, digits, and control characters are
+ *       reserved characters to the parser and must be preceded by \ even if they currently don't
+ *       mean anything.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">!</td>
+ *       <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
+ *       parser that this expression specifies the backwards-iteration behavior of the iterator,
+ *       and not its normal iteration behavior.&nbsp; This is generally only used in situations
+ *       where the automatically-generated backwards-iteration brhavior doesn't produce
+ *       satisfactory results and must be supplemented with extra client-specified rules.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%"><em>(all others)</em></td>
+ *       <td width="94%">All other characters are treated as literal characters, which must match
+ *       the corresponding character(s) in the text exactly.</td>
+ *     </tr>
+ *   </table>
+ * </blockquote>
+ * 
+ * <p>Within a [] expression, a number of other special characters can be used to specify
+ * groups of characters:</p>
+ * 
+ * <blockquote>
+ *   <table border="1" width="100%">
+ *     <tr>
+ *       <td width="6%">-</td>
+ *       <td width="94%">Specifies a range of matching characters.&nbsp; For example
+ *       &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
+ *       sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
+ *       language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
+ *       it include accented letters such as a-umlaut.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">::</td>
+ *       <td width="94%">A pair of colons containing a one- or two-letter code matches all
+ *       characters in the corresponding Unicode category.&nbsp; The two-letter codes are the same
+ *       as the two-letter codes in the Unicode database (for example, &quot;[:Sc::Sm:]&quot;
+ *       matches all currency symbols and all math symbols).&nbsp; Specifying a one-letter code is
+ *       the same as specifying all two-letter codes that begin with that letter (for example,
+ *       &quot;[:L:]&quot; matches all letters, and is equivalent to
+ *       &quot;[:Lu::Ll::Lo::Lm::Lt:]&quot;).&nbsp; Anything other than a valid two-letter Unicode
+ *       category code or a single letter that begins a Unicode category code is illegal within
+ *       colons.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">[]</td>
+ *       <td width="94%">[] expressions can nest.&nbsp; This has no effect, except when used in
+ *       conjunction with the ^ token.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%">^</td>
+ *       <td width="94%">Excludes the character (or the characters in the [] expression) following
+ *       it from the group of characters.&nbsp; For example, &quot;[a-z^p]&quot; matches all Latin
+ *       lowercase letters except p.&nbsp; &quot;[:L:^[\u4e00-\u9fff]]&quot; matches all letters
+ *       except the Han ideographs.</td>
+ *     </tr>
+ *     <tr>
+ *       <td width="6%"><em>(all others)</em></td>
+ *       <td width="94%">All other characters are treated as literal characters.&nbsp; (For
+ *       example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
+ *     </tr>
+ *   </table>
+ * </blockquote>
+ * 
+ * <p>For a more complete explanation, see <a
+ * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
+ * &nbsp; For examples, see the resource data (which is annotated).</p>
+ *
+ * @author Richard Gillam
+ */
+class RuleBasedBreakIterator {
+
+protected:
+
+    /**
+     * A token used as a character-category value to identify ignore characters
+     */
+    static int8_t IGNORE;
+
+private:
+
+    /**
+     * The state number of the starting state
+     */
+    static int16_t START_STATE;
+
+    /**
+     * The state-transition value indicating "stop"
+     */
+    static int16_t STOP_STATE;
+
+    /**
+     * The textual description this iterator was created from
+     */
+    UnicodeString description;
+
+    /**
+     * A table that indexes from character values to character category numbers
+     */
+    CompactByteArray charCategoryTable;
+
+    /**
+     * The table of state transitions used for forward iteration
+     */
+    int16_t* stateTable;
+
+    /**
+     * The table of state transitions used to sync up the iterator with the
+     * text in backwards and random-access iteration
+     */
+    int16_t* backwardsStateTable;
+
+    /**
+     * A list of flags indicating which states in the state table are accepting
+     * ("end") states
+     */
+    bool_t* endStates;
+
+    /**
+     * The number of character categories (and, thus, the number of columns in
+     * the state tables)
+     */
+    int32_t numCategories;
+
+    /**
+     * The character iterator through which this BreakIterator accesses the text
+     */
+    CharacterIterator text;
+
+    //=======================================================================
+    // constructors
+    //=======================================================================
+
+public:
+
+    /**
+     * Constructs a RuleBasedBreakIterator according to the description
+     * provided.  If the description is malformed, throws an
+     * IllegalArgumentException.  Normally, instead of constructing a
+     * RuleBasedBreakIterator directory, you'll use the factory methods
+     * on BreakIterator to create one indirectly from a description
+     * in the framework's resource files.  You'd use this when you want
+     * special behavior not provided by the built-in iterators.
+     */
+    RuleBasedBreakIterator(UnicodeString description);
+
+    //=======================================================================
+    // boilerplate
+    //=======================================================================
+public:
+
+    /**
+     * Clones this iterator.
+     * @return A newly-constructed RuleBasedBreakIterator with the same
+     * behavior as this one.
+     */
+    virtual Object clone();
+
+    /**
+     * Returns true if both BreakIterators are of the same class, have the same
+     * rules, and iterate over the same text.
+     */
+    virtual bool_t equals(Object that);
+
+    /**
+     * Returns the description used to create this iterator
+     */
+    virtual UnicodeString toString();
+
+    /**
+     * Compute a hashcode for this BreakIterator
+     * @return A hash code
+     */
+    virtual int32_t hashCode();
+    //=======================================================================
+    // BreakIterator overrides
+    //=======================================================================
+    /**
+     * Sets the current iteration position to the beginning of the text.
+     * (i.e., the CharacterIterator's starting offset).
+     * @return The offset of the beginning of the text.
+     */
+    virtual int32_t first();
+
+    /**
+     * Sets the current iteration position to the end of the text.
+     * (i.e., the CharacterIterator's ending offset).
+     * @return The text's past-the-end offset.
+     */
+    virtual int32_t last();
+
+    /**
+     * Advances the iterator either forward or backward the specified number of steps.
+     * Negative values move backward, and positive values move forward.  This is
+     * equivalent to repeatedly calling next() or previous().
+     * @param n The number of steps to move.  The sign indicates the direction
+     * (negative is backwards, and positive is forwards).
+     * @return The character offset of the boundary position n boundaries away from
+     * the current one.
+     */
+    virtual int32_t next(int32_t n);
+
+    /**
+     * Advances the iterator to the next boundary position.
+     * @return The position of the first boundary after this one.
+     */
+    virtual int32_t next();
+
+    /**
+     * Advances the iterator backwards, to the last boundary preceding this one.
+     * @return The position of the last boundary position preceding this one.
+     */
+    virtual int32_t previous();
+
+    /**
+     * Sets the iterator to refer to the first boundary position following
+     * the specified position.
+     * @offset The position from which to begin searching for a break position.
+     * @return The position of the first break after the current position.
+     */
+    virtual int32_t following(int32_t offset);
+
+    /**
+     * Sets the iterator to refer to the last boundary position before the
+     * specified position.
+     * @offset The position to begin searching for a break from.
+     * @return The position of the last boundary before the starting position.
+     */
+    virtual int32_t preceding(int32_t offset);
+
+    /**
+     * Returns true if the specfied position is a boundary position.  As a side
+     * effect, leaves the iterator pointing to the first boundary position at
+     * or after "offset".
+     * @param offset the offset to check.
+     * @return True if "offset" is a boundary position.
+     */
+    virtual bool_t isBoundary(int32_t offset);
+
+    /**
+     * Returns the current iteration position.
+     * @return The current iteration position.
+     */
+    virtual int32_t current();
+
+    /**
+     * Return a CharacterIterator over the text being analyzed.  This version
+     * of this method returns the actual CharacterIterator we're using internally.
+     * Changing the state of this iterator can have undefined consequences.  If
+     * you need to change it, clone it first.
+     * @return An iterator over the text being analyzed.
+     */
+    virtual CharacterIterator getText();
+
+    /**
+     * Set the iterator to analyze a new piece of text.  This function resets
+     * the current iteration position to the beginning of the text.
+     * @param newText An iterator over the text to analyze.
+     */
+    virtual void setText(CharacterIterator newText);
+    //=======================================================================
+    // implementation
+    //=======================================================================
+protected:
+
+    /**
+     * This method is the actual implementation of the next() method.  All iteration
+     * vectors through here.  This method initializes the state machine to state 1
+     * and advances through the text character by character until we reach the end
+     * of the text or the state machine transitions to state 0.  We update our return
+     * value every time the state machine passes through a possible end state.
+     */
+    virtual int32_t handleNext();
+
+    /**
+     * This method backs the iterator back up to a "safe position" in the text.
+     * This is a position that we know, without any context, must be a break position.
+     * The various calling methods then iterate forward from this safe position to
+     * the appropriate position to return.  (For more information, see the description
+     * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
+     */
+    virtual int32_t handlePrevious();
+
+    /**
+     * Looks up a character's category (i.e., its category for breaking purposes,
+     * not its Unicode category)
+     */
+    virtual int32_t lookupCategory(UChar c);
+
+    /**
+     * Given a current state and a character category, looks up the
+     * next state to transition to in the state table.
+     */
+    virtual int32_t lookupState(int32_t state, int32_t category);
+
+    /**
+     * Given a current state and a character category, looks up the
+     * next state to transition to in the backwards state table.
+     */
+    virtual int32_t lookupBackwardState(int32_t state, int32_t category);
+};
+
+#endif
--- a/icu4c/source/i18n/rbbi_bld.cpp
+++ b/icu4c/source/i18n/rbbi_bld.cpp
--- a/icu4c/source/i18n/rbbi_bld.h
+++ b/icu4c/source/i18n/rbbi_bld.h
@ -0,0 +1,283 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 Alan Liu and others. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   10/22/99    alan        Creation.  This is an internal header; it
+*                           shall not be exported.
+**********************************************************************
+*/
+
+#ifndef RBBI_BLD_H
+#define RBBI_BLD_H
+
+#include "rbbi.h"
+#include "uniset.h"
+#include "uvector.h"
+
+//=======================================================================
+// RuleBasedBreakIterator.Builder
+//=======================================================================
+/**
+ * The Builder class has the job of constructing a RuleBasedBreakIterator from a
+ * textual description.  A Builder is constructed by RuleBasedBreakIterator's
+ * constructor, which uses it to construct the iterator itself and then throws it
+ * away.
+ * <p>The construction logic is separated out into its own class for two primary
+ * reasons:
+ * <ul><li>The construction logic is quite complicated and large.  Separating it
+ * out into its own class means the code must only be loaded into memory while a
+ * RuleBasedBreakIterator is being constructed, and can be purged after that.
+ * <li>There is a fair amount of state that must be maintained throughout the
+ * construction process that is not needed by the iterator after construction.
+ * Separating this state out into another class prevents all of the functions that
+ * construct the iterator from having to have really long parameter lists,
+ * (hopefully) contributing to readability and maintainability.</ul>
+ * <p>It'd be really nice if this could be an independent class rather than an
+ * inner class, because that would shorten the source file considerably, but
+ * making Builder an inner class of RuleBasedBreakIterator allows it direct access
+ * to RuleBasedBreakIterator's private members, which saves us from having to
+ * provide some kind of "back door" to the Builder class that could then also be
+ * used by other classes.
+ */
+class RuleBasedBreakIteratorBuilder {
+
+protected:
+
+    /**
+     * A temporary holding place used for calculating the character categories.
+     * This object contains UnicodeSet objects.
+     */
+    UVector categories;
+
+    /**
+     * A table used to map parts of regexp text to lists of character categories,
+     * rather than having to figure them out from scratch each time
+     */
+    Hashtable expressions;
+
+    /**
+     * A temporary holding place for the list of ignore characters
+     */
+    UnicodeSet ignoreChars;
+
+    /**
+     * A temporary holding place where the forward state table is built
+     */
+    UVector tempStateTable;
+
+    /**
+     * A list of all the states that have to be filled in with transitions to the
+     * next state that is created.  Used when building the state table from the
+     * regular expressions.
+     */
+    UVector decisionPointList;
+
+    /**
+     * A UStack for holding decision point lists.  This is used to handle nested
+     * parentheses and braces in regexps.
+     */
+    UStack decisionPointStack;
+
+    /**
+     * A list of states that loop back on themselves.  Used to handle .*?
+     */
+    UVector loopingStates;
+
+    /**
+     * Looping states actually have to be backfilled later in the process
+     * than everything else.  This is where a the list of states to backfill
+     * is accumulated.  This is also used to handle .*?
+     */
+    UVector statesToBackfill;
+
+    /**
+     * A list mapping pairs of state numbers for states that are to be combined
+     * to the state number of the state representing their combination.  Used
+     * in the process of making the state table deterministic to prevent
+     * infinite recursion.
+     */
+    UVector mergeList;
+
+    /**
+     * A flag that is used to indicate when the list of looping states can
+     * be reset.
+     */
+    bool_t clearLoopingStates;
+
+public:
+
+    /**
+     * No special construction is required for the Builder.
+     */
+    RuleBasedBreakIteratorBuilder();
+
+    /**
+     * This is the main function for setting up the BreakIterator's tables.  It
+     * just UVectors different parts of the job off to other functions.
+     */
+    virtual void buildBreakIterator();
+
+private:
+
+    /**
+     * Thus function has three main purposes:
+     * <ul><li>Perform general syntax checking on the description, so the rest of the
+     * build code can assume that it's parsing a legal description.
+     * <li>Split the description into separate rules
+     * <li>Perform variable-name substitutions (so that no one else sees variable names)
+     * </ul>
+     */
+    virtual UVector buildRuleList(UnicodeString description);
+
+protected:
+
+    /**
+     * This function performs variable-name substitutions.  First it does syntax
+     * checking on the variable-name definition.  If it's syntactically valid, it
+     * then goes through the remainder of the description and does a simple
+     * find-and-replace of the variable name with its text.  (The variable text
+     * must be enclosed in either [] or () for this to work.)
+     */
+    virtual UnicodeString processSubstitution(UnicodeString substitutionRule, UnicodeString description,
+                    int32_t startPos);
+
+    /**
+     * This function defines a protocol for handling substitution names that
+     * are "special," i.e., that have some property beyond just being
+     * substitutions.  At the RuleBasedBreakIterator level, we have one
+     * special substitution name, "<ignore>".  Subclasses can override this
+     * function to add more.  Any special processing that has to go on beyond
+     * that which is done by the normal substitution-processing code is done
+     * here.
+     */
+    virtual void handleSpecialSubstitution(UnicodeString replace, UnicodeString replaceWith,
+                int32_t startPos, UnicodeString description);
+
+    /**
+     * This function builds the character category table.  On entry,
+     * tempRuleList is a UVector of break rules that has had variable names substituted.
+     * On exit, the charCategoryTable data member has been initialized to hold the
+     * character category table, and tempRuleList's rules have been munged to contain
+     * character category numbers everywhere a literal character or a [] expression
+     * originally occurred.
+     */
+    virtual void buildCharCategories(UVector tempRuleList);
+
+private:
+
+    /**
+     * This is the function that builds the forward state table.  Most of the real
+     * work is done in parseRule(), which is called once for each rule in the
+     * description.
+     */
+    virtual void buildStateTable(UVector tempRuleList);
+
+    /**
+     * This is where most of the work really happens.  This routine parses a single
+     * rule in the rule description, adding and modifying states in the state
+     * table according to the new expression.  The state table is kept deterministic
+     * throughout the whole operation, although some ugly postprocessing is needed
+     * to handle the *? token.
+     */
+    virtual void parseRule(UnicodeString rule, bool_t forward);
+
+    /**
+     * Update entries in the state table, and merge states when necessary to keep
+     * the table deterministic.
+     * @param rows The list of rows that need updating (the decision point list)
+     * @param pendingChars A character category list, encoded in a String.  This is the
+     * list of the columns that need updating.
+     * @param newValue Update the cells specfied above to contain this value
+     */
+    virtual void updateStateTable(UVector rows,
+                                  UnicodeString pendingChars,
+                                  int16_t newValue);
+
+    /**
+     * The real work of making the state table deterministic happens here.  This function
+     * merges a state in the state table (specified by rowNum) with a state that is
+     * passed in (newValues).  The basic process is to copy the nonzero cells in newStates
+     * into the state in the state table (we'll call that oldValues).  If there's a
+     * collision (i.e., if the same cell has a nonzero value in both states, and it's
+     * not the SAME value), then we have to reconcile the collision.  We do this by
+     * creating a new state, adding it to the end of the state table, and using this
+     * function recursively to merge the original two states into a single, combined
+     * state.  This process may happen recursively (i.e., each successive level may
+     * involve collisions).  To prevent infinite recursion, we keep a log of merge
+     * operations.  Any time we're merging two states we've merged before, we can just
+     * supply the row number for the result of that merge operation rather than creating
+     * a new state just like it.
+     * @param rowNum The row number in the state table of the state to be updated
+     * @param newValues The state to merge it with.
+     * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
+     * (itself a copy of the decision point list from parseRule()).  Newly-created
+     * states get added to the decision point list if their "parents" were on it.
+     */
+    virtual void mergeStates(int32_t rowNum,
+                             int16_t* newValues,
+                             UVector rowsBeingUpdated);
+
+    /**
+     * The merge list is a list of pairs of rows that have been merged somewhere in
+     * the process of building this state table, along with the row number of the
+     * row containing the merged state.  This function looks up a pair of row numbers
+     * and returns the row number of the row they combine into.  (It returns 0 if
+     * this pair of rows isn't in the merge list.)
+     */
+    virtual int32_t searchMergeList(int32_t a, int32_t b);
+
+    /**
+     * This function is used to update the list of current loooping states (i.e.,
+     * states that are controlled by a *? construct).  It backfills values from
+     * the looping states into unpopulated cells of the states that are currently
+     * marked for backfilling, and then updates the list of looping states to be
+     * the new list
+     * @param newLoopingStates The list of new looping states
+     * @param endStates The list of states to treat as end states (states that
+     * can exit the loop).
+     */
+    virtual void setLoopingStates(UVector newLoopingStates, UVector endStates);
+
+    /**
+     * This removes "ending states" and states reachable from them from the
+     * list of states to backfill.
+     * @param The row number of the state to remove from the backfill list
+     */
+    virtual void eliminateBackfillStates(int32_t baseState);
+
+    /**
+     * This function completes the backfilling process by actually doing the
+     * backfilling on the states that are marked for it
+     */
+    virtual void backfillLoopingStates();
+
+    /**
+     * This function completes the state-table-building process by doing several
+     * postprocessing steps and copying everything into its final resting place
+     * in the iterator itself
+     * @param forward True if we're working on the forward state table
+     */
+    virtual void finishBuildingStateTable(bool_t forward);
+
+    /**
+     * This function builds the backward state table from the forward state
+     * table and any additional rules (identified by the ! on the front)
+     * supplied in the description
+     */
+    virtual void buildBackwardsStateTable(UVector tempRuleList);
+
+protected:
+
+    /**
+     * Throws an IllegalArgumentException representing a syntax error in the rule
+     * description.  The exception's message contains some debugging information.
+     * @param message A message describing the problem
+     * @param position The position in the description where the problem was
+     * discovered
+     * @param context The string containing the error
+     */
+    virtual void error(UnicodeString message, int32_t position, UnicodeString context);
+};
+
+#endif