ICU-2077 RBBI: review comments incorporated. (incomplete, more to come.)

X-SVN-Rev: 9612
2002-08-08 00:39:13 +00:00 · 2002-08-08 00:39:13 +00:00 · 6df1676310
commit 6df1676310
parent 48acf9f6dd
9 changed files with 287 additions and 197 deletions
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@ -37,7 +37,7 @@ const int32_t BreakIterator::DONE = (int32_t)-1;

 // -------------------------------------

-// Creates a simple text boundary for word breaks.
+// Creates a break iterator for word breaks.
 BreakIterator*
 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
 {
@ -49,31 +49,32 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)

    if (U_FAILURE(status))
        return NULL;
+
    if (!uprv_strcmp(key.getLanguage(), "th"))
    {
        filename = "word_th";
    }

    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
+    if (U_FAILURE(status)) {
+        return NULL;
+    }
+    // The UDataMemory is adopted by the break iterator.

-    if (U_SUCCESS(status)) {
-        if(!uprv_strcmp(filename, "word_th")) {
-            filename = "thaidict.brk";
-            result = new DictionaryBasedBreakIterator(file, filename, status);
-            /* test for NULL */
-            if(result == 0) {
-                status = U_MEMORY_ALLOCATION_ERROR;
-                return 0;
-            }
-        }
-        else {
-            result = new RuleBasedBreakIterator(file, status);
-            /* test for NULL */
-            if(result == 0) {
-                status = U_MEMORY_ALLOCATION_ERROR;
-                return 0;
-            }
-        }
+    if(!uprv_strcmp(filename, "word_th")) {
+        filename = "thaidict.brk";
+        result = new DictionaryBasedBreakIterator(file, filename, status);
+    }
+    else {
+        result = new RuleBasedBreakIterator(file, status);
+    }
+    if (result == NULL) {
+        udata_close(file);
+        status = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        delete result;
+        result = NULL;
    }

    return result;
@ -81,7 +82,7 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)

 // -------------------------------------

-// Creates a simple text boundary for line breaks.
+// Creates a break iterator  for line breaks.
 BreakIterator*
 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
 {
@ -93,39 +94,39 @@ BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)

    if (U_FAILURE(status))
        return NULL;
+
    if (!uprv_strcmp(key.getLanguage(), "th"))
    {
        filename = "line_th";
    }

    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
-
-    if (!U_FAILURE(status)) {
-        if (!uprv_strcmp(key.getLanguage(), "th")) {
-            filename = "thaidict.brk";
-            result = new DictionaryBasedBreakIterator(file, filename, status);
-            /* test for NULL */
-            if(result == 0) {
-                status = U_MEMORY_ALLOCATION_ERROR;
-                return 0;
-            }
-        }
-        else {
-            result = new RuleBasedBreakIterator(file, status);
-            /* test for NULL */
-            if(result == 0) {
-                status = U_MEMORY_ALLOCATION_ERROR;
-                return 0;
-            }
-        }
+    if (U_FAILURE(status)) {
+        return NULL;
    }
+    // The UDataMemory is adopted by the break iterator.

+    if (!uprv_strcmp(key.getLanguage(), "th")) {
+        filename = "thaidict.brk";
+        result = new DictionaryBasedBreakIterator(file, filename, status);
+    }
+    else {
+        result = new RuleBasedBreakIterator(file, status);
+    }
+    if (result == NULL) {
+        udata_close(file);
+        status = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        delete result;
+        result = NULL;
+    }
    return result;
 }

 // -------------------------------------

-// Creates a simple text boundary for character breaks.
+// Creates a break iterator  for character breaks.
 BreakIterator*
 BreakIterator::createCharacterInstance(const Locale& /* key */, UErrorCode& status)
 {
@ -138,22 +139,26 @@ BreakIterator::createCharacterInstance(const Locale& /* key */, UErrorCode& stat
    if (U_FAILURE(status))
        return NULL;
    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
-
-    if (!U_FAILURE(status)) {
-        result = new RuleBasedBreakIterator(file, status);
-        /* test for NULL */
-        if(result == 0) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return 0;
-        }
+    if (U_FAILURE(status)) {
+        return NULL;
    }
+    // The UDataMemory is adopted by the break iterator.

+    result = new RuleBasedBreakIterator(file, status);
+    if (result == NULL) {
+        udata_close(file);
+        status = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        delete result;
+        result = NULL;
+    }
    return result;
 }

 // -------------------------------------

-// Creates a simple text boundary for sentence breaks.
+// Creates a break iterator  for sentence breaks.
 BreakIterator*
 BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status)
 {
@ -166,14 +171,19 @@ BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status
    if (U_FAILURE(status))
        return NULL;
    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
+    if (U_FAILURE(status)) {
+        return NULL;
+    }
+    // The UDataMemory is adopted by the break iterator.

-    if (!U_FAILURE(status)) {
-        result = new RuleBasedBreakIterator(file, status);
-        /* test for NULL */
-            if(result == 0) {
-                status = U_MEMORY_ALLOCATION_ERROR;
-                return 0;
-            }
+    result = new RuleBasedBreakIterator(file, status);
+    if (result == NULL) {
+        udata_close(file);
+        status = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        delete result;
+        result = NULL;
    }

    return result;
@ -181,7 +191,7 @@ BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status

 // -------------------------------------

-// Creates a simple text boundary for title casing breaks.
+// Creates a break iterator for title casing breaks.
 BreakIterator*
 BreakIterator::createTitleInstance(const Locale& /* key */, UErrorCode& status)
 {
@ -194,14 +204,19 @@ BreakIterator::createTitleInstance(const Locale& /* key */, UErrorCode& status)
    if (U_FAILURE(status))
        return NULL;
    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
+    if (U_FAILURE(status)) {
+        return NULL;
+    }
+    // The UDataMemory is adopted by the break iterator.

-    if (!U_FAILURE(status)) {
-        result = new RuleBasedBreakIterator(file, status);
-        /* test for NULL */
-        if(result == 0) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return 0;
-        }
+    result = new RuleBasedBreakIterator(file, status);
+    if (result == NULL) {
+        udata_close(file);
+        status = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        delete result;
+        result = NULL;
    }

    return result;
@ -234,11 +249,11 @@ BreakIterator::getDisplayName(const Locale& objectLocale,
    return objectLocale.getDisplayName(displayLocale, name);
 }

-// -------------------------------------
-
-// Needed because we declare the copy constructor (in order to prevent synthesizing one) and
-// so the default constructor is no longer synthesized.
-
+// ------------------------------------------
+//
+// Default constructor and destructor
+//
+//-------------------------------------------
 BreakIterator::BreakIterator()
 {
    fBufferClone = FALSE;
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -18,6 +18,7 @@
 #include "rbbirb.h"
 #include "filestrm.h"
 #include "cmemory.h"
+#include "cstring.h"

 #include "uassert.h"

@ -25,8 +26,7 @@ U_NAMESPACE_BEGIN


 static const int16_t START_STATE = 1;     // The state number of the starting state
-
-static const int16_t STOP_STATE = 0;      // The state-transition value indicating "stop"
+static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"

 /**
 * Class ID.  (value is irrelevant; address is important)
@ -86,6 +86,10 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    if (U_FAILURE(status)) {return;};
    RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
        RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
+    // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
+    //        creates and returns a complete RBBI.  From here, in a constructor, we
+    //        can't just return the object created by the builder factory, hence
+    //        the assignment of the factory created object to "this".
    if (U_SUCCESS(status)) {
        *this = *bi;
        delete bi;
@ -118,16 +122,15 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
 }


-//=======================================================================
-// boilerplate
-//=======================================================================
 /**
 * Destructor
 */
 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    delete fText;
+    fText = NULL;
    if (fData != NULL) {
        fData->removeReference();
+        fData = NULL;
    }
 }

@ -163,6 +166,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 //-----------------------------------------------------------------------------
 //
 //    init()      Shared initialization routine.   Used by all the constructors.
+//                Initializes all fields, leaving the object in a consistent state.
 //
 //-----------------------------------------------------------------------------
 UBool RuleBasedBreakIterator::fTrace = FALSE;
@ -179,7 +183,7 @@ void RuleBasedBreakIterator::init() {
    if (debugInitDone == FALSE) {
 #ifdef RBBI_DEBUG
        char *debugEnv = getenv("U_RBBIDEBUG");
-        if (debugEnv && strstr(debugEnv, "trace")) {
+        if (debugEnv && uprv_strstr(debugEnv, "trace")) {
            fTrace = TRUE;
        }
 #endif
@ -268,7 +272,7 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    reset();
    delete fText;
    fText = newText;
-    fText->first();
+    this->first();
 }

 /**
@ -286,8 +290,8 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    else {
        delete fText;
        fText = new StringCharacterIterator(newText);
-        fText->first();
    }
+    this->first();
 }


@ -435,11 +439,14 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
    fLastBreakTagValid = TRUE;
    if (fText == NULL || offset >= fText->endIndex()) {
        // fText->setToEnd();
-        return BreakIterator::DONE;
+        // return BreakIterator::DONE;
+        last();
+        return next();
    }
    else if (offset < fText->startIndex()) {
        // fText->setToStart();
-        return fText->startIndex();
+        // return fText->startIndex();
+        return first();
    }

    // otherwise, set our internal iteration position (temporarily)
@ -476,10 +483,11 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    // just return DONE; if it's before the beginning, return the
    // text's starting offset
    if (fText == NULL || offset > fText->endIndex()) {
-        return BreakIterator::DONE;
+        // return BreakIterator::DONE;
+        return last();
    }
    else if (offset < fText->startIndex()) {
-        return fText->startIndex();
+        return first();
    }

    // if we start by updating the current iteration position to the
@ -499,19 +507,25 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    // the beginning index of the iterator is always a boundary position by definition
    if (fText == NULL || offset == fText->startIndex()) {
+        first();       // For side effects on current position, tag values.
        return TRUE;
    }

    // out-of-range indexes are never boundary positions
-    else if (offset < fText->startIndex() || offset > fText->endIndex()) {
+    if (offset < fText->startIndex()) {
+        first();       // For side effects on current position, tag values.
+        return FALSE;
+    }
+
+    if (offset > fText->endIndex()) {
+        last();        // For side effects on current position, tag values.
        return FALSE;
    }

    // otherwise, we can use following() on the position before the specified
-    // one and return true of the position we get back is the one the user
+    // one and return true if the position we get back is the one the user
    // specified
-    else
-        return following(offset - 1) == offset;
+    return following(offset - 1) == offset;
 }

 /**
@ -555,7 +569,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
    int32_t result = fText->getIndex() + 1;
    int32_t lookaheadResult = 0;

-    // begin in state 1
+    // Initialize the state machine.  Begin in state 1
    int32_t            state           = START_STATE;
    int16_t            category;
    UChar32            c               = fText->current32();
@ -565,16 +579,19 @@ int32_t RuleBasedBreakIterator::handleNext(void) {

    fLastBreakTag = 0;

-    row = (RBBIStateTableRow *)
+    row = (RBBIStateTableRow *)    // Point to starting row of state table.
        (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
+
+    // Character Category fetch for starting character.
+    //    See comments on character category code within loop, below.
    UTRIE_GET16(&fData->fTrie, c, category);
    if ((category & 0x4000) != 0)  {
          fDictionaryCharCount++;
          category &= ~0x4000;
        }

-      // loop until we reach the end of the text or transition to state 0
-      for (;;) {
+    // loop until we reach the end of the text or transition to state 0
+    for (;;) {
        if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
            // Note: CharacterIterator::DONE is 0xffff, which is also a legal
            //       character value.  Check for DONE first, because it's quicker,
@ -586,15 +603,16 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
        // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
        //        not the size of the character going in.
        //
-        //  And off bit 14, which flags use of a dictionary for dictionary based
-        //    iterators, but should be ignored here.
        UTRIE_GET16(&fData->fTrie, c, category);

        // Check the dictionary bit in the character's category.
-        //    Counter is only used by dictionary based iterators.
+        //    Counter is only used by dictionary based iterators (subclasses).
+        //    Chars that need to be handled by a dictionary have a flag bit set
+        //    in their category values.
        //
        if ((category & 0x4000) != 0)  {
            fDictionaryCharCount++;
+            //  And off the dictionary flag bit.
            category &= ~0x4000;
        }

@ -616,6 +634,8 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
        // Get the next character.  Doing it here positions the iterator
        //    to the correct position for recording matches in the code that
        //    follows.
+        //  TODO:  16 bit next, and a 16 bit TRIE lookup, with escape code
+        //         for non-BMP chars, would be faster.
        c = fText->next32();

        if (row->fAccepting == 0 && row->fLookAhead == 0) {
@ -636,7 +656,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
        if (row->fAccepting == 0 && row->fLookAhead != 0) {
            // Lookahead match point.  Remember it, but only if no other rule has
            //                         unconitionally matched up to this point.
-            // TODO:  handle case where there's a pending match from a different rule
+            // TODO:  handle case where there's a pending match from a different rule -
            //        where lookaheadStatus != 0  && lookaheadStatus != row->fLookAhead.
            int32_t  r = fText->getIndex();
            if (r > result) {
@ -672,6 +692,7 @@ continueOn:
    // a lookahead state, advance the break position to the lookahead position
    // (the theory here is that if there are no characters at all after the lookahead
    // position, that always matches the lookahead criteria)
+    //   TODO:  is this really the right behavior?
    if (c == CharacterIterator::DONE &&
        fText->hasNext()==FALSE &&
        lookaheadResult == fText->endIndex()) {
@ -694,8 +715,9 @@ continueOn:
 //      This method backs the iterator back up to a "safe position" in the text.
 //      This is a position that we know, without any context, must be a break position.
 //      The various calling methods then iterate forward from this safe position to
-//      the appropriate position to return.  (For more information, see the description
-//      of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
+//      the appropriate position to return.
+//
+//      The logic of this function is very similar to handleNext(), above.
 //
 //-----------------------------------------------------------------------------------
 int32_t RuleBasedBreakIterator::handlePrevious(void) {
@ -833,18 +855,27 @@ RuleBasedBreakIterator::reset()

 //-------------------------------------------------------------------------------
 //
-//   getRuleStatus()
+//   getRuleStatus()   Return the break rule tag associated with the current
+//                     iterator position.  If the iterator arrived at its current
+//                     position by iterating forwards, the value will have been
+//                     cached by the handleNext() function.
+//
+//                     If no cached status value is available, the status is
+//                     found by doing a previous() followed by a next(), which
+//                     leaves the iterator where it started, and computes the
+//                     status while doing the next().
 //
 //-------------------------------------------------------------------------------
 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
-    // If the break tag value is unkown, back the iterator up, then move
-    //   forward again.  Moving forward will set the fLastBreakTag value correctly.
    RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
    if (fLastBreakTagValid == FALSE) {
-        if (current() == fText->startIndex()) {
+        //  No cached status is available.
+        if (fText == NULL || current() == fText->startIndex()) {
+            //  At start of text, or there is no text.  Status is always zero.
            nonConstThis->fLastBreakTag = 0;
            nonConstThis->fLastBreakTagValid = TRUE;
        } else {
+            //  Not at start of text.  Find status the tedious way.
            int32_t pa = current();
            nonConstThis->previous();
            int32_t pb = nonConstThis->next();
@ -857,7 +888,7 @@ int32_t  RuleBasedBreakIterator::getRuleStatus() const {

 //-------------------------------------------------------------------------------
 //
-//   getFlattenedData      Access to the compiled form of the rules,
+//   getBinaryRules        Access to the compiled form of the rules,
 //                         for use by build system tools that save the data
 //                         for standard iterator types.
 //
@ -868,7 +899,7 @@ const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {

    if (fData != NULL) {
        retPtr = (const uint8_t *)fData->fHeader;
-         length = fData->fHeader->fLength;
+        length = fData->fHeader->fLength;
    }
    return retPtr;
 }
--- a/icu4c/source/common/rbbidata.cpp
+++ b/icu4c/source/common/rbbidata.cpp
@ -1,8 +1,8 @@
 /*
-**********************************************************************
+***************************************************************************
 *   Copyright (C) 1999-2002 International Business Machines Corporation   *
-*   and others. All rights reserved.                                 *
-**********************************************************************
+*   and others. All rights reserved.                                      *
+***************************************************************************
 */

 #include "unicode/utypes.h"
@ -156,7 +156,7 @@ int32_t  RBBIDataWrapper::hashCode() {
 //
 //-----------------------------------------------------------------------------
 void RBBIDataWrapper::removeReference() {
-    if (umtx_atomic_dec(&fRefCount) == 0) {  
+    if (umtx_atomic_dec(&fRefCount) == 0) {
        delete this;
    }
 };
@ -221,9 +221,4 @@ void  RBBIDataWrapper::printData() {



-
-
-
-
-
 U_NAMESPACE_END
--- a/icu4c/source/common/rbbisetb.h
+++ b/icu4c/source/common/rbbisetb.h
@ -35,7 +35,8 @@ U_NAMESPACE_BEGIN
 //     All of them are strung together in a linked list, which is kept in order
 //     (by character)
 //
-struct RangeDescriptor : public UObject {
+class RangeDescriptor : public UObject {
+public:
    UChar32            fStartChar;      // Start of range, unicode 32 bit value.
    UChar32            fEndChar;        // End of range, unicode 32 bit value.
    int32_t            fNum;            // runtime-mapped input value for this range.
--- a/icu4c/source/common/ubrk.cpp
+++ b/icu4c/source/common/ubrk.cpp
@ -94,22 +94,27 @@ ubrk_openRules(  const UChar        *rules,
                       UParseError  *parseErr,
                       UErrorCode   *status)  {

-    BreakIterator *result = 0;
+    if (status == NULL || U_FAILURE(*status)){
+        return 0;
+    }

+    BreakIterator *result = 0;
    UnicodeString ruleString(rules, rulesLength);
    result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, *parseErr, *status);
    if(U_FAILURE(*status)) {
        return 0;
    }

-    UCharCharacterIterator *iter = 0;
-    iter = new UCharCharacterIterator(text, textLength);
-    if(iter == 0) {
-        *status = U_MEMORY_ALLOCATION_ERROR;
-        delete result;
-        return 0;
+    if (text != NULL) {
+        UCharCharacterIterator *iter = 0;
+        iter = new UCharCharacterIterator(text, textLength);
+        if(iter == 0) {
+            *status = U_MEMORY_ALLOCATION_ERROR;
+            delete result;
+            return 0;
+        }
+        result->adoptText(iter);
    }
-    result->adoptText(iter);
    return (UBreakIterator *)result;
 }

@ -243,7 +248,7 @@ ubrk_countAvailable()
 }


-U_CAPI  UBool U_EXPORT2 
+U_CAPI  UBool U_EXPORT2
 ubrk_isBoundary(UBreakIterator *bi, int32_t offset)
 {
    return ((BreakIterator *)bi)->isBoundary(offset);
--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@ -1,10 +1,10 @@
 /*
 *****************************************************************************************
-*   Copyright (C) 1997-2001, International Business Machines
+*   Copyright (C) 1997-2002, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *****************************************************************************************
 *
-* File BRKITER.H
+* File brkiter.h
 *
 * Modification History:
 *
@ -65,13 +65,13 @@ U_NAMESPACE_BEGIN
 * <P>
 * Helper function to output text
 * <pre>
- * \code 
+ * \code
 *    void printTextRange( BreakIterator& iterator, int32_t start, int32_t end )
 *    {
 *        UnicodeString textBuffer, temp;
 *        CharacterIterator *strIter = iterator.createText();
 *        strIter->getText(temp);
- *        cout << " " << start << " " << end << " |" 
+ *        cout << " " << start << " " << end << " |"
 *             << temp.extractBetween(start, end, textBuffer)
 *             << "|" << endl;
 *        delete strIter;
@ -149,7 +149,7 @@ U_NAMESPACE_BEGIN
 *           BreakIterator* boundary;
 *           UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
 *           cout << "Examining: " << stringToExamine << endl;
- * 
+ *
 *           //print each sentence in forward and reverse order
 *           boundary = BreakIterator::createSentenceInstance( Locale::US );
 *           boundary->setText(stringToExamine);
@ -158,7 +158,7 @@ U_NAMESPACE_BEGIN
 *           cout << "----- backward: ----------" << endl;
 *           printEachBackward(*boundary);
 *           delete boundary;
- * 
+ *
 *           //print each word in order
 *           boundary = BreakIterator::createWordInstance();
 *           boundary->setText(stringToExamine);
@ -173,7 +173,7 @@ U_NAMESPACE_BEGIN
 *           //print word at charpos 10
 *           cout << "----- at pos 10: ---------" << endl;
 *           printAt(*boundary, 10 );
- * 
+ *
 *           delete boundary;
 *       }
 * \endcode
@ -222,6 +222,8 @@ public:

    /**
     * Return a CharacterIterator over the text being analyzed.
+     * Changing the state of the returned iterator can have undefined consequences
+     * on the operation of the break iterator.  If you need to change it, clone it first.
     * @stable
     */
    virtual const CharacterIterator& getText(void) const = 0;
@ -278,8 +280,7 @@ public:
    virtual int32_t next(void) = 0;

    /**
-     * Return character index of the text boundary that was most recently
-     * returned by next(), previous(), first(), or last()
+     * Return character index of the current interator position within the text.
     * @return The boundary most recently returned.
     * @stable
     */
@ -304,9 +305,11 @@ public:
     * @stable
     */
    virtual int32_t preceding(int32_t offset) = 0;
- 
+
    /**
     * Return true if the specfied position is a boundary position.
+     * As a side effect, the current position of the iterator is set
+     * to the first boundary position at or following the specified offset.
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
     * @stable
@ -328,22 +331,22 @@ public:
     * Create BreakIterator for word-breaks using the given locale.
     * Returns an instance of a BreakIterator implementing word breaks.
     * WordBreak is useful for word selection (ex. double click)
-     * @param where the locale. 
+     * @param where the locale.
     * @param status the error code
-     * @return A BreakIterator for word-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for word-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable
     */
-    static BreakIterator* createWordInstance(const Locale& where, 
+    static BreakIterator* createWordInstance(const Locale& where,
                                                   UErrorCode& status);

    /**
@ -354,84 +357,84 @@ public:
     * LineBreak is useful for word wrapping text.
     * @param where the locale.
     * @param status The error code.
-     * @return A BreakIterator for line-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for line-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable
     */
-    static BreakIterator* createLineInstance(const Locale& where, 
+    static BreakIterator* createLineInstance(const Locale& where,
                                                   UErrorCode& status);

    /**
     * Create BreakIterator for character-breaks using specified locale
     * Returns an instance of a BreakIterator implementing character breaks.
     * Character breaks are boundaries of combining character sequences.
-     * @param where the locale. 
+     * @param where the locale.
     * @param status The error code.
-     * @return A BreakIterator for character-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for character-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable
     */
-    static BreakIterator* createCharacterInstance(const Locale& where, 
+    static BreakIterator* createCharacterInstance(const Locale& where,
                                                        UErrorCode& status);

    /**
     * Create BreakIterator for sentence-breaks using specified locale
     * Returns an instance of a BreakIterator implementing sentence breaks.
-     * @param where the locale. 
+     * @param where the locale.
     * @param status The error code.
-     * @return A BreakIterator for sentence-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for sentence-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable
     */
-    static BreakIterator* createSentenceInstance(const Locale& where, 
+    static BreakIterator* createSentenceInstance(const Locale& where,
                                                       UErrorCode& status);

    /**
     * Create BreakIterator for title-casing breaks using the specified locale
     * Returns an instance of a BreakIterator implementing title breaks.
-     * @param where the locale. 
+     * @param where the locale.
     * @param status The error code.
-     * @return A BreakIterator for title-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for title-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
-     * @stable
+     * @draft ICU 2.1
     */
-    static BreakIterator* createTitleInstance(const Locale& where, 
+    static BreakIterator* createTitleInstance(const Locale& where,
                                                       UErrorCode& status);

    /**
@ -469,24 +472,30 @@ public:
    /**
     * Thread safe client-buffer-based cloning operation
     *    Do NOT call delete on a safeclone, since 'new' is not used to create it.
-     * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. 
+     * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
     * If buffer is not large enough, new memory will be allocated.
-     * @param BufferSize reference to size of allocated space. 
-     * If BufferSize == 0, a sufficient size for use in cloning will 
+     * @param BufferSize reference to size of allocated space.
+     * If BufferSize == 0, a sufficient size for use in cloning will
     * be returned ('pre-flighting')
-     * If BufferSize is not enough for a stack-based safe clone, 
+     * If BufferSize is not enough for a stack-based safe clone,
     * new memory will be allocated.
     * @param status to indicate whether the operation went on smoothly or there were errors
-     *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were 
+     *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
     *  necessary.
     * @return pointer to the new clone
-     *  
-     * @draft ICU 1.8
+     *
+     * @stable
     */
    virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                               int32_t &BufferSize,
                                               UErrorCode &status) = 0;

+    /**
+     *   Determine whether the BreakIterator was created in user memory by
+     *   createBufferClone(), and thus should not be deleted.  Such objects
+     *   must be closed by an explicit call to the destructor (not delete).
+     *  @stable
+     */
    inline UBool isBufferClone(void);


--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -24,9 +24,9 @@ struct UTrie;
 U_NAMESPACE_BEGIN

 struct RBBIDataHeader;
-class RuleBasedBreakIteratorTables;
-class BreakIterator;
-class RBBIDataWrapper;
+class  RuleBasedBreakIteratorTables;
+class  BreakIterator;
+class  RBBIDataWrapper;



@ -37,10 +37,6 @@ class RBBIDataWrapper;
 * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
 *
 */
-
-
-
-
 class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {

 protected:
@ -74,7 +70,7 @@ protected:
    uint32_t           fDictionaryCharCount;

    //
-    // Debugging flag.
+    // Debugging flag.  Trace operation of state machine when true.
    //
    static UBool        fTrace;

@ -117,7 +113,8 @@ protected:
 public:

    /** Default constructor.  Creates an empty shell of an iterator, with no
-     *  rules or text to iterate over.   Object can subsequently be assigned.
+     *  rules or text to iterate over.   Object can subsequently be assigned to.
+     *  @draft ICU 2.2
     */
    RuleBasedBreakIterator();

@ -134,12 +131,14 @@ public:
     * @param parseError  In the event of a syntax error in the rules, provides the location
     *                    within the rules of the problem.
     * @param status Information on any errors encountered.
+     *  @draft ICU 2.2
     */
    RuleBasedBreakIterator( const UnicodeString    &rules,
                             UParseError           &parseError,
                             UErrorCode            &status);
    /**
     * Destructor
+     *  @stable
     */
    virtual ~RuleBasedBreakIterator();

@ -148,6 +147,7 @@ public:
     * and iterate over the same text, as the one passed in.
     * @param that The RuleBasedBreakItertor passed in
     * @return the newly created RuleBasedBreakIterator
+     *  @stable
     */
    RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);

@ -157,6 +157,7 @@ public:
     * @param that The BreakIterator to be compared for equality
     * @Return TRUE if both BreakIterators are of the
     * same class, have the same behavior, and iterate over the same text.
+     *  @stable
     */
    virtual UBool operator==(const BreakIterator& that) const;

@ -165,6 +166,7 @@ public:
     * and vice versa.
     * @param that The BreakIterator to be compared for inequality
     * @return TRUE if both BreakIterators are not same.
+     *  @stable
     */
    UBool operator!=(const BreakIterator& that) const;

@ -175,18 +177,21 @@ public:
     *   will correctly clone (copy) a derived class.
     * clone() is thread safe.  Multiple threads may simultaeneously
     * clone the same source break iterator.
+     *  @stable
     */
    virtual BreakIterator* clone() const;

    /**
     * Compute a hash code for this BreakIterator
     * @return A hash code
+     *  @stable
     */
    virtual int32_t hashCode(void) const;

    /**
     * Returns the description used to create this iterator
     * @return the description used to create this iterator
+     *  @stable
     */
    virtual const UnicodeString& getRules(void) const;

@ -200,6 +205,7 @@ public:
     * Changing the state of this iterator can have undefined consequences.  If
     * you need to change it, clone it first.
     * @return An iterator over the text being analyzed.
+     *  @stable
     */
    virtual const CharacterIterator& getText(void) const;

@ -209,6 +215,7 @@ public:
     * the current iteration position to the beginning of the text.
     * @param newText An iterator over the text to analyze.  The BreakIterator
     * takes ownership of the character iterator.  The caller MUST NOT delete it!
+     *  @stable
     */
    virtual void adoptText(CharacterIterator* newText);

@ -216,6 +223,7 @@ public:
     * Set the iterator to analyze a new piece of text.  This function resets
     * the current iteration position to the beginning of the text.
     * @param newText The text to analyze.
+     *  @stable
     */
    virtual void setText(const UnicodeString& newText);

@ -223,6 +231,7 @@ public:
     * Sets the current iteration position to the beginning of the text.
     * (i.e., the CharacterIterator's starting offset).
     * @return The offset of the beginning of the text.
+     *  @stable
     */
    virtual int32_t first(void);

@ -230,6 +239,7 @@ public:
     * Sets the current iteration position to the end of the text.
     * (i.e., the CharacterIterator's ending offset).
     * @return The text's past-the-end offset.
+     *  @stable
     */
    virtual int32_t last(void);

@ -241,18 +251,21 @@ public:
     * (negative is backwards, and positive is forwards).
     * @return The character offset of the boundary position n boundaries away from
     * the current one.
+     *  @stable
     */
    virtual int32_t next(int32_t n);

    /**
     * Advances the iterator to the next boundary position.
     * @return The position of the first boundary after this one.
+     *  @stable
     */
    virtual int32_t next(void);

    /**
     * Moves the iterator backwards, to the last boundary preceding this one.
     * @return The position of the last boundary position preceding this one.
+     *  @stable
     */
    virtual int32_t previous(void);

@ -261,6 +274,7 @@ public:
     * the specified position.
     * @param offset The position from which to begin searching for a break position.
     * @return The position of the first break after the current position.
+     *  @stable
     */
    virtual int32_t following(int32_t offset);

@ -269,6 +283,7 @@ public:
     * specified position.
     * @param offset The position to begin searching for a break from.
     * @return The position of the last boundary before the starting position.
+     *  @stable
     */
    virtual int32_t preceding(int32_t offset);

@ -278,12 +293,14 @@ public:
     * or after "offset".
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
+     *  @stable
     */
    virtual UBool isBoundary(int32_t offset);

    /**
     * Returns the current iteration position.
     * @return The current iteration position.
+     * @stable
     */
    virtual int32_t current(void) const;

@ -295,6 +312,7 @@ public:
     * status, a default value of 0 is returned.
     * @return the status from the break rule that determined the most recently
     * returned break position.
+     * @draft ICU 2.2
     */
    virtual int32_t getRuleStatus() const;

@ -336,7 +354,7 @@ public:
     *                     buffer size, but do not clone the object.  If the
     *                     size was too small (but not zero), allocate heap
     *                     storage for the cloned object.
-     * 
+     *
     * @param status       Error status.  U_SAFECLONE_ALLOCATED_WARNING will be
     *                     returned if the the provided buffer was too small, and
     *                     the clone was therefore put on the heap.
@ -344,6 +362,7 @@ public:
     * @return  Pointer to the clone object.  This may differ from the stackBuffer
     *          address if the byte alignment of the stack buffer was not suitable
     *          or if the stackBuffer was too small to hold the clone.
+     * @draft stable
     */
    virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                               int32_t &BufferSize,
@ -365,6 +384,7 @@ public:
     * @return   A pointer to the binary (compiled) rule data.  The storage
     *           belongs to the RulesBasedBreakIterator object, not the
     *           caller, and must not be modified or deleted.
+     * @internal
     */
    virtual const uint8_t *getBinaryRules(uint32_t &length);

--- a/icu4c/source/common/unicode/ubrk.h
+++ b/icu4c/source/common/unicode/ubrk.h
@ -47,7 +47,7 @@
 * typically starts of words, that should be set to Title Case
 * when title casing the text.
 * <P>
- * 
+ *
 * This is the interface for all text boundaries.
 * <P>
 * Examples:
@ -204,15 +204,27 @@ typedef enum UBreakIteratorType UBreakIteratorType;
 *  than for single individual values.
 */
 enum UWordBreak {
+    /** Tag value for "words" that do not fit into any of other categories. 
+     *  Includes spaces and most punctuation. */
    UBRK_WORD_NONE           = 0,
+    /** Upper bound for tags for uncategorized words. */
    UBRK_WORD_NONE_LIMIT     = 100,
+    /** Tag value for words that appear to be numbers, lower limit.    */
    UBRK_WORD_NUMBER         = 100,
+    /** Tag value for words that appear to be numbers, upper limit.    */
    UBRK_WORD_NUMBER_LIMIT   = 200,
+    /** Tag value for words that contain letters, excluding
+     *  hiragana, katakana or ideographic characters, lower limit.    */
    UBRK_WORD_LETTER         = 200,
+    /** Tag value for words containing letters, upper limit  */
    UBRK_WORD_LETTER_LIMIT   = 300,
-    UBRK_WORD_HIRAKATA       = 300,
-    UBRK_WORD_HIRAKATA_LIMIT = 400,
+    /** Tag value for words containing kana characters, lower limit */
+    UBRK_WORD_KANA           = 300,
+    /** Tag value for words containing kana characters, upper limit */
+    UBRK_WORD_KANA_LIMIT     = 400,
+    /** Tag value for words containing ideographic characters, lower limit */
    UBRK_WORD_IDEO           = 400,
+    /** Tag value for words containing ideographic characters, upper limit */
    UBRK_WORD_IDEO_LIMIT     = 500
 };
 typedef enum UWordBreak UWordBreak;
@ -232,7 +244,7 @@ typedef enum UWordBreak UWordBreak;
 * @see ubrk_openRules
 * @stable
 */
-U_CAPI UBreakIterator* U_EXPORT2 
+U_CAPI UBreakIterator* U_EXPORT2
 ubrk_open(UBreakIteratorType type,
      const char *locale,
      const UChar *text,
@ -252,9 +264,9 @@ ubrk_open(UBreakIteratorType type,
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
- * @draft
+ * @draft ICU 2.2
 */
-U_CAPI UBreakIterator* U_EXPORT2 
+U_CAPI UBreakIterator* U_EXPORT2
 ubrk_openRules(const UChar     *rules,
               int32_t         rulesLength,
               const UChar     *text,
@ -276,9 +288,9 @@ ubrk_openRules(const UChar     *rules,
 * @param status to indicate whether the operation went on smoothly or there were errors
 *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
 * @return pointer to the new clone
- * @draft ICU 1.8
+ * @stable
 */
-U_CAPI UBreakIterator * U_EXPORT2 
+U_CAPI UBreakIterator * U_EXPORT2
 ubrk_safeClone(
          const UBreakIterator *bi,
          void *stackBuffer,
@ -293,7 +305,7 @@ ubrk_safeClone(
 * @param bi The break iterator to close.
 * @stable
 */
-U_CAPI void U_EXPORT2 
+U_CAPI void U_EXPORT2
 ubrk_close(UBreakIterator *bi);

 /**
@ -304,7 +316,7 @@ ubrk_close(UBreakIterator *bi);
 * @param status The error code
 * @stable
 */
-U_CAPI void U_EXPORT2 
+U_CAPI void U_EXPORT2
 ubrk_setText(UBreakIterator* bi,
             const UChar*    text,
             int32_t         textLength,
@ -318,7 +330,7 @@ ubrk_setText(UBreakIterator* bi,
 * \Ref{ubrk_first}, or \Ref{ubrk_last}.
 * @stable
 */
-U_CAPI int32_t U_EXPORT2 
+U_CAPI int32_t U_EXPORT2
 ubrk_current(const UBreakIterator *bi);

 /**
@ -330,7 +342,7 @@ ubrk_current(const UBreakIterator *bi);
 * @see ubrk_previous
 * @stable
 */
-U_CAPI int32_t U_EXPORT2 
+U_CAPI int32_t U_EXPORT2
 ubrk_next(UBreakIterator *bi);

 /**
@ -342,7 +354,7 @@ ubrk_next(UBreakIterator *bi);
 * @see ubrk_next
 * @stable
 */
-U_CAPI int32_t U_EXPORT2 
+U_CAPI int32_t U_EXPORT2
 ubrk_previous(UBreakIterator *bi);

 /**
@ -353,7 +365,7 @@ ubrk_previous(UBreakIterator *bi);
 * @see ubrk_last
 * @stable
 */
-U_CAPI int32_t U_EXPORT2 
+U_CAPI int32_t U_EXPORT2
 ubrk_first(UBreakIterator *bi);

 /**
@ -366,7 +378,7 @@ ubrk_first(UBreakIterator *bi);
 * @see ubrk_first
 * @stable
 */
-U_CAPI int32_t U_EXPORT2 
+U_CAPI int32_t U_EXPORT2
 ubrk_last(UBreakIterator *bi);

 /**
@ -378,7 +390,7 @@ ubrk_last(UBreakIterator *bi);
 * @see ubrk_following
 * @stable
 */
-U_CAPI int32_t U_EXPORT2 
+U_CAPI int32_t U_EXPORT2
 ubrk_preceding(UBreakIterator *bi,
           int32_t offset);

@ -391,7 +403,7 @@ ubrk_preceding(UBreakIterator *bi,
 * @see ubrk_preceding
 * @stable
 */
-U_CAPI int32_t U_EXPORT2 
+U_CAPI int32_t U_EXPORT2
 ubrk_following(UBreakIterator *bi,
           int32_t offset);

@ -404,7 +416,7 @@ ubrk_following(UBreakIterator *bi,
 * @see ubrk_countAvailable
 * @stable
 */
-U_CAPI const char* U_EXPORT2 
+U_CAPI const char* U_EXPORT2
 ubrk_getAvailable(int32_t index);

 /**
@ -415,7 +427,7 @@ ubrk_getAvailable(int32_t index);
 * @see ubrk_getAvailable
 * @stable
 */
-U_CAPI int32_t U_EXPORT2 
+U_CAPI int32_t U_EXPORT2
 ubrk_countAvailable(void);


@ -426,8 +438,9 @@ ubrk_countAvailable(void);
 * @param bi The break iterator to use.
 * @param offset the offset to check.
 * @return True if "offset" is a boundary position.
+* @stable
 */
-U_CAPI  UBool U_EXPORT2 
+U_CAPI  UBool U_EXPORT2
 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);

 /**
@ -437,6 +450,7 @@ ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
 * status, a default value of 0 is returned.
 * <p>
 * For word break iterators, the possible values are defined in enum UWordBreak.
+ * @draft ICU 2.2
 */
 U_CAPI  int32_t U_EXPORT2
 ubrk_getRuleStatus(UBreakIterator *bi);
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -654,12 +654,12 @@ void RBBIAPITest::TestWordStatus() {
     int32_t tag_lo[]  = {UBRK_WORD_NONE,     UBRK_WORD_LETTER, UBRK_WORD_NONE,    UBRK_WORD_LETTER,
                          UBRK_WORD_NONE,     UBRK_WORD_NUMBER, UBRK_WORD_NONE,
                          UBRK_WORD_IDEO,     UBRK_WORD_IDEO,   UBRK_WORD_NONE,
-                          UBRK_WORD_HIRAKATA, UBRK_WORD_NONE,   UBRK_WORD_HIRAKATA};
+                          UBRK_WORD_KANA,     UBRK_WORD_NONE,   UBRK_WORD_KANA};

-     int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT,     UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT,    UBRK_WORD_LETTER_LIMIT,
-                          UBRK_WORD_NONE_LIMIT,     UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
-                          UBRK_WORD_IDEO_LIMIT,     UBRK_WORD_IDEO_LIMIT,   UBRK_WORD_NONE_LIMIT,
-                          UBRK_WORD_HIRAKATA_LIMIT, UBRK_WORD_NONE_LIMIT,   UBRK_WORD_HIRAKATA_LIMIT};
+     int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
+                          UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
+                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT,   UBRK_WORD_NONE_LIMIT,
+                          UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT,   UBRK_WORD_KANA_LIMIT};

     UErrorCode status=U_ZERO_ERROR;