ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
2003-11-05 23:50:39 +00:00 · 2003-11-05 23:50:39 +00:00 · 469c2d5b76
commit 469c2d5b76
parent e7251a2b04
10 changed files with 904 additions and 309 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -405,45 +405,51 @@ int32_t RuleBasedBreakIterator::previous(void) {
        return BreakIterator::DONE;
    }

-    // set things up.  handlePrevious() will back us up to some valid
-    // break position before the current position (we back our internal
-    // iterator up one step to prevent handlePrevious() from returning
-    // the current position), but not necessarily the last one before
-    // where we started
    int32_t start = current();
-    fText->previous32();
-    int32_t lastResult    = handlePrevious();
-    int32_t result        = lastResult;
-    int32_t lastTag       = 0;
-    UBool   breakTagValid = FALSE;
-
-    // iterate forward from the known break position until we pass our
-    // starting point.  The last break position before the starting
-    // point is our return value
    for (;;) {
-        result         = handleNext();
-        if (result == BreakIterator::DONE || result >= start) {
-            break;
+        // set things up. handlePrevious() will back us up to a safe position 
+        // before the current position to at most 2 breaks beyond. the 
+        // backwards rules may occasionally move the position to less than a 
+        // break beyond
+        
+        int32_t safe = handlePrevious();
+        return safe;
+        /*** int32_t result = handleNext();
+        // moving forward to a boundary.
+        if (result < start) {
+            fLastBreakTag      = 0;   // for use by getRuleStatus()
+            fLastBreakTagValid = TRUE;  // handlenext called
+            /// return lastResult;
+            return result;
        }
-        lastResult     = result;
-        lastTag        = fLastBreakTag;
-        breakTagValid  = TRUE;
+        else {
+            fText->setIndex(safe);
+            if (safe == fText->startIndex()) {
+                // if we are at the start of the text and result == start
+                // this means that we are already at the previous break
+                fLastBreakTag      = 0;   // for use by getRuleStatus()
+                fLastBreakTagValid = FALSE;
+                return safe;
+            }
+        }
+        ***/
+        /// lastResult     = result;
+        /// lastTag        = fLastBreakTag;
+        /// breakTagValid  = TRUE;
+        
+        // fLastBreakTag wants to have the value for section of text preceding
+        // the result position that we are to return (in lastResult.)  If
+        // the backwards rules overshot and the above loop had to do two or more
+        // handleNext()s to move up to the desired return position, we will have a valid
+        // tag value. But, if handlePrevious() took us to exactly the correct result positon,
+        // we wont have a tag value for that position, which is only set by handleNext().
+
+
+        /// fText->setIndex(lastResult);
+        /// fLastBreakTag      = lastTag;       // for use by getRuleStatus()
+        /// fLastBreakTagValid = breakTagValid;
+        /// return lastResult;
    }
-
-    // fLastBreakTag wants to have the value for section of text preceding
-    // the result position that we are to return (in lastResult.)  If
-    // the backwards rules overshot and the above loop had to do two or more
-    //  handleNext()s to move up to the desired return position, we will have a valid
-    //  tag value.  But, if handlePrevious() took us to exactly the correct result positon,
-    //  we wont have a tag value for that position, which is only set by handleNext().
-
-
-    // set the current iteration position to be the last break position
-    // before where we started, and then return that value
-    fText->setIndex(lastResult);
-    fLastBreakTag      = lastTag;       // for use by getRuleStatus()
-    fLastBreakTagValid = breakTagValid;
-    return lastResult;
 }


@ -476,9 +482,11 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
    // otherwise, set our internal iteration position (temporarily)
    // to the position passed in.  If this is the _beginning_ position,
    // then we can just use next() to get our return value
-    fText->setIndex(offset);
-    if (offset == fText->startIndex())
-        return handleNext();
+    /// todo synwee 
+    /// fText->setIndex(offset);
+    fText->setIndex(fText->startIndex());
+    /// if (offset == fText->startIndex())
+    ///    return handleNext();

    // otherwise, we have to sync up first.  Use handlePrevious() to back
    // us up to a known break position before the specified position (if
@ -488,7 +496,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
    // from here until we've passed the starting position.  The position
    // we stop on will be the first break position after the specified one.

-    int32_t result = previous();
+    int32_t result = fText->startIndex();/// previous();
    while (result != BreakIterator::DONE && result <= offset) {
        result = next();
    }
@ -517,8 +525,17 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    // if we start by updating the current iteration position to the
    // position specified by the caller, we can just use previous()
    // to carry out this operation
-    fText->setIndex(offset);
-    return previous();
+    /// todo synwee
+    /// fText->setIndex(offset);
+
+    /// return previous();
+    int32_t result = fText->endIndex();
+    fText->setIndex(result);
+    while (result != BreakIterator::DONE && result >= offset) {
+        result = next();
+    }
+
+    return result;
 }

 /**
@ -679,6 +696,35 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
            goto continueOn;
        }

+        if (row->fAccepting != 0 && row->fLookAhead != 0) {
+            // Lookahead match is completed.  Set the result accordingly, but only
+            //   if no other rule has matched further in the mean time.
+            ///
+            if (lookaheadResult >= result) {
+                // U_ASSERT(row->fAccepting == lookaheadStatus);   // TODO:  handle this case
+                //    of overlapping lookahead matches.
+                result          = lookaheadResult;
+                fLastBreakTag   = lookaheadTag;
+                lookaheadStatus = 0;
+                /// i think we have to back up to read the lookahead character again
+                fText->setIndex(lookaheadResult);
+                /// TODO: this is a simple hack since reverse rules only have simple
+                /// lookahead rules that we can definitely break out from.
+                /// we need to make the lookahead rules not chain eventually.
+                return result;
+            }
+            int32_t  r = fText->getIndex();
+            if (r > result) {
+                ///
+                result = r;
+                lookaheadResult = r;
+                lookaheadStatus = row->fLookAhead;
+                lookaheadTag   = row->fTag;
+            }
+
+            goto continueOn;
+        }
+
        if (row->fAccepting == -1) {
            // Match found, common case, no lookahead involved.
            //    (It's possible that some lookahead rule matched here also,
@ -695,24 +741,9 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
            // TODO:  handle case where there's a pending match from a different rule -
            //        where lookaheadStatus != 0  && lookaheadStatus != row->fLookAhead.
            int32_t  r = fText->getIndex();
-            if (r > result) {
-                lookaheadResult = r;
-                lookaheadStatus = row->fLookAhead;
-                lookaheadTag   = row->fTag;
-            }
-            goto continueOn;
-        }
-
-        if (row->fAccepting != 0 && row->fLookAhead != 0) {
-            // Lookahead match is completed.  Set the result accordingly, but only
-            //   if no other rule has matched further in the mean time.
-            if (lookaheadResult > result) {
-                U_ASSERT(row->fAccepting == lookaheadStatus);   // TODO:  handle this case
-                //    of overlapping lookahead matches.
-                result          = lookaheadResult;
-                fLastBreakTag   = lookaheadTag;
-                lookaheadStatus = 0;
-            }
+            lookaheadResult = r;
+            lookaheadStatus = row->fLookAhead;
+            lookaheadTag   = row->fTag;
            goto continueOn;
        }

@ -722,7 +753,7 @@ continueOn:
            // We have advanced through the string until it is certain that no
            //   longer match is possible, no matter what characters follow.
            break;
-        }
+        } 
    }

    // The state machine is done.  Check whether it found a match...
@ -749,7 +780,9 @@ continueOn:
 //  handlePrevious()
 //
 //      This method backs the iterator back up to a "safe position" in the text.
-//      This is a position that we know, without any context, must be a break position.
+//      This is a position that we know, without any context, may be any position
+//      not more than 2 breaks away. Occasionally, the position may be less than
+//      one break away.
 //      The various calling methods then iterate forward from this safe position to
 //      the appropriate position to return.
 //
@ -760,18 +793,24 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
    if (fText == NULL || fData == NULL) {
        return 0;
    }
+    // break tag is no longer valid after icu switched to exact backwards
+    // positioning.
+    fLastBreakTagValid = FALSE;
    if (fData->fReverseTable == NULL) {
        return fText->setToStart();
    }

-    int32_t            state           = START_STATE;
+    int32_t            state              = START_STATE;
    int32_t            category;
-    int32_t            lastCategory    = 0;
-    int32_t            result          = fText->getIndex();
-    int32_t            lookaheadStatus = 0;
-    int32_t            lookaheadResult = 0;
-    int32_t            lookaheadTag    = 0;
-    UChar32            c               = fText->current32();
+    int32_t            lastCategory       = 0;
+    UBool              hasPassedStartText = !fText->hasPrevious(); 
+    UChar32            c                  = fText->previous32();
+    // previous character
+    int32_t            result             = fText->getIndex(); 
+    int32_t            lookaheadStatus = 0;//[]   = {0, 0, 0, 0, 0};
+    int32_t            lookaheadResult = 0;//[]   = {0, 0, 0, 0, 0};
+    int32_t            lookaheadTag = 0;//[]      = {0, 0, 0, 0, 0};
+    int32_t            lookaheadCount = 0;
    RBBIStateTableRow *row;

    row = (RBBIStateTableRow *)
@ -788,7 +827,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {

    // loop until we reach the beginning of the text or transition to state 0
    for (;;) {
-        if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
+        // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
+        if (hasPassedStartText) { 
+            // if we have already considered the start of the text
            break;
        }

@ -825,9 +866,39 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
            goto continueOn;
        }

+        if (row->fAccepting != 0 && row->fLookAhead != 0) {
+            // Lookahead match is completed.  Set the result accordingly, but only
+            //   if no other rule has matched further in the mean time.
+            if (row->fAccepting == lookaheadStatus) { ///lookaheadResult > 0 && lookaheadResult <= result) {
+                /// what on earth is this? 
+                /// U_ASSERT(row->fAccepting == lookaheadStatus);   // TODO:  handle this case
+                //    of overlapping lookahead matches.
+                result          = lookaheadResult;
+                fLastBreakTag   = lookaheadTag;
+                lookaheadStatus = 0;
+                /// i think we have to back up to read the lookahead character again
+                fText->setIndex(lookaheadResult);
+                /// TODO: this is a simple hack since reverse rules only have simple
+                /// lookahead rules that we can definitely break out from.
+                /// we need to make the lookahead rules not chain eventually.
+                return result;
+            }
+
+            int32_t  r = fText->getIndex();
+            if (r < result) {
+                result = r;
+                lookaheadResult = r;
+                lookaheadStatus = row->fLookAhead;
+                lookaheadTag   = row->fTag;
+            }
+            goto continueOn;
+        }
+
        if (row->fAccepting == -1) {
            // Match found, common case, no lookahead involved.
            result = fText->getIndex();
+            /// added
+            fLastBreakTag   = row->fTag;   // Remember the break status (tag) value.
            lookaheadStatus = 0;     // clear out any pending look-ahead matches.
            goto continueOn;
        }
@ -837,43 +908,32 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
            //                         has unconditionally matched to this point.
            // TODO:  handle case where there's a pending match from a different rule
            //        where lookaheadStatus != 0  && lookaheadStatus != row->fLookAhead.
+            // 
            int32_t  r = fText->getIndex();
-            if (r > result) {
-                lookaheadResult = r;
-                lookaheadStatus = row->fLookAhead;
-                lookaheadTag    = row->fTag;
-            }
-            goto continueOn;
-        }
-
-        if (row->fAccepting != 0 && row->fLookAhead != 0) {
-            // Lookahead match is completed.  Set the result accordingly, but only
-            //   if no other rule has matched further in the mean time.
-            if (lookaheadResult > result) {
-                U_ASSERT(row->fAccepting == lookaheadStatus);   // TODO:  handle this case
-                //    of overlapping lookahead matches.
-                result          = lookaheadResult;
-                fLastBreakTag   = lookaheadTag;
-                lookaheadStatus = 0;
-            }
+            lookaheadResult = r;
+            lookaheadStatus = row->fLookAhead;
+            lookaheadTag    = row->fTag; 
            goto continueOn;
        }

 continueOn:
-        if (state == STOP_STATE) {
+        if (state == STOP_STATE) { /// && lookaheadStatus == 0) {
            break;
        }

        // then advance one character backwards
+        hasPassedStartText = !fText->hasPrevious(); 
        c = fText->previous32();
    }

    // Note:  the result postion isn't what is returned to the user by previous(),
    //        but where the implementation of previous() turns around and
    //        starts iterating forward again.
-    if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
-        result = fText->startIndex();
-    }
+    // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
+    /// if (hasPassedStartText) && row->fLookAhead != 0) {
+        /// return fText->setToStart();
+        /// return result;
+    /// }
    fText->setIndex(result);

    return result;
--- a/icu4c/source/data/brkitr/char.txt
+++ b/icu4c/source/data/brkitr/char.txt
@ -36,10 +36,11 @@ $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
 #
 $CR $LF;
 ([^$Control] | $HangulSyllable) $Extend*;
-.;


 #
 #  Reverse Rule, back up to the beginning of some preceding grapheme cluster.
 #
-! ($Extend | $V | $T )*   ($LF $CR | ($LV | $LVT)?$L* | .);
+$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
+$BackOneCluster = ($LF $CR) | ($Extend* ([^$Control] | $BackHangulSyllable));
+! $BackOneCluster; 
--- a/icu4c/source/data/brkitr/line.txt
+++ b/icu4c/source/data/brkitr/line.txt
@ -12,7 +12,7 @@
 #  Character Classes defined by TR 14.
 #

-!!chain ; 
+!!chain; 
 !!LBCMNoChain;

 $AI = [:LineBreak =  Ambiguous:];
@ -136,17 +136,17 @@ $LB5NonBreaks $CM* [$SP $ZW];
 #                                   $SP $CM needs to behave like $ID.
 #                                   X   $CM needs to behave like X, where X is not $SP.
 #                                   $CM not covered by the above needs to behave like $AL
-[$LB5NonBreaks] $CM+;    #  Stick together any combining sequences that don't match other rules.
+$LB5NonBreaks $CM+;    #  Stick together any combining sequences that don't match other rules.

 # LB 8     
-[$LB5NonBreaks] $CM* $CL;
-[$LB5NonBreaks] $CM* $EX;
-[$LB5NonBreaks] $CM* $IS;
-[$LB5NonBreaks] $CM* $SY;
+$LB5NonBreaks $CM* $CL;
+$LB5NonBreaks $CM* $EX;
+$LB5NonBreaks $CM* $IS;
+$LB5NonBreaks $CM* $SY;

 # LB 9
 $OPcm $SP* .?;         
-$OPcm $SP* [$LB5NonBreaks] $CM*;
+$OPcm $SP* $LB5NonBreaks $CM*;

 # LB 10
 $QUcm $SP* $OPcm;
@ -159,24 +159,24 @@ $CLcm $SP* $NScm;

 # LB 11b
 $LB5NonBreaks $CM* $GLcm .?;
-$LB5NonBreaks $CM* $GLcm [$LB5NonBreaks] $CM*;
+$LB5NonBreaks $CM* $GLcm $LB5NonBreaks $CM*;
 $GLcm $LB3NonBreaks?;
-$GLcm [$LB5NonBreaks] $CM*;
+$GLcm $LB5NonBreaks $CM*;

 # LB 12
-$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]];
+$LB12NonBreaks = [$LB5NonBreaks - $SP];

 # LB 14
 $LB12NonBreaks $CM* $QUcm+ .?;
-$LB12NonBreaks $CM* $QUcm+ [$LB5NonBreaks] $CM*;
+$LB12NonBreaks $CM* $QUcm+ $LB5NonBreaks $CM*;
 $SP $CM+            $QUcm+ .?;                      # LB7a  SP CM+ behaves as ID
-$SP $CM+            $QUcm+ [$LB5NonBreaks] $CM*;
+$SP $CM+            $QUcm+ $LB5NonBreaks $CM*;

 $QUcm $LB3NonBreaks?;
-$QUcm [$LB5NonBreaks] $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+$QUcm $LB5NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
 
 # LB 14a
-$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]];
+$LB14NonBreaks = [$LB12NonBreaks - $CB];
 $LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;


@ -216,6 +216,112 @@ $CM* $ALcm+;    # The $CM* is from rule 7C, and unattached CM is treated as AL
 #            Note that the initial .. is to back over both halves of a CR/LF sequence
 #            at the current position.
 #
+
 !!reverse;
-!. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
-#!.*;
+# !. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
+
+! $CM+ $ALPlus;
+! $CM+ $BA;
+! $CM+ $BB;
+! $CM+ $B2;
+! $CM+ $CL;
+! $CM+ $EX;
+! $CM+ $GL;
+! $CM+ $HY;
+! $CM+ $ID;
+! $CM+ $IN;
+! $CM+ $IS;
+! $CM+ $NS;
+! $CM+ $NU;
+! $CM+ $OP;
+! $CM+ $PO;
+! $CM+ $PR;
+! $CM+ $QU;
+! $CM+ $SP;
+! $CM+ $SY;
+
+# LB 3
+
+! ($BK | $CR | $LF | $NL) $LB3NonBreaks?;   
+! ($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
+! $LF $CR;
+
+# LB 4         x SP
+#              x ZW
+! [$SP $ZW] $LB3NonBreaks;
+! [$SP $ZW] $CM* $LB5NonBreaks;
+
+# LB 5         Break after zero width space
+
+# LB 7     Combining marks.  TODO:  get it right!
+#                                   $SP $CM needs to behave like $ID.
+#                                   X   $CM needs to behave like X, where X is not $SP.
+#                                   $CM not covered by the above needs to behave like $AL
+! $CM+ $LB5NonBreaks;    #  Stick together any combining sequences that don't match other rules.
+
+# LB 8     
+! $CL $CM* $LB5NonBreaks;
+! $EX $CM* $LB5NonBreaks;
+! $IS $CM* $LB5NonBreaks;
+! $SY $CM* $LB5NonBreaks;
+
+# LB 9
+! .? $SP* $CM* $OP;         
+! $CM* $LB5NonBreaks $SP* $CM* $OP;
+
+# LB 10
+! $CM* $OP $SP* $CM* $QU;
+
+# LB 11
+! $CM* $NS $SP* $CM* $CL;
+
+# LB 11a
+! ($CM* $B2)+;
+
+# LB 11b
+! .? $CM* $GL $CM* $LB5NonBreaks;
+! $CM* $LB5NonBreaks $CM* $GL $CM* $LB5NonBreaks;
+! $LB3NonBreaks? $CM* $GL;
+! $CM* $LB5NonBreaks $CM* $GL;
+
+# LB 12
+
+# LB 14
+! .? ($CM* $QU)+ $CM* $LB12NonBreaks;
+! $CM* $LB5NonBreaks ($CM* $QU)+ $CM* $LB12NonBreaks;
+! .? ($CM* $QU)+ $CM+ $SP; # LB7a  SP CM+ behaves as ID
+! $CM* $LB5NonBreaks ($CM* $QU)+ $CM+ $SP;
+
+! $LB3NonBreaks? $CM* $QU;
+! $CM* $LB5NonBreaks $CM* $QU;    # Don't let a combining mark go onto $CR, $BK, etc.
+ 
+# LB 14a
+$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
+
+# LB 15
+! ($CM* $BA | $CM* $HY | $CM* $NS) $BackLB14CanBreakAfter;   
+! ($CM* $BA | $CM* $HY | $CM* $NS) $CM+ / [$BK $CR $LF $NL $ZW];   
+! [^$CB] $CM* $BB;
+! $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;  
+
+# LB 16
+! $CM* $IN $CM* $ALPlus;
+! $CM* $IN $CM+ / [$BK $CR $LF $NL $ZW];     #  by rule 7c, any otherwise unattached CM behaves as AL
+! $CM* $IN $CM* $ID;
+! $CM* $IN $CM+ $SP; # by rule 7a, $SP $CM behaves like ID
+! $CM* $IN $CM* $IN;
+! $CM* $IN $CM* $NU;
+
+# $LB 17
+! $CM* $PO ($CM* $ID | $CM+ $SP);
+! $CM* $NU ($CM* $ALPlus)+; # includes $LB19
+! ($CM* $NU)+;
+! ($CM* $NU)+ $CM+ / [$BK $CR $LF $NL $ZW];        # Rule 7c
+! ($CM* $ALPlus)+ $CM* $NU;
+
+# LB 18
+! ($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;
+
+# LB 19
+! ($CM* $ALPlus)+;
+! ($CM* $ALPlus)+ $CM+ / [$BK $CR $LF $NL $ZW];    # The $CM* is from rule 7C, and unattached CM is treated as AL
--- a/icu4c/source/data/brkitr/sent.txt
+++ b/icu4c/source/data/brkitr/sent.txt
@ -9,6 +9,7 @@
 #      These rules are based on TR 29 version 4.0.0
 #
    
+!!chain;

 #
 # Character categories as defined in TR 29
@ -30,59 +31,85 @@ $Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
 $Close   = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
           [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
           
-           
+$Extend  = [[:Grapheme_Extend = TRUE:]]; 

-# Define extended forms of the character classes,
-#   incorporate grapheme cluster + format chars.
+$ATermEx = $ATerm $Extend*;
+$NumericEx = $Numeric $Extend*;
+$UpperEx = $Upper $Extend*;
+$CloseEx = $Close $Extend*;
+$SpEx = $Sp $Extend*;
+$LowerEx = $Lower $Extend*;
+$TermEx = $Term $Extend*;

-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-$ATermEx    = $ATerm   $Extend* $Format*;
-$NumericEx  = $Numeric $Extend* $Format*;
-$UpperEx    = $Upper   $Extend* $Format*;
-$TermEx     = $Term    $Extend* $Format*;
+# rule 6

-#
-#  $SepSeq keeps together CRLF as a separator.  (CRLF is a grapheme cluster)
-#
-$SepSeq  = $Sep | \u000d\u000a;
+$ATermEx $Format* $NumericEx;         

-# $InteriorChars are those that never trigger a following break.
-$InteriorChars = [^$Term $ATerm $Sep];   #Note:  includes Extend and Format chars
+# rule 7
+
+$UpperEx $ATermEx $Format* $UpperEx;
+
+# rule 8
+
+$ATermEx $Format* $CloseEx* $Format* $SpEx $Format* 
+    [^$OLetter $Upper $Lower $Sep]* $Extend* $Format* $LowerEx;
+
+# rule 9 forced to exit by / [^$Close $Sp]
+
+($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* $Sep;
+($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($CloseEx | $SpEx) / [^$Close $Sp];
+
+# rule 10 forced to exit by / [^$Sp];


-# Rule 6.  Match an ATerm (.) that does not cause a break because a number immediately follows it.
-$NumberFollows = $InteriorChars* $ATermEx $NumericEx;
+($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $Sep;
+($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $SpEx / [^$Sp];


-# Rule 7.  $UppersSurround   Match a no-break sentence fragment containing a . surrounded by Uppers
-$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
+# rule 11 partly included in rule 9 and 10
+$TermEx;
+$ATermEx;

-# Rule 8   Matches a sentence fragment containing "." that should not cause a sentence break,
-#          because a lower case word follows the period.
-$LowerWordFollows  = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
+# rule 12

-# Rules 3, 9, 10, 11
-#                       Matches a simple sentence, or the trailing part of a complex sentence,
-#                       where a simple sentence contains no interior "."s.
-$TermEndSequence   = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
-$EndSequence       = $InteriorChars* $SepSeq?;
+([^$Term $ATerm $Sep] $Extend*)+;
+([^$Term $ATerm $Sep] $Extend* $Format*)+ ($Term | $ATerm | $Sep);

-
-
-# Put them all together.  
-($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $TermEndSequence{0};   # status = UBRK_SENTENCE_TERM
-($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $EndSequence{100};     # status = UBRK_SENTENCE_SEP
-
-     
 #
 #  Reverse Rules
 #
-$EndGorp                  = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
-$RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
-$ReverseLowerWordFollows  = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
-$ReverseUpperSurround     = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
-$ReverseNumberFollows     = $Numeric $Format* $Extend* $ATerm $InteriorChars*;

-! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
-#! .*;
- 
+$BackATermEx = $Extend* $ATerm;
+$BackNumericEx = $Extend* $Numeric;
+$BackUpperEx = $Extend* $Upper;
+$BackCloseEx = $Extend* $Close;
+$BackSpEx = $Extend* $Sp;
+$BackLowerEx = $Extend* $Lower;
+$BackTermEx = $Extend* $Term;
+
+# rule 3 
+
+! $Sep .;
+
+# rule 6
+
+! $BackNumericEx $Format* $BackATermEx;         
+
+# rule 7
+
+! $BackUpperEx $Format* $BackATermEx $BackUpperEx;
+
+# rule 8
+
+! $BackLowerEx $Format* $Extend* [^$OLetter $Upper $Lower $Sep]* $Format* 
+    $BackSpEx $Format* $BackCloseEx* $Format* $BackATermEx;
+
+# rules 9, 10, 11, 12
+
+$Any = [^$Term $ATerm $Sep];
+$Safe = [^$Term $ATerm $Sep $Sp $Close];
+$BackEnd = ($BackSpEx $Format*)* ($BackCloseEx $Format*)* ($BackTermEx | $BackATermEx);
+! $BackEnd;
+! $BackEnd? $Any* $Safe;
+! $BackEnd? $Any* $Close / ($BackSpEx $Format*)+ ($BackTermEx | $BackATermEx);
+! $BackEnd? $Any* $Sp / $Sep; 
--- a/icu4c/source/data/brkitr/title.txt
+++ b/icu4c/source/data/brkitr/title.txt
@ -11,7 +11,7 @@ $NotCased        = [^ $Cased];
 #
 #  If the iterator was not stopped on a cased character, advance it to the first cased char
 #
-($NotCased | $CaseIgnorable)*;
+$NotCased+;

 #
 #  If the iterator starts on a cased item, advance through all adjacent cased items plus
@ -22,5 +22,11 @@ $Cased ($Cased | $CaseIgnorable)* $NotCased*;
 #
 #  Reverse Rules
 #
-!$NotCased* ($Cased | $CaseIgnorable)* $NotCased?;

+! $NotCased+;
+
+#
+#  If the iterator starts on a cased item, advance through all adjacent cased items plus
+#    any non-cased stuff, to reach the start of the next word.
+#
+! $NotCased* ($Cased | $CaseIgnorable)* $Cased;
--- a/icu4c/source/data/brkitr/word.txt
+++ b/icu4c/source/data/brkitr/word.txt
@ -1,25 +1,28 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
-#       All Rights Reserved.
+# Copyright (C) 2002-2003, 
+# International Business Machines Corporation and others.
+# All Rights Reserved.
 #
-#   file:  word.txt   
+# file:  word.txt   
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
 #      These rules are based on Version 4.0.0, dated 2003-04-17
 #

-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+##############################################################################
+
+!!chain;
+
+$Katakana  = [[:Script = KATAKANA:] 
+              [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+              [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+              [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+              [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];


 $ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] 
@ -28,122 +31,127 @@ $ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
                           - [:Script = Thai:]
                           - [:Script = Lao:]
                           - [:Script = Hiragana:]];
+
+$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
+$ACMLetter   = [$ALetter & [:Grapheme_Extend = TRUE:]];
                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];  
+$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  
+              [:name = HEBREW PUNCTUATION GERSHAYIM:]
+              [:name = RIGHT SINGLE QUOTATION MARK:] 
+			  [:name = HYPHENATION POINT:]];  
              
 $MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];

 $MidNum    = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
 $Numeric   = [:LineBreak = Numeric:];

-
 #
 #  Character Class Definitions.
 #    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 

+$CR      = \u000d;
+$LF      = \u000a;
+$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
+$Extend  = [[:Grapheme_Extend = TRUE:]]; 
+$Format  = [[:Cf:]];  
+$Hiragana = [:Hiragana:];
+$Ideographic = [:IDEOGRAPHIC:];

-
-
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
-#
-####################################################################################
-
-$Format    = [[:Cf:]];  
-
-# ALetter2  -  There are some characters, e.g. \u0fa9, that are both combining marks ($Extend)  
-#              and alphabetic (ALetter).  $ALetter2 is ALetter from the Unicode TR, less all such chars.
-#              We need this because of sequences of the form
-#                 <Letter>  <MidLetter> <alpha combining mark>  <Numeric>
-#              Rule 3 says treat graphme clusters as a unit, as their first character.
-#                 The <MidLetter> <alpha combining mark> thus should be treated as just <MidLetter>
-#              Rules for this are awkward, because the sequence
-#                 <Letter> <MidLetter> <Letter> <Numeric>
-#              should not break, but the sequence
-#                 <Letter> <MidLetter> <Numeric>
-#              should break after the <Letter>. 
-$ALetter2   = [$ALetter - $Extend];
-
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent syllables -
-#          they won't be word boundaries.
-#
-
-
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$ALetter2Ex   = $ALetter2  $Extend*;
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidNumLetEx  = $MidNumLet $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-
-
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($Format* ($MidNumEx | $MidNumLetEx)? $Format* $NumericEx)*;
-$NumberSequence {100};
-
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-#$LetterSequence   = $ALetterEx ($Format* ($MidLetterEx | $MidNumLetEx)? $Format* $ALetterEx)*;     # rules #6, #7
-$WordGlue          = $MidLetterEx | $MidNumLetEx;
-$MidWordFragment   = ($WordGlue $ALetter2Ex | $WordGlue $Format+ $ALetterEx);
-$WordSequence      = $ALetterEx ($Format* ($ALetterEx | $MidWordFragment))*;
-$WordSequence2     = $ALetter2Ex ($Format* ($ALetterEx | $MidWordFragment))*;
-$WordTail          = ($Format* $NumberSequence $Format+ $WordSequence) |  ($Format* $NumberSequence $WordSequence2?);
-($NumberSequence $Format+)? $WordSequence  $WordTail* {200};
-($NumberSequence)?          $WordSequence2 $WordTail* {200};
-
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($Format* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
-
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, only so that they
-#                           can be tagged with a return value.   TODO:  is this what we want?
-#
-[:IDEOGRAPHIC:] $Extend* {400};
-
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
 $CR $LF;

-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  too far,
-#                   but must back up at least enough, and must stop on a boundary.)
-#
+# rule 3 and 4

-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
-#    reaches something that can only be the start (and probably only) char in a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
+$ALetterEx     = $ALetter     $Extend*; 
+$ABaseLetterEx = $ABaseLetter $Extend*; 
+$NumericEx     = $Numeric     $Extend*;
+$MidNumEx      = $MidNum      $Extend*;
+$MidNumLetEx   = $MidNumLet   $Extend*;
+$MidLetterEx   = $MidLetter   $Extend*;
+$KatakanaEx    = $Katakana    $Extend*;

-#!.*;
-! ($NonStarters* | \n \r) .;
+[^$Format $Hiragana $Ideographic] $Extend* [$Extend - $ALetter];
+# letters should be left alone 
+[^$Format $ALetter $Numeric $Hiragana $Ideographic] $Extend* $ACMLetter / [^$Extend];
+$NumericEx $ACMLetter / $MidLetter;
+
+# rule 5
+
+$ALetterEx ($Format* $ALetterEx)* {200};
+
+# rule 6 and 7
+
+$ALetterEx $Format* ($MidLetterEx | $MidNumLetEx) $ABaseLetterEx {200}; 
+$ALetterEx $Format* ($MidLetterEx | $MidNumLetEx) $Format+ $ALetterEx {200}; 
+
+# rule 8
+
+$NumericEx ($Format* $NumericEx)* {100};
+
+# rule 9
+
+$ALetterEx $Format* $NumericEx {200};  
+
+# rule 10
+
+$NumericEx $Format* $ALetterEx {200};
+
+# rule 11 and 12 
+
+$NumericEx $Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx {100}; 
+
+# rule 13
+
+$KatakanaEx ($Format* $KatakanaEx)* {300};
+$Hiragana $Extend* {300} / [^$Extend];
+$Ideographic $Extend* {400} / [^$Extend];
+
+# reverse rules!!
+!!reverse;
+
+$BackALetterEx     = $Extend* $ALetter;
+$BackABaseLetterEx = $Extend* $ABaseLetter;
+$BackACMLetterEx   = $Extend* $ACMLetter;
+$BackNumericEx     = $Extend* $Numeric;
+$BackMidNumEx      = $Extend* $MidNum;
+$BackMidNumLetEx   = $Extend* $MidNumLet;
+$BackMidLetterEx   = $Extend* $MidLetter;
+$BackKatakanaEx    = $Extend* $Katakana;
+
+! $LF $CR;
+
+! $Extend+ [^$Format];
+
+# rule 5
+
+$BackEndACMLetter = $Format+ $Extend* [^$ALetter $Numeric $MidLetter $MidNumLet];
+! $BackALetterEx $Format* $BackABaseLetterEx;
+! $BackALetterEx $Format* $BackACMLetterEx / $BackEndACMLetter;
+
+# rule 6 and 7
+
+! $BackABaseLetterEx ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackABaseLetterEx; 
+! $BackABaseLetterEx ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackACMLetterEx / $BackEndACMLetter; 
+! $BackALetterEx $Format+ ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackABaseLetterEx; 
+! $BackALetterEx $Format+ ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackACMLetterEx / $BackEndACMLetter;
+
+# rule 8
+
+! $BackNumericEx $Format* $BackNumericEx; 
+
+# rule 9
+
+! $BackNumericEx $Format* (($BackNumericEx | $BackALetterEx) $Format*)* $BackABaseLetterEx;
+! $BackNumericEx $Format* $BackACMLetterEx / $BackEndACMLetter; ## problem here
+
+# rule 10
+
+! ($BackALetterEx $Format*)+ $BackNumericEx;
+
+# rule 11 and 12
+
+! $BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
+
+# rule 13
+
+! $BackKatakanaEx $Format* $BackKatakanaEx;
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@ -283,25 +283,25 @@ $(BRK_FILES:.brk =.brk
 BRKDEPS = "$(ICUBLD)\$(ICUDT)uprops.icu" "$(ICUBLD)\$(ICUDT)unames.icu" "$(ICUBLD)\$(ICUDT)pnames.icu" "$(ICUBLD)\$(ICUDT)unorm.icu"

 $(ICUDT)char.brk : "$(ICUBRK)\char.txt" $(BRKDEPS)
-	genbrk -r "$(ICUBRK)\char.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
+	genbrk -c -r "$(ICUBRK)\char.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"

 $(ICUDT)word.brk : "$(ICUBRK)\word.txt" $(BRKDEPS)
-	genbrk -r "$(ICUBRK)\word.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
+	genbrk -c -r "$(ICUBRK)\word.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"

 $(ICUDT)line.brk : "$(ICUBRK)\line.txt" $(BRKDEPS)
-	genbrk -r "$(ICUBRK)\line.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
+	genbrk -c -r "$(ICUBRK)\line.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"

 $(ICUDT)sent.brk : "$(ICUBRK)\sent.txt" $(BRKDEPS)
-	genbrk -r "$(ICUBRK)\sent.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
+	genbrk -c -r "$(ICUBRK)\sent.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"

 $(ICUDT)title.brk : "$(ICUBRK)\title.txt" $(BRKDEPS)
-	genbrk -r "$(ICUBRK)\title.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
+	genbrk -c -r "$(ICUBRK)\title.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"

 $(ICUDT)word_th.brk : "$(ICUBRK)\word_th.txt" $(BRKDEPS)
-	genbrk -r "$(ICUBRK)\word_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
+	genbrk -c -r "$(ICUBRK)\word_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"

 $(ICUDT)line_th.brk : "$(ICUBRK)\line_th.txt" $(BRKDEPS)
-	genbrk -r "$(ICUBRK)\line_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"
+	genbrk -c -r "$(ICUBRK)\line_th.txt" -o $@ -d"$(ICUBLD)" -i "$(ICUBLD)\\"


 # utility target to send us to the right dir
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -839,6 +839,23 @@ void RBBIAPITest::TestRoundtripRules() {
 void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
 {
    if (exec) logln((UnicodeString)"TestSuite RuleBasedBreakIterator API ");
+    switch (index) {
+     //   case 0: name = "TestConstruction"; if (exec) TestConstruction(); break;
+        case  0: name = "TestCloneEquals"; if (exec) TestCloneEquals(); break;
+        case  1: name = "TestgetRules"; if (exec) TestgetRules(); break;
+        case  2: name = "TestHashCode"; if (exec) TestHashCode(); break;
+        case  3: name = "TestGetSetAdoptText"; if (exec) TestGetSetAdoptText(); break;
+        case  4: name = "extra"; break;   /* Extra */
+        case  5: name = "TestBuilder"; if (exec) TestBuilder(); break;
+        case  6: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
+        case  7: name = "TestRuleStatus"; if (exec) TestRuleStatus(); break;
+        case  8: name = "TestBug2190"; if (exec) TestBug2190(); break;
+        case  9: name = "TestRegistration"; if (exec) TestRegistration(); break;
+        case 10: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
+
+        default: name = ""; break; /*needed to end loop*/
+    }
+    /*** TODO synwee
    switch (index) {
     //   case 0: name = "TestConstruction"; if (exec) TestConstruction(); break;
        case  0: name = "TestCloneEquals"; if (exec) TestCloneEquals(); break;
@ -846,8 +863,8 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
        case  2: name = "TestHashCode"; if (exec) TestHashCode(); break;
        case  3: name = "TestGetSetAdoptText"; if (exec) TestGetSetAdoptText(); break;
        case  4: name = "TestIteration"; if (exec) TestIteration(); break;
-        case  5: name = "extra"; break;   /* Extra */
-        case  6: name = "extra"; break;   /* Extra */
+        case  5: name = "extra"; break;   // Extra
+        case  6: name = "extra"; break;   // Extra
        case  7: name = "TestBuilder"; if (exec) TestBuilder(); break;
        case  8: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
        case  9: name = "TestRuleStatus"; if (exec) TestRuleStatus(); break;
@ -856,8 +873,9 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
        case 12: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
        case 13: name = "TestRoundtripRules"; if (exec) TestRoundtripRules(); break;

-        default: name = ""; break; /*needed to end loop*/
+        default: name = ""; break; // needed to end loop
    }
+    ***/
 }

 //---------------------------------------------
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -22,6 +22,7 @@
 #include "unicode/schriter.h"
 #include "unicode/uniset.h"
 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
+#include "unicode/ustring.h"

 #include "intltest.h"
 #include "rbbitst.h"
@ -292,6 +293,41 @@ void RBBITest::TestStatusReturn() {
 }


+static void printStringBreaks(UnicodeString ustr, int expected[],
+                              int expectedcount)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    char name[100];
+    printf("code    alpha extend alphanum type line name\n");
+    for (int j = 0; j < ustr.length(); j ++) {
+        if (expectedcount > 0) {
+            for (int k = 0; k < expectedcount; k ++) {
+                if (j == expected[k]) {
+                    printf("------------------------------------------------ %d\n",
+                           j);
+                }
+            }
+        }
+        UChar32 c = ustr.char32At(j);
+        if (c > 0xffff) {
+            j ++;
+        }
+        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
+        printf("%7x %5d %6d %8d %4s %4s %s\n", c, 
+                           u_isUAlphabetic(c), 
+                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
+                           u_isalnum(c), 
+                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 
+                                                  u_charType(c), 
+                                                  U_SHORT_PROPERTY_NAME), 
+                           u_getPropertyValueName(UCHAR_LINE_BREAK, 
+                                                  u_getIntPropertyValue(c, 
+                                                             UCHAR_LINE_BREAK), 
+                                                  U_SHORT_PROPERTY_NAME),
+                           name);
+    }
+}
+
 void RBBITest::TestThaiLineBreak() {
    UErrorCode status = U_ZERO_ERROR;
    BITestData thaiLineSelection(status);
@ -517,36 +553,55 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
    switch (index) {

+        case 0: name = "TestJapaneseLineBreak";
+            if(exec) TestJapaneseLineBreak();                 break;
+        case 1: name = "TestStatusReturn";
+            if(exec) TestStatusReturn();                       break;
+
+        case 2: name = "TestLineBreakData";
+            if(exec) TestLineBreakData();                      break;
+        case 3: name = "TestEmptyString";
+            if(exec) TestEmptyString();                        break;
+
+        case 4: name = "TestGetAvailableLocales";
+            if(exec) TestGetAvailableLocales();                break;
+
+        case 5: name = "TestGetDisplayName";
+            if(exec) TestGetDisplayName();                     break;
+
+        case 6: name = "TestEndBehaviour";
+            if(exec) TestEndBehaviour();                       break;
+        case 7: name = "TestBug4153072";
+            if(exec) TestBug4153072();                         break;
+        case 8: name = "TestWordBoundary";
+             if(exec) TestWordBoundary();                 break;
+        default: name = ""; break; //needed to end loop
+    }
+    /*** TODO synwee
+    switch (index) {
        case 0: name = "TestExtended";
             if(exec) TestExtended();                          break;
-        case 1: name = "TestJapaneseLineBrea";
+        case 1: name = "TestJapaneseLineBreak";
            if(exec) TestJapaneseLineBreak();                 break;
        case 2: name = "TestStatusReturn";
            if(exec) TestStatusReturn();                       break;

        case 3: name = "TestLineBreakData";
            if(exec) TestLineBreakData();                      break;
-        case 4: name = "TestSentenceInvariants";
-            if(exec) TestSentenceInvariants();                 break;
-        case 5: name = "TestCharacterInvariants";
-            if(exec) TestCharacterInvariants();                break;
-        case 6: name = "TestWordInvariants";
-            if(exec) TestWordInvariants();                     break;
-
-        case 7: name = "TestEmptyString";
+        case 4: name = "TestEmptyString";
            if(exec) TestEmptyString();                        break;

-        case 8: name = "TestGetAvailableLocales";
+        case 5: name = "TestGetAvailableLocales";
            if(exec) TestGetAvailableLocales();                break;

-        case 9: name = "TestGetDisplayName";
+        case 6: name = "TestGetDisplayName";
            if(exec) TestGetDisplayName();                     break;

-        case 10: name = "TestEndBehaviour";
+        case 7: name = "TestEndBehaviour";
            if(exec) TestEndBehaviour();                       break;
-        case 11: name = "TestBug4153072";
+        case 8: name = "TestBug4153072";
            if(exec) TestBug4153072();                         break;
-        case 12: name = "TestMonkey";
+        case 9: name = "TestMonkey";
             if(exec) {
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
               TestMonkey(params);
@ -556,18 +611,25 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
             }
             break;

-        case 13: name = "TestThaiLineBreak";
+        case 10: name = "TestThaiLineBreak";
             if(exec) TestThaiLineBreak();                     break;
-        case 14: name = "TestMixedThaiLineBreak";
+        case 11: name = "TestMixedThaiLineBreak";
             if(exec) TestMixedThaiLineBreak();                break;
-        case 15: name = "TestMaiyamok";
+        case 12: name = "TestMaiyamok";
             if(exec) TestMaiyamok();                          break;
-        case 16: name = "TestThaiWordBreak";
+        case 13: name = "TestThaiWordBreak";
             if(exec) TestThaiWordBreak();                     break;
-
-
+        case 14: name = "TestWordBreaks";
+             if(exec) TestWordBreaks();                   break;
+        case 15: name = "TestLineBreaks";
+             if(exec) TestLineBreaks();                   break;
+        case 16: name = "TestWordBoundary";
+             if(exec) TestWordBoundary();                 break;
+        case 17: name = "TestSentBreaks";
+             if(exec) TestSentBreaks();                 break;
        default: name = ""; break; //needed to end loop
    }
+    ***/
 }


@ -918,6 +980,7 @@ void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
                    }
                }
                if (!seen2) {
+                    printStringBreaks(work, NULL, 0); 
                    errln("No Break between \\U%04x and \\U%04x", c1, c2);
                    errCount++;
                    if (errCount >= 75)
@ -1201,14 +1264,18 @@ void RBBITest::executeTest(TestParams *t) {
        //  and this one.
        for (i=prevBP+1; i<bp; i++) {
            if (t->expectedBreaks->elementAti(i) != 0) {
-                errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
+                int expected[] = {0, i};
+                printStringBreaks(t->dataToBreak, expected, 2);
+                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
            }
        }

        // Check that the break we did find was expected
        if (t->expectedBreaks->elementAti(bp) == 0) {
-            errln("Forward Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
+            int expected[] = {0, bp};
+            printStringBreaks(t->dataToBreak, expected, 2);
+            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
        } else {
            // The break was expected.
@ -1219,7 +1286,7 @@ void RBBITest::executeTest(TestParams *t) {
            }
            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
            if (rs != expectedTagVal) {
-                errln("Incorrect status for break.  Pos=%4d  File line,col= %4d,%4d.\n"
+                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
                      "          Actual, Expected status = %4d, %4d",
                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
            }
@ -1232,7 +1299,7 @@ void RBBITest::executeTest(TestParams *t) {
    // Verify that there were no missed expected breaks after the last one found
    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
        if (t->expectedBreaks->elementAti(i) != 0) {
-            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
+            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
        }
    }
@ -1271,7 +1338,7 @@ void RBBITest::executeTest(TestParams *t) {
            }
            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
            if (rs != expectedTagVal) {
-                errln("Incorrect status for break.  Pos=%4d  File line,col= %4d,%4d.\n"
+                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
                      "          Actual, Expected status = %4d, %4d",
                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
            }
@ -2601,6 +2668,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
    int32_t    nextCPPos; //  Index of the code point following "pos."
                          //     May point to a combining mark.
    int32_t    tPos;      //  temp value.
+    UChar32    c;

    if (startPos >= fText->length()) {
        return -1;
@ -2699,7 +2767,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {

        nextCPPos = fText->moveIndex32(pos, 1);
        nextPos   = nextCPPos;
-        UChar32 c = fText->char32At(nextPos);
+        c = fText->char32At(nextPos);
        rule67Adjust(pos,     &thisChar, &nextPos, &c);

        // If the loop is still warming up - if we haven't shifted the initial
@ -2742,8 +2810,20 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
            if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
                goto fall_through_9;
            }
+        }
+        /***
+        for (tPos=prevPos; ; tPos = fText->moveIndex32(tPos, -1)) {
+            if (fOP->contains(fText->char32At(tPos))) {
+                break;
+            }
+            if ((fSP->contains(fText->char32At(tPos)) || 
+                fCM->contains(fText->char32At(tPos))) == FALSE 
+                || tPos == 0) {
+                goto fall_through_9;
+            }
            
        }
+        ***/
        // We match OP SP* x
        //   No break at this postion.
        //   Continue the outer loop.
@ -2932,6 +3012,277 @@ static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t d
 }
 #endif

+void RBBITest::TestWordBreaks(void)
+{
+    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
+    Locale        locale("en");
+    UErrorCode    status = U_ZERO_ERROR;
+    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
+    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
+    UChar         str[25]; 
+    char          *strlist[] = 
+    {"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
+    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
+    "\\u2027\\U000e0067\\u0a47\\u00b7",
+    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
+    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
+    "\\u0589\\U000e006e\\u0a42\\U000104a5",
+    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
+    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
+    "\\u0027\\u11af\\U000e0057\\u0602",
+    "\\U0001d7f2\\U000e007\\u0004\\u0589",
+    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
+    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
+    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
+    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
+    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
+    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
+    "\\u0233\\U000e0020\\u0a69\\u0d6a",
+    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
+    "\\u58f4\\U000e0049\\u20e7\\u2027",
+    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
+    "\\ua183\\u102d\\u0bec\\u003a",
+    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
+    "\\u003a\\u0e57\\u0fad\\u002e",
+    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
+    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
+    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
+    "\\u003a\\u0664\\u00b7\\u1fba",
+    "\\u003b\\u0027\\u00b7\\u47a3",
+    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
+    };
+    for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
+        printf("looping %d\n", loop);
+        u_unescape(strlist[loop], str, 25);
+        UnicodeString ustr(str);
+        // RBBICharMonkey monkey;
+        RBBIWordMonkey monkey;
+
+        int expected[20];
+        int forward[20];
+        int expectedcount = 0;
+
+        monkey.setText(ustr);
+        for (int i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
+            expected[expectedcount ++] = i;
+        }
+
+        int count = 0;
+        bi->setText(ustr);
+        for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
+            forward[count] = i;
+            if (count > 20 || expected[count] != i) {
+                 errln("happy break forward test failed: expected %d but got %d", 
+                       expected[count], i);
+            }
+            count ++;
+        }
+        if (count != expectedcount) {
+            printStringBreaks(ustr, expected, expectedcount);
+            errln("happy break test failed: missed a match");
+            break;
+        }
+        for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
+            count --;
+            if (forward[count] != i) {
+                printStringBreaks(ustr, expected, expectedcount);
+                errln("happy break test reverse failed: expected %d but got %d", 
+                      forward[count], i);
+                break;
+            }
+        }
+        if (count != 0) {
+            errln("happy break test failed: missed a match");
+        }
+    }
+}
+
+void RBBITest::TestWordBoundary(void)
+{
+    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
+    Locale        locale("en");
+    UErrorCode    status = U_ZERO_ERROR;
+    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
+    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
+    UChar         str[20]; 
+    char          *strlist[] = 
+    {"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
+    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
+    "\\u2027\\U000e0067\\u0a47\\u00b7",
+    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
+    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
+    "\\u0589\\U000e006e\\u0a42\\U000104a5",
+    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
+    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
+    "\\u0027\\u11af\\U000e0057\\u0602",
+    "\\U0001d7f2\\U000e007\\u0004\\u0589",
+    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
+    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
+    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
+    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
+    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
+    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
+    "\\u0233\\U000e0020\\u0a69\\u0d6a",
+    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
+    "\\u58f4\\U000e0049\\u20e7\\u2027",
+    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
+    "\\ua183\\u102d\\u0bec\\u003a",
+    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
+    "\\u003a\\u0e57\\u0fad\\u002e",
+    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
+    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
+    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
+    "\\u003a\\u0664\\u00b7\\u1fba",
+    "\\u003b\\u0027\\u00b7\\u47a3",
+    };
+    for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
+        printf("looping %d\n", loop);
+        u_unescape(strlist[loop], str, 20);
+        UnicodeString ustr(str);
+        int forward[20];
+        int count = 0;
+        
+        bi->setText(ustr);
+        int prev = 0;
+        for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
+            forward[count ++] = i;
+            if (i > prev) {
+                for (int j = prev + 1; j < i; j ++) {
+                    if (bi->isBoundary(j)) {
+                        printStringBreaks(ustr, forward, count);
+                        errln("happy boundary test failed: expected %d not a boundary", 
+                               j);
+                        break;
+                    }
+                }
+            }
+            if (!bi->isBoundary(i)) {
+                printStringBreaks(ustr, forward, count);
+                errln("happy boundary test failed: expected %d a boundary", 
+                       i);
+                break;
+            }
+            prev = i;
+        }
+    }
+}
+
+void RBBITest::TestLineBreaks(void)
+{
+    Locale        locale("en");
+    UErrorCode    status = U_ZERO_ERROR;
+    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
+    UChar         str[20]; 
+    char          *strlist[] = 
+    {"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
+     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
+     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
+     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
+     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
+     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
+     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
+     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
+     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
+     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
+     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
+     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
+     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
+     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
+     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
+     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
+     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
+    };
+    for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
+        printf("looping %d\n", loop);
+        u_unescape(strlist[loop], str, 20);
+        UnicodeString ustr(str);
+        // RBBICharMonkey monkey;
+        RBBILineMonkey monkey;
+
+        int expected[20];
+        int forward[20];
+        int expectedcount = 0;
+
+        monkey.setText(ustr);
+        for (int i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
+            expected[expectedcount ++] = i;
+        }
+
+        int count = 0;
+        bi->setText(ustr);
+        for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
+            forward[count] = i;
+            if (count < expectedcount && expected[count] != i) {
+                 errln("happy break forward test failed: expected %d but got %d", 
+                       expected[count], i);
+            }
+            count ++;
+        }
+        if (count != expectedcount) {
+            printStringBreaks(ustr, expected, expectedcount);
+            errln("happy break test failed: missed %d match", 
+                  expectedcount - count);
+        }
+        for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
+            count --;
+            if (forward[count] != i) {
+                printStringBreaks(ustr, expected, expectedcount);
+                errln("happy break test reverse failed: expected %d but got %d", 
+                      forward[count], i);
+                break;
+            }
+        }
+        if (count != 0) {
+            errln("happy break test failed: missed a match");
+        }
+    }
+}
+
+void RBBITest::TestSentBreaks(void)
+{
+    Locale        locale("en");
+    UErrorCode    status = U_ZERO_ERROR;
+    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
+    UChar         str[100]; 
+    char          *strlist[] = 
+    {"This\n",
+     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
+     "\"Sentence ending with a quote.\" Bye.",
+     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"", 
+     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
+     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
+     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
+     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
+     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
+    };
+    for (int loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
+        printf("looping %d\n", loop);
+        u_unescape(strlist[loop], str, 100);
+        UnicodeString ustr(str);
+        
+        int forward[20];
+
+        int count = 0;
+        bi->setText(ustr);
+        for (int i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
+            forward[count ++] = i;
+        }
+        int tempcount = count;
+        for (int i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
+            tempcount --;
+            if (forward[tempcount] != i) {
+                printStringBreaks(ustr, forward, count);
+                errln("happy break test reverse failed: expected %d but got %d", 
+                      forward[tempcount], i);
+                break;
+            }
+        }
+        if (tempcount != 0) {
+            errln("happy break test failed: missed a match");
+        }
+    }
+}
+
 void RBBITest::TestMonkey(char *params) {
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

@ -3119,7 +3470,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
            const char *errorType = NULL;
            if  (forwardBreaks[i] != expectedBreaks[i]) {
                errorType = "next()";
-            } else if (reverseBreaks[i] != expectedBreaks[i]) {
+            } else if (reverseBreaks[i] != forwardBreaks[i]) {
                errorType = "previous()";
            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
                errorType = "isBoundary()";
@ -3135,23 +3486,39 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
                int startContext = i;
                for (;;) {
                    if (startContext==0) { break; }
-                    startContext--;
+                    startContext --;
                    if (expectedBreaks[startContext] != 0) {break;}
                }

                // End of range is two expected breaks past the start position.
-                int endContext = i+1;
+                int endContext = i + 1;
                int ci;
                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
                    for (;;) {
                        if (endContext >= testText.length()) {break;}
                        if (expectedBreaks[endContext-1] != 0) { break;}
-                        endContext++;
+                        endContext ++;
                    }
                }

                // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
                UnicodeString errorText = "<data>";
+                /***
+                if (strcmp(errorType, "previous()") == 0) {
+                    startContext = 0;
+                    int j = i;
+                    while (true) {
+                        if (reverseBreaks[j ++] != 0) {
+                            printf("%d\n", j);
+                            break;
+                        }
+                        if (j % 100 == 0) {
+                            printf("continue %d\n", j);
+                        }
+                    }
+                    endContext = j - 1;
+                }
+                ***/
                for (ci=startContext; ci<endContext;) {
                    UnicodeString hexChars("0123456789abcdef");
                    UChar32  c;
@ -3181,7 +3548,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
                errorText.append("</data>\n");

                // Output the error
-                char  charErrorTxt[100];
+                char  charErrorTxt[500];
                UErrorCode status = U_ZERO_ERROR;
                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -70,8 +70,10 @@ public:
    UChar *ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status);
    void executeTest(TestParams *);

-    
- 
+    void TestWordBreaks();
+    void TestWordBoundary();
+    void TestLineBreaks();
+    void TestSentBreaks();
    
    
 /***********************/