ICU-45 RBBI copyright notices, AIX portability, comments

X-SVN-Rev: 8963
2002-06-27 21:14:47 +00:00 · 2002-06-27 21:14:47 +00:00 · e56b99a590
commit e56b99a590
parent 59029844b7
9 changed files with 171 additions and 213 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -1,3 +1,8 @@
+//
+//  file:  rbbi.c    Contains the implementation of the rule based break iterator
+//                   runtime engine and the API implementation for
+//                   class RuleBasedBreakIterator
+//
 /*
 **********************************************************************
 *   Copyright (C) 1999-2002 International Business Machines Corporation   *
@ -5,6 +10,7 @@
 **********************************************************************
 */

+
 #include "unicode/rbbi.h"
 #include "unicode/schriter.h"
 #include "unicode/udata.h"
@ -151,12 +157,12 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 UBool RuleBasedBreakIterator::fTrace = FALSE;
 void RuleBasedBreakIterator::init() {
    static UBool debugInitDone = FALSE;
-    
+
    fText                = NULL;
    fData                = NULL;
    fCharMappings        = NULL;
    fLastBreakTag        = 0;
-    fDictionaryCharCount = 0;   
+    fDictionaryCharCount = 0;

    if (debugInitDone == FALSE) {
        char *debugEnv = getenv("U_RBBIDEBUG");
@ -190,7 +196,7 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    if (that.getDynamicClassID() != getDynamicClassID())
        return FALSE;

-    
+
    const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&)that;
    UBool r = (that2.fText == fText);
    r |= (*that2.fText == *fText);
@ -229,7 +235,7 @@ RuleBasedBreakIterator::getRules() const {
 const CharacterIterator&
 RuleBasedBreakIterator::getText() const {
    RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
-    
+
    // The iterator is initialized pointing to no text at all, so if this
    // function is called while we're in that state, we have to fudge an
    // an iterator to return.
@ -252,7 +258,7 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
 }

 /**
- * Set the iterator to analyze a new piece of text.  This function resets 
+ * Set the iterator to analyze a new piece of text.  This function resets
 * the current iteration position to the beginning of the text.
 * @param newText An iterator over the text to analyze.
 */
@ -295,7 +301,7 @@ int32_t RuleBasedBreakIterator::last(void) {
    reset();
    if (fText == NULL)
        return BreakIterator::DONE;
-    
+
    // I'm not sure why, but t.last() returns the offset of the last character,
    // rather than the past-the-end offset

@ -352,7 +358,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
    fText->previous32();
    int32_t lastResult = handlePrevious();
    int32_t result = lastResult;
-    
+
    // iterate forward from the known break position until we pass our
    // starting point.  The last break position before the starting
    // point is our return value
@ -360,7 +366,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
        lastResult = result;
        result = handleNext();
    }
-    
+
    // set the current iteration position to be the last break position
    // before where we started, and then return that value
    fText->setIndex(lastResult);
@ -420,7 +426,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    else if (offset < fText->startIndex()) {
        return fText->startIndex();
    }
-    
+
    // if we start by updating the current iteration position to the
    // position specified by the caller, we can just use previous()
    // to carry out this operation
@ -445,7 +451,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    else if (offset < fText->startIndex() || offset > fText->endIndex()) {
        return FALSE;
    }
-        
+
    // otherwise, we can use following() on the position before the specified
    // one and return true of the position we get back is the one the user
    // specified
@ -462,7 +468,7 @@ int32_t RuleBasedBreakIterator::current(void) const {
 }

 //=======================================================================
-// implementation 
+// implementation
 //=======================================================================


@ -487,11 +493,11 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
    // no matter what, we always advance at least one character forward
    int32_t result = fText->getIndex() + 1;
    int32_t lookaheadResult = 0;
-    
+
    // begin in state 1
    int32_t            state           = START_STATE;
    int16_t            category;
-    UChar32            c               = fText->current32();  
+    UChar32            c               = fText->current32();
    RBBIStateTableRow *row;
    int32_t            lookaheadStatus = 0;
    int32_t            lookaheadTag    = 0;
@ -505,7 +511,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
          fDictionaryCharCount++;
          category &= ~0x4000;
        }
-  
+
      // loop until we reach the end of the text or transition to state 0
      for (;;) {
        if (c == CharacterIterator::DONE ) {
@ -542,17 +548,17 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
        state = row->fNextState[category];
        row = (RBBIStateTableRow *)
            (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
-        
+
        // Get the next character.  Doing it here positions the iterator
        //    to the correct position for recording matches in the code that
        //    follows.
        c = fText->next32();
-        
+
        if (row->fAccepting == 0 && row->fLookAhead == 0) {
            // No match, nothing of interest happening, common case.
            goto continueOn;
        }
-        
+
        if (row->fAccepting == -1) {
            // Match found, common case, no lookahead involved.
            //    (It's possible that some lookahead rule matched here also,
@ -562,7 +568,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
            fLastBreakTag   = row->fTag;   // Remember the break status (tag) value.
            goto continueOn;
        }
-        
+
        if (row->fAccepting == 0 && row->fLookAhead != 0) {
            // Lookahead match point.  Remember it, but only if no other rule has
            //                         unconitionally matched up to this point.
@ -594,7 +600,7 @@ continueOn:
        if (state == STOP_STATE) {
            break;
        }
-        
+
        // c = fText->next32();
    }

@ -605,7 +611,7 @@ continueOn:
    if (c == CharacterIterator::DONE && lookaheadResult == fText->endIndex()) {
        result = lookaheadResult;
    }
-        
+

    fText->setIndex(result);
    if (fTrace) {
@ -626,13 +632,13 @@ continueOn:
 //
 //-----------------------------------------------------------------------------------
 int32_t RuleBasedBreakIterator::handlePrevious(void) {
-    if (fText == NULL || fData == NULL) { 
+    if (fText == NULL || fData == NULL) {
        return 0;
    }
    if (fData->fReverseTable == NULL) {
        return fText->setToStart();
    }
-        
+
    int32_t            state           = START_STATE;
    int32_t            category;
    int32_t            lastCategory    = 0;
@ -650,11 +656,11 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
        fDictionaryCharCount++;
        category &= ~0x4000;
    }
-    
+
    if (fTrace) {
        printf("Handle Prev   pos   char  state category  \n");
    }
-    
+
    // loop until we reach the beginning of the text or transition to state 0
    for (;;) {
        if (c == CharacterIterator::DONE) {
@ -693,14 +699,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
            // No match, nothing of interest happening, common case.
            goto continueOn;
        }
-        
+
        if (row->fAccepting == -1) {
            // Match found, common case, no lookahead involved.
            result = fText->getIndex();
            lookaheadStatus = 0;     // clear out any pending look-ahead matches.
            goto continueOn;
        }
-        
+
        if (row->fAccepting == 0 && row->fLookAhead != 0) {
            // Lookahead match point.  Remember it, but only if no other rule
            //                         has unconditionally matched to this point.
@ -714,7 +720,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
            }
            goto continueOn;
        }
-        
+
        if (row->fAccepting != 0 && row->fLookAhead != 0) {
            // Lookahead match is completed.  Set the result accordingly, but only
            //   if no other rule has matched further in the mean time.
@ -732,18 +738,18 @@ continueOn:
        if (state == STOP_STATE) {
            break;
        }
-            
+
        // then advance one character backwards
        c = fText->previous32();
    }
-    
-    // Note:  the result postion isn't what is returned to the user by previous(), 
-    //        but where the implementation of previous() turns around and 
+
+    // Note:  the result postion isn't what is returned to the user by previous(),
+    //        but where the implementation of previous() turns around and
    //        starts iterating forward again.
    if (c == CharacterIterator::DONE) {
        result = fText->startIndex();
-    } 
-    fText->setIndex(result);  
+    }
+    fText->setIndex(result);

    return result;
 }
@ -808,7 +814,7 @@ BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
    }

    //
-    //  If user buffer size is zero this is a preflight operation to 
+    //  If user buffer size is zero this is a preflight operation to
    //    obtain the needed buffer size, allowing for worst case misalignment.
    //
    if (bufferSize == 0) {
@ -859,7 +865,7 @@ BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
        clone->fBufferClone = TRUE;
    }

-    return clone;    
+    return clone;
 }


--- a/icu4c/source/common/rbbicst.pl
+++ b/icu4c/source/common/rbbicst.pl
@ -1,6 +1,21 @@
 #
 #  rbbicst   Compile the RBBI rule paser state table data into initialized C data.
+#            Usage:
+#                   cd icu/source/common
+#                   perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
 #
+#             The output file, rbbrpt.h, is included by some of the .cpp rbbi
+#             implementation files.   This perl script is NOT run as part
+#             of a normal ICU build.  It is run by hand when needed, and the
+#             rbbirpt.h generated file is put back into cvs.
+#
+#             See rbbirpt.h for a description of the input format for this script.
+#
+#*********************************************************************
+#   Copyright (C) 2002 International Business Machines Corporation   *
+#   and others. All rights reserved.                                 *
+#*********************************************************************
+

 $num_states = 1;     # Always the state number for the line being compiled.
 $line_num  = 0;      # The line number in the input file.
@ -180,10 +195,14 @@ die if ($errors>0);
 print "//---------------------------------------------------------------------------------\n";
 print "//\n";
 print "// Generated Header File.  Do not edit by hand.\n";
-print "//    This file contains the state table for RBBI rule parser.\n";
+print "//    This file contains the state table for the ICU Rule Based Break Iterator\n";
+print "//    rule parser.\n";
 print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
 print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
 print "//\n";
+print "//   Copyright (C) 2002 International Business Machines Corporation \n";
+print "//   and others. All rights reserved.  \n";
+print "//\n";
 print "//---------------------------------------------------------------------------------\n";
 print "#ifndef RBBIRPT_H\n";
 print "#define RBBIRPT_H\n";
@ -257,7 +276,7 @@ for ($state=1; $state < $num_states; $state++) {
    print "    , {$state_func_name[$state],";
    if ($state_literal_chars[$state] ne "") {
        $c = $state_literal_chars[$state];
-        printf(" %d /*$c*/,", ord($c));   #TODO:  use numeric value, so EBCDIC machines are ok.
+        printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
    }else {
        print " $charClasses{$state_char_class[$state]},";
    }
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@ -201,8 +201,8 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
    //
    //   Generate the DFA state transition table.
    //
-    builder.fForwardTables = new RBBITableBuilder(&builder, builder.fForwardTree);
-    builder.fReverseTables = new RBBITableBuilder(&builder, builder.fReverseTree);
+    builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
+    builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
    builder.fForwardTables->build();
    builder.fReverseTables->build();
    if (U_FAILURE(status)) {
--- a/icu4c/source/common/rbbirpt.h
+++ b/icu4c/source/common/rbbirpt.h
@ -1,10 +1,14 @@
 //---------------------------------------------------------------------------------
 //
 // Generated Header File.  Do not edit by hand.
-//    This file contains the state table for RBBI rule parser.
+//    This file contains the state table for the ICU Rule Based Break Iterator
+//    rule parser.
 //    It is generated by the Perl script "rbbicst.pl" from
 //    the rule parser state definitions file "rbbirpt.txt".
 //
+//   Copyright (C) 2002 International Business Machines Corporation 
+//   and others. All rights reserved.  
+//
 //---------------------------------------------------------------------------------
 #ifndef RBBIRPT_H
 #define RBBIRPT_H
@ -71,87 +75,87 @@ struct RBBIRuleTableEl gRuleParseStateTable[] = {
    {doNOP, 0, 0, 0, TRUE}
    , {doExprStart, 254, 12, 8, FALSE}     //  1      start
    , {doNOP, 130, 1,0,  TRUE}     //  2 
-    , {doExprStart, 36 /*$*/, 70, 80, FALSE}     //  3 
-    , {doReverseDir, 33 /*!*/, 11,0,  TRUE}     //  4 
-    , {doNOP, 59 /*;*/, 1,0,  TRUE}     //  5 
+    , {doExprStart, 36 /* $ */, 70, 80, FALSE}     //  3 
+    , {doReverseDir, 33 /* ! */, 11,0,  TRUE}     //  4 
+    , {doNOP, 59 /* ; */, 1,0,  TRUE}     //  5 
    , {doNOP, 252, 0,0,  FALSE}     //  6 
    , {doExprStart, 255, 12, 8, FALSE}     //  7 
-    , {doEndOfRule, 59 /*;*/, 1,0,  TRUE}     //  8      break-rule-end
+    , {doEndOfRule, 59 /* ; */, 1,0,  TRUE}     //  8      break-rule-end
    , {doNOP, 130, 8,0,  TRUE}     //  9 
    , {doRuleError, 255, 85,0,  FALSE}     //  10 
    , {doExprStart, 255, 12, 8, FALSE}     //  11      reverse-rule
    , {doRuleChar, 254, 21,0,  TRUE}     //  12      term
    , {doNOP, 130, 12,0,  TRUE}     //  13 
    , {doRuleChar, 129, 21,0,  TRUE}     //  14 
-    , {doNOP, 91 /*[*/, 76, 21, FALSE}     //  15 
-    , {doLParen, 40 /*(*/, 12, 21, TRUE}     //  16 
-    , {doNOP, 36 /*$*/, 70, 20, FALSE}     //  17 
-    , {doDotAny, 46 /*.*/, 21,0,  TRUE}     //  18 
+    , {doNOP, 91 /* [ */, 76, 21, FALSE}     //  15 
+    , {doLParen, 40 /* ( */, 12, 21, TRUE}     //  16 
+    , {doNOP, 36 /* $ */, 70, 20, FALSE}     //  17 
+    , {doDotAny, 46 /* . */, 21,0,  TRUE}     //  18 
    , {doRuleError, 255, 85,0,  FALSE}     //  19 
    , {doCheckVarDef, 255, 21,0,  FALSE}     //  20      term-var-ref
-    , {doUnaryOpStar, 42 /***/, 25,0,  TRUE}     //  21      expr-mod
-    , {doUnaryOpPlus, 43 /*+*/, 25,0,  TRUE}     //  22 
-    , {doUnaryOpQuestion, 63 /*?*/, 25,0,  TRUE}     //  23 
+    , {doUnaryOpStar, 42 /* * */, 25,0,  TRUE}     //  21      expr-mod
+    , {doUnaryOpPlus, 43 /* + */, 25,0,  TRUE}     //  22 
+    , {doUnaryOpQuestion, 63 /* ? */, 25,0,  TRUE}     //  23 
    , {doNOP, 255, 25,0,  FALSE}     //  24 
    , {doExprCatOperator, 254, 12,0,  FALSE}     //  25      expr-cont
    , {doNOP, 130, 25,0,  TRUE}     //  26 
    , {doExprCatOperator, 129, 12,0,  FALSE}     //  27 
-    , {doExprCatOperator, 91 /*[*/, 12,0,  FALSE}     //  28 
-    , {doExprCatOperator, 40 /*(*/, 12,0,  FALSE}     //  29 
-    , {doExprCatOperator, 36 /*$*/, 12,0,  FALSE}     //  30 
-    , {doExprCatOperator, 46 /*.*/, 12,0,  FALSE}     //  31 
-    , {doExprCatOperator, 47 /*/*/, 37,0,  FALSE}     //  32 
-    , {doExprCatOperator, 123 /*{*/, 49,0,  TRUE}     //  33 
-    , {doExprOrOperator, 124 /*|*/, 12,0,  TRUE}     //  34 
-    , {doExprRParen, 41 /*)*/, 255,0,  TRUE}     //  35 
+    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  28 
+    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  29 
+    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  30 
+    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  31 
+    , {doExprCatOperator, 47 /* / */, 37,0,  FALSE}     //  32 
+    , {doExprCatOperator, 123 /* { */, 49,0,  TRUE}     //  33 
+    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  34 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  35 
    , {doExprFinished, 255, 255,0,  FALSE}     //  36 
-    , {doSlash, 47 /*/*/, 39,0,  TRUE}     //  37      look-ahead
+    , {doSlash, 47 /* / */, 39,0,  TRUE}     //  37      look-ahead
    , {doNOP, 255, 85,0,  FALSE}     //  38 
    , {doExprCatOperator, 254, 12,0,  FALSE}     //  39      expr-cont-no-slash
    , {doNOP, 130, 25,0,  TRUE}     //  40 
    , {doExprCatOperator, 129, 12,0,  FALSE}     //  41 
-    , {doExprCatOperator, 91 /*[*/, 12,0,  FALSE}     //  42 
-    , {doExprCatOperator, 40 /*(*/, 12,0,  FALSE}     //  43 
-    , {doExprCatOperator, 36 /*$*/, 12,0,  FALSE}     //  44 
-    , {doExprCatOperator, 46 /*.*/, 12,0,  FALSE}     //  45 
-    , {doExprOrOperator, 124 /*|*/, 12,0,  TRUE}     //  46 
-    , {doExprRParen, 41 /*)*/, 255,0,  TRUE}     //  47 
+    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  42 
+    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  43 
+    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  44 
+    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  45 
+    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  46 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  47 
    , {doExprFinished, 255, 255,0,  FALSE}     //  48 
    , {doNOP, 130, 49,0,  TRUE}     //  49      tag-open
    , {doStartTagValue, 128, 52,0,  FALSE}     //  50 
    , {doTagExpectedError, 255, 85,0,  FALSE}     //  51 
    , {doNOP, 130, 56,0,  TRUE}     //  52      tag-value
-    , {doNOP, 125 /*}*/, 56,0,  FALSE}     //  53 
+    , {doNOP, 125 /* } */, 56,0,  FALSE}     //  53 
    , {doTagDigit, 128, 52,0,  TRUE}     //  54 
    , {doTagExpectedError, 255, 85,0,  FALSE}     //  55 
    , {doNOP, 130, 56,0,  TRUE}     //  56      tag-close
-    , {doTagValue, 125 /*}*/, 59,0,  TRUE}     //  57 
+    , {doTagValue, 125 /* } */, 59,0,  TRUE}     //  57 
    , {doTagExpectedError, 255, 85,0,  FALSE}     //  58 
    , {doExprCatOperator, 254, 12,0,  FALSE}     //  59      expr-cont-no-tag
    , {doNOP, 130, 59,0,  TRUE}     //  60 
    , {doExprCatOperator, 129, 12,0,  FALSE}     //  61 
-    , {doExprCatOperator, 91 /*[*/, 12,0,  FALSE}     //  62 
-    , {doExprCatOperator, 40 /*(*/, 12,0,  FALSE}     //  63 
-    , {doExprCatOperator, 36 /*$*/, 12,0,  FALSE}     //  64 
-    , {doExprCatOperator, 46 /*.*/, 12,0,  FALSE}     //  65 
-    , {doExprCatOperator, 47 /*/*/, 37,0,  FALSE}     //  66 
-    , {doExprOrOperator, 124 /*|*/, 12,0,  TRUE}     //  67 
-    , {doExprRParen, 41 /*)*/, 255,0,  TRUE}     //  68 
+    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  62 
+    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  63 
+    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  64 
+    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  65 
+    , {doExprCatOperator, 47 /* / */, 37,0,  FALSE}     //  66 
+    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  67 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  68 
    , {doExprFinished, 255, 255,0,  FALSE}     //  69 
-    , {doStartVariableName, 36 /*$*/, 72,0,  TRUE}     //  70      scan-var-name
+    , {doStartVariableName, 36 /* $ */, 72,0,  TRUE}     //  70      scan-var-name
    , {doNOP, 255, 85,0,  FALSE}     //  71 
    , {doNOP, 132, 74,0,  TRUE}     //  72      scan-var-start
    , {doVariableNameExpectedErr, 255, 85,0,  FALSE}     //  73 
    , {doNOP, 131, 74,0,  TRUE}     //  74      scan-var-body
    , {doEndVariableName, 255, 255,0,  FALSE}     //  75 
-    , {doScanUnicodeSet, 91 /*[*/, 255,0,  TRUE}     //  76      scan-unicode-set
-    , {doScanUnicodeSet, 112 /*p*/, 255,0,  TRUE}     //  77 
-    , {doScanUnicodeSet, 80 /*P*/, 255,0,  TRUE}     //  78 
+    , {doScanUnicodeSet, 91 /* [ */, 255,0,  TRUE}     //  76      scan-unicode-set
+    , {doScanUnicodeSet, 112 /* p */, 255,0,  TRUE}     //  77 
+    , {doScanUnicodeSet, 80 /* P */, 255,0,  TRUE}     //  78 
    , {doNOP, 255, 85,0,  FALSE}     //  79 
    , {doNOP, 130, 80,0,  TRUE}     //  80      assign-or-rule
-    , {doStartAssign, 61 /*=*/, 12, 83, TRUE}     //  81 
+    , {doStartAssign, 61 /* = */, 12, 83, TRUE}     //  81 
    , {doNOP, 255, 20, 8, FALSE}     //  82 
-    , {doEndAssign, 59 /*;*/, 1,0,  TRUE}     //  83      assign-end
+    , {doEndAssign, 59 /* ; */, 1,0,  TRUE}     //  83      assign-end
    , {doRuleErrorAssignExpr, 255, 85,0,  FALSE}     //  84 
    , {doExit, 255, 85,0,  TRUE}     //  85      errorDeath
 };
--- a/icu4c/source/common/rbbisetb.cpp
+++ b/icu4c/source/common/rbbisetb.cpp
@ -1,18 +1,20 @@
 //
 //  rbbisetb.cpp
+//
 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
-*   Corporation and others.  All Rights Reserved.
+*   Copyright (C) 2002 International Business Machines Corporation   *
+*   and others. All rights reserved.                                 *
 **********************************************************************
 */
 //
-//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
+//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
+//                   (part of the rule building process.)
 //
 //      Starting with the rules parse tree from the scanner,
 //
 //                   -  Enumerate the set of UnicodeSets that are referenced
-//                      by the RBBI rules. 
+//                      by the RBBI rules.
 //                   -  compute a set of non-overlapping character ranges
 //                      with all characters within a range belonging to the same
 //                      set of input uniocde sets.
@ -62,10 +64,10 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
 //   Destructor
 //
 //------------------------------------------------------------------------
-RBBISetBuilder::~RBBISetBuilder() 
+RBBISetBuilder::~RBBISetBuilder()
 {
    RangeDescriptor   *nextRangeDesc;
-    
+
    // Walk through & delete the linked list of RangeDescriptors
    for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
        RangeDescriptor *r = nextRangeDesc;
@ -227,7 +229,7 @@ void RBBISetBuilder::build() {

    if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
    if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "esets")) {printSets();}
-    
+
    //
    // Build the Trie table for mapping UChar32 values to the corresponding
    //   range group number
@ -278,7 +280,7 @@ utrie_serialize(fTrie,
                TRUE,                    // Reduce to 16 bits
                fStatus);
 }
-    
+
 //------------------------------------------------------------------------
 //
 //  addValToSets     Add a runtime-mapped input value to each uset from a
@ -291,7 +293,7 @@ utrie_serialize(fTrie,
 //
 //                   The "logically equivalent expression" is the tree for an
 //                   or-ing together of all of the symbols that go into the set.
-//                   
+//
 //------------------------------------------------------------------------
 void  RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
    int32_t       ix;
@ -354,7 +356,7 @@ void RBBISetBuilder::printRanges() {
                if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
                    setName = varRef->fText;
                }
-            } 
+            }
            RBBINode::printUnicodeString(setName); printf("  ");
        }
        printf("\n");
@ -373,7 +375,7 @@ void RBBISetBuilder::printRangeGroups() {
    RangeDescriptor       *tRange;
    int                    i;
    int                    lastPrintedGroupNum = 0;
-    
+
    printf("\nRanges grouped by Unicode Set Membership...\n");
    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        int groupNum = rlRange->fNum & 0xbfff;
@ -382,7 +384,7 @@ void RBBISetBuilder::printRangeGroups() {
            printf("%2i  ", groupNum);

            if (rlRange->fNum & 0x4000) { printf(" <DICT> ");};
-            
+
            for (i=0; i<rlRange->fIncludesSets->size(); i++) {
                RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
                UnicodeString   setName = "anon";
@ -392,8 +394,8 @@ void RBBISetBuilder::printRangeGroups() {
                    if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
                        setName = varRef->fText;
                    }
-                } 
-                RBBINode::printUnicodeString(setName); printf(" "); 
+                }
+                RBBINode::printUnicodeString(setName); printf(" ");
            }

            i = 0;
@ -410,7 +412,7 @@ void RBBISetBuilder::printRangeGroups() {
    }
    printf("\n");
 }
-    
+


 //------------------------------------------------------------------------
@ -440,7 +442,7 @@ void RBBISetBuilder::printSets() {
            if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
                setName = varRef->fText;
            }
-        } 
+        }
        RBBINode::printUnicodeString(setName);
        printf("   ");
        RBBINode::printUnicodeString(usetNode->fText);
--- a/icu4c/source/common/rbbistbl.cpp
+++ b/icu4c/source/common/rbbistbl.cpp
@ -1,12 +1,12 @@
 //
 //  file:  rbbistbl.cpp    Implementation of the ICU RBBISymbolTable class
 //
-
-/********************************************************************
- * COPYRIGHT:
- * Copyright (c) 1997-2001, International Business Machines Corporation and
- * others. All Rights Reserved.
- ********************************************************************/
+/*
+**********************************************************************
+*   Copyright (C) 2002 International Business Machines Corporation   *
+*   and others. All rights reserved.                                 *
+**********************************************************************
+*/

 #include "unicode/unistr.h"
 #include "unicode/uniset.h"
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@ -4,7 +4,7 @@

 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
+*   Copyright (c) 2002, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -20,8 +20,8 @@
 #include <assert.h>


-RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode) :
- fTree(rootNode) {
+RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
+ fTree(*rootNode) {
    fRB             = rb;
    fStatus         = fRB->fStatus;
    fDStates        = new UVector(*fStatus);
--- a/icu4c/source/common/rbbitblb.h
+++ b/icu4c/source/common/rbbitblb.h
@ -4,7 +4,7 @@

 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
+*   Copyright (c) 2002, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -21,6 +21,7 @@
 U_NAMESPACE_BEGIN

 class RBBIRuleScanner;
+class RBBIRuleBuilder;

 //
 //  class RBBITableBuilder is part of the RBBI rule compiler.
@ -33,9 +34,7 @@ class RBBIRuleScanner;

 class RBBITableBuilder : public UObject {
 public:
-    // TODO:  add a root node param to the constructor.  We're going to have two
-    //        builders, one for the forward table, and one for the reverse table.
-    RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode);
+    RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
    ~RBBITableBuilder();

    void     build();
@ -46,7 +45,7 @@ public:
                                        //     the specified location.

    //  TODO:  add getter function(s) for the built table.
-    
+
 private:
    void     calcNullable(RBBINode *n);
    void     calcFirstPos(RBBINode *n);
@ -71,7 +70,7 @@ private:
 private:
    RBBIRuleBuilder  *fRB;
    RBBINode         *&fTree;              // The root node of the parse tree to build a
-                                           //   table for.  
+                                           //   table for.
    UErrorCode       *fStatus;

    UVector          *fDStates;            //  D states (Aho's terminology)
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -1,5 +1,9 @@
 /*
-* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
+***************************************************************************
+*   Copyright (C) 1999-2002 International Business Machines Corporation   *
+*   and others. All rights reserved.                                      *
+***************************************************************************
+
 **********************************************************************
 *   Date        Name        Description
 *   10/22/99    alan        Creation.
@ -28,26 +32,18 @@ class BreakIterator;
 /**
 * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
 *
- * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
+ * <p>There are two kinds of rules, which are separated by semicolons: <i>variable definitions</i>
 * and <i>regular expressions.</i></p>
 *
- * <p>A substitution rule defines a name that can be used in place of an expression. It
- * consists of a name, which is a string of characters contained in angle brackets, an equals
- * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
- * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
- * square brackets. A substitution is visible after its definition, and is filled in using
- * simple textual substitution. Substitution definitions can contain other substitutions, as
- * long as those substitutions have been defined first. Substitutions are generally used to
- * make the regular expressions (which can get quite complex) shorted and easier to read.
+ * <p>A varialbe definition defines a variable name that can be used in subsequent expressions.
+ * It consists of a name preceded by a dollar sign, an equals
+ * sign, and an expression.
+ * A $variable is visible after its definition.
+ * Variable definitions can contain other variables, as
+ * long as those variables have been defined first. Variables are generally used to
+ * make the regular expressions (which can get quite complex) shorter and easier to read.
 * They typically define either character categories or commonly-used subexpressions.</p>
 *
- * <p>There is one special substitution.&nbsp; If the description defines a substitution
- * called &quot;&lt;ignore&gt;&quot;, the expression must be a [] expression, and the
- * expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
- * will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
- * same way it would if any ignore characters it contains are taken out.&nbsp; Break
- * positions never occur befoer ignore characters.</p>
- *
 * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
 * defines a sequence of characters to be kept together. With one significant exception, the
 * iterator uses a longest-possible-match algorithm when matching text to regular
@ -64,10 +60,6 @@ class BreakIterator;
 *       of times (including not at all).</td>
 *     </tr>
 *     <tr>
- *       <td width="6%">{}</td>
- *       <td width="94%">Encloses a sequence of characters that is optional.</td>
- *     </tr>
- *     <tr>
 *       <td width="6%">()</td>
 *       <td width="94%">Encloses a sequence of characters.&nbsp; If followed by *, the sequence
 *       repeats.&nbsp; Otherwise, the parentheses are just a grouping device and a way to delimit
@ -76,29 +68,17 @@ class BreakIterator;
 *     <tr>
 *       <td width="6%">|</td>
 *       <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
- *       sequence or the other, but not both, matches this expression.&nbsp; The | character can
- *       only occur inside ().</td>
+ *       sequence or the other, but not both, matches this expression.</td>
 *     </tr>
 *     <tr>
 *       <td width="6%">.</td>
 *       <td width="94%">Matches any character.</td>
 *     </tr>
 *     <tr>
- *       <td width="6%">*?</td>
- *       <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
- *       when there is overlap between the last group of characters in the expression preceding the
- *       * and the first group of characters following the *.&nbsp; When there is this kind of
- *       overlap, * will match the longest sequence of characters that match the expression before
- *       the *, and *? will match the shortest sequence of characters matching the expression
- *       before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
- *       &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
- *       but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
- *     </tr>
- *     <tr>
 *       <td width="6%">[]</td>
- *       <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
+ *       <td width="94%">Specify a set of characters.&nbsp; A [] expression will
 *       match any single character that is specified in the [] expression.&nbsp; For more on the
- *       syntax of [] expressions, see below.</td>
+ *       syntax of [] expressions, see the ICU User Guide description of UnicodeSet.</td>
 *     </tr>
 *     <tr>
 *       <td width="6%">/</td>
@ -111,24 +91,16 @@ class BreakIterator;
 *     <tr>
 *       <td width="6%">\</td>
 *       <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
- *       character to be treated as literal character.&nbsp; This has no effect for many
- *       characters, but for the characters listed above, this deprives them of their special
- *       meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
- *       newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
- *       &quot;\n&quot; will be converted to a literal newline character by the time the
- *       regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
- *       visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
- *       characters in the ASCII range except for letters, digits, and control characters are
- *       reserved characters to the parser and must be preceded by \ even if they currently don't
- *       mean anything.</td>
+ *       character to be treated as literal character.&nbsp;  Except for letters and numbers,
+ *       characters in the ASCII range must be escaped to be considered as literals.</td>
 *     </tr>
 *     <tr>
 *       <td width="6%">!</td>
 *       <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
 *       parser that this expression specifies the backwards-iteration behavior of the iterator,
- *       and not its normal iteration behavior.&nbsp; This is generally only used in situations
- *       where the automatically-generated backwards-iteration brhavior doesn't produce
- *       satisfactory results and must be supplemented with extra client-specified rules.</td>
+ *       and not its normal iteration behavior.&nbsp;  The backwards rules must move the
+ *       iterator to a safe position at or before the previous break position; forwards rules
+ *       will then be used to find the exact previous position</td>
 *     </tr>
 *     <tr>
 *       <td width="6%"><em>(all others)</em></td>
@ -137,52 +109,6 @@ class BreakIterator;
 *     </tr>
 *   </table>
 * </blockquote>
- *
- * <p>Within a [] expression, a number of other special characters can be used to specify
- * groups of characters:</p>
- *
- * <blockquote>
- *   <table border="1" width="100%">
- *     <tr>
- *       <td width="6%">-</td>
- *       <td width="94%">Specifies a range of matching characters.&nbsp; For example
- *       &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
- *       sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
- *       language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
- *       it include accented letters such as a-umlaut.</td>
- *     </tr>
- *     <tr>
- *       <td width="6%">::</td>
- *       <td width="94%">A pair of colons containing a one- or two-letter code matches all
- *       characters in the corresponding Unicode category.&nbsp; The two-letter codes are the same
- *       as the two-letter codes in the Unicode database (for example, &quot;[:Sc::Sm:]&quot;
- *       matches all currency symbols and all math symbols).&nbsp; Specifying a one-letter code is
- *       the same as specifying all two-letter codes that begin with that letter (for example,
- *       &quot;[:L:]&quot; matches all letters, and is equivalent to
- *       &quot;[:Lu::Ll::Lo::Lm::Lt:]&quot;).&nbsp; Anything other than a valid two-letter Unicode
- *       category code or a single letter that begins a Unicode category code is illegal within
- *       colons.</td>
- *     </tr>
- *     <tr>
- *       <td width="6%">[]</td>
- *       <td width="94%">[] expressions can nest.&nbsp; This has no effect, except when used in
- *       conjunction with the ^ token.</td>
- *     </tr>
- *     <tr>
- *       <td width="6%">^</td>
- *       <td width="94%">Excludes the character (or the characters in the [] expression) following
- *       it from the group of characters.&nbsp; For example, &quot;[a-z^p]&quot; matches all Latin
- *       lowercase letters except p.&nbsp; &quot;[:L:^[\u4e00-\u9fff]]&quot; matches all letters
- *       except the Han ideographs.</td>
- *     </tr>
- *     <tr>
- *       <td width="6%"><em>(all others)</em></td>
- *       <td width="94%">All other characters are treated as literal characters.&nbsp; (For
- *       example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
- *     </tr>
- *   </table>
- * </blockquote>
- *
 */


@ -201,7 +127,9 @@ protected:
    //
    RBBIDataWrapper    *fData;
    UTrie              *fCharMappings;
-    int32_t             fLastBreakTag;      // Rule {tag} value for the most recent match.
+
+    // Rule {tag} value for the most recent match.
+    int32_t             fLastBreakTag;

    //
    // Counter for the number of characters encountered with the "dictionary"
@ -215,7 +143,7 @@ protected:
    // Debugging flag.
    //
    static UBool        fTrace;
-    
+


 private:
@ -228,7 +156,7 @@ protected:
    //=======================================================================
    // constructors
    //=======================================================================
-     
+
     // This constructor uses the udata interface to create a BreakIterator whose
     // internal tables live in a memory-mapped file.  "image" is a pointer to the
     // beginning of that file.
@ -248,7 +176,7 @@ protected:
     friend class BreakIterator;


-     
+
 public:

     /** Default constructor.  Creates an empty shell of an iterator, with no
@ -500,7 +428,7 @@ protected:
      * Return true if the category lookup for this char
      * indicates that it is in the set of dictionary lookup chars.
      * This function is intended for use by dictionary based break iterators.
-      */               
+      */
    virtual UBool isDictionaryChar(UChar32);

    /**
@ -513,7 +441,7 @@ protected:



-    
+
 //----------------------------------------------------------------------------------
 //
 //   Inline Functions Definitions ...