ICU-1117 add getRuleStatus() to RBBI

X-SVN-Rev: 8956
2002-06-27 01:50:22 +00:00 · 2002-06-27 01:50:22 +00:00 · 878c84b1d2
commit 878c84b1d2
parent 37792a8277
11 changed files with 116 additions and 28 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -155,7 +155,7 @@ void RuleBasedBreakIterator::init() {
    fText                = NULL;
    fData                = NULL;
    fCharMappings        = NULL;
-    fLastBreakStatus     = 0;
+    fLastBreakTag        = 0;
    fDictionaryCharCount = 0;   

    if (debugInitDone == FALSE) {
@ -489,11 +489,14 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
    int32_t lookaheadResult = 0;
    
    // begin in state 1
-    int32_t            state    = START_STATE;
+    int32_t            state           = START_STATE;
    int16_t            category;
-    UChar32            c        = fText->current32();  
+    UChar32            c               = fText->current32();  
    RBBIStateTableRow *row;
    int32_t            lookaheadStatus = 0;
+    int32_t            lookaheadTag    = 0;
+
+    fLastBreakTag = 0;

    row = (RBBIStateTableRow *)
        (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
@ -550,10 +553,13 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
            goto continueOn;
        }
        
-        if (row->fAccepting != 0 && row->fLookAhead == 0) {
+        if (row->fAccepting == -1) {
            // Match found, common case, no lookahead involved.
-            result = fText->getIndex();
-            lookaheadStatus = 0;     // clear out any pending look-ahead matches.
+            //    (It's possible that some lookahead rule matched here also,
+            //     but since there's an unconditional match, we'll favor that.)
+            result          = fText->getIndex();
+            lookaheadStatus = 0;           // clear out any pending look-ahead matches.
+            fLastBreakTag   = row->fTag;   // Remember the break status (tag) value.
            goto continueOn;
        }
        
@ -566,6 +572,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
            if (r > result) {
                lookaheadResult = r;
                lookaheadStatus = row->fLookAhead;
+                lookaheadTag   = row->fTag;
            }
            goto continueOn;
        }
@ -576,7 +583,8 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
            if (lookaheadResult > result) {
                assert(row->fAccepting == lookaheadStatus);   // TODO:  handle this case
                //    of overlapping lookahead matches.
-                result = lookaheadResult;
+                result          = lookaheadResult;
+                fLastBreakTag   = lookaheadTag;
                lookaheadStatus = 0;
            }
            goto continueOn;
@ -631,6 +639,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
    int32_t            result          = fText->getIndex();
    int32_t            lookaheadStatus = 0;
    int32_t            lookaheadResult = 0;
+    int32_t            lookaheadTag    = 0;
    UChar32            c               = fText->current32();
    RBBIStateTableRow *row;

@ -685,7 +694,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
            goto continueOn;
        }
        
-        if (row->fAccepting != 0 && row->fLookAhead == 0) {
+        if (row->fAccepting == -1) {
            // Match found, common case, no lookahead involved.
            result = fText->getIndex();
            lookaheadStatus = 0;     // clear out any pending look-ahead matches.
@ -694,13 +703,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
        
        if (row->fAccepting == 0 && row->fLookAhead != 0) {
            // Lookahead match point.  Remember it, but only if no other rule
-            //   has unconditinally matched to this point.
+            //                         has unconditionally matched to this point.
            // TODO:  handle case where there's a pending match from a different rule
            //        where lookaheadStatus != 0  && lookaheadStatus != row->fLookAhead.
            int32_t  r = fText->getIndex();
            if (r > result) {
                lookaheadResult = r;
                lookaheadStatus = row->fLookAhead;
+                lookaheadTag    = row->fTag;
            }
            goto continueOn;
        }
@ -711,7 +721,8 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
            if (lookaheadResult > result) {
                assert(row->fAccepting == lookaheadStatus);   // TODO:  handle this case
                //    of overlapping lookahead matches.
-                result = lookaheadResult;
+                result          = lookaheadResult;
+                fLastBreakTag   = lookaheadTag;
                lookaheadStatus = 0;
            }
            goto continueOn;
@ -752,8 +763,8 @@ RuleBasedBreakIterator::reset()
 //   getRuleStatus()
 //
 //-------------------------------------------------------------------------------
-int16_t  RuleBasedBreakIterator::getRuleStatus() const {
-    return fLastBreakStatus;
+int32_t  RuleBasedBreakIterator::getRuleStatus() const {
+    return fLastBreakTag;
 }


@ -764,13 +775,13 @@ int16_t  RuleBasedBreakIterator::getRuleStatus() const {
 //                         for standard iterator types.
 //
 //-------------------------------------------------------------------------------
-const uint8_t  *RuleBasedBreakIterator::getFlattenedData(uint32_t *length) {
+const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
    const uint8_t  *retPtr = NULL;
-    *length = 0;
+    length = 0;

    if (fData != NULL) {
        retPtr = (const uint8_t *)fData->fHeader;
-         *length = fData->fHeader->fLength;
+         length = fData->fHeader->fLength;
    }
    return retPtr;
 }
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@ -164,10 +164,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {



+//----------------------------------------------------------------------------------------
 //
-//  RulesBasedBreakIterator, construct from source rules that are passed in
-//                           in a UnicodeString
+//  createRuleBasedBreakIterator    construct from source rules that are passed in
+//                                  in a UnicodeString
 //
+//----------------------------------------------------------------------------------------
 BreakIterator * 
 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
                                    UParseError      &parseError,
--- a/icu4c/source/common/rbbirpt.h
+++ b/icu4c/source/common/rbbirpt.h
@ -101,7 +101,7 @@ struct RBBIRuleTableEl gRuleParseStateTable[] = {
    , {doExprCatOperator, 36 /*$*/, 12,0,  FALSE}     //  30 
    , {doExprCatOperator, 46 /*.*/, 12,0,  FALSE}     //  31 
    , {doExprCatOperator, 47 /*/*/, 37,0,  FALSE}     //  32 
-    , {doExprCatOperator, 123 /*{*/, 49,0,  FALSE}     //  33 
+    , {doExprCatOperator, 123 /*{*/, 49,0,  TRUE}     //  33 
    , {doExprOrOperator, 124 /*|*/, 12,0,  TRUE}     //  34 
    , {doExprRParen, 41 /*)*/, 255,0,  TRUE}     //  35 
    , {doExprFinished, 255, 255,0,  FALSE}     //  36 
--- a/icu4c/source/common/rbbirpt.txt
+++ b/icu4c/source/common/rbbirpt.txt
@ -129,7 +129,7 @@ expr-cont:
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '/'                     look-ahead                              doExprCatOperator
-    '{'                     tag-open                                doExprCatOperator
+    '{'                  n  tag-open                                doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@ -443,7 +443,7 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action,
    case doStartTagValue:
        // Scanned a '{', the opening delimiter for a tag value within a rule.
        n = pushNewNode(RBBINode::tag);
-        n->fVal   = 0;
+        n->fVal      = 0;
        n->fFirstPos = fScanIndex;
        n->fLastPos  = fNextIndex;
        break;
@ -451,13 +451,15 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action,
    case doTagDigit:
        // Just scanned a decimal digit that's part of a tag value
        {
+            n = fNodeStack[fNodeStackPtr];
            uint32_t v = u_charDigitValue(fC.fChar);
            assert(v >= 0);
-            n->fVal *= v;
+            n->fVal = n->fVal*10 + v;
            break;
        }

    case doTagValue:
+        n = fNodeStack[fNodeStackPtr];
        n->fLastPos = fNextIndex;
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
        break;
@ -952,6 +954,19 @@ void RBBIRuleScanner::parse() {

    }

+    //
+    // If there were NO user specified reverse rules, set up the equivalent of ".*;"
+    //
+    if (fRB->fReverseTree == NULL) {
+        fRB->fReverseTree  = pushNewNode(RBBINode::opStar);
+        RBBINode  *operand = pushNewNode(RBBINode::setRef);
+        findSetFor(kAny, operand);
+        fRB->fReverseTree->fLeftChild = operand;
+        operand->fParent              = fRB->fReverseTree;
+        fNodeStackPtr -= 2;
+    }
+
+
    //
    // Parsing of the input RBBI rules is complete.
    // We now have a parse tree for the rule expressions
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@ -111,6 +111,7 @@ void  RBBITableBuilder::build() {
    buildStateTable();
    flagAcceptingStates();
    flagLookAheadStates();
+    flagTaggedStates();
    if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "states")) {printStates();};

 }
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -201,7 +201,7 @@ protected:
    //
    RBBIDataWrapper    *fData;
    UTrie              *fCharMappings;
-    int16_t             fLastBreakStatus;
+    int32_t             fLastBreakTag;      // Rule {tag} value for the most recent match.

    //
    // Counter for the number of characters encountered with the "dictionary"
@ -414,7 +414,7 @@ protected:
     * within brackets, {123}, for example.  For rules that do not specify a
     * status, a default value of 0 is returned.
     */
-    virtual int16_t getRuleStatus() const;
+    virtual int32_t getRuleStatus() const;

    /**
     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
@ -446,17 +446,20 @@ protected:


    /**
-     * Return the flattened form of compiled break rules,
+     * Return the binary form of compiled break rules,
     * which can then be used to create a new break iterator at some
     * time in the future.  Creating a break iterator in this way
     * is much faster than building one from the source form of the
     * break rules.
     *
-     * @return   A pointer to the flattened rule data.  The storage
+     * The binary data is can only be used with the same version of ICU
+     *  and on the same platform type (processor endian-ness)
+     *
+     * @return   A pointer to the binary (compiled) rule data.  The storage
     *           belongs to the RulesBasedBreakIterator object, no the
     *           caller, and must not be modified or deleted.
     */
-    virtual const uint8_t *getFlattenedData(uint32_t *length);
+    virtual const uint8_t *getBinaryRules(uint32_t &length);


 #ifdef RBBI_DEBUG
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -610,6 +610,7 @@ void RBBIAPITest::TestBuilder() {
         bi->setText(testString1);
         doBoundaryTest(*bi, testString1, bounds1);
     }
+     delete bi;
 }


--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -348,7 +348,7 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
 //    delete rbbi;
 }
 //--------------------------------------------------------------------
-//tests default rules based word iteration
+//tests default rules based sentence iteration
 //--------------------------------------------------------------------
 static const UChar kParagraphSeparator[] = {0x2029, 0};
 static const UChar kLineSeparator[]      = {0x2028, 0};
@ -766,6 +766,53 @@ void RBBITest::TestTitleBreak()
    delete titleData;
 }

+
+//-----------------------------------------------------------------------------------
+//
+//   Test for status {tag} return value from break rules.
+//        TODO:  a more thorough test.
+//
+//-----------------------------------------------------------------------------------
+void RBBITest::TestStatusReturn() {
+     UnicodeString rulesString1 = "$Letters = [:L:];\n"
+                                  "$Numbers = [:N:];\n"
+                                  "$Letters+{1};\n"
+                                  "$Numbers+{2};\n"
+                                  "Help\\ {4}/me\\!;\n"
+                                  "[^$Letters $Numbers];\n"
+                                  "!.*;\n";
+     UnicodeString testString1  = "abc123..abc Help me Help me!";
+                                // 01234567890123456789012345678
+     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
+     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
+
+     UErrorCode status=U_ZERO_ERROR;
+     UParseError    parseError;
+     
+     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
+     if(U_FAILURE(status)) {
+         errln("FAIL : in construction");
+     } else {
+         int32_t  pos;
+         int32_t  i = 0;
+         bi->setText(testString1);
+         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
+             if (pos != bounds1[i]) {
+                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
+                 break;
+             }
+
+             int tag = bi->getRuleStatus();
+             if (tag != brkStatus[i]) {
+                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
+                 break;
+             }
+             i++;
+         }
+     }
+     delete bi;
+}
+
 /*
 //Bug: if there is no word break before and after danda when it is followed by a space
 void RBBITest::TestDanda()
@ -1039,6 +1086,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
            if(exec) TestHindiWordBreak();                     break;
        case 6: name = "TestTitleBreak";
            if(exec) TestTitleBreak();                         break;
+        case 7: name = "TestStatusReturn";
+            if(exec) TestStatusReturn();                       break;

 //      case 6: name = "TestDanda()";
 //           if(exec) TestDanda();                             break;
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -55,6 +55,12 @@ public:
     * Tests Title Case break iteration
     **/  
    void TestTitleBreak(void);
+
+    /**
+     * Tests rule status return values
+     **/  
+    void TestStatusReturn();
+
    /**
    * Test Hindi Danda i.e make sure we have a break point before and after danda 
    **/ 
--- a/icu4c/source/tools/genbrk/genbrk.cpp
+++ b/icu4c/source/tools/genbrk/genbrk.cpp
@ -191,7 +191,7 @@ int  main(int argc, char **argv) {
    //
    uint32_t        outDataSize;
    const uint8_t  *outData;
-    outData = bi->getFlattenedData(&outDataSize);
+    outData = bi->getBinaryRules(outDataSize);


    //