ICU-45 RBBI Bug: fix handling of \uffff in data to be iterated.

Add test for same. Add test for new line break test data file. X-SVN-Rev: 9464
2002-07-31 19:05:33 +00:00 · 2002-07-31 19:05:33 +00:00 · 2a51b28692
commit 2a51b28692
parent 7ab5b0d8dc
3 changed files with 230 additions and 7 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -573,7 +573,10 @@ int32_t RuleBasedBreakIterator::handleNext(void) {

      // loop until we reach the end of the text or transition to state 0
      for (;;) {
-        if (c == CharacterIterator::DONE ) {
+        if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
+            // Note: CharacterIterator::DONE is 0xffff, which is also a legal
+            //       character value.  Check for DONE first, because it's quicker,
+            //       but also need to check fText->hasNext() to be certain.
            break;
        }
        // look up the current character's character category, which tells us
@ -667,9 +670,11 @@ continueOn:
    // a lookahead state, advance the break position to the lookahead position
    // (the theory here is that if there are no characters at all after the lookahead
    // position, that always matches the lookahead criteria)
-    if (c == CharacterIterator::DONE && lookaheadResult == fText->endIndex()) {
-        result          = lookaheadResult;
-        fLastBreakTag   = lookaheadTag;
+    if (c == CharacterIterator::DONE &&
+        fText->hasNext()==FALSE &&
+        lookaheadResult == fText->endIndex()) {
+            result          = lookaheadResult;
+            fLastBreakTag   = lookaheadTag;
    }


@ -723,7 +728,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {

    // loop until we reach the beginning of the text or transition to state 0
    for (;;) {
-        if (c == CharacterIterator::DONE) {
+        if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
            break;
        }

@ -806,7 +811,7 @@ continueOn:
    // Note:  the result postion isn't what is returned to the user by previous(),
    //        but where the implementation of previous() turns around and
    //        starts iterating forward again.
-    if (c == CharacterIterator::DONE) {
+    if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
        result = fText->startIndex();
    }
    fText->setIndex(result);
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -11,11 +11,13 @@


 #include "intltest.h"
+#include "unicode/utypes.h"
 #include "unicode/brkiter.h"
 #include "unicode/rbbi.h"
 #include "unicode/unicode.h"
+#include "unicode/uchar.h"
+#include "unicode/utf16.h"
 #include <stdio.h>
-#include "unicode/utypes.h"
 #include "rbbitst.h"
 #include <string.h>
 #include "unicode/schriter.h"
@ -271,6 +273,11 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration()
    ADD_DATACHUNK(chardata, "\\udbff\\udfff", 0, status);
    ADD_DATACHUNK(chardata, "x", 0, status);

+    // 0xffff is a legal character, and should not stop the break iterator early.
+    //    (Requires special casing in implementation, which is why it gets a test.)
+    ADD_DATACHUNK(chardata, "\\uffff", 0, status);
+    ADD_DATACHUNK(chardata, "\\uffff", 0, status);
+

    if(U_FAILURE(status)){
        errln("FAIL : in BITestData construction");
@ -1153,6 +1160,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
            if(exec) TestTitleBreak();                         break;
        case 7: name = "TestStatusReturn";
            if(exec) TestStatusReturn();                       break;
+        case 8: name = "TestLineBreakData";
+            if(exec) TestLineBreakData();                       break;

 //      case 6: name = "TestDanda()";
 //           if(exec) TestDanda();                             break;
@ -1414,4 +1423,208 @@ void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestD
 }


+//
+//  Token level scanner for the Unicode Line Break Test Data file.
+//      Return the next token, as follows:
+//          >= 0:       a UChar32 character, scanned from hex in the file.
+//          -1:         a break position, a division sign in the file.
+//          -2:         end of rule.  A new line in the file.
+//          -3:         end of file.  No more rules.
+//          -4:         Error
+//
+//   The scanner
+//       strips comments, ('#' to end of line)
+//       Recognizes CR, CR/LF and LF as new lines.
+//       Skips over spaces and  Xs (don't break here) in the data.
+//
+struct ScanState {
+    int32_t     fPeekChar;
+    UBool       fPeeked;
+    int32_t     fLineNum;
+    FILE        *fFile;
+    ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {};
+};
+
+//  Literal characters that are of interest.  In hex to keep EBCDIC based machines happy.
+//  The data itself is latin-1 on all platforms.
+static const chSpace  = 0x20;
+static const chTab    = 0x09;
+static const chCR     = 0x0D;
+static const chLF     = 0x0A;
+static const chHash   = 0x23;
+static const chMult   = 0xD7;
+static const chDivide = 0xF7;
+
+static int32_t   nextLBDToken(ScanState *s) {
+    int32_t     c;
+
+    // Read  characters from the input file until we get something interesting
+    //   to return.  The file is in latin-1 encoding.
+    for (;;) {
+        // Get the next character to look at, 
+        if (s->fPeeked) {
+            c = s->fPeekChar;
+            s->fPeeked = FALSE;
+        } else {
+            c = getc(s->fFile);
+        }
+
+        // EOF.  Return immediately.
+        if (c == EOF) {
+            return -3;
+        }
+
+        // Spaces.  Treat the multiply sign as a space - it indicates a no-break position 
+        //          in the data, and the test program doesn't want to see them.
+        //          Continue the next char loop, looking for something significant.
+        if (c == chSpace || c == chTab || c == chMult) {
+            continue;
+        }
+
+        //  Divide sign.  Indicates an expected break position.
+        if (c == chDivide) {
+            return -1;
+        }
+
+        // New Line Handling.  Keep track of line number in the file, which in turn
+        //   requires keeping track of CR/LF as a single new line.
+        if (c == chCR) {
+            s->fLineNum++;
+            s->fPeekChar = getc(s->fFile);
+            if (s->fPeekChar != chLF) {s->fPeeked = TRUE;};
+            return -2;
+        }
+        if (c == chLF) {
+            s->fLineNum++;
+            return -2;
+        }
+
+        // Comments.  Consume everything up to the next new line.
+        if (c == chHash) {
+            do {
+                c = getc(s->fFile);
+            } while (!(c == EOF || c == chCR || c == chLF));
+            s->fPeekChar = c;
+            s->fPeeked = TRUE;
+            return nextLBDToken(s);
+        }
+
+        // Scan a hex character (UChar32) value.  
+        if (u_digit(c, 16) >= 0) { 
+            int32_t   v = u_digit(c, 16);
+            for (;;) {
+                c = getc(s->fFile);
+                if (u_digit(c, 16) < 0) {break;};
+                v <<= 4;
+                v += u_digit(c, 16);
+            }
+            s->fPeekChar = c;
+            s->fPeeked   = TRUE;
+            return v;
+        }
+
+        // Error.  Character was something unexpected.
+        return -4;
+    }
+}
+
+
+
+void RBBITest::TestLineBreakData() {
+
+    UErrorCode      status = U_ZERO_ERROR;
+    UnicodeString   testString;
+    UVector         expectedBreaks(status);
+    ScanState       ss;
+    int32_t         tok;
+
+    BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status);
+    if (U_FAILURE(status)) {
+        errln("Failure creating break iterator");
+        return;
+    }
+
+    char *       lbdfName = "LBTest.txt";
+
+    // Open the test data file.
+    //   TODO:  a proper way to handle this data.
+    ss.fFile = fopen(lbdfName, "rb");
+    if (ss.fFile == NULL) {
+        infoln("Unable to open Line Break Test Data file.  Skipping test.");
+        return;
+    }
+
+    // Loop once per line from the test data file.
+    for (;;) {
+        // Zero out test data from previous line.
+        testString.truncate(0);
+        expectedBreaks.removeAllElements();
+        
+        // Read one test's (line's) worth of data from the file.
+        //   Loop once per token on the input file line.
+        for(;;)  {
+            tok = nextLBDToken(&ss);
+            
+            // If we scanned a character number in the file.
+            //   save it in the test data array.
+            if (tok >= 0) {
+                testString.append((UChar32)tok);
+                continue;
+            }
+            
+            // If we scanned a break position in the data, record it.
+            if (tok == -1) {
+                expectedBreaks.addElement(testString.length(), status);
+                continue;
+            }
+            
+            // If we scanned a new line, or EOF
+            //    drop out of scan loop and run the test case.
+            if (tok == -2 || tok == -3) {break;};
+
+            // None of above.  Error.
+            errln("Failure:  Unrecognized data format,  test file line %d", ss.fLineNum);
+            break;
+        }
+        
+        // If this line from the test data file actually contained test data,
+        //   run the test.
+        if (testString.length() > 0) {
+            int32_t pos;                 // Break Position in the test string
+            int32_t expectedI = 0;       // Index of expected break position in vector of same.
+            int32_t expectedPos;         // Expected break position (index into test string)
+
+            bi->setText(testString);
+            pos = bi->first();       // TODO:  break iterators always return a match at pos 0.
+            pos = bi->next();        //        Line Break TR says no match at position 0.
+                                     //        Resolve.
+ 
+            for (; pos != BreakIterator::DONE; ) {
+                expectedPos = expectedBreaks.elementAti(expectedI);
+                if (pos < expectedPos) {
+                    errln("Failure: Test file line %d, unexpected break found at position %d",
+                        ss.fLineNum, pos);
+                    break;
+                }
+                if (pos > expectedPos) {
+                    errln("Failure: Test file line %d, failed to find break at position %d",
+                        ss.fLineNum, expectedPos);
+                    break;
+                }
+                pos = bi->next();
+                expectedI++;
+            }
+        }
+
+        // If we've hit EOF on the input file, we're done.
+        if (tok == -3) {
+            break;
+        }
+
+    }
+
+    fclose(ss.fFile);
+    delete bi;
+            
+}

--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -62,6 +62,11 @@ public:
     **/  
    void TestStatusReturn();

+    /**
+     * Run the Unicode Line Break test data.
+     **/  
+    void TestLineBreakData();
+
    /**
    * Test Hindi Danda i.e make sure we have a break point before and after danda 
    **/