ICU-2093 Update LineBreak tests (work in progress)

X-SVN-Rev: 12032
2003-05-21 06:07:18 +00:00 · 2003-05-21 06:07:18 +00:00 · d11f9e993b
commit d11f9e993b
parent 26640c070f
2 changed files with 7 additions and 314 deletions
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -236,170 +236,9 @@ static const int T_IDEO   = 400;
 #define deadSA   "\\u0938\\u094d"
 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
 void RBBITest::TestHindiCharacterBreak()
 {
    UErrorCode status= U_ZERO_ERROR;
    BITestData hindicharData(status);
    ADD_DATACHUNK(hindicharData, NULL, 0, status);           // Break at start of data
    //devanagari characters for Hindi support
    ADD_DATACHUNK(hindicharData, "\\u0906", 0, status);                    //devanagari AA
    //hindi character break should make sure that it
    // doesn't break in-between a vowelsign and a chandrabindu
    ADD_DATACHUNK(hindicharData, "\\u000a", 0, status);                   // Force break so following can appear stand-alone.
    ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status);            //devanagari vowelsign AA+ chandrabindu
    ADD_DATACHUNK(hindicharData, "\\u0906\\u0901", 0, status);            // Devanagari AA + chandrabindu
    ADD_DATACHUNK(hindicharData, "\\u0915", 0, status);                   // Devanagari KA 
    ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status);            // Devanagari AA vowelsign + chandrabindu
    ADD_DATACHUNK(hindicharData, "\\u0916\\u0947", 0, status);              //devanagari KHA+vowelsign E
    ADD_DATACHUNK(hindicharData, "\\u0938\\u0941\\u0902", 0, status);        //devanagari SA+vowelsign U + anusvara(bindu)
    ADD_DATACHUNK(hindicharData, "\\u0926", 0, status);                    //devanagari consonant DA
    ADD_DATACHUNK(hindicharData, "\\u0930", 0, status);                    //devanagari consonant RA
    ADD_DATACHUNK(hindicharData, "\\u0939", 0, status);                    //devanagari consonant HA+
    ADD_DATACHUNK(hindicharData, "\\u094c", 0, status);                    //           +dependent vowel sign AI
    ADD_DATACHUNK(hindicharData, "\\u0964", 0, status);                    //devanagari danda
    ADD_DATACHUNK(hindicharData, "\\u0950", 0, status);                    //devanagari OM
    ADD_DATACHUNK(hindicharData, "\\u0915\\u0943", 0, status);              //devanagari KA+dependent vowel RI->KRI
    //dependent half-forms.   2002-8-7:  New Char Break rules no longer join the half-sequences.
    ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u0924", 0, status);             //halfSA+base consonant TA->STA
    ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u0925", 0, status);             //halfSA+base consonant THA->STHA
    ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u092e", 0, status);             //halfSA+base consonant MA->SMA
    ADD_DATACHUNK(hindicharData, /* halfCHA */ "\\u091b", 0, status);            //halfCHA+base consonant CHHA->CHHHA
    ADD_DATACHUNK(hindicharData, /* halfNA */ "\\u0917", 0, status);             //halfNA+base consonant GA->NGA
    // ADD_DATACHUNK(hindicharData, "\\u092a\\u094d\\u200d\\u092f", 0, status);   //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
    ADD_DATACHUNK(hindicharData, "\\u092a\\u094d", 0, status);   //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
    ADD_DATACHUNK(hindicharData, "\\u200d", 0, status);          //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
    ADD_DATACHUNK(hindicharData, "\\u092f", 0, status);          //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
    //consonant RA rules ----------
    //if the dead consonant RA precedes either a consonant or an independent vowel,
    //then it is replaced by its superscript non-spacing mark
    ADD_DATACHUNK(hindicharData, /* deadRA */  "\\u0915", 0, status);             //deadRA+devanagari consonant KA->KA+superRA
    ADD_DATACHUNK(hindicharData, /* deadRA */  "\\u0923", 0, status);             //deadRA+devanagari consonant NNA->NNA+superRA
    ADD_DATACHUNK(hindicharData, /* deadRA */  "\\u0917", 0, status);             //deadRA+devanagari consonant GA->GA+superRA
    //  ADD_DATACHUNK(hindicharData, deadRA+ "\\u0960", 0);           //deadRA+devanagari cosonant RRI->RRI+superRA
    //if any dead consonant(other than dead RA)precedes the consonant RA, then
    //it is replaced with its nominal forma nd RA is replaced by the subscript non-spacing mark.
    ADD_DATACHUNK(hindicharData, /* deadPHA */  "\\u0930", 0, status);            //deadPHA+devanagari consonant RA->PHA+subRA
    ADD_DATACHUNK(hindicharData, /* deadPA */  "\\u0930", 0, status);             //deadPA+devanagari consonant RA->PA+subRA
    ADD_DATACHUNK(hindicharData, /* deadTTHA */  "\\u0930", 0, status);           //deadTTHA+devanagari consonant RA->TTHA+subRA
    ADD_DATACHUNK(hindicharData, /* deadTA */  "\\u0930", 0, status);             //deadTA+RA->TRA
    // ADD_DATACHUNK(hindicharData, "\\u0936\\u094d\\u0930", 0, status);         //deadSHA(SHA+virama)+RA->SHRA
    ADD_DATACHUNK(hindicharData, "\\u0936\\u094d", 0, status);         //deadSHA(SHA+virama)+RA->SHRA
    ADD_DATACHUNK(hindicharData, "\\u0930", 0, status);         //deadSHA(SHA+virama)+RA->SHRA
    //conjuct ligatures
    //    2002-08-7   virma no longer forces joining.
    // ADD_DATACHUNK(hindicharData, "\\u0915\\u094d\\u0937", 0, status);         //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
    ADD_DATACHUNK(hindicharData, "\\u0915\\u094d", 0, status);         //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
    ADD_DATACHUNK(hindicharData, "\\u0937", 0, status);         //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
    ADD_DATACHUNK(hindicharData, /* deadTA */ "\\u0924", 0, status);              //deadTA+TA wraps up into glyph TTHA
    //ADD_DATACHUNK(hindicharData, "\\u0926\\u094d\\u0935", 0, status);         //deadDA(DA+virama)+VA wraps up into DVA
    //ADD_DATACHUNK(hindicharData, "\\u091c\\u094d\\u091e", 0, status);         //deadJA(JA+virama)+NYA wraps up into JNYA
    RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
    if(U_FAILURE(status)){
        errln("FAIL : in construction");
        return;
    }
    generalIteratorTest(*e, hindicharData);
    delete e;
 }
 void RBBITest::TestHindiWordBreak()
 {
    UErrorCode status= U_ZERO_ERROR;
    BITestData hindiWordData(status);
    //hindi
    ADD_DATACHUNK(hindiWordData, NULL, 0, status);           // Break at start of data
    ADD_DATACHUNK(hindiWordData, "\\u0917\\u092a\\u00ad\\u0936\\u092a", 200, status);
    ADD_DATACHUNK(hindiWordData, "!", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u092f\\u0939", 200, status);
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0939\\u093f" halfNA "\\u0926\\u0940", 200, status);
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0939\\u0948", 200, status);
    //danda is similar to full stop. danda is a hindi phrase seperator
    //Make sure it breaks before danda and after danda when it is followed by a space
    //ADD_DATACHUNK(hindiWordData, "\\u0964", 0);   //fails here doesn't break at danda
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0905\\u093e\\u092a", 200, status);
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947", 200, status);
    ADD_DATACHUNK(hindiWordData, "?", 0, status);
    ADD_DATACHUNK(hindiWordData, "\n", 0, status);
    ADD_DATACHUNK(hindiWordData, ":", 0, status);
    ADD_DATACHUNK(hindiWordData, deadPA "\\u0930\\u093e\\u092f" visarga, 200, status);    //no break before visarga
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0935" deadRA "\\u0937\\u093e", 200, status);
    ADD_DATACHUNK(hindiWordData, "\r\n", 0, status);
    ADD_DATACHUNK(hindiWordData, deadPA  "\\u0930\\u0915\\u093e\\u0936", 200, status);     //deadPA+RA+KA+vowel AA+SHA -> prakash
    ADD_DATACHUNK(hindiWordData, ",", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0924\\u0941\\u092e\\u093e\\u0930\\u094b", 200, status);
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u092e\\u093f" deadTA "\\u0930", 200, status);       //MA+vowel I+ deadTA + RA
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0915\\u093e", 200, status);
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u092a" deadTA "\\u0930", 200, status);            //PA + deadTA + RA
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u092a\\u095d\\u094b", 200, status);
    // ADD_DATACHUNK(hindiWordData, "\\u0964", 0); //fails here doesn't break at danda
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, deadSA deadTA "\\u0930\\u093f", 200, status);       //deadSA+deadTA+RA+vowel I->sthri
    ADD_DATACHUNK(hindiWordData, ".", 0, status);
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0968\\u0966.\\u0969\\u096f", 100, status);            //hindi numbers
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0967\\u0966\\u0966.\\u0966\\u0966", 100, status);     //postnumeric
    ADD_DATACHUNK(hindiWordData, "\\u20a8", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0967,\\u0967\\u0966\\u0966.\\u0966\\u0966", 100, status); //pre-number India currency symbol Rs.\\u20aD
    ADD_DATACHUNK(hindiWordData, " ", 0, status);
    ADD_DATACHUNK(hindiWordData, "\\u0905\\u092e\\u091c", 200, status);
    ADD_DATACHUNK(hindiWordData, "\n", 0, status);
    ADD_DATACHUNK(hindiWordData, halfSA "\\u0935\\u0924\\u0902" deadTA "\\u0930", 200, status);
    ADD_DATACHUNK(hindiWordData, "\r", 0, status);
    RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
    if(U_FAILURE(status)){
        errln("FAIL : in construction");
        return;
    }
    generalIteratorTest(*e, hindiWordData);
    delete e;
 }
 void RBBITest::TestTitleBreak()
 {
    UErrorCode status= U_ZERO_ERROR;
    RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
    if(U_FAILURE(status)){
          errln("FAIL : in construction");
          return;
    }
    BITestData titleData(status);
    ADD_DATACHUNK(titleData, NULL, 0, status);           // Break at start of data
    ADD_DATACHUNK(titleData, "   ", 0, status);
    ADD_DATACHUNK(titleData, "This ", 0, status);
    ADD_DATACHUNK(titleData, "is ", 0, status);
    ADD_DATACHUNK(titleData, "a ", 0, status);
    ADD_DATACHUNK(titleData, "simple ", 0, status);
    ADD_DATACHUNK(titleData, "sample ", 0, status);
    ADD_DATACHUNK(titleData, "sentence. ", 0, status);
    ADD_DATACHUNK(titleData, "This ", 0, status);
    generalIteratorTest(*titleI, titleData);
    delete titleI;
 }
 //-----------------------------------------------------------------------------------
@ -682,12 +521,12 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
             break;
        case 3: name = "";
             break;
-        case 4: name = "TestHindiCharacterBreak";
+        case 4: name = "";
-            if(exec) TestHindiCharacterBreak();                break;
+            break;
-        case 5: name = "TestHindiWordBreak";
+        case 5: name = "";
-            if(exec) TestHindiWordBreak();                     break;
+            break;
-        case 6: name = "TestTitleBreak";
+        case 6: name = "";
-            if(exec) TestTitleBreak();                         break;
+            break;
        case 7: name = "TestStatusReturn";
            if(exec) TestStatusReturn();                       break;
@ -1042,139 +881,6 @@ void RBBITest::TestSentenceInvariants()
 }
 void RBBITest::TestLineInvariants()
 {
 #if 0        // TestLineInvariants() needs to be updated to reflect TR 14 rules.
    UErrorCode status = U_ZERO_ERROR;
    BreakIterator *e = BreakIterator::createLineInstance(Locale::getUS(), status);
    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");
        return;
    }
    UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");
    UnicodeString testChars = *cannedTestChars + s;
    doBreakInvariantTest(*e, testChars);
    doOtherInvariantTest(*e, testChars);
    int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;
    int32_t i, j, k;
    // in addition to the other invariants, a line-break iterator should make sure that:
    // it doesn't break around the non-breaking characters,
    // EXCEPT breaking after a space takes precedence over not breaking before
    //        an non-breaking char.  So says TR 14.
    UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
    UnicodeString work("aaa");
    testCharsLen = testChars.length();
    noBreakLen = noBreak.length();
    for (i = 0; i < testCharsLen; i++) {
        UChar c = testChars[i];
        if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 ||
            u_charType(c) == U_CONTROL_CHAR) {
            continue;
        }
        work[0] = c;
        for (j = 0; j < noBreakLen; j++) {
            work[1] = noBreak[j];
            for (k = 0; k < testCharsLen; k++) {
                work[2] = testChars[k];
                e->setText(work);
                for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
                    UChar c1 = work[l - 1];
                    UChar c2 = work[l];
                    if (c1 == 0x20 && l == 1) {
                        continue;
                    }
                    if (l == 1 || l == 2) {
                        errln("Got break between U+" + UCharToUnicodeString(c1) + 
                            " and U+" + UCharToUnicodeString(c2));
                        errCount++;
                        if (errCount >= 75)
                            return;
                    }
                }
            }
        }
    }
    // it does break after hyphens (Rule 15B from TR 14
    //  (unless they're followed by a digit, a non-spacing mark,
    // a currency symbol, a non-breaking space, or a line or paragraph separator
    //  or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
    // This test is sufficiently screwed up that I'm largely disabling it.  TODO:  fix it.  06/12/2002  AGH
    //
    UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
    dashesLen = dashes.length();
    for (i = 0; i < testCharsLen; i++) {
        work[0] = testChars[i];
        for (j = 0; j < dashesLen; j++) {
            UChar c1 = work[1] = dashes[j];
            for (k = 0; k < testCharsLen; k++) {
                UChar c2 = work[2] = testChars[k];
                int8_t type = u_charType(c2);
                if (type == U_DECIMAL_DIGIT_NUMBER ||
                    type == U_OTHER_NUMBER ||
                    type == U_NON_SPACING_MARK ||
                    type == U_ENCLOSING_MARK ||
                    type == U_CURRENCY_SYMBOL ||
                    type == U_SPACE_SEPARATOR ||
                    type == U_DASH_PUNCTUATION ||
                    type == U_CONTROL_CHAR ||
                    type == U_FORMAT_CHAR ||
                    c2 == '\n'   || c2 == '\r'   || c2 == 0x2028 || c2 == 0x2029 ||
                    c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 ||
                    c2 == 0xfeff)
                {
                    continue;
                }
                // If c1 == hyphen-minus, and ...
                if (c1 == 0x002d  &&  (
                       c2 == 0x0021  ||   // !
                       c2 == 0x002c  ||   // ,
                       c2 == 0x002d  ||   // -
                       c2 == 0x002e  ||   // .   (TR 14 class IS)
                       c2 == 0x0029  ||   // )
                       c2 == 0x003a  ||   // :
                       c2 == 0x003b  ||   // ;   (TR 14 class IS)
                       c2 == 0x005d  ||   // ]
                       c2 == 0x007c  ||   // |   (TR 14 class BA, rule 15)
                       c2 == 0x007d  ||   // }
                       c2 == 0x0903  ||   // Devanagari sign visarga, combining, what's it doing in this test?
                       c2 == 0x093E  ||   // Devanagari , combining, what's it doing in this test?
                       c2 == 0x093F  ||   // Devanagari , combining, what's it doing in this test?
                       c2 == 0x0940  ||   // Devanagari , combining, what's it doing in this test?
                       c2 == 0x0949  ||   // Devanagari , combining, what's it doing in this test?
                       c2 == 0x0f3b  ||   // Tibetan closing bracket
                       c2 == 0x3001  ||   // CJK closing bracket
                       c2 == 0x3002       // CJK closing bracket
                      )) {
                    continue;
                }
                e->setText(work);
                UBool saw2 = FALSE;
                for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
                    if (l == 2) {
                        saw2 = TRUE;
                        break;
                    }
                }
                if (!saw2) {
                    // TODO:  This test is completely out of sync with the spec.  Fix it.
                    // errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) + 
                    //    " and U+" + UCharToUnicodeString(work[2]));
                    // errCount++;
                    // if (errCount >= 75)
                    //    return;
                }
            }
        }
    }
    delete e;
 #endif
 }
 void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -35,19 +35,7 @@ public:
    virtual ~RBBITest();
    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
-    /**
+ 
     * Tests Hindi(Devanagiri) character iteration
     **/  
    void TestHindiCharacterBreak(void);
    /**
     * Tests Hindi(Devanagiri) word iteration
     **/  
    void TestHindiWordBreak(void);
    /**
     * Tests Title Case break iteration
     **/  
    void TestTitleBreak(void);
    /**
     * Tests rule status return values
     **/  
@ -65,7 +53,6 @@ public:
    void TestSentenceInvariants();
    void TestCharacterInvariants();
    void TestWordInvariants();
    void TestLineInvariants();
    void TestEmptyString();
    void TestGetAvailableLocales();
    void TestGetDisplayName();