ICU-2093 Update LineBreak tests (work in progress)

X-SVN-Rev: 12032
2003-05-21 06:07:18 +00:00 · 2003-05-21 06:07:18 +00:00 · d11f9e993b
commit d11f9e993b
parent 26640c070f
2 changed files with 7 additions and 314 deletions
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -236,170 +236,9 @@ static const int T_IDEO   = 400;
 #define deadSA   "\\u0938\\u094d"
 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/

-void RBBITest::TestHindiCharacterBreak()
-{
-    UErrorCode status= U_ZERO_ERROR;
-    BITestData hindicharData(status);
-    ADD_DATACHUNK(hindicharData, NULL, 0, status);           // Break at start of data
-    //devanagari characters for Hindi support
-    ADD_DATACHUNK(hindicharData, "\\u0906", 0, status);                    //devanagari AA
-
-    //hindi character break should make sure that it
-    // doesn't break in-between a vowelsign and a chandrabindu
-
-    ADD_DATACHUNK(hindicharData, "\\u000a", 0, status);                   // Force break so following can appear stand-alone.
-    ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status);            //devanagari vowelsign AA+ chandrabindu
-    ADD_DATACHUNK(hindicharData, "\\u0906\\u0901", 0, status);            // Devanagari AA + chandrabindu
-    ADD_DATACHUNK(hindicharData, "\\u0915", 0, status);                   // Devanagari KA 
-    ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status);            // Devanagari AA vowelsign + chandrabindu


-    ADD_DATACHUNK(hindicharData, "\\u0916\\u0947", 0, status);              //devanagari KHA+vowelsign E
-    ADD_DATACHUNK(hindicharData, "\\u0938\\u0941\\u0902", 0, status);        //devanagari SA+vowelsign U + anusvara(bindu)
-    ADD_DATACHUNK(hindicharData, "\\u0926", 0, status);                    //devanagari consonant DA
-    ADD_DATACHUNK(hindicharData, "\\u0930", 0, status);                    //devanagari consonant RA
-    ADD_DATACHUNK(hindicharData, "\\u0939", 0, status);                    //devanagari consonant HA+
-    ADD_DATACHUNK(hindicharData, "\\u094c", 0, status);                    //           +dependent vowel sign AI
-    ADD_DATACHUNK(hindicharData, "\\u0964", 0, status);                    //devanagari danda
-    ADD_DATACHUNK(hindicharData, "\\u0950", 0, status);                    //devanagari OM
-    ADD_DATACHUNK(hindicharData, "\\u0915\\u0943", 0, status);              //devanagari KA+dependent vowel RI->KRI

-    //dependent half-forms.   2002-8-7:  New Char Break rules no longer join the half-sequences.
-    ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u0924", 0, status);             //halfSA+base consonant TA->STA
-    ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u0925", 0, status);             //halfSA+base consonant THA->STHA
-    ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u092e", 0, status);             //halfSA+base consonant MA->SMA
-    ADD_DATACHUNK(hindicharData, /* halfCHA */ "\\u091b", 0, status);            //halfCHA+base consonant CHHA->CHHHA
-    ADD_DATACHUNK(hindicharData, /* halfNA */ "\\u0917", 0, status);             //halfNA+base consonant GA->NGA
-    // ADD_DATACHUNK(hindicharData, "\\u092a\\u094d\\u200d\\u092f", 0, status);   //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
-    ADD_DATACHUNK(hindicharData, "\\u092a\\u094d", 0, status);   //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
-    ADD_DATACHUNK(hindicharData, "\\u200d", 0, status);          //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
-    ADD_DATACHUNK(hindicharData, "\\u092f", 0, status);          //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
-
-
-    //consonant RA rules ----------
-    //if the dead consonant RA precedes either a consonant or an independent vowel,
-    //then it is replaced by its superscript non-spacing mark
-    ADD_DATACHUNK(hindicharData, /* deadRA */  "\\u0915", 0, status);             //deadRA+devanagari consonant KA->KA+superRA
-    ADD_DATACHUNK(hindicharData, /* deadRA */  "\\u0923", 0, status);             //deadRA+devanagari consonant NNA->NNA+superRA
-    ADD_DATACHUNK(hindicharData, /* deadRA */  "\\u0917", 0, status);             //deadRA+devanagari consonant GA->GA+superRA
-    //  ADD_DATACHUNK(hindicharData, deadRA+ "\\u0960", 0);           //deadRA+devanagari cosonant RRI->RRI+superRA
-
-    //if any dead consonant(other than dead RA)precedes the consonant RA, then
-    //it is replaced with its nominal forma nd RA is replaced by the subscript non-spacing mark.
-    ADD_DATACHUNK(hindicharData, /* deadPHA */  "\\u0930", 0, status);            //deadPHA+devanagari consonant RA->PHA+subRA
-    ADD_DATACHUNK(hindicharData, /* deadPA */  "\\u0930", 0, status);             //deadPA+devanagari consonant RA->PA+subRA
-    ADD_DATACHUNK(hindicharData, /* deadTTHA */  "\\u0930", 0, status);           //deadTTHA+devanagari consonant RA->TTHA+subRA
-    ADD_DATACHUNK(hindicharData, /* deadTA */  "\\u0930", 0, status);             //deadTA+RA->TRA
-    // ADD_DATACHUNK(hindicharData, "\\u0936\\u094d\\u0930", 0, status);         //deadSHA(SHA+virama)+RA->SHRA
-    ADD_DATACHUNK(hindicharData, "\\u0936\\u094d", 0, status);         //deadSHA(SHA+virama)+RA->SHRA
-    ADD_DATACHUNK(hindicharData, "\\u0930", 0, status);         //deadSHA(SHA+virama)+RA->SHRA
-
-    //conjuct ligatures
-    //    2002-08-7   virma no longer forces joining.
-    // ADD_DATACHUNK(hindicharData, "\\u0915\\u094d\\u0937", 0, status);         //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
-    ADD_DATACHUNK(hindicharData, "\\u0915\\u094d", 0, status);         //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
-    ADD_DATACHUNK(hindicharData, "\\u0937", 0, status);         //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
-    ADD_DATACHUNK(hindicharData, /* deadTA */ "\\u0924", 0, status);              //deadTA+TA wraps up into glyph TTHA
-    //ADD_DATACHUNK(hindicharData, "\\u0926\\u094d\\u0935", 0, status);         //deadDA(DA+virama)+VA wraps up into DVA
-    //ADD_DATACHUNK(hindicharData, "\\u091c\\u094d\\u091e", 0, status);         //deadJA(JA+virama)+NYA wraps up into JNYA
-
-    RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
-    if(U_FAILURE(status)){
-        errln("FAIL : in construction");
-        return;
-    }
-    generalIteratorTest(*e, hindicharData);
-    delete e;
-}
-
-void RBBITest::TestHindiWordBreak()
-{
-    UErrorCode status= U_ZERO_ERROR;
-    BITestData hindiWordData(status);
-
-    //hindi
-    ADD_DATACHUNK(hindiWordData, NULL, 0, status);           // Break at start of data
-    ADD_DATACHUNK(hindiWordData, "\\u0917\\u092a\\u00ad\\u0936\\u092a", 200, status);
-    ADD_DATACHUNK(hindiWordData, "!", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u092f\\u0939", 200, status);
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0939\\u093f" halfNA "\\u0926\\u0940", 200, status);
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0939\\u0948", 200, status);
-    //danda is similar to full stop. danda is a hindi phrase seperator
-    //Make sure it breaks before danda and after danda when it is followed by a space
-    //ADD_DATACHUNK(hindiWordData, "\\u0964", 0);   //fails here doesn't break at danda
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0905\\u093e\\u092a", 200, status);
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947", 200, status);
-    ADD_DATACHUNK(hindiWordData, "?", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\n", 0, status);
-    ADD_DATACHUNK(hindiWordData, ":", 0, status);
-    ADD_DATACHUNK(hindiWordData, deadPA "\\u0930\\u093e\\u092f" visarga, 200, status);    //no break before visarga
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0935" deadRA "\\u0937\\u093e", 200, status);
-    ADD_DATACHUNK(hindiWordData, "\r\n", 0, status);
-    ADD_DATACHUNK(hindiWordData, deadPA  "\\u0930\\u0915\\u093e\\u0936", 200, status);     //deadPA+RA+KA+vowel AA+SHA -> prakash
-    ADD_DATACHUNK(hindiWordData, ",", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0924\\u0941\\u092e\\u093e\\u0930\\u094b", 200, status);
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u092e\\u093f" deadTA "\\u0930", 200, status);       //MA+vowel I+ deadTA + RA
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0915\\u093e", 200, status);
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u092a" deadTA "\\u0930", 200, status);            //PA + deadTA + RA
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u092a\\u095d\\u094b", 200, status);
-    // ADD_DATACHUNK(hindiWordData, "\\u0964", 0); //fails here doesn't break at danda
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, deadSA deadTA "\\u0930\\u093f", 200, status);       //deadSA+deadTA+RA+vowel I->sthri
-    ADD_DATACHUNK(hindiWordData, ".", 0, status);
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0968\\u0966.\\u0969\\u096f", 100, status);            //hindi numbers
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0967\\u0966\\u0966.\\u0966\\u0966", 100, status);     //postnumeric
-    ADD_DATACHUNK(hindiWordData, "\\u20a8", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0967,\\u0967\\u0966\\u0966.\\u0966\\u0966", 100, status); //pre-number India currency symbol Rs.\\u20aD
-    ADD_DATACHUNK(hindiWordData, " ", 0, status);
-    ADD_DATACHUNK(hindiWordData, "\\u0905\\u092e\\u091c", 200, status);
-    ADD_DATACHUNK(hindiWordData, "\n", 0, status);
-    ADD_DATACHUNK(hindiWordData, halfSA "\\u0935\\u0924\\u0902" deadTA "\\u0930", 200, status);
-    ADD_DATACHUNK(hindiWordData, "\r", 0, status);
-
-    RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
-    if(U_FAILURE(status)){
-        errln("FAIL : in construction");
-        return;
-    }
-    generalIteratorTest(*e, hindiWordData);
-    delete e;
-}
-
-
-void RBBITest::TestTitleBreak()
-{
-    UErrorCode status= U_ZERO_ERROR;
-    RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
-    if(U_FAILURE(status)){
-          errln("FAIL : in construction");
-          return;
-    }
-
-    BITestData titleData(status);
-    ADD_DATACHUNK(titleData, NULL, 0, status);           // Break at start of data
-    ADD_DATACHUNK(titleData, "   ", 0, status);
-    ADD_DATACHUNK(titleData, "This ", 0, status);
-    ADD_DATACHUNK(titleData, "is ", 0, status);
-    ADD_DATACHUNK(titleData, "a ", 0, status);
-    ADD_DATACHUNK(titleData, "simple ", 0, status);
-    ADD_DATACHUNK(titleData, "sample ", 0, status);
-    ADD_DATACHUNK(titleData, "sentence. ", 0, status);
-    ADD_DATACHUNK(titleData, "This ", 0, status);
-
-    generalIteratorTest(*titleI, titleData);
-    delete titleI;
-}


 //-----------------------------------------------------------------------------------
@ -682,12 +521,12 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
             break;
        case 3: name = "";
             break;
-        case 4: name = "TestHindiCharacterBreak";
-            if(exec) TestHindiCharacterBreak();                break;
-        case 5: name = "TestHindiWordBreak";
-            if(exec) TestHindiWordBreak();                     break;
-        case 6: name = "TestTitleBreak";
-            if(exec) TestTitleBreak();                         break;
+        case 4: name = "";
+            break;
+        case 5: name = "";
+            break;
+        case 6: name = "";
+            break;
        case 7: name = "TestStatusReturn";
            if(exec) TestStatusReturn();                       break;

@ -1042,139 +881,6 @@ void RBBITest::TestSentenceInvariants()
 }


-void RBBITest::TestLineInvariants()
-{
-#if 0        // TestLineInvariants() needs to be updated to reflect TR 14 rules.
-    UErrorCode status = U_ZERO_ERROR;
-    BreakIterator *e = BreakIterator::createLineInstance(Locale::getUS(), status);
-    if (U_FAILURE(status))
-    {
-        errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");
-        return;
-    }
-    UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");
-    UnicodeString testChars = *cannedTestChars + s;
-    doBreakInvariantTest(*e, testChars);
-    doOtherInvariantTest(*e, testChars);
-
-    int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;
-    int32_t i, j, k;
-
-    // in addition to the other invariants, a line-break iterator should make sure that:
-    // it doesn't break around the non-breaking characters,
-    // EXCEPT breaking after a space takes precedence over not breaking before
-    //        an non-breaking char.  So says TR 14.
-    UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
-    UnicodeString work("aaa");
-    testCharsLen = testChars.length();
-    noBreakLen = noBreak.length();
-    for (i = 0; i < testCharsLen; i++) {
-        UChar c = testChars[i];
-        if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 ||
-            u_charType(c) == U_CONTROL_CHAR) {
-            continue;
-        }
-        work[0] = c;
-        for (j = 0; j < noBreakLen; j++) {
-            work[1] = noBreak[j];
-            for (k = 0; k < testCharsLen; k++) {
-                work[2] = testChars[k];
-                e->setText(work);
-                for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
-                    UChar c1 = work[l - 1];
-                    UChar c2 = work[l];
-                    if (c1 == 0x20 && l == 1) {
-                        continue;
-                    }
-                    if (l == 1 || l == 2) {
-                        errln("Got break between U+" + UCharToUnicodeString(c1) + 
-                            " and U+" + UCharToUnicodeString(c2));
-                        errCount++;
-                        if (errCount >= 75)
-                            return;
-                    }
-                }
-            }
-        }
-    }
-
-    // it does break after hyphens (Rule 15B from TR 14
-    //  (unless they're followed by a digit, a non-spacing mark,
-    // a currency symbol, a non-breaking space, or a line or paragraph separator
-    //  or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
-
-    // This test is sufficiently screwed up that I'm largely disabling it.  TODO:  fix it.  06/12/2002  AGH
-    //
-    UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
-    dashesLen = dashes.length();
-    for (i = 0; i < testCharsLen; i++) {
-        work[0] = testChars[i];
-        for (j = 0; j < dashesLen; j++) {
-            UChar c1 = work[1] = dashes[j];
-            for (k = 0; k < testCharsLen; k++) {
-                UChar c2 = work[2] = testChars[k];
-                int8_t type = u_charType(c2);
-                if (type == U_DECIMAL_DIGIT_NUMBER ||
-                    type == U_OTHER_NUMBER ||
-                    type == U_NON_SPACING_MARK ||
-                    type == U_ENCLOSING_MARK ||
-                    type == U_CURRENCY_SYMBOL ||
-                    type == U_SPACE_SEPARATOR ||
-                    type == U_DASH_PUNCTUATION ||
-                    type == U_CONTROL_CHAR ||
-                    type == U_FORMAT_CHAR ||
-                    c2 == '\n'   || c2 == '\r'   || c2 == 0x2028 || c2 == 0x2029 ||
-                    c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 ||
-                    c2 == 0xfeff)
-                {
-                    continue;
-                }
-                // If c1 == hyphen-minus, and ...
-                if (c1 == 0x002d  &&  (
-                       c2 == 0x0021  ||   // !
-                       c2 == 0x002c  ||   // ,
-                       c2 == 0x002d  ||   // -
-                       c2 == 0x002e  ||   // .   (TR 14 class IS)
-                       c2 == 0x0029  ||   // )
-                       c2 == 0x003a  ||   // :
-                       c2 == 0x003b  ||   // ;   (TR 14 class IS)
-                       c2 == 0x005d  ||   // ]
-                       c2 == 0x007c  ||   // |   (TR 14 class BA, rule 15)
-                       c2 == 0x007d  ||   // }
-                       c2 == 0x0903  ||   // Devanagari sign visarga, combining, what's it doing in this test?
-                       c2 == 0x093E  ||   // Devanagari , combining, what's it doing in this test?
-                       c2 == 0x093F  ||   // Devanagari , combining, what's it doing in this test?
-                       c2 == 0x0940  ||   // Devanagari , combining, what's it doing in this test?
-                       c2 == 0x0949  ||   // Devanagari , combining, what's it doing in this test?
-                       c2 == 0x0f3b  ||   // Tibetan closing bracket
-                       c2 == 0x3001  ||   // CJK closing bracket
-                       c2 == 0x3002       // CJK closing bracket
-                      )) {
-                    continue;
-                }
-
-                e->setText(work);
-                UBool saw2 = FALSE;
-                for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
-                    if (l == 2) {
-                        saw2 = TRUE;
-                        break;
-                    }
-                }
-                if (!saw2) {
-                    // TODO:  This test is completely out of sync with the spec.  Fix it.
-                    // errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) + 
-                    //    " and U+" + UCharToUnicodeString(work[2]));
-                    // errCount++;
-                    // if (errCount >= 75)
-                    //    return;
-                }
-            }
-        }
-    }
-    delete e;
-#endif
-}


 void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -35,18 +35,6 @@ public:
    virtual ~RBBITest();

    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
-    /**
-     * Tests Hindi(Devanagiri) character iteration
-     **/  
-    void TestHindiCharacterBreak(void);
-    /**
-     * Tests Hindi(Devanagiri) word iteration
-     **/  
-    void TestHindiWordBreak(void);
-    /**
-     * Tests Title Case break iteration
-     **/  
-    void TestTitleBreak(void);
 
    /**
     * Tests rule status return values
@ -65,7 +53,6 @@ public:
    void TestSentenceInvariants();
    void TestCharacterInvariants();
    void TestWordInvariants();
-    void TestLineInvariants();
    void TestEmptyString();
    void TestGetAvailableLocales();
    void TestGetDisplayName();