ICU-2093 Update LineBreak tests (work in progress)

X-SVN-Rev: 12032
This commit is contained in:
Andy Heninger 2003-05-21 06:07:18 +00:00
parent 26640c070f
commit d11f9e993b
2 changed files with 7 additions and 314 deletions

View File

@ -236,170 +236,9 @@ static const int T_IDEO = 400;
#define deadSA "\\u0938\\u094d" #define deadSA "\\u0938\\u094d"
#define visarga "\\u0903" /*devanagari visarga looks like a english colon*/ #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
void RBBITest::TestHindiCharacterBreak()
{
UErrorCode status= U_ZERO_ERROR;
BITestData hindicharData(status);
ADD_DATACHUNK(hindicharData, NULL, 0, status); // Break at start of data
//devanagari characters for Hindi support
ADD_DATACHUNK(hindicharData, "\\u0906", 0, status); //devanagari AA
//hindi character break should make sure that it
// doesn't break in-between a vowelsign and a chandrabindu
ADD_DATACHUNK(hindicharData, "\\u000a", 0, status); // Force break so following can appear stand-alone.
ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status); //devanagari vowelsign AA+ chandrabindu
ADD_DATACHUNK(hindicharData, "\\u0906\\u0901", 0, status); // Devanagari AA + chandrabindu
ADD_DATACHUNK(hindicharData, "\\u0915", 0, status); // Devanagari KA
ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status); // Devanagari AA vowelsign + chandrabindu
ADD_DATACHUNK(hindicharData, "\\u0916\\u0947", 0, status); //devanagari KHA+vowelsign E
ADD_DATACHUNK(hindicharData, "\\u0938\\u0941\\u0902", 0, status); //devanagari SA+vowelsign U + anusvara(bindu)
ADD_DATACHUNK(hindicharData, "\\u0926", 0, status); //devanagari consonant DA
ADD_DATACHUNK(hindicharData, "\\u0930", 0, status); //devanagari consonant RA
ADD_DATACHUNK(hindicharData, "\\u0939", 0, status); //devanagari consonant HA+
ADD_DATACHUNK(hindicharData, "\\u094c", 0, status); // +dependent vowel sign AI
ADD_DATACHUNK(hindicharData, "\\u0964", 0, status); //devanagari danda
ADD_DATACHUNK(hindicharData, "\\u0950", 0, status); //devanagari OM
ADD_DATACHUNK(hindicharData, "\\u0915\\u0943", 0, status); //devanagari KA+dependent vowel RI->KRI
//dependent half-forms. 2002-8-7: New Char Break rules no longer join the half-sequences.
ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u0924", 0, status); //halfSA+base consonant TA->STA
ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u0925", 0, status); //halfSA+base consonant THA->STHA
ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u092e", 0, status); //halfSA+base consonant MA->SMA
ADD_DATACHUNK(hindicharData, /* halfCHA */ "\\u091b", 0, status); //halfCHA+base consonant CHHA->CHHHA
ADD_DATACHUNK(hindicharData, /* halfNA */ "\\u0917", 0, status); //halfNA+base consonant GA->NGA
// ADD_DATACHUNK(hindicharData, "\\u092a\\u094d\\u200d\\u092f", 0, status); //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
ADD_DATACHUNK(hindicharData, "\\u092a\\u094d", 0, status); //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
ADD_DATACHUNK(hindicharData, "\\u200d", 0, status); //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
ADD_DATACHUNK(hindicharData, "\\u092f", 0, status); //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
//consonant RA rules ----------
//if the dead consonant RA precedes either a consonant or an independent vowel,
//then it is replaced by its superscript non-spacing mark
ADD_DATACHUNK(hindicharData, /* deadRA */ "\\u0915", 0, status); //deadRA+devanagari consonant KA->KA+superRA
ADD_DATACHUNK(hindicharData, /* deadRA */ "\\u0923", 0, status); //deadRA+devanagari consonant NNA->NNA+superRA
ADD_DATACHUNK(hindicharData, /* deadRA */ "\\u0917", 0, status); //deadRA+devanagari consonant GA->GA+superRA
// ADD_DATACHUNK(hindicharData, deadRA+ "\\u0960", 0); //deadRA+devanagari cosonant RRI->RRI+superRA
//if any dead consonant(other than dead RA)precedes the consonant RA, then
//it is replaced with its nominal forma nd RA is replaced by the subscript non-spacing mark.
ADD_DATACHUNK(hindicharData, /* deadPHA */ "\\u0930", 0, status); //deadPHA+devanagari consonant RA->PHA+subRA
ADD_DATACHUNK(hindicharData, /* deadPA */ "\\u0930", 0, status); //deadPA+devanagari consonant RA->PA+subRA
ADD_DATACHUNK(hindicharData, /* deadTTHA */ "\\u0930", 0, status); //deadTTHA+devanagari consonant RA->TTHA+subRA
ADD_DATACHUNK(hindicharData, /* deadTA */ "\\u0930", 0, status); //deadTA+RA->TRA
// ADD_DATACHUNK(hindicharData, "\\u0936\\u094d\\u0930", 0, status); //deadSHA(SHA+virama)+RA->SHRA
ADD_DATACHUNK(hindicharData, "\\u0936\\u094d", 0, status); //deadSHA(SHA+virama)+RA->SHRA
ADD_DATACHUNK(hindicharData, "\\u0930", 0, status); //deadSHA(SHA+virama)+RA->SHRA
//conjuct ligatures
// 2002-08-7 virma no longer forces joining.
// ADD_DATACHUNK(hindicharData, "\\u0915\\u094d\\u0937", 0, status); //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
ADD_DATACHUNK(hindicharData, "\\u0915\\u094d", 0, status); //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
ADD_DATACHUNK(hindicharData, "\\u0937", 0, status); //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
ADD_DATACHUNK(hindicharData, /* deadTA */ "\\u0924", 0, status); //deadTA+TA wraps up into glyph TTHA
//ADD_DATACHUNK(hindicharData, "\\u0926\\u094d\\u0935", 0, status); //deadDA(DA+virama)+VA wraps up into DVA
//ADD_DATACHUNK(hindicharData, "\\u091c\\u094d\\u091e", 0, status); //deadJA(JA+virama)+NYA wraps up into JNYA
RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
if(U_FAILURE(status)){
errln("FAIL : in construction");
return;
}
generalIteratorTest(*e, hindicharData);
delete e;
}
void RBBITest::TestHindiWordBreak()
{
UErrorCode status= U_ZERO_ERROR;
BITestData hindiWordData(status);
//hindi
ADD_DATACHUNK(hindiWordData, NULL, 0, status); // Break at start of data
ADD_DATACHUNK(hindiWordData, "\\u0917\\u092a\\u00ad\\u0936\\u092a", 200, status);
ADD_DATACHUNK(hindiWordData, "!", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u092f\\u0939", 200, status);
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0939\\u093f" halfNA "\\u0926\\u0940", 200, status);
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0939\\u0948", 200, status);
//danda is similar to full stop. danda is a hindi phrase seperator
//Make sure it breaks before danda and after danda when it is followed by a space
//ADD_DATACHUNK(hindiWordData, "\\u0964", 0); //fails here doesn't break at danda
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0905\\u093e\\u092a", 200, status);
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947", 200, status);
ADD_DATACHUNK(hindiWordData, "?", 0, status);
ADD_DATACHUNK(hindiWordData, "\n", 0, status);
ADD_DATACHUNK(hindiWordData, ":", 0, status);
ADD_DATACHUNK(hindiWordData, deadPA "\\u0930\\u093e\\u092f" visarga, 200, status); //no break before visarga
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0935" deadRA "\\u0937\\u093e", 200, status);
ADD_DATACHUNK(hindiWordData, "\r\n", 0, status);
ADD_DATACHUNK(hindiWordData, deadPA "\\u0930\\u0915\\u093e\\u0936", 200, status); //deadPA+RA+KA+vowel AA+SHA -> prakash
ADD_DATACHUNK(hindiWordData, ",", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0924\\u0941\\u092e\\u093e\\u0930\\u094b", 200, status);
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u092e\\u093f" deadTA "\\u0930", 200, status); //MA+vowel I+ deadTA + RA
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0915\\u093e", 200, status);
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u092a" deadTA "\\u0930", 200, status); //PA + deadTA + RA
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u092a\\u095d\\u094b", 200, status);
// ADD_DATACHUNK(hindiWordData, "\\u0964", 0); //fails here doesn't break at danda
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, deadSA deadTA "\\u0930\\u093f", 200, status); //deadSA+deadTA+RA+vowel I->sthri
ADD_DATACHUNK(hindiWordData, ".", 0, status);
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0968\\u0966.\\u0969\\u096f", 100, status); //hindi numbers
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0967\\u0966\\u0966.\\u0966\\u0966", 100, status); //postnumeric
ADD_DATACHUNK(hindiWordData, "\\u20a8", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0967,\\u0967\\u0966\\u0966.\\u0966\\u0966", 100, status); //pre-number India currency symbol Rs.\\u20aD
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, "\\u0905\\u092e\\u091c", 200, status);
ADD_DATACHUNK(hindiWordData, "\n", 0, status);
ADD_DATACHUNK(hindiWordData, halfSA "\\u0935\\u0924\\u0902" deadTA "\\u0930", 200, status);
ADD_DATACHUNK(hindiWordData, "\r", 0, status);
RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
if(U_FAILURE(status)){
errln("FAIL : in construction");
return;
}
generalIteratorTest(*e, hindiWordData);
delete e;
}
void RBBITest::TestTitleBreak()
{
UErrorCode status= U_ZERO_ERROR;
RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
if(U_FAILURE(status)){
errln("FAIL : in construction");
return;
}
BITestData titleData(status);
ADD_DATACHUNK(titleData, NULL, 0, status); // Break at start of data
ADD_DATACHUNK(titleData, " ", 0, status);
ADD_DATACHUNK(titleData, "This ", 0, status);
ADD_DATACHUNK(titleData, "is ", 0, status);
ADD_DATACHUNK(titleData, "a ", 0, status);
ADD_DATACHUNK(titleData, "simple ", 0, status);
ADD_DATACHUNK(titleData, "sample ", 0, status);
ADD_DATACHUNK(titleData, "sentence. ", 0, status);
ADD_DATACHUNK(titleData, "This ", 0, status);
generalIteratorTest(*titleI, titleData);
delete titleI;
}
//----------------------------------------------------------------------------------- //-----------------------------------------------------------------------------------
@ -682,12 +521,12 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
break; break;
case 3: name = ""; case 3: name = "";
break; break;
case 4: name = "TestHindiCharacterBreak"; case 4: name = "";
if(exec) TestHindiCharacterBreak(); break; break;
case 5: name = "TestHindiWordBreak"; case 5: name = "";
if(exec) TestHindiWordBreak(); break; break;
case 6: name = "TestTitleBreak"; case 6: name = "";
if(exec) TestTitleBreak(); break; break;
case 7: name = "TestStatusReturn"; case 7: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break; if(exec) TestStatusReturn(); break;
@ -1042,139 +881,6 @@ void RBBITest::TestSentenceInvariants()
} }
void RBBITest::TestLineInvariants()
{
#if 0 // TestLineInvariants() needs to be updated to reflect TR 14 rules.
UErrorCode status = U_ZERO_ERROR;
BreakIterator *e = BreakIterator::createLineInstance(Locale::getUS(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");
return;
}
UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");
UnicodeString testChars = *cannedTestChars + s;
doBreakInvariantTest(*e, testChars);
doOtherInvariantTest(*e, testChars);
int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;
int32_t i, j, k;
// in addition to the other invariants, a line-break iterator should make sure that:
// it doesn't break around the non-breaking characters,
// EXCEPT breaking after a space takes precedence over not breaking before
// an non-breaking char. So says TR 14.
UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
UnicodeString work("aaa");
testCharsLen = testChars.length();
noBreakLen = noBreak.length();
for (i = 0; i < testCharsLen; i++) {
UChar c = testChars[i];
if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 ||
u_charType(c) == U_CONTROL_CHAR) {
continue;
}
work[0] = c;
for (j = 0; j < noBreakLen; j++) {
work[1] = noBreak[j];
for (k = 0; k < testCharsLen; k++) {
work[2] = testChars[k];
e->setText(work);
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
UChar c1 = work[l - 1];
UChar c2 = work[l];
if (c1 == 0x20 && l == 1) {
continue;
}
if (l == 1 || l == 2) {
errln("Got break between U+" + UCharToUnicodeString(c1) +
" and U+" + UCharToUnicodeString(c2));
errCount++;
if (errCount >= 75)
return;
}
}
}
}
}
// it does break after hyphens (Rule 15B from TR 14
// (unless they're followed by a digit, a non-spacing mark,
// a currency symbol, a non-breaking space, or a line or paragraph separator
// or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
// This test is sufficiently screwed up that I'm largely disabling it. TODO: fix it. 06/12/2002 AGH
//
UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
dashesLen = dashes.length();
for (i = 0; i < testCharsLen; i++) {
work[0] = testChars[i];
for (j = 0; j < dashesLen; j++) {
UChar c1 = work[1] = dashes[j];
for (k = 0; k < testCharsLen; k++) {
UChar c2 = work[2] = testChars[k];
int8_t type = u_charType(c2);
if (type == U_DECIMAL_DIGIT_NUMBER ||
type == U_OTHER_NUMBER ||
type == U_NON_SPACING_MARK ||
type == U_ENCLOSING_MARK ||
type == U_CURRENCY_SYMBOL ||
type == U_SPACE_SEPARATOR ||
type == U_DASH_PUNCTUATION ||
type == U_CONTROL_CHAR ||
type == U_FORMAT_CHAR ||
c2 == '\n' || c2 == '\r' || c2 == 0x2028 || c2 == 0x2029 ||
c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 ||
c2 == 0xfeff)
{
continue;
}
// If c1 == hyphen-minus, and ...
if (c1 == 0x002d && (
c2 == 0x0021 || // !
c2 == 0x002c || // ,
c2 == 0x002d || // -
c2 == 0x002e || // . (TR 14 class IS)
c2 == 0x0029 || // )
c2 == 0x003a || // :
c2 == 0x003b || // ; (TR 14 class IS)
c2 == 0x005d || // ]
c2 == 0x007c || // | (TR 14 class BA, rule 15)
c2 == 0x007d || // }
c2 == 0x0903 || // Devanagari sign visarga, combining, what's it doing in this test?
c2 == 0x093E || // Devanagari , combining, what's it doing in this test?
c2 == 0x093F || // Devanagari , combining, what's it doing in this test?
c2 == 0x0940 || // Devanagari , combining, what's it doing in this test?
c2 == 0x0949 || // Devanagari , combining, what's it doing in this test?
c2 == 0x0f3b || // Tibetan closing bracket
c2 == 0x3001 || // CJK closing bracket
c2 == 0x3002 // CJK closing bracket
)) {
continue;
}
e->setText(work);
UBool saw2 = FALSE;
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
if (l == 2) {
saw2 = TRUE;
break;
}
}
if (!saw2) {
// TODO: This test is completely out of sync with the spec. Fix it.
// errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +
// " and U+" + UCharToUnicodeString(work[2]));
// errCount++;
// if (errCount >= 75)
// return;
}
}
}
}
delete e;
#endif
}
void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars) void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)

View File

@ -35,19 +35,7 @@ public:
virtual ~RBBITest(); virtual ~RBBITest();
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
/**
* Tests Hindi(Devanagiri) character iteration
**/
void TestHindiCharacterBreak(void);
/**
* Tests Hindi(Devanagiri) word iteration
**/
void TestHindiWordBreak(void);
/**
* Tests Title Case break iteration
**/
void TestTitleBreak(void);
/** /**
* Tests rule status return values * Tests rule status return values
**/ **/
@ -65,7 +53,6 @@ public:
void TestSentenceInvariants(); void TestSentenceInvariants();
void TestCharacterInvariants(); void TestCharacterInvariants();
void TestWordInvariants(); void TestWordInvariants();
void TestLineInvariants();
void TestEmptyString(); void TestEmptyString();
void TestGetAvailableLocales(); void TestGetAvailableLocales();
void TestGetDisplayName(); void TestGetDisplayName();