ICU-2093 Update LineBreak tests (work in progress)
X-SVN-Rev: 12032
This commit is contained in:
parent
26640c070f
commit
d11f9e993b
@ -236,170 +236,9 @@ static const int T_IDEO = 400;
|
||||
#define deadSA "\\u0938\\u094d"
|
||||
#define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
|
||||
|
||||
void RBBITest::TestHindiCharacterBreak()
|
||||
{
|
||||
UErrorCode status= U_ZERO_ERROR;
|
||||
BITestData hindicharData(status);
|
||||
ADD_DATACHUNK(hindicharData, NULL, 0, status); // Break at start of data
|
||||
//devanagari characters for Hindi support
|
||||
ADD_DATACHUNK(hindicharData, "\\u0906", 0, status); //devanagari AA
|
||||
|
||||
//hindi character break should make sure that it
|
||||
// doesn't break in-between a vowelsign and a chandrabindu
|
||||
|
||||
ADD_DATACHUNK(hindicharData, "\\u000a", 0, status); // Force break so following can appear stand-alone.
|
||||
ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status); //devanagari vowelsign AA+ chandrabindu
|
||||
ADD_DATACHUNK(hindicharData, "\\u0906\\u0901", 0, status); // Devanagari AA + chandrabindu
|
||||
ADD_DATACHUNK(hindicharData, "\\u0915", 0, status); // Devanagari KA
|
||||
ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status); // Devanagari AA vowelsign + chandrabindu
|
||||
|
||||
|
||||
ADD_DATACHUNK(hindicharData, "\\u0916\\u0947", 0, status); //devanagari KHA+vowelsign E
|
||||
ADD_DATACHUNK(hindicharData, "\\u0938\\u0941\\u0902", 0, status); //devanagari SA+vowelsign U + anusvara(bindu)
|
||||
ADD_DATACHUNK(hindicharData, "\\u0926", 0, status); //devanagari consonant DA
|
||||
ADD_DATACHUNK(hindicharData, "\\u0930", 0, status); //devanagari consonant RA
|
||||
ADD_DATACHUNK(hindicharData, "\\u0939", 0, status); //devanagari consonant HA+
|
||||
ADD_DATACHUNK(hindicharData, "\\u094c", 0, status); // +dependent vowel sign AI
|
||||
ADD_DATACHUNK(hindicharData, "\\u0964", 0, status); //devanagari danda
|
||||
ADD_DATACHUNK(hindicharData, "\\u0950", 0, status); //devanagari OM
|
||||
ADD_DATACHUNK(hindicharData, "\\u0915\\u0943", 0, status); //devanagari KA+dependent vowel RI->KRI
|
||||
|
||||
//dependent half-forms. 2002-8-7: New Char Break rules no longer join the half-sequences.
|
||||
ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u0924", 0, status); //halfSA+base consonant TA->STA
|
||||
ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u0925", 0, status); //halfSA+base consonant THA->STHA
|
||||
ADD_DATACHUNK(hindicharData, /* halfSA */ "\\u092e", 0, status); //halfSA+base consonant MA->SMA
|
||||
ADD_DATACHUNK(hindicharData, /* halfCHA */ "\\u091b", 0, status); //halfCHA+base consonant CHHA->CHHHA
|
||||
ADD_DATACHUNK(hindicharData, /* halfNA */ "\\u0917", 0, status); //halfNA+base consonant GA->NGA
|
||||
// ADD_DATACHUNK(hindicharData, "\\u092a\\u094d\\u200d\\u092f", 0, status); //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
|
||||
ADD_DATACHUNK(hindicharData, "\\u092a\\u094d", 0, status); //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
|
||||
ADD_DATACHUNK(hindicharData, "\\u200d", 0, status); //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
|
||||
ADD_DATACHUNK(hindicharData, "\\u092f", 0, status); //halfPA(PA+virama+zerowidthjoiner+base consonant YA->PYA
|
||||
|
||||
|
||||
//consonant RA rules ----------
|
||||
//if the dead consonant RA precedes either a consonant or an independent vowel,
|
||||
//then it is replaced by its superscript non-spacing mark
|
||||
ADD_DATACHUNK(hindicharData, /* deadRA */ "\\u0915", 0, status); //deadRA+devanagari consonant KA->KA+superRA
|
||||
ADD_DATACHUNK(hindicharData, /* deadRA */ "\\u0923", 0, status); //deadRA+devanagari consonant NNA->NNA+superRA
|
||||
ADD_DATACHUNK(hindicharData, /* deadRA */ "\\u0917", 0, status); //deadRA+devanagari consonant GA->GA+superRA
|
||||
// ADD_DATACHUNK(hindicharData, deadRA+ "\\u0960", 0); //deadRA+devanagari cosonant RRI->RRI+superRA
|
||||
|
||||
//if any dead consonant(other than dead RA)precedes the consonant RA, then
|
||||
//it is replaced with its nominal forma nd RA is replaced by the subscript non-spacing mark.
|
||||
ADD_DATACHUNK(hindicharData, /* deadPHA */ "\\u0930", 0, status); //deadPHA+devanagari consonant RA->PHA+subRA
|
||||
ADD_DATACHUNK(hindicharData, /* deadPA */ "\\u0930", 0, status); //deadPA+devanagari consonant RA->PA+subRA
|
||||
ADD_DATACHUNK(hindicharData, /* deadTTHA */ "\\u0930", 0, status); //deadTTHA+devanagari consonant RA->TTHA+subRA
|
||||
ADD_DATACHUNK(hindicharData, /* deadTA */ "\\u0930", 0, status); //deadTA+RA->TRA
|
||||
// ADD_DATACHUNK(hindicharData, "\\u0936\\u094d\\u0930", 0, status); //deadSHA(SHA+virama)+RA->SHRA
|
||||
ADD_DATACHUNK(hindicharData, "\\u0936\\u094d", 0, status); //deadSHA(SHA+virama)+RA->SHRA
|
||||
ADD_DATACHUNK(hindicharData, "\\u0930", 0, status); //deadSHA(SHA+virama)+RA->SHRA
|
||||
|
||||
//conjuct ligatures
|
||||
// 2002-08-7 virma no longer forces joining.
|
||||
// ADD_DATACHUNK(hindicharData, "\\u0915\\u094d\\u0937", 0, status); //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
|
||||
ADD_DATACHUNK(hindicharData, "\\u0915\\u094d", 0, status); //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
|
||||
ADD_DATACHUNK(hindicharData, "\\u0937", 0, status); //deadKA(KA+virama) followed by SSHA wraps up into a single character KSSHA
|
||||
ADD_DATACHUNK(hindicharData, /* deadTA */ "\\u0924", 0, status); //deadTA+TA wraps up into glyph TTHA
|
||||
//ADD_DATACHUNK(hindicharData, "\\u0926\\u094d\\u0935", 0, status); //deadDA(DA+virama)+VA wraps up into DVA
|
||||
//ADD_DATACHUNK(hindicharData, "\\u091c\\u094d\\u091e", 0, status); //deadJA(JA+virama)+NYA wraps up into JNYA
|
||||
|
||||
RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status)){
|
||||
errln("FAIL : in construction");
|
||||
return;
|
||||
}
|
||||
generalIteratorTest(*e, hindicharData);
|
||||
delete e;
|
||||
}
|
||||
|
||||
void RBBITest::TestHindiWordBreak()
|
||||
{
|
||||
UErrorCode status= U_ZERO_ERROR;
|
||||
BITestData hindiWordData(status);
|
||||
|
||||
//hindi
|
||||
ADD_DATACHUNK(hindiWordData, NULL, 0, status); // Break at start of data
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0917\\u092a\\u00ad\\u0936\\u092a", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, "!", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u092f\\u0939", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0939\\u093f" halfNA "\\u0926\\u0940", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0939\\u0948", 200, status);
|
||||
//danda is similar to full stop. danda is a hindi phrase seperator
|
||||
//Make sure it breaks before danda and after danda when it is followed by a space
|
||||
//ADD_DATACHUNK(hindiWordData, "\\u0964", 0); //fails here doesn't break at danda
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0905\\u093e\\u092a", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, "?", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\n", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, ":", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, deadPA "\\u0930\\u093e\\u092f" visarga, 200, status); //no break before visarga
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0935" deadRA "\\u0937\\u093e", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\r\n", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, deadPA "\\u0930\\u0915\\u093e\\u0936", 200, status); //deadPA+RA+KA+vowel AA+SHA -> prakash
|
||||
ADD_DATACHUNK(hindiWordData, ",", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0924\\u0941\\u092e\\u093e\\u0930\\u094b", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u092e\\u093f" deadTA "\\u0930", 200, status); //MA+vowel I+ deadTA + RA
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0915\\u093e", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u092a" deadTA "\\u0930", 200, status); //PA + deadTA + RA
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u092a\\u095d\\u094b", 200, status);
|
||||
// ADD_DATACHUNK(hindiWordData, "\\u0964", 0); //fails here doesn't break at danda
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, deadSA deadTA "\\u0930\\u093f", 200, status); //deadSA+deadTA+RA+vowel I->sthri
|
||||
ADD_DATACHUNK(hindiWordData, ".", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0968\\u0966.\\u0969\\u096f", 100, status); //hindi numbers
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0967\\u0966\\u0966.\\u0966\\u0966", 100, status); //postnumeric
|
||||
ADD_DATACHUNK(hindiWordData, "\\u20a8", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0967,\\u0967\\u0966\\u0966.\\u0966\\u0966", 100, status); //pre-number India currency symbol Rs.\\u20aD
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\\u0905\\u092e\\u091c", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\n", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, halfSA "\\u0935\\u0924\\u0902" deadTA "\\u0930", 200, status);
|
||||
ADD_DATACHUNK(hindiWordData, "\r", 0, status);
|
||||
|
||||
RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status)){
|
||||
errln("FAIL : in construction");
|
||||
return;
|
||||
}
|
||||
generalIteratorTest(*e, hindiWordData);
|
||||
delete e;
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestTitleBreak()
|
||||
{
|
||||
UErrorCode status= U_ZERO_ERROR;
|
||||
RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status)){
|
||||
errln("FAIL : in construction");
|
||||
return;
|
||||
}
|
||||
|
||||
BITestData titleData(status);
|
||||
ADD_DATACHUNK(titleData, NULL, 0, status); // Break at start of data
|
||||
ADD_DATACHUNK(titleData, " ", 0, status);
|
||||
ADD_DATACHUNK(titleData, "This ", 0, status);
|
||||
ADD_DATACHUNK(titleData, "is ", 0, status);
|
||||
ADD_DATACHUNK(titleData, "a ", 0, status);
|
||||
ADD_DATACHUNK(titleData, "simple ", 0, status);
|
||||
ADD_DATACHUNK(titleData, "sample ", 0, status);
|
||||
ADD_DATACHUNK(titleData, "sentence. ", 0, status);
|
||||
ADD_DATACHUNK(titleData, "This ", 0, status);
|
||||
|
||||
generalIteratorTest(*titleI, titleData);
|
||||
delete titleI;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
@ -682,12 +521,12 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
break;
|
||||
case 3: name = "";
|
||||
break;
|
||||
case 4: name = "TestHindiCharacterBreak";
|
||||
if(exec) TestHindiCharacterBreak(); break;
|
||||
case 5: name = "TestHindiWordBreak";
|
||||
if(exec) TestHindiWordBreak(); break;
|
||||
case 6: name = "TestTitleBreak";
|
||||
if(exec) TestTitleBreak(); break;
|
||||
case 4: name = "";
|
||||
break;
|
||||
case 5: name = "";
|
||||
break;
|
||||
case 6: name = "";
|
||||
break;
|
||||
case 7: name = "TestStatusReturn";
|
||||
if(exec) TestStatusReturn(); break;
|
||||
|
||||
@ -1042,139 +881,6 @@ void RBBITest::TestSentenceInvariants()
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestLineInvariants()
|
||||
{
|
||||
#if 0 // TestLineInvariants() needs to be updated to reflect TR 14 rules.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator *e = BreakIterator::createLineInstance(Locale::getUS(), status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");
|
||||
return;
|
||||
}
|
||||
UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");
|
||||
UnicodeString testChars = *cannedTestChars + s;
|
||||
doBreakInvariantTest(*e, testChars);
|
||||
doOtherInvariantTest(*e, testChars);
|
||||
|
||||
int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;
|
||||
int32_t i, j, k;
|
||||
|
||||
// in addition to the other invariants, a line-break iterator should make sure that:
|
||||
// it doesn't break around the non-breaking characters,
|
||||
// EXCEPT breaking after a space takes precedence over not breaking before
|
||||
// an non-breaking char. So says TR 14.
|
||||
UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
|
||||
UnicodeString work("aaa");
|
||||
testCharsLen = testChars.length();
|
||||
noBreakLen = noBreak.length();
|
||||
for (i = 0; i < testCharsLen; i++) {
|
||||
UChar c = testChars[i];
|
||||
if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 ||
|
||||
u_charType(c) == U_CONTROL_CHAR) {
|
||||
continue;
|
||||
}
|
||||
work[0] = c;
|
||||
for (j = 0; j < noBreakLen; j++) {
|
||||
work[1] = noBreak[j];
|
||||
for (k = 0; k < testCharsLen; k++) {
|
||||
work[2] = testChars[k];
|
||||
e->setText(work);
|
||||
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
|
||||
UChar c1 = work[l - 1];
|
||||
UChar c2 = work[l];
|
||||
if (c1 == 0x20 && l == 1) {
|
||||
continue;
|
||||
}
|
||||
if (l == 1 || l == 2) {
|
||||
errln("Got break between U+" + UCharToUnicodeString(c1) +
|
||||
" and U+" + UCharToUnicodeString(c2));
|
||||
errCount++;
|
||||
if (errCount >= 75)
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// it does break after hyphens (Rule 15B from TR 14
|
||||
// (unless they're followed by a digit, a non-spacing mark,
|
||||
// a currency symbol, a non-breaking space, or a line or paragraph separator
|
||||
// or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
|
||||
|
||||
// This test is sufficiently screwed up that I'm largely disabling it. TODO: fix it. 06/12/2002 AGH
|
||||
//
|
||||
UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
|
||||
dashesLen = dashes.length();
|
||||
for (i = 0; i < testCharsLen; i++) {
|
||||
work[0] = testChars[i];
|
||||
for (j = 0; j < dashesLen; j++) {
|
||||
UChar c1 = work[1] = dashes[j];
|
||||
for (k = 0; k < testCharsLen; k++) {
|
||||
UChar c2 = work[2] = testChars[k];
|
||||
int8_t type = u_charType(c2);
|
||||
if (type == U_DECIMAL_DIGIT_NUMBER ||
|
||||
type == U_OTHER_NUMBER ||
|
||||
type == U_NON_SPACING_MARK ||
|
||||
type == U_ENCLOSING_MARK ||
|
||||
type == U_CURRENCY_SYMBOL ||
|
||||
type == U_SPACE_SEPARATOR ||
|
||||
type == U_DASH_PUNCTUATION ||
|
||||
type == U_CONTROL_CHAR ||
|
||||
type == U_FORMAT_CHAR ||
|
||||
c2 == '\n' || c2 == '\r' || c2 == 0x2028 || c2 == 0x2029 ||
|
||||
c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 ||
|
||||
c2 == 0xfeff)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// If c1 == hyphen-minus, and ...
|
||||
if (c1 == 0x002d && (
|
||||
c2 == 0x0021 || // !
|
||||
c2 == 0x002c || // ,
|
||||
c2 == 0x002d || // -
|
||||
c2 == 0x002e || // . (TR 14 class IS)
|
||||
c2 == 0x0029 || // )
|
||||
c2 == 0x003a || // :
|
||||
c2 == 0x003b || // ; (TR 14 class IS)
|
||||
c2 == 0x005d || // ]
|
||||
c2 == 0x007c || // | (TR 14 class BA, rule 15)
|
||||
c2 == 0x007d || // }
|
||||
c2 == 0x0903 || // Devanagari sign visarga, combining, what's it doing in this test?
|
||||
c2 == 0x093E || // Devanagari , combining, what's it doing in this test?
|
||||
c2 == 0x093F || // Devanagari , combining, what's it doing in this test?
|
||||
c2 == 0x0940 || // Devanagari , combining, what's it doing in this test?
|
||||
c2 == 0x0949 || // Devanagari , combining, what's it doing in this test?
|
||||
c2 == 0x0f3b || // Tibetan closing bracket
|
||||
c2 == 0x3001 || // CJK closing bracket
|
||||
c2 == 0x3002 // CJK closing bracket
|
||||
)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
e->setText(work);
|
||||
UBool saw2 = FALSE;
|
||||
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
|
||||
if (l == 2) {
|
||||
saw2 = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!saw2) {
|
||||
// TODO: This test is completely out of sync with the spec. Fix it.
|
||||
// errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +
|
||||
// " and U+" + UCharToUnicodeString(work[2]));
|
||||
// errCount++;
|
||||
// if (errCount >= 75)
|
||||
// return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
delete e;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
||||
|
@ -35,19 +35,7 @@ public:
|
||||
virtual ~RBBITest();
|
||||
|
||||
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
/**
|
||||
* Tests Hindi(Devanagiri) character iteration
|
||||
**/
|
||||
void TestHindiCharacterBreak(void);
|
||||
/**
|
||||
* Tests Hindi(Devanagiri) word iteration
|
||||
**/
|
||||
void TestHindiWordBreak(void);
|
||||
/**
|
||||
* Tests Title Case break iteration
|
||||
**/
|
||||
void TestTitleBreak(void);
|
||||
|
||||
|
||||
/**
|
||||
* Tests rule status return values
|
||||
**/
|
||||
@ -65,7 +53,6 @@ public:
|
||||
void TestSentenceInvariants();
|
||||
void TestCharacterInvariants();
|
||||
void TestWordInvariants();
|
||||
void TestLineInvariants();
|
||||
void TestEmptyString();
|
||||
void TestGetAvailableLocales();
|
||||
void TestGetDisplayName();
|
||||
|
Loading…
Reference in New Issue
Block a user