ICU-2093 RBBI Tests updated; title break rules tweaked
X-SVN-Rev: 12025
This commit is contained in:
parent
0026f2d005
commit
1b2b7444d8
@ -8,22 +8,16 @@ $CaseIgnorable = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
|
||||
$OtherUpperCase = [\u2160-\u216f \u24b6-\u24cf];
|
||||
$OtherLowerCase = [\u02b0-\u02b8 \u02c0-\u02c1 \u02e0-\u02e4 \u0345\u037a \u2170-\u217f \u24d0-\u24e9];
|
||||
$Cased = [[:Lu:][:Lt:][:Ll:] $OtherUpperCase $OtherLowerCase - $CaseIgnorable];
|
||||
$NotCased = [^ $Cased $CaseIgnorable];
|
||||
|
||||
#
|
||||
# If the iterator was not stopped on a cased character, advance it to the first cased char
|
||||
#
|
||||
($NotCased | $CaseIgnorable)*;
|
||||
$NotCased = [^ $Cased];
|
||||
|
||||
#
|
||||
# If the iterator starts on a cased item, advance through all adjacent cased items plus
|
||||
# any non-cased stuff, to reach the start of the next word.
|
||||
# any non-cased stuff, to reach the start of the next (cased) word.
|
||||
#
|
||||
$Cased ($Cased | $CaseIgnorable)* $NotCased*;
|
||||
|
||||
($Cased | $CaseIgnorable)* $NotCased*;
|
||||
|
||||
#
|
||||
# Reverse Rules
|
||||
#
|
||||
!$NotCased* ($Cased | $CaseIgnorable)* $NotCased*;
|
||||
!$NotCased* ($Cased | $CaseIgnorable)* $NotCased?;
|
||||
|
||||
|
@ -214,388 +214,12 @@ RBBITest::~RBBITest() {
|
||||
delete cannedTestChars;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
//tests default rules based character iteration
|
||||
//--------------------------------------------------------------------
|
||||
void RBBITest::TestDefaultRuleBasedCharacterIteration()
|
||||
{
|
||||
// RuleBasedBreakIterator* rbbi=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance();
|
||||
logln((UnicodeString)"Testing the RBBI for character iteration by using default rules");
|
||||
//fetch the rules used to create the above RuleBasedBreakIterator
|
||||
// UnicodeString defaultRules=rbbi->getRules();
|
||||
// RuleBasedCharacterIterator charIterDefault = new RuleBasedBreakIterator(defaultRules);
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
RuleBasedBreakIterator* charIterDefault=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status)){
|
||||
errln("FAIL : in construction");
|
||||
return;
|
||||
}
|
||||
|
||||
BITestData chardata(status);
|
||||
|
||||
ADD_DATACHUNK(chardata, NULL, 0, status); // Starting break
|
||||
ADD_DATACHUNK(chardata, "H", 0, status);
|
||||
ADD_DATACHUNK(chardata, "e", 0, status);
|
||||
ADD_DATACHUNK(chardata, "l", 0, status);
|
||||
ADD_DATACHUNK(chardata, "l", 0, status);
|
||||
ADD_DATACHUNK(chardata, "o", 0, status);
|
||||
ADD_DATACHUNK(chardata, "e\\u0301", 0, status); //acuteE
|
||||
ADD_DATACHUNK(chardata, "&", 0, status);
|
||||
ADD_DATACHUNK(chardata, "e\\u0303", 0, status); //tildaE
|
||||
|
||||
ADD_DATACHUNK(chardata, "S\\u0300", 0, status); //graveS
|
||||
ADD_DATACHUNK(chardata, "i\\u0301", 0, status); // acuteBelowI
|
||||
ADD_DATACHUNK(chardata, "m", 0, status);
|
||||
ADD_DATACHUNK(chardata, "p", 0, status);
|
||||
ADD_DATACHUNK(chardata, "l", 0, status);
|
||||
ADD_DATACHUNK(chardata, "e\\u0301", 0, status); // acuteE
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
ADD_DATACHUNK(chardata, "s", 0, status);
|
||||
ADD_DATACHUNK(chardata, "a\\u0302", 0, status); // circumflexA
|
||||
ADD_DATACHUNK(chardata, "m", 0, status);
|
||||
ADD_DATACHUNK(chardata, "p", 0, status);
|
||||
ADD_DATACHUNK(chardata, "l", 0, status);
|
||||
ADD_DATACHUNK(chardata, "e\\u0303", 0, status); // tildeE
|
||||
ADD_DATACHUNK(chardata, ".", 0, status);
|
||||
ADD_DATACHUNK(chardata, "w", 0, status);
|
||||
ADD_DATACHUNK(chardata, "a\\u0302", 0, status); // circumflexA
|
||||
ADD_DATACHUNK(chardata, "w", 0, status);
|
||||
ADD_DATACHUNK(chardata, "a", 0, status);
|
||||
ADD_DATACHUNK(chardata, "f", 0, status);
|
||||
ADD_DATACHUNK(chardata, "q", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\n", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\r", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\r\n", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\n", 0, status);
|
||||
|
||||
//devanagiri characters for Hindi support
|
||||
ADD_DATACHUNK(chardata, "\\u0906", 0, status); //devanagiri AA
|
||||
//ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0); //devanagiri vowelsign AA+ chandrabindhu
|
||||
ADD_DATACHUNK(chardata, "\\u0906\\u0901", 0, status); // Devanagari AA + chandrabindu
|
||||
ADD_DATACHUNK(chardata, "\\u0915", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
|
||||
ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
|
||||
|
||||
ADD_DATACHUNK(chardata, "\\u0916\\u0947", 0, status); //devanagiri KHA+vowelsign E
|
||||
ADD_DATACHUNK(chardata, "\\u0938\\u0941\\u0902", 0, status); //devanagiri SA+vowelsign U + anusvara(bindu)
|
||||
ADD_DATACHUNK(chardata, "\\u0926", 0, status); //devanagiri consonant DA
|
||||
ADD_DATACHUNK(chardata, "\\u0930", 0, status); //devanagiri consonant RA
|
||||
ADD_DATACHUNK(chardata, "\\u0939", 0, status); //devanagiri HA+vowel sign AI
|
||||
ADD_DATACHUNK(chardata, "\\u094c", 0, status); //devanagiri HA+vowel sign AI
|
||||
ADD_DATACHUNK(chardata, "\\u0964", 0, status); //devanagiri danda
|
||||
//end hindi characters
|
||||
ADD_DATACHUNK(chardata, "A\\u0302", 0, status); //circumflexA
|
||||
ADD_DATACHUNK(chardata, "i\\u0301", 0, status); //acuteBelowI
|
||||
// conjoining jamo->..
|
||||
ADD_DATACHUNK(chardata, "\\u1109\\u1161\\u11bc", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u1112\\u1161\\u11bc", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\n", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\r\n", 0, status); //keep CRLF sequences together
|
||||
ADD_DATACHUNK(chardata, "S\\u0300", 0, status); //graveS
|
||||
ADD_DATACHUNK(chardata, "i\\u0301", 0, status); //acuteBelowI
|
||||
ADD_DATACHUNK(chardata, "!", 0, status);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// What follows is a string of Korean characters (I found it in the Yellow Pages
|
||||
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
|
||||
// it correctly), first as precomposed syllables, and then as conjoining jamo.
|
||||
// Both sequences should be semantically identical and break the same way.
|
||||
// precomposed syllables...
|
||||
ADD_DATACHUNK(chardata, "\\uc0c1", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\ud56d", 0, status);
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\ud55c", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\uc778", 0, status);
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\uc5f0", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\ud569", 0, status);
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\uc7a5", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\ub85c", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\uad50", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\ud68c", 0, status);
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
// conjoining jamo...
|
||||
ADD_DATACHUNK(chardata, "\\u1109\\u1161\\u11bc", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u1112\\u1161\\u11bc", 0, status);
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u1112\\u1161\\u11ab", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u110b\\u1175\\u11ab", 0, status);
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u110b\\u1167\\u11ab", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u1112\\u1161\\u11b8", 0, status);
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u110c\\u1161\\u11bc", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u1105\\u1169", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u1100\\u116d", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u1112\\u116c", 0, status);
|
||||
|
||||
// Surrogate pairs stay together
|
||||
ADD_DATACHUNK(chardata, "\\ud800\\udc00", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\udbff\\udfff", 0, status);
|
||||
ADD_DATACHUNK(chardata, "x", 0, status);
|
||||
|
||||
// 0xffff is a legal character, and should not stop the break iterator early.
|
||||
// (Requires special casing in implementation, which is why it gets a test.)
|
||||
ADD_DATACHUNK(chardata, "\\uffff", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\uffff", 0, status);
|
||||
ADD_DATACHUNK(chardata, " ", 0, status);
|
||||
ADD_DATACHUNK(chardata, "a", 0, status);
|
||||
|
||||
// Regression test for bug 1889
|
||||
ADD_DATACHUNK(chardata, "\\u0f40\\u0f7d", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u0000", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\\u0f7e", 0, status);
|
||||
// \u0f7d\u0000\u0f7e
|
||||
|
||||
if(U_FAILURE(status)){
|
||||
errln("FAIL : in BITestData construction");
|
||||
return;
|
||||
}
|
||||
// Run the test...
|
||||
generalIteratorTest(*charIterDefault, chardata);
|
||||
|
||||
delete charIterDefault;
|
||||
// delete rbbi;
|
||||
}
|
||||
|
||||
static const int T_NUMBER = 100;
|
||||
static const int T_LETTER = 200;
|
||||
static const int T_H_OR_K = 300;
|
||||
static const int T_IDEO = 400;
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
//tests default rules based word iteration
|
||||
//--------------------------------------------------------------------
|
||||
void RBBITest::TestDefaultRuleBasedWordIteration()
|
||||
{
|
||||
logln((UnicodeString)"Testing the RBBI for word iteration using default rules");
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
RuleBasedBreakIterator* wordIterDefault=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status)){
|
||||
errln("FAIL : in construction");
|
||||
return;
|
||||
}
|
||||
|
||||
BITestData worddata(status);
|
||||
ADD_DATACHUNK(worddata, NULL, 0, status);
|
||||
ADD_DATACHUNK(worddata, "Write", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "wordrules", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, ".", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "123.456", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "alpha\\u00adbeta\\u00adgamma", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u092f\\u0939", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0939\\u093f" halfNA "\\u0926\\u0940", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0939\\u0948", T_LETTER, status);
|
||||
//ADD_DATACHUNK(worddata, "\\u0964", 0); //danda followed by a space "\u0964->danda: hindi phrase seperator"
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0905\\u093e\\u092a", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "?", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0968\\u0966.\\u0969\\u096f", T_NUMBER, status); //hindi numbers
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0967\\u0966\\u0966.\\u0966\\u0966", T_NUMBER, status); //postnumeric
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u20a8", 0, status); // //pre-number India currency symbol Rs->\\u20aD
|
||||
ADD_DATACHUNK(worddata, "\\u0967,\\u0967\\u0966\\u0966.\\u0966\\u0966", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0905\\u092e\\u091c", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "\n", 0, status);
|
||||
ADD_DATACHUNK(worddata, halfSA "\\u0935\\u0924\\u0902" deadTA "\\u0930", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "\r", 0, status);
|
||||
ADD_DATACHUNK(worddata, "It's", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "$", 0, status);
|
||||
ADD_DATACHUNK(worddata, "30.10", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "12,34", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u00A2", 0, status); //cent sign
|
||||
ADD_DATACHUNK(worddata, "\\u00A3", 0, status); //pound sign
|
||||
ADD_DATACHUNK(worddata, "\\u00A4", 0, status); //currency sign
|
||||
ADD_DATACHUNK(worddata, "\\u00A5", 0, status); //yen sign
|
||||
ADD_DATACHUNK(worddata, "alpha\\u05f3beta\\u05f4gamma", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "Badges", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "?", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "!", 0, status);
|
||||
ADD_DATACHUNK(worddata, "?", 0, status);
|
||||
ADD_DATACHUNK(worddata, "!", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "We", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "don't", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "need", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "no", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "STINKING", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "!", 0, status);
|
||||
ADD_DATACHUNK(worddata, "!", 0, status);
|
||||
|
||||
ADD_DATACHUNK(worddata, "1000,233,456.000", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
|
||||
ADD_DATACHUNK(worddata, "1,23.322", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, "%", 0, status);
|
||||
ADD_DATACHUNK(worddata, "123.1222", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, "$", 0, status);
|
||||
ADD_DATACHUNK(worddata, "123,000.20", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
|
||||
ADD_DATACHUNK(worddata, "179.01", T_NUMBER, status);
|
||||
ADD_DATACHUNK(worddata, "%", 0, status);
|
||||
ADD_DATACHUNK(worddata, "X", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "Now", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "\r", 0, status);
|
||||
ADD_DATACHUNK(worddata, "is", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "\n", 0, status);
|
||||
ADD_DATACHUNK(worddata, "the", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "\r\n", 0, status);
|
||||
ADD_DATACHUNK(worddata, "time", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\uc5f0\\ud569", T_LETTER, status); // Hangul Syllables
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\uc7a5\\ub85c\\uad50\\ud68c", T_LETTER, status); // Hangul
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
// conjoining jamo...
|
||||
ADD_DATACHUNK(worddata, "\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "Hello", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, ",", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "how", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "are", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "you", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
|
||||
// Words containing non-BMP letters
|
||||
ADD_DATACHUNK(worddata, "abc\\U00010300", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "abc\\U0001044D", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "abc\\U0001D433", T_LETTER, status); //MATHEMATICAL BOLD SMALL Z
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "abc\\U0001D7C9", T_LETTER, status); //MATHEMATICAL SANS-SERIF BOLD ITALIC PI
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
|
||||
ADD_DATACHUNK(worddata, "abc", T_LETTER, status); // same test outside of letter range.
|
||||
ADD_DATACHUNK(worddata, "\\U0001D800", 0, status);
|
||||
ADD_DATACHUNK(worddata, "def", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "\\U0001D3FF", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
|
||||
// Hiragana & Katakana stay together, but separates from each other and Latin.
|
||||
// TODO: Hira and Kata ranges from UnicodeSet differ slightly from
|
||||
// what's in Unicode Scripts file. Investigate.
|
||||
ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "\\u3041", T_H_OR_K, status); // Hiragana
|
||||
ADD_DATACHUNK(worddata, "\\u3094\\u0301", T_H_OR_K, status); // Hiragana
|
||||
ADD_DATACHUNK(worddata, "\\u309d", T_H_OR_K, status); // Hiragana
|
||||
ADD_DATACHUNK(worddata, "\\u30a1\\u30fd\\uff66\\uff9d", T_H_OR_K, status); // Katakana
|
||||
ADD_DATACHUNK(worddata, "def", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, "#", 0, status);
|
||||
|
||||
// Words with interior formatting characters
|
||||
ADD_DATACHUNK(worddata, "def\\u0301\\u070Fabc", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
|
||||
// to test for bug #4097779
|
||||
ADD_DATACHUNK(worddata, "aa\\u0300a", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
|
||||
// to test for bug #4098467
|
||||
// What follows is a string of Korean characters (I found it in the Yellow Pages
|
||||
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
|
||||
// it correctly), first as precomposed syllables, and then as conjoining jamo.
|
||||
// Both sequences should be semantically identical and break the same way.
|
||||
// precomposed syllables...
|
||||
ADD_DATACHUNK(worddata, "\\uc0c1\\ud56d", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\ud55c\\uc778", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\uc5f0\\ud569", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\uc7a5\\ub85c\\uad50\\ud68c", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
// conjoining jamo...
|
||||
ADD_DATACHUNK(worddata, "\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c", T_LETTER, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
|
||||
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
|
||||
// count as a Kanji character for the purposes of word breaking
|
||||
ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
|
||||
// Unicode TR29: Ideographs do NOT group together into words.
|
||||
//wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
|
||||
ADD_DATACHUNK(worddata, "\\u4e01", T_IDEO, status);
|
||||
ADD_DATACHUNK(worddata, "\\u4e02", T_IDEO, status);
|
||||
ADD_DATACHUNK(worddata, "\\u3005", T_LETTER, status); // TODO: 3005 is ideographic iteration mark
|
||||
// Treating as letter is according to TR.
|
||||
// Check whether this is really intended.
|
||||
ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status);
|
||||
ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status);
|
||||
ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
|
||||
|
||||
//
|
||||
// Try some words from other scripts.
|
||||
//
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0391\\u0392\\u0393", T_LETTER, status); // Greek
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0411\\u0412\\u0413", T_LETTER, status); // Cyrillic
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u05D0\\u05D1\\u05D2\\u0593", T_LETTER, status); // Hebrew
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0627\\u0628\\u062A", T_LETTER, status); // Arabic
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u0661\\u0662\\u0663", T_NUMBER, status); // Arabic
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\\u10A0\\u10A1\\u10A2", T_LETTER, status); // Georgian
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "ABC", T_LETTER, status); // Latin
|
||||
|
||||
|
||||
if (U_FAILURE(status)){
|
||||
errln("FAIL : in BITestData construction");
|
||||
return;
|
||||
}
|
||||
|
||||
generalIteratorTest(*wordIterDefault, worddata);
|
||||
|
||||
delete wordIterDefault;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -824,257 +448,6 @@ void RBBITest::TestStatusReturn() {
|
||||
delete bi;
|
||||
}
|
||||
|
||||
/*
|
||||
//Bug: if there is no word break before and after danda when it is followed by a space
|
||||
void RBBITest::TestDanda()
|
||||
{
|
||||
Vector *hindiWordData = new Vector();
|
||||
//hindi
|
||||
ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u092f\\u0939"), 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
//Danda is similar to full stop, danda is a hindi phrase seperator.
|
||||
//Make sure there is a word break before and after danda when it is followed by a space
|
||||
//following fail----
|
||||
ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u0939\\u0948"), 0, status);
|
||||
// ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u0964"), 0); // devanagari danda
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u092f\\u0939"), 0, status);
|
||||
// ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u0965"), 0); //devanagari double danda
|
||||
ADD_DATACHUNK(hindiWordData, " ", 0, status);
|
||||
|
||||
RuleBasedBreakIterator* e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance();
|
||||
generalIteratorTest(*e, hindiWordData);
|
||||
delete e;
|
||||
delete hindiWordData;
|
||||
}
|
||||
|
||||
//Make sure the character wrapping is done correctly
|
||||
void RBBITest::TestHindiCharacterWrapping()
|
||||
{
|
||||
Vector *hindicharData = new Vector();
|
||||
//if the dead consonant RA precedes either a consonant or an independent vowel,
|
||||
//then it is replaced by its superscript non-spacing mark
|
||||
ADD_DATACHUNK(hindicharData, deadRA+ CharsToUnicodeString("\\u0917"), 0, status); //deadRA+devanagari consonant GA->GA+superRA
|
||||
//following fail----
|
||||
// ADD_DATACHUNK(hindicharData, deadRA+ CharsToUnicodeString("\\u0960"), 0); //deadRA+devanagari RRI->RRI+superRA
|
||||
|
||||
RuleBasedBreakIterator* e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance();
|
||||
generalIteratorTest(*e, hindicharData);
|
||||
delete e;
|
||||
delete hindicharData;
|
||||
|
||||
}*/
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------
|
||||
//adds rules for telugu support and tests the behaviour of chracterIterator of RBBI
|
||||
//----------------------------------------------------------------------------------
|
||||
/*void RBBITest::TestTeluguRuleBasedCharacterIteration()
|
||||
{
|
||||
logln((UnicodeString)"Testing the RBBI by adding rules for Telugu(Indian Language) Support");
|
||||
//get the default rules
|
||||
RuleBasedBreakIterator *rb= (RuleBasedBreakIterator*)BreakIterator::createCharacterInstance();
|
||||
//additional rules for Telugu(Indian Language) support
|
||||
UnicodeString crules1 = rb->getRules() + //default rules +
|
||||
"<telvirama>=[\\u0c4d];" + //telugu virama
|
||||
"<telVowelSign>=[\\u0c3e-\\u0c44\\u0c46\\u0c47\\u0c48\\u0c4a\\u0c4b\\u0c4c];" + //telugu dependent vowel signs
|
||||
"<telConsonant>=[\\u0c15-\\u0c28\\u0c2a-\\u0c33\\u0c35-\\u0c39];" + //telugu consonants
|
||||
"<telCharEnd>=[\\u0c02\\u0c03\\u0c55\\u0c56];" + //to create half forms and dead forms
|
||||
"<telConjunct>=({<telConsonant><telvirama>{<zwj>}}<telConsonant>);" +
|
||||
"<telConjunct>{<telVowelSign>}{<telCharEnd>};";
|
||||
RuleBasedBreakIterator charIter=null;
|
||||
charIter = new RuleBasedBreakIterator(crules1);
|
||||
|
||||
Vector *chardata = new Vector();
|
||||
//behaviour of telugu characters from specified rules
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c15"), 0, status); //telugu consonant KA
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c30\\u0c47"), 0, status); //telugu consonant RA+telugu dependent vowel EE
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c1b\\u0c3e"), 0, status); //telugu consonant CHA+telegu depenednt vowel AA
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c17\\u0c48"), 0, status); //telegu consonant GA+teleugu dependent vowel AI
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c17\\u0c46\\u0c56"), 0, status); //telugu consonant GA+telugu dependent vowel sign E+telugu AI length mark
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c28\\u0c4d\\u200d\\u0c28"), 0, status); //telugu consonant NA+telugu virama+zwj=>halfNA+NA->NNA(dependent half-form)
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c17\\u0c4d\\u0c30"), 0, status); //GA+deadRA(RA+telvirama)->GA+subRA->GRA
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c66"), 0, status); //telugu digit
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c37\\u0c4d\\u0c15"), 0, status); //deadSSA(SSA+telvirama)+KA+subSSA->KSHA
|
||||
//behaviour of other characters from default rules
|
||||
ADD_DATACHUNK(chardata, "h", 0, status);
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("A\\u0302"), 0, status); // circumflexA
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("i\\u0301"), 0, status); // acuteBelowI
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1109\\u1161\\u11bc"), 0, status);
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1112\\u1161\\u11bc"), 0, status);
|
||||
ADD_DATACHUNK(chardata, "\n", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\r\n", 0, status);
|
||||
|
||||
generalIteratorTest(charIter, chardata);
|
||||
|
||||
delete charIter;
|
||||
delete charData;
|
||||
delete rb;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
//tests the behaviour of character iteration of RBBI with custom rules
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
void RBBITest::TestCustomRuleBasedCharacterIteration()
|
||||
{
|
||||
logln((UnicodeString)"Testing the RBBI by using custom rules for character iteration");
|
||||
|
||||
UnicodeString crules2="<ignore>=[e];" + //ignore the character "e"
|
||||
".;" +
|
||||
"<devVowelSign>=[\\u093e-\\u094c\\u0962\\u0963];" + //devanagiri vowel = \\u093e tp \\u094c and \\u0962.\\u0963
|
||||
"<devConsonant>=[\\u0915-\\u0939];" + //devanagiri consonant = \\u0915 to \\u0939
|
||||
"<devConsonant>{<devVowelSign>};" ; //break at all places except the following
|
||||
//devanagiri consonants+ devanagiri vowelsign
|
||||
|
||||
RuleBasedCharacterIterator charIterCustom = new RuleBasedBreakIterator(crules2);
|
||||
Vector *chardata = new Vector();
|
||||
ADD_DATACHUNK(chardata, "He", 0, status); //ignores 'e'
|
||||
ADD_DATACHUNK(chardata, "l", 0, status);
|
||||
ADD_DATACHUNK(chardata, "l", 0, status);
|
||||
ADD_DATACHUNK(chardata, "oe", 0, status); //ignores 'e' hence wraps it into 'o' instead of wrapping with
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0301"), 0, status); //'\\u0301' to form 'acuteE '
|
||||
ADD_DATACHUNK(chardata, "&e", 0, status); //ignores 'e' hence wraps it into '&' instead of wrapping with
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0303"), 0, status); //'\\u0303 to form 'tildaE'
|
||||
//devanagiri characters
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0906"), 0, status); //devanagiri AA
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u093e"), 0, status); //devanagiri vowelsign AA:--breaks at \\u0901 which is devanagiri
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0901"), 0, status); //chandra bindhu since it is not mentioned in the rules
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0916\\u0947"), 0, status); //devanagiri KHA+vowelsign E
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0938\\u0941"), 0, status); //devanagiri SA+vowelsign U : - breaks at
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0902"), 0, status); //\\u0902 devanagiri anusvara since it is not mentioned in the rules
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0926"), 0, status); //devanagiri consonant DA
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0930"), 0, status); //devanagiri consonant RA
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0939\\u094c"), 0, status); //devanagiri HA+vowel sign AI
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0964"), 0, status); //devanagiri danda
|
||||
// devanagiri chracters end
|
||||
ADD_DATACHUNK(chardata, "A", 0, status); //breaks in between since it is not mentioned in the rules
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0302"), 0, status); // circumflexA
|
||||
ADD_DATACHUNK(chardata, "i", 0, status); //breaks in between since not mentioned in the rules
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0301"), 0, status); // acuteBelowI
|
||||
//Rules don't support conjoining jamo->->..
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1109"), 0, status); //break at every character since rules
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1161"), 0, status); //don't support conjoining jamo
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u11bc"), 0, status);
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1112"), 0, status);
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1161"), 0, status);
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u11bc"), 0, status);
|
||||
ADD_DATACHUNK(chardata, "\n", 0, status);
|
||||
ADD_DATACHUNK(chardata, "\r", 0, status); //doesn't keep CRLGF together since rules do not mention it
|
||||
ADD_DATACHUNK(chardata, "\n", 0, status);
|
||||
ADD_DATACHUNK(chardata, "S", 0, status); //graveS
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0300"), 0, status); //breaks in between since it is not mentioned in the rules
|
||||
ADD_DATACHUNK(chardata, "i", 0, status); //acuteBelowI
|
||||
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0301"), 0, status); //breaks in between since it is not mentioned in the rules
|
||||
generalIteratorTest(charIterCustom, chardata);
|
||||
|
||||
delete charIterCustom;
|
||||
delete chardata;
|
||||
}*/
|
||||
/*//--------------------------------------------------------------------
|
||||
//tests custom rules based word iteration
|
||||
//--------------------------------------------------------------------
|
||||
void RBBITest::TestCustomRuleBasedWordIteration(){
|
||||
logln("(UnicodeString)Testing the RBBI by using custom rules for word iteration");
|
||||
UnicodeString wrules1="<ignore>=[:Mn::Me::Cf:];" + //ignore non-spacing marks, enclosing marks, and format characters,
|
||||
"<danda>=[\\u0964\\u0965];" + //Hindi Phrase seperator
|
||||
"<let>=[:L::Mc:];" + //uppercase(Lu), lowercase(Ll), titlecase(Lt), modifier(Lm) letters, Mc-combining space mark
|
||||
"<mid-word>=[:Pd:\\\"\\\'\\.];" + //dashes, quotation, apostraphes, period
|
||||
"<ls>=[\\n\\u000c\\u2028\\u2029];" + //line separators: LF, FF, PS, and LS
|
||||
"<ws>=[:Zs:\\t];" + //all space separators and the tab character
|
||||
"<word>=((<let><let>*(<mid-word><let><let>*)*));" +
|
||||
".;" + //break after every character, with the following exceptions
|
||||
"{<word>};" +
|
||||
"<ws>*{\\r}{<ls>}{<danda>};" ;
|
||||
|
||||
RuleBasedBreakIterator wordIterCustom = new RuleBasedBreakIterator(wrules1);
|
||||
Vector *worddata = new Vector();
|
||||
ADD_DATACHUNK(worddata, "Write", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "wordrules", 0, status);
|
||||
ADD_DATACHUNK(worddata, ".", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
//play with hindi
|
||||
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u092f\\u0939"), 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0939\\u093f") + halfNA + CharsToUnicodeString("\\u0926\\u0940"), 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0939\\u0948"), 0, status);
|
||||
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0964"), 0, status); //Danda is similar to full stop-> Danda followed by a space
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0905\\u093e\\u092a"), 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947"), 0, status);
|
||||
ADD_DATACHUNK(worddata, "?", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "It's", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "$", 0, status);
|
||||
ADD_DATACHUNK(worddata, "3", 0, status);
|
||||
ADD_DATACHUNK(worddata, "0", 0, status);
|
||||
ADD_DATACHUNK(worddata, ".", 0, status);
|
||||
ADD_DATACHUNK(worddata, "1", 0, status);
|
||||
ADD_DATACHUNK(worddata, "0", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
// ADD_DATACHUNK(worddata, " ", 0);
|
||||
generalIteratorTest(wordIterCustom, worddata);
|
||||
|
||||
delete wordIterCustom;
|
||||
delete worddata;
|
||||
}
|
||||
//-------------------------------------------------------------------------------
|
||||
//adds extra rules to deal with abbrevations(limited) and test the word Iteration
|
||||
//-------------------------------------------------------------------------------
|
||||
void RBBITest::TestAbbrRuleBasedWordIteration()
|
||||
{
|
||||
logln((UnicodeString)"Testing the RBBI for word iteration by adding rules to support abbreviation");
|
||||
RuleBasedBreakIterator *rb =(RuleBasedBreakIterator*)BreakIterator::createWordInstance();
|
||||
|
||||
UnicodeString wrules2="<abbr>=((Mr.)|(Mrs.)|(Ms.)|(Dr.)|(U.S.));" + // abbreviations.
|
||||
rb->getRules() +
|
||||
"{(<abbr><ws>)*<word>};";
|
||||
RuleBasedBreakIterator wordIter=null;
|
||||
//try{
|
||||
wordIter = new RuleBasedBreakIterator(wrules2);
|
||||
// }catch(IllegalArgumentException iae){
|
||||
// errln("ERROR: failed construction illegal rules");
|
||||
// }
|
||||
Vector *worddata = new Vector();
|
||||
ADD_DATACHUNK(worddata, "Mr. George", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "is", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "from", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "U.S. Navy", 0, status);
|
||||
ADD_DATACHUNK(worddata, ".", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "His", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\n", 0, status);
|
||||
ADD_DATACHUNK(worddata, "friend", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\t", 0, status);
|
||||
ADD_DATACHUNK(worddata, "Dr. Steven", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "married", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "Ms. Benneth", 0, status);
|
||||
ADD_DATACHUNK(worddata, "!", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "Mrs. Johnson", 0, status);
|
||||
ADD_DATACHUNK(worddata, "\r\n", 0, status);
|
||||
ADD_DATACHUNK(worddata, "paid", 0, status);
|
||||
ADD_DATACHUNK(worddata, " ", 0, status);
|
||||
ADD_DATACHUNK(worddata, "$2,400.00", 0, status);
|
||||
generalIteratorTest(wordIter, worddata);
|
||||
|
||||
delete wordIter;
|
||||
delete worddata;
|
||||
delete rb;
|
||||
} */
|
||||
|
||||
|
||||
|
||||
void RBBITest::TestThaiLineBreak() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
@ -1301,12 +674,12 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
|
||||
switch (index) {
|
||||
|
||||
case 0: name = "TestDefaultRuleBasedCharacterIteration";
|
||||
if(exec) TestDefaultRuleBasedCharacterIteration(); break;
|
||||
case 1: name = "TestExtended";
|
||||
case 0: name = "TestExtended";
|
||||
if(exec) TestExtended(); break;
|
||||
case 2: name = "TestDefaultRuleBasedWordIteration";
|
||||
if(exec) TestDefaultRuleBasedWordIteration(); break;
|
||||
case 1: name = "";
|
||||
break;
|
||||
case 2: name = "";
|
||||
break;
|
||||
case 3: name = "";
|
||||
break;
|
||||
case 4: name = "TestHindiCharacterBreak";
|
||||
|
@ -35,14 +35,6 @@ public:
|
||||
virtual ~RBBITest();
|
||||
|
||||
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
/**
|
||||
* Tests default rules based character iteration
|
||||
**/
|
||||
void TestDefaultRuleBasedCharacterIteration(void);
|
||||
/**
|
||||
* Tests default rules based word iteration
|
||||
**/
|
||||
void TestDefaultRuleBasedWordIteration(void);
|
||||
/**
|
||||
* Tests Hindi(Devanagiri) character iteration
|
||||
**/
|
||||
|
115
icu4c/source/test/testdata/rbbitst.txt
vendored
115
icu4c/source/test/testdata/rbbitst.txt
vendored
@ -39,10 +39,12 @@
|
||||
|
||||
# Surrogates
|
||||
<data>•\U00011000•\U00010020•\U00010000\N{COMBINING MACRON}•</data>
|
||||
<data>•\ud800\udc00•\udbff\udfff•a•</data>
|
||||
|
||||
# Extend (Combining chars) combine.
|
||||
<data>•A\N{COMBINING GRAVE ACCENT}•B•</data>
|
||||
<data>•\N{GREEK SMALL LETTER MU}\N{COMBINING LOW LINE}\N{COMBINING HORN}•</data>
|
||||
<data>•a\u0301•b\u0302•c\u0303•d\u0304•e\u0305•f\u0306•g\u0307•h\u0308•i\u0309•</data>
|
||||
|
||||
<data>•a\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304•</data>
|
||||
|
||||
@ -64,11 +66,22 @@
|
||||
# Hindi combining chars. (An old test)
|
||||
<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
|
||||
•\u0939•\u094c•\u0964•</data>
|
||||
<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
|
||||
|
||||
|
||||
# Bug 1587. Tamil. \u0baa\u0bc1 should be two separate characters, even though
|
||||
# Hyangmi would perfer that it be one.
|
||||
<data>•\u0baa•\u0bc1•\u0baa•\u0bc1•</data>
|
||||
|
||||
# Regression test for bug 1889
|
||||
<data>•\u0f40\u0f7d•\u0000•\u0f7e•</data>
|
||||
|
||||
|
||||
# 0xffff is a legal character, and should not stop the break iterator early.
|
||||
# (Requires special casing in implementation, which is why it gets a test.)
|
||||
<data>•\uffff•\uffff• •a•</data>
|
||||
|
||||
|
||||
########################################################################################
|
||||
#
|
||||
#
|
||||
@ -90,18 +103,73 @@
|
||||
#
|
||||
|
||||
<word>
|
||||
<data>•This<200> •is<200> •a<200> •word<200> •break<200>.• •Isn't<200> •it<200>?• •2.25<100></data>
|
||||
<data>•This<200> •is<200> •a<200> •word<200> •break<200>.• • •Isn't<200> •it<200>?• •2.25<100></data>
|
||||
|
||||
|
||||
<sent>
|
||||
<data>•This\n•</data>
|
||||
<data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \
|
||||
doing? •This\n• costs $20,00,000. •</data>
|
||||
|
||||
<line>
|
||||
<data>•Hello! •how\r\n• •(are)\r• •you? •I'am •fine- •Thankyou. •foo\u00a0bar
|
||||
•How, •are, •you? •This, •costs •$20,00,000.•</data>
|
||||
#
|
||||
# Data originally from TestDefaultRuleBasedWordIteration()
|
||||
#
|
||||
<data>•Write<200> •wordrules<200>.• •123.456<100> •alpha\u00adbeta\u00adgamma<200> •\u092f\u0939<200> •</data>
|
||||
<data>• •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•</data>
|
||||
|
||||
#Hindi Numbers
|
||||
<data>• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100> •\N{RUPEE SIGN}•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> • •\u0905\u092e\u091c<200>\n•</data>
|
||||
|
||||
<data>•\u0938\u094d\u200d\u0935\u0924\u0902deadTA\u0930<200>\r•It's<200> •$•30.10<100> •12,34<100>¢•£•¤•¥•alpha\u05f3beta\u05f4gamma<200> •</data>
|
||||
|
||||
<data>•Badges<200>?• •BADGES<200>!•?•!• •We<200> •don't<200> •need<200> •no<200> •STINKING<200> •BADGES<200>!•!•1000,233,456.000<100> •1,23.322<100>%•123.1222<100>$•123,000.20<100> •179.01<100>%•X<200> •Now<200>\r•is<200>\n•the<200>\r\n•time<200> •</data>
|
||||
|
||||
#Hangul
|
||||
<data>•\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •Hello<200>,• •how<200> •are<200> •you<200> •</data>
|
||||
|
||||
|
||||
# Words containing non-BMP letters
|
||||
<data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATICAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200> •</data>
|
||||
|
||||
# Unassigned code points
|
||||
<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
|
||||
|
||||
# Hiragana & Katakana stay together, but separates from each other and Latin.
|
||||
<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
|
||||
|
||||
# Words with interior formatting characters
|
||||
<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}<200> •</data>
|
||||
|
||||
# to test for bug #4097779
|
||||
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
|
||||
|
||||
|
||||
# to test for bug #4098467
|
||||
# What follows is a string of Korean characters (I found it in the Yellow Pages
|
||||
# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
|
||||
# it correctly), first as precomposed syllables, and then as conjoining jamo.
|
||||
# Both sequences should be semantically identical and break the same way.
|
||||
# precomposed syllables...
|
||||
<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
|
||||
|
||||
<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
|
||||
|
||||
#
|
||||
# Try some words from other scripts.
|
||||
#
|
||||
|
||||
# Try some words from other scripts.
|
||||
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
|
||||
#
|
||||
<data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200> •ABC<200> •</data>
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Hindi word break tests, imported from the old RBBI tests.
|
||||
# An historical note: a much earlier version of ICU break iterators had a number
|
||||
# of special case rules for Hindi, which were tested by an earlier version of
|
||||
# this test data. The current RBBI rules do not special case Hindi in
|
||||
# any way, making this test data much less signfificant.
|
||||
#
|
||||
<data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903<200>
|
||||
•\u0935\u0930\u094d\u0937\u093e<200>\r\n•\u092a\u094d\u0930\u0915\u093e\u0936<200>,•\u0924\u0941\u092e\u093e\u0930\u094b<200> •\u092e\u093f\u0924\u094d\u0930<200> •\u0915\u093e<200> •\u092a\u0924\u094d\u0930<200> •\u092a\u095d\u094b<200> •\u0938\u094d\u0924\u094d\u0930\u093f<200>.• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100>\u20a8•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> •\u0905\u092e\u091c<200>\n•\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930<200>\r•</data>
|
||||
|
||||
########################################################################################
|
||||
#
|
||||
@ -117,6 +185,13 @@ doing? •This\n• costs $20,00,000. •</data>
|
||||
#
|
||||
<sent>
|
||||
|
||||
|
||||
<sent>
|
||||
<data>•This\n•</data>
|
||||
<data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \
|
||||
doing? •This\n• costs $20,00,000. •</data>
|
||||
|
||||
|
||||
# Sentence ending in a quote.
|
||||
<data>•"Sentence ending with a quote." •Bye.•</data>
|
||||
|
||||
@ -246,6 +321,10 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
|
||||
<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence.
|
||||
•Hi •Hello •How\n•are\r•you\u2028•fine.\t•good. •Now\r•is\n•the\r\n•time\n•\r•for\r•\r•all•</data>
|
||||
|
||||
<line>
|
||||
<data>•Hello! •how\r\n• •(are)\r• •you? •I'am •fine- •Thankyou. •foo\u00a0bar
|
||||
•How, •are, •you? •This, •costs •$20,00,000.•</data>
|
||||
|
||||
# test for bug #4068133
|
||||
#
|
||||
<data>•\u96f6•\u4e00\u3002•\u4e8c\u3001•\u4e09\u3002\u3001•\u56db\u3001\u3002\u3001•\u4e94,•\u516d.•\u4e03.\u3001,\u3002•\u516b•</data>
|
||||
@ -287,3 +366,23 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
|
||||
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
|
||||
#
|
||||
<data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>
|
||||
|
||||
|
||||
########################################################################################
|
||||
#
|
||||
#
|
||||
# T i t l e B o u n d a r y T e s t s
|
||||
#
|
||||
#
|
||||
##########################################################################################
|
||||
<title>
|
||||
<data>•Here •is •a •short •sample •sentence. •And •another.•</data>
|
||||
<data>•HERE •IS •A •SHORT •SAMPLE •SENTENCE. •AND •ANOTHER.•</data>
|
||||
<data>• •Start •and •end •with •spaces •</data>
|
||||
<data>•Include 123 456 ^& •some 54332 •numbers 4445•abc123•abc •ending 1223 •</data>
|
||||
|
||||
<data>•Combining\u0301 \u0301•ma\u0306rks •bye •</data>
|
||||
<data>•123 •Start •with •a •number.•</data>
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user