ICU-2093 RBBI Tests updated; title break rules tweaked

X-SVN-Rev: 12025
This commit is contained in:
Andy Heninger 2003-05-20 18:38:41 +00:00
parent 0026f2d005
commit 1b2b7444d8
4 changed files with 116 additions and 658 deletions

View File

@ -8,22 +8,16 @@ $CaseIgnorable = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
$OtherUpperCase = [\u2160-\u216f \u24b6-\u24cf];
$OtherLowerCase = [\u02b0-\u02b8 \u02c0-\u02c1 \u02e0-\u02e4 \u0345\u037a \u2170-\u217f \u24d0-\u24e9];
$Cased = [[:Lu:][:Lt:][:Ll:] $OtherUpperCase $OtherLowerCase - $CaseIgnorable];
$NotCased = [^ $Cased $CaseIgnorable];
#
# If the iterator was not stopped on a cased character, advance it to the first cased char
#
($NotCased | $CaseIgnorable)*;
$NotCased = [^ $Cased];
#
# If the iterator starts on a cased item, advance through all adjacent cased items plus
# any non-cased stuff, to reach the start of the next word.
# any non-cased stuff, to reach the start of the next (cased) word.
#
$Cased ($Cased | $CaseIgnorable)* $NotCased*;
($Cased | $CaseIgnorable)* $NotCased*;
#
# Reverse Rules
#
!$NotCased* ($Cased | $CaseIgnorable)* $NotCased*;
!$NotCased* ($Cased | $CaseIgnorable)* $NotCased?;

View File

@ -214,388 +214,12 @@ RBBITest::~RBBITest() {
delete cannedTestChars;
}
//--------------------------------------------------------------------
//tests default rules based character iteration
//--------------------------------------------------------------------
void RBBITest::TestDefaultRuleBasedCharacterIteration()
{
// RuleBasedBreakIterator* rbbi=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance();
logln((UnicodeString)"Testing the RBBI for character iteration by using default rules");
//fetch the rules used to create the above RuleBasedBreakIterator
// UnicodeString defaultRules=rbbi->getRules();
// RuleBasedCharacterIterator charIterDefault = new RuleBasedBreakIterator(defaultRules);
UErrorCode status=U_ZERO_ERROR;
RuleBasedBreakIterator* charIterDefault=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
if(U_FAILURE(status)){
errln("FAIL : in construction");
return;
}
BITestData chardata(status);
ADD_DATACHUNK(chardata, NULL, 0, status); // Starting break
ADD_DATACHUNK(chardata, "H", 0, status);
ADD_DATACHUNK(chardata, "e", 0, status);
ADD_DATACHUNK(chardata, "l", 0, status);
ADD_DATACHUNK(chardata, "l", 0, status);
ADD_DATACHUNK(chardata, "o", 0, status);
ADD_DATACHUNK(chardata, "e\\u0301", 0, status); //acuteE
ADD_DATACHUNK(chardata, "&", 0, status);
ADD_DATACHUNK(chardata, "e\\u0303", 0, status); //tildaE
ADD_DATACHUNK(chardata, "S\\u0300", 0, status); //graveS
ADD_DATACHUNK(chardata, "i\\u0301", 0, status); // acuteBelowI
ADD_DATACHUNK(chardata, "m", 0, status);
ADD_DATACHUNK(chardata, "p", 0, status);
ADD_DATACHUNK(chardata, "l", 0, status);
ADD_DATACHUNK(chardata, "e\\u0301", 0, status); // acuteE
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "s", 0, status);
ADD_DATACHUNK(chardata, "a\\u0302", 0, status); // circumflexA
ADD_DATACHUNK(chardata, "m", 0, status);
ADD_DATACHUNK(chardata, "p", 0, status);
ADD_DATACHUNK(chardata, "l", 0, status);
ADD_DATACHUNK(chardata, "e\\u0303", 0, status); // tildeE
ADD_DATACHUNK(chardata, ".", 0, status);
ADD_DATACHUNK(chardata, "w", 0, status);
ADD_DATACHUNK(chardata, "a\\u0302", 0, status); // circumflexA
ADD_DATACHUNK(chardata, "w", 0, status);
ADD_DATACHUNK(chardata, "a", 0, status);
ADD_DATACHUNK(chardata, "f", 0, status);
ADD_DATACHUNK(chardata, "q", 0, status);
ADD_DATACHUNK(chardata, "\n", 0, status);
ADD_DATACHUNK(chardata, "\r", 0, status);
ADD_DATACHUNK(chardata, "\r\n", 0, status);
ADD_DATACHUNK(chardata, "\n", 0, status);
//devanagiri characters for Hindi support
ADD_DATACHUNK(chardata, "\\u0906", 0, status); //devanagiri AA
//ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0); //devanagiri vowelsign AA+ chandrabindhu
ADD_DATACHUNK(chardata, "\\u0906\\u0901", 0, status); // Devanagari AA + chandrabindu
ADD_DATACHUNK(chardata, "\\u0915", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
ADD_DATACHUNK(chardata, "\\u0916\\u0947", 0, status); //devanagiri KHA+vowelsign E
ADD_DATACHUNK(chardata, "\\u0938\\u0941\\u0902", 0, status); //devanagiri SA+vowelsign U + anusvara(bindu)
ADD_DATACHUNK(chardata, "\\u0926", 0, status); //devanagiri consonant DA
ADD_DATACHUNK(chardata, "\\u0930", 0, status); //devanagiri consonant RA
ADD_DATACHUNK(chardata, "\\u0939", 0, status); //devanagiri HA+vowel sign AI
ADD_DATACHUNK(chardata, "\\u094c", 0, status); //devanagiri HA+vowel sign AI
ADD_DATACHUNK(chardata, "\\u0964", 0, status); //devanagiri danda
//end hindi characters
ADD_DATACHUNK(chardata, "A\\u0302", 0, status); //circumflexA
ADD_DATACHUNK(chardata, "i\\u0301", 0, status); //acuteBelowI
// conjoining jamo->..
ADD_DATACHUNK(chardata, "\\u1109\\u1161\\u11bc", 0, status);
ADD_DATACHUNK(chardata, "\\u1112\\u1161\\u11bc", 0, status);
ADD_DATACHUNK(chardata, "\n", 0, status);
ADD_DATACHUNK(chardata, "\r\n", 0, status); //keep CRLF sequences together
ADD_DATACHUNK(chardata, "S\\u0300", 0, status); //graveS
ADD_DATACHUNK(chardata, "i\\u0301", 0, status); //acuteBelowI
ADD_DATACHUNK(chardata, "!", 0, status);
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
ADD_DATACHUNK(chardata, "\\uc0c1", 0, status);
ADD_DATACHUNK(chardata, "\\ud56d", 0, status);
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "\\ud55c", 0, status);
ADD_DATACHUNK(chardata, "\\uc778", 0, status);
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "\\uc5f0", 0, status);
ADD_DATACHUNK(chardata, "\\ud569", 0, status);
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "\\uc7a5", 0, status);
ADD_DATACHUNK(chardata, "\\ub85c", 0, status);
ADD_DATACHUNK(chardata, "\\uad50", 0, status);
ADD_DATACHUNK(chardata, "\\ud68c", 0, status);
ADD_DATACHUNK(chardata, " ", 0, status);
// conjoining jamo...
ADD_DATACHUNK(chardata, "\\u1109\\u1161\\u11bc", 0, status);
ADD_DATACHUNK(chardata, "\\u1112\\u1161\\u11bc", 0, status);
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "\\u1112\\u1161\\u11ab", 0, status);
ADD_DATACHUNK(chardata, "\\u110b\\u1175\\u11ab", 0, status);
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "\\u110b\\u1167\\u11ab", 0, status);
ADD_DATACHUNK(chardata, "\\u1112\\u1161\\u11b8", 0, status);
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "\\u110c\\u1161\\u11bc", 0, status);
ADD_DATACHUNK(chardata, "\\u1105\\u1169", 0, status);
ADD_DATACHUNK(chardata, "\\u1100\\u116d", 0, status);
ADD_DATACHUNK(chardata, "\\u1112\\u116c", 0, status);
// Surrogate pairs stay together
ADD_DATACHUNK(chardata, "\\ud800\\udc00", 0, status);
ADD_DATACHUNK(chardata, "\\udbff\\udfff", 0, status);
ADD_DATACHUNK(chardata, "x", 0, status);
// 0xffff is a legal character, and should not stop the break iterator early.
// (Requires special casing in implementation, which is why it gets a test.)
ADD_DATACHUNK(chardata, "\\uffff", 0, status);
ADD_DATACHUNK(chardata, "\\uffff", 0, status);
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "a", 0, status);
// Regression test for bug 1889
ADD_DATACHUNK(chardata, "\\u0f40\\u0f7d", 0, status);
ADD_DATACHUNK(chardata, "\\u0000", 0, status);
ADD_DATACHUNK(chardata, "\\u0f7e", 0, status);
// \u0f7d\u0000\u0f7e
if(U_FAILURE(status)){
errln("FAIL : in BITestData construction");
return;
}
// Run the test...
generalIteratorTest(*charIterDefault, chardata);
delete charIterDefault;
// delete rbbi;
}
static const int T_NUMBER = 100;
static const int T_LETTER = 200;
static const int T_H_OR_K = 300;
static const int T_IDEO = 400;
//--------------------------------------------------------------------
//tests default rules based word iteration
//--------------------------------------------------------------------
void RBBITest::TestDefaultRuleBasedWordIteration()
{
logln((UnicodeString)"Testing the RBBI for word iteration using default rules");
UErrorCode status=U_ZERO_ERROR;
RuleBasedBreakIterator* wordIterDefault=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
if(U_FAILURE(status)){
errln("FAIL : in construction");
return;
}
BITestData worddata(status);
ADD_DATACHUNK(worddata, NULL, 0, status);
ADD_DATACHUNK(worddata, "Write", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "wordrules", T_LETTER, status);
ADD_DATACHUNK(worddata, ".", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "123.456", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "alpha\\u00adbeta\\u00adgamma", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u092f\\u0939", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0939\\u093f" halfNA "\\u0926\\u0940", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0939\\u0948", T_LETTER, status);
//ADD_DATACHUNK(worddata, "\\u0964", 0); //danda followed by a space "\u0964->danda: hindi phrase seperator"
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0905\\u093e\\u092a", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947", T_LETTER, status);
ADD_DATACHUNK(worddata, "?", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0968\\u0966.\\u0969\\u096f", T_NUMBER, status); //hindi numbers
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0967\\u0966\\u0966.\\u0966\\u0966", T_NUMBER, status); //postnumeric
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u20a8", 0, status); // //pre-number India currency symbol Rs->\\u20aD
ADD_DATACHUNK(worddata, "\\u0967,\\u0967\\u0966\\u0966.\\u0966\\u0966", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0905\\u092e\\u091c", T_LETTER, status);
ADD_DATACHUNK(worddata, "\n", 0, status);
ADD_DATACHUNK(worddata, halfSA "\\u0935\\u0924\\u0902" deadTA "\\u0930", T_LETTER, status);
ADD_DATACHUNK(worddata, "\r", 0, status);
ADD_DATACHUNK(worddata, "It's", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "$", 0, status);
ADD_DATACHUNK(worddata, "30.10", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "12,34", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u00A2", 0, status); //cent sign
ADD_DATACHUNK(worddata, "\\u00A3", 0, status); //pound sign
ADD_DATACHUNK(worddata, "\\u00A4", 0, status); //currency sign
ADD_DATACHUNK(worddata, "\\u00A5", 0, status); //yen sign
ADD_DATACHUNK(worddata, "alpha\\u05f3beta\\u05f4gamma", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "Badges", T_LETTER, status);
ADD_DATACHUNK(worddata, "?", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, "?", 0, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "We", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "don't", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "need", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "no", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "STINKING", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, "1000,233,456.000", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "1,23.322", T_NUMBER, status);
ADD_DATACHUNK(worddata, "%", 0, status);
ADD_DATACHUNK(worddata, "123.1222", T_NUMBER, status);
ADD_DATACHUNK(worddata, "$", 0, status);
ADD_DATACHUNK(worddata, "123,000.20", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "179.01", T_NUMBER, status);
ADD_DATACHUNK(worddata, "%", 0, status);
ADD_DATACHUNK(worddata, "X", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "Now", T_LETTER, status);
ADD_DATACHUNK(worddata, "\r", 0, status);
ADD_DATACHUNK(worddata, "is", T_LETTER, status);
ADD_DATACHUNK(worddata, "\n", 0, status);
ADD_DATACHUNK(worddata, "the", T_LETTER, status);
ADD_DATACHUNK(worddata, "\r\n", 0, status);
ADD_DATACHUNK(worddata, "time", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\uc5f0\\ud569", T_LETTER, status); // Hangul Syllables
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\uc7a5\\ub85c\\uad50\\ud68c", T_LETTER, status); // Hangul
ADD_DATACHUNK(worddata, " ", 0, status);
// conjoining jamo...
ADD_DATACHUNK(worddata, "\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "Hello", T_LETTER, status);
ADD_DATACHUNK(worddata, ",", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "how", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "are", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "you", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// Words containing non-BMP letters
ADD_DATACHUNK(worddata, "abc\\U00010300", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "abc\\U0001044D", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "abc\\U0001D433", T_LETTER, status); //MATHEMATICAL BOLD SMALL Z
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "abc\\U0001D7C9", T_LETTER, status); //MATHEMATICAL SANS-SERIF BOLD ITALIC PI
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "abc", T_LETTER, status); // same test outside of letter range.
ADD_DATACHUNK(worddata, "\\U0001D800", 0, status);
ADD_DATACHUNK(worddata, "def", T_LETTER, status);
ADD_DATACHUNK(worddata, "\\U0001D3FF", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// Hiragana & Katakana stay together, but separates from each other and Latin.
// TODO: Hira and Kata ranges from UnicodeSet differ slightly from
// what's in Unicode Scripts file. Investigate.
ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
ADD_DATACHUNK(worddata, "\\u3041", T_H_OR_K, status); // Hiragana
ADD_DATACHUNK(worddata, "\\u3094\\u0301", T_H_OR_K, status); // Hiragana
ADD_DATACHUNK(worddata, "\\u309d", T_H_OR_K, status); // Hiragana
ADD_DATACHUNK(worddata, "\\u30a1\\u30fd\\uff66\\uff9d", T_H_OR_K, status); // Katakana
ADD_DATACHUNK(worddata, "def", T_LETTER, status);
ADD_DATACHUNK(worddata, "#", 0, status);
// Words with interior formatting characters
ADD_DATACHUNK(worddata, "def\\u0301\\u070Fabc", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// to test for bug #4097779
ADD_DATACHUNK(worddata, "aa\\u0300a", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// to test for bug #4098467
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
ADD_DATACHUNK(worddata, "\\uc0c1\\ud56d", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\ud55c\\uc778", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\uc5f0\\ud569", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\uc7a5\\ub85c\\uad50\\ud68c", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// conjoining jamo...
ADD_DATACHUNK(worddata, "\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
// count as a Kanji character for the purposes of word breaking
ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
// Unicode TR29: Ideographs do NOT group together into words.
//wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
ADD_DATACHUNK(worddata, "\\u4e01", T_IDEO, status);
ADD_DATACHUNK(worddata, "\\u4e02", T_IDEO, status);
ADD_DATACHUNK(worddata, "\\u3005", T_LETTER, status); // TODO: 3005 is ideographic iteration mark
// Treating as letter is according to TR.
// Check whether this is really intended.
ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status);
ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status);
ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
//
// Try some words from other scripts.
//
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0391\\u0392\\u0393", T_LETTER, status); // Greek
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0411\\u0412\\u0413", T_LETTER, status); // Cyrillic
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u05D0\\u05D1\\u05D2\\u0593", T_LETTER, status); // Hebrew
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0627\\u0628\\u062A", T_LETTER, status); // Arabic
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u0661\\u0662\\u0663", T_NUMBER, status); // Arabic
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u10A0\\u10A1\\u10A2", T_LETTER, status); // Georgian
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "ABC", T_LETTER, status); // Latin
if (U_FAILURE(status)){
errln("FAIL : in BITestData construction");
return;
}
generalIteratorTest(*wordIterDefault, worddata);
delete wordIterDefault;
}
@ -824,257 +448,6 @@ void RBBITest::TestStatusReturn() {
delete bi;
}
/*
//Bug: if there is no word break before and after danda when it is followed by a space
void RBBITest::TestDanda()
{
Vector *hindiWordData = new Vector();
//hindi
ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u092f\\u0939"), 0, status);
ADD_DATACHUNK(hindiWordData, " ", 0, status);
//Danda is similar to full stop, danda is a hindi phrase seperator.
//Make sure there is a word break before and after danda when it is followed by a space
//following fail----
ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u0939\\u0948"), 0, status);
// ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u0964"), 0); // devanagari danda
ADD_DATACHUNK(hindiWordData, " ", 0, status);
ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u092f\\u0939"), 0, status);
// ADD_DATACHUNK(hindiWordData, CharsToUnicodeString("\\u0965"), 0); //devanagari double danda
ADD_DATACHUNK(hindiWordData, " ", 0, status);
RuleBasedBreakIterator* e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance();
generalIteratorTest(*e, hindiWordData);
delete e;
delete hindiWordData;
}
//Make sure the character wrapping is done correctly
void RBBITest::TestHindiCharacterWrapping()
{
Vector *hindicharData = new Vector();
//if the dead consonant RA precedes either a consonant or an independent vowel,
//then it is replaced by its superscript non-spacing mark
ADD_DATACHUNK(hindicharData, deadRA+ CharsToUnicodeString("\\u0917"), 0, status); //deadRA+devanagari consonant GA->GA+superRA
//following fail----
// ADD_DATACHUNK(hindicharData, deadRA+ CharsToUnicodeString("\\u0960"), 0); //deadRA+devanagari RRI->RRI+superRA
RuleBasedBreakIterator* e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance();
generalIteratorTest(*e, hindicharData);
delete e;
delete hindicharData;
}*/
//----------------------------------------------------------------------------------
//adds rules for telugu support and tests the behaviour of chracterIterator of RBBI
//----------------------------------------------------------------------------------
/*void RBBITest::TestTeluguRuleBasedCharacterIteration()
{
logln((UnicodeString)"Testing the RBBI by adding rules for Telugu(Indian Language) Support");
//get the default rules
RuleBasedBreakIterator *rb= (RuleBasedBreakIterator*)BreakIterator::createCharacterInstance();
//additional rules for Telugu(Indian Language) support
UnicodeString crules1 = rb->getRules() + //default rules +
"<telvirama>=[\\u0c4d];" + //telugu virama
"<telVowelSign>=[\\u0c3e-\\u0c44\\u0c46\\u0c47\\u0c48\\u0c4a\\u0c4b\\u0c4c];" + //telugu dependent vowel signs
"<telConsonant>=[\\u0c15-\\u0c28\\u0c2a-\\u0c33\\u0c35-\\u0c39];" + //telugu consonants
"<telCharEnd>=[\\u0c02\\u0c03\\u0c55\\u0c56];" + //to create half forms and dead forms
"<telConjunct>=({<telConsonant><telvirama>{<zwj>}}<telConsonant>);" +
"<telConjunct>{<telVowelSign>}{<telCharEnd>};";
RuleBasedBreakIterator charIter=null;
charIter = new RuleBasedBreakIterator(crules1);
Vector *chardata = new Vector();
//behaviour of telugu characters from specified rules
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c15"), 0, status); //telugu consonant KA
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c30\\u0c47"), 0, status); //telugu consonant RA+telugu dependent vowel EE
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c1b\\u0c3e"), 0, status); //telugu consonant CHA+telegu depenednt vowel AA
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c17\\u0c48"), 0, status); //telegu consonant GA+teleugu dependent vowel AI
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c17\\u0c46\\u0c56"), 0, status); //telugu consonant GA+telugu dependent vowel sign E+telugu AI length mark
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c28\\u0c4d\\u200d\\u0c28"), 0, status); //telugu consonant NA+telugu virama+zwj=>halfNA+NA->NNA(dependent half-form)
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c17\\u0c4d\\u0c30"), 0, status); //GA+deadRA(RA+telvirama)->GA+subRA->GRA
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c66"), 0, status); //telugu digit
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0c37\\u0c4d\\u0c15"), 0, status); //deadSSA(SSA+telvirama)+KA+subSSA->KSHA
//behaviour of other characters from default rules
ADD_DATACHUNK(chardata, "h", 0, status);
ADD_DATACHUNK(chardata, CharsToUnicodeString("A\\u0302"), 0, status); // circumflexA
ADD_DATACHUNK(chardata, CharsToUnicodeString("i\\u0301"), 0, status); // acuteBelowI
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1109\\u1161\\u11bc"), 0, status);
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1112\\u1161\\u11bc"), 0, status);
ADD_DATACHUNK(chardata, "\n", 0, status);
ADD_DATACHUNK(chardata, "\r\n", 0, status);
generalIteratorTest(charIter, chardata);
delete charIter;
delete charData;
delete rb;
}
//--------------------------------------------------------------------
//tests the behaviour of character iteration of RBBI with custom rules
//--------------------------------------------------------------------
void RBBITest::TestCustomRuleBasedCharacterIteration()
{
logln((UnicodeString)"Testing the RBBI by using custom rules for character iteration");
UnicodeString crules2="<ignore>=[e];" + //ignore the character "e"
".;" +
"<devVowelSign>=[\\u093e-\\u094c\\u0962\\u0963];" + //devanagiri vowel = \\u093e tp \\u094c and \\u0962.\\u0963
"<devConsonant>=[\\u0915-\\u0939];" + //devanagiri consonant = \\u0915 to \\u0939
"<devConsonant>{<devVowelSign>};" ; //break at all places except the following
//devanagiri consonants+ devanagiri vowelsign
RuleBasedCharacterIterator charIterCustom = new RuleBasedBreakIterator(crules2);
Vector *chardata = new Vector();
ADD_DATACHUNK(chardata, "He", 0, status); //ignores 'e'
ADD_DATACHUNK(chardata, "l", 0, status);
ADD_DATACHUNK(chardata, "l", 0, status);
ADD_DATACHUNK(chardata, "oe", 0, status); //ignores 'e' hence wraps it into 'o' instead of wrapping with
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0301"), 0, status); //'\\u0301' to form 'acuteE '
ADD_DATACHUNK(chardata, "&e", 0, status); //ignores 'e' hence wraps it into '&' instead of wrapping with
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0303"), 0, status); //'\\u0303 to form 'tildaE'
//devanagiri characters
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0906"), 0, status); //devanagiri AA
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u093e"), 0, status); //devanagiri vowelsign AA:--breaks at \\u0901 which is devanagiri
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0901"), 0, status); //chandra bindhu since it is not mentioned in the rules
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0916\\u0947"), 0, status); //devanagiri KHA+vowelsign E
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0938\\u0941"), 0, status); //devanagiri SA+vowelsign U : - breaks at
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0902"), 0, status); //\\u0902 devanagiri anusvara since it is not mentioned in the rules
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0926"), 0, status); //devanagiri consonant DA
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0930"), 0, status); //devanagiri consonant RA
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0939\\u094c"), 0, status); //devanagiri HA+vowel sign AI
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0964"), 0, status); //devanagiri danda
// devanagiri chracters end
ADD_DATACHUNK(chardata, "A", 0, status); //breaks in between since it is not mentioned in the rules
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0302"), 0, status); // circumflexA
ADD_DATACHUNK(chardata, "i", 0, status); //breaks in between since not mentioned in the rules
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0301"), 0, status); // acuteBelowI
//Rules don't support conjoining jamo->->..
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1109"), 0, status); //break at every character since rules
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1161"), 0, status); //don't support conjoining jamo
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u11bc"), 0, status);
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1112"), 0, status);
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u1161"), 0, status);
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u11bc"), 0, status);
ADD_DATACHUNK(chardata, "\n", 0, status);
ADD_DATACHUNK(chardata, "\r", 0, status); //doesn't keep CRLGF together since rules do not mention it
ADD_DATACHUNK(chardata, "\n", 0, status);
ADD_DATACHUNK(chardata, "S", 0, status); //graveS
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0300"), 0, status); //breaks in between since it is not mentioned in the rules
ADD_DATACHUNK(chardata, "i", 0, status); //acuteBelowI
ADD_DATACHUNK(chardata, CharsToUnicodeString("\\u0301"), 0, status); //breaks in between since it is not mentioned in the rules
generalIteratorTest(charIterCustom, chardata);
delete charIterCustom;
delete chardata;
}*/
/*//--------------------------------------------------------------------
//tests custom rules based word iteration
//--------------------------------------------------------------------
void RBBITest::TestCustomRuleBasedWordIteration(){
logln("(UnicodeString)Testing the RBBI by using custom rules for word iteration");
UnicodeString wrules1="<ignore>=[:Mn::Me::Cf:];" + //ignore non-spacing marks, enclosing marks, and format characters,
"<danda>=[\\u0964\\u0965];" + //Hindi Phrase seperator
"<let>=[:L::Mc:];" + //uppercase(Lu), lowercase(Ll), titlecase(Lt), modifier(Lm) letters, Mc-combining space mark
"<mid-word>=[:Pd:\\\"\\\'\\.];" + //dashes, quotation, apostraphes, period
"<ls>=[\\n\\u000c\\u2028\\u2029];" + //line separators: LF, FF, PS, and LS
"<ws>=[:Zs:\\t];" + //all space separators and the tab character
"<word>=((<let><let>*(<mid-word><let><let>*)*));" +
".;" + //break after every character, with the following exceptions
"{<word>};" +
"<ws>*{\\r}{<ls>}{<danda>};" ;
RuleBasedBreakIterator wordIterCustom = new RuleBasedBreakIterator(wrules1);
Vector *worddata = new Vector();
ADD_DATACHUNK(worddata, "Write", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "wordrules", 0, status);
ADD_DATACHUNK(worddata, ".", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
//play with hindi
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u092f\\u0939"), 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0939\\u093f") + halfNA + CharsToUnicodeString("\\u0926\\u0940"), 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0939\\u0948"), 0, status);
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0964"), 0, status); //Danda is similar to full stop-> Danda followed by a space
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0905\\u093e\\u092a"), 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, CharsToUnicodeString("\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947"), 0, status);
ADD_DATACHUNK(worddata, "?", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "It's", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "$", 0, status);
ADD_DATACHUNK(worddata, "3", 0, status);
ADD_DATACHUNK(worddata, "0", 0, status);
ADD_DATACHUNK(worddata, ".", 0, status);
ADD_DATACHUNK(worddata, "1", 0, status);
ADD_DATACHUNK(worddata, "0", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// ADD_DATACHUNK(worddata, " ", 0);
generalIteratorTest(wordIterCustom, worddata);
delete wordIterCustom;
delete worddata;
}
//-------------------------------------------------------------------------------
//adds extra rules to deal with abbrevations(limited) and test the word Iteration
//-------------------------------------------------------------------------------
void RBBITest::TestAbbrRuleBasedWordIteration()
{
logln((UnicodeString)"Testing the RBBI for word iteration by adding rules to support abbreviation");
RuleBasedBreakIterator *rb =(RuleBasedBreakIterator*)BreakIterator::createWordInstance();
UnicodeString wrules2="<abbr>=((Mr.)|(Mrs.)|(Ms.)|(Dr.)|(U.S.));" + // abbreviations.
rb->getRules() +
"{(<abbr><ws>)*<word>};";
RuleBasedBreakIterator wordIter=null;
//try{
wordIter = new RuleBasedBreakIterator(wrules2);
// }catch(IllegalArgumentException iae){
// errln("ERROR: failed construction illegal rules");
// }
Vector *worddata = new Vector();
ADD_DATACHUNK(worddata, "Mr. George", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "is", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "from", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "U.S. Navy", 0, status);
ADD_DATACHUNK(worddata, ".", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "His", 0, status);
ADD_DATACHUNK(worddata, "\n", 0, status);
ADD_DATACHUNK(worddata, "friend", 0, status);
ADD_DATACHUNK(worddata, "\t", 0, status);
ADD_DATACHUNK(worddata, "Dr. Steven", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "married", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "Ms. Benneth", 0, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "Mrs. Johnson", 0, status);
ADD_DATACHUNK(worddata, "\r\n", 0, status);
ADD_DATACHUNK(worddata, "paid", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "$2,400.00", 0, status);
generalIteratorTest(wordIter, worddata);
delete wordIter;
delete worddata;
delete rb;
} */
void RBBITest::TestThaiLineBreak() {
UErrorCode status = U_ZERO_ERROR;
@ -1301,12 +674,12 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
switch (index) {
case 0: name = "TestDefaultRuleBasedCharacterIteration";
if(exec) TestDefaultRuleBasedCharacterIteration(); break;
case 1: name = "TestExtended";
case 0: name = "TestExtended";
if(exec) TestExtended(); break;
case 2: name = "TestDefaultRuleBasedWordIteration";
if(exec) TestDefaultRuleBasedWordIteration(); break;
case 1: name = "";
break;
case 2: name = "";
break;
case 3: name = "";
break;
case 4: name = "TestHindiCharacterBreak";

View File

@ -35,14 +35,6 @@ public:
virtual ~RBBITest();
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
/**
* Tests default rules based character iteration
**/
void TestDefaultRuleBasedCharacterIteration(void);
/**
* Tests default rules based word iteration
**/
void TestDefaultRuleBasedWordIteration(void);
/**
* Tests Hindi(Devanagiri) character iteration
**/

View File

@ -39,10 +39,12 @@
# Surrogates
<data>•\U00011000•\U00010020•\U00010000\N{COMBINING MACRON}•</data>
<data>•\ud800\udc00•\udbff\udfff•a•</data>
# Extend (Combining chars) combine.
<data>•A\N{COMBINING GRAVE ACCENT}•B•</data>
<data>•\N{GREEK SMALL LETTER MU}\N{COMBINING LOW LINE}\N{COMBINING HORN}•</data>
<data>•a\u0301•b\u0302•c\u0303•d\u0304•e\u0305•f\u0306•g\u0307•h\u0308•i\u0309•</data>
<data>•a\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304•</data>
@ -64,11 +66,22 @@
# Hindi combining chars. (An old test)
<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
•\u0939•\u094c•\u0964•</data>
<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
# Bug 1587. Tamil. \u0baa\u0bc1 should be two separate characters, even though
# Hyangmi would perfer that it be one.
<data>•\u0baa•\u0bc1•\u0baa•\u0bc1•</data>
# Regression test for bug 1889
<data>•\u0f40\u0f7d•\u0000•\u0f7e•</data>
# 0xffff is a legal character, and should not stop the break iterator early.
# (Requires special casing in implementation, which is why it gets a test.)
<data>•\uffff•\uffff• •a•</data>
########################################################################################
#
#
@ -90,18 +103,73 @@
#
<word>
<data>•This<200> •is<200> •a<200> •word<200> •break<200>.• •Isn't<200> •it<200>?• •2.25<100></data>
<data>•This<200> •is<200> •a<200> •word<200> •break<200>.• •Isn't<200> •it<200>?• •2.25<100></data>
<sent>
<data>•This\n•</data>
<data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \
doing? •This\n• costs $20,00,000. •</data>
<line>
<data>•Hello! •how\r\n• •(are)\r• •you? •I'am •fine- •Thankyou. •foo\u00a0bar
•How, •are, •you? •This, •costs •$20,00,000.•</data>
#
# Data originally from TestDefaultRuleBasedWordIteration()
#
<data>•Write<200> •wordrules<200>.• •123.456<100> •alpha\u00adbeta\u00adgamma<200> •\u092f\u0939<200> •</data>
<data>• •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•</data>
#Hindi Numbers
<data>• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100> •\N{RUPEE SIGN}•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> • •\u0905\u092e\u091c<200>\n•</data>
<data>•\u0938\u094d\u200d\u0935\u0924\u0902deadTA\u0930<200>\r•It's<200> •$•30.10<100> •12,34<100>¢•£•¤•¥•alpha\u05f3beta\u05f4gamma<200> •</data>
<data>•Badges<200>?• •BADGES<200>!•?•!• •We<200> •don't<200> •need<200> •no<200> •STINKING<200> •BADGES<200>!•!•1000,233,456.000<100> •1,23.322<100>%•123.1222<100>$•123,000.20<100> •179.01<100>%•X<200> •Now<200>\r•is<200>\n•the<200>\r\n•time<200> •</data>
#Hangul
<data>•\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •Hello<200>,• •how<200> •are<200> •you<200> •</data>
# Words containing non-BMP letters
<data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATICAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200> •</data>
# Unassigned code points
<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
# Hiragana & Katakana stay together, but separates from each other and Latin.
<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
# Words with interior formatting characters
<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}<200> •</data>
# to test for bug #4097779
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
# to test for bug #4098467
# What follows is a string of Korean characters (I found it in the Yellow Pages
# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
# it correctly), first as precomposed syllables, and then as conjoining jamo.
# Both sequences should be semantically identical and break the same way.
# precomposed syllables...
<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
#
# Try some words from other scripts.
#
# Try some words from other scripts.
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
#
<data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200> •ABC<200> •</data>
#
# Hindi word break tests, imported from the old RBBI tests.
# An historical note: a much earlier version of ICU break iterators had a number
# of special case rules for Hindi, which were tested by an earlier version of
# this test data. The current RBBI rules do not special case Hindi in
# any way, making this test data much less signfificant.
#
<data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903<200>
•\u0935\u0930\u094d\u0937\u093e<200>\r\n•\u092a\u094d\u0930\u0915\u093e\u0936<200>,•\u0924\u0941\u092e\u093e\u0930\u094b<200> •\u092e\u093f\u0924\u094d\u0930<200> •\u0915\u093e<200> •\u092a\u0924\u094d\u0930<200> •\u092a\u095d\u094b<200> •\u0938\u094d\u0924\u094d\u0930\u093f<200>.• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100>\u20a8•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> •\u0905\u092e\u091c<200>\n•\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930<200>\r•</data>
########################################################################################
#
@ -117,6 +185,13 @@ doing? •This\n• costs $20,00,000. •</data>
#
<sent>
<sent>
<data>•This\n•</data>
<data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \
doing? •This\n• costs $20,00,000. •</data>
# Sentence ending in a quote.
<data>•"Sentence ending with a quote." •Bye.•</data>
@ -246,6 +321,10 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence.
•Hi •Hello •How\n•are\r•you\u2028•fine.\t•good. •Now\r•is\n•the\r\n•time\n•\r•for\r•\r•all•</data>
<line>
<data>•Hello! •how\r\n• •(are)\r• •you? •I'am •fine- •Thankyou. •foo\u00a0bar
•How, •are, •you? •This, •costs •$20,00,000.•</data>
# test for bug #4068133
#
<data>•\u96f6•\u4e00\u3002•\u4e8c\u3001•\u4e09\u3002\u3001•\u56db\u3001\u3002\u3001•\u4e94,•\u516d.•\u4e03.\u3001,\u3002•\u516b•</data>
@ -287,3 +366,23 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
#
<data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>
########################################################################################
#
#
# T i t l e B o u n d a r y T e s t s
#
#
##########################################################################################
<title>
<data>•Here •is •a •short •sample •sentence. •And •another.•</data>
<data>•HERE •IS •A •SHORT •SAMPLE •SENTENCE. •AND •ANOTHER.•</data>
<data>• •Start •and •end •with •spaces •</data>
<data>•Include 123 456 ^& •some 54332 •numbers 4445•abc123•abc •ending 1223 •</data>
<data>•Combining\u0301 \u0301•ma\u0306rks •bye •</data>
<data>•123 •Start •with •a •number.•</data>