ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests

into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
This commit is contained in:
Andy Heninger 2002-08-27 19:10:11 +00:00
parent f0bd5ecb0d
commit 4a1d1083b0
6 changed files with 927 additions and 1882 deletions

View File

@ -306,10 +306,6 @@ SOURCE=.\ittrans.cpp
# End Source File
# Begin Source File
SOURCE=.\ittxtbd.cpp
# End Source File
# Begin Source File
SOURCE=.\itutil.cpp
# End Source File
# Begin Source File
@ -683,10 +679,6 @@ SOURCE=.\ittrans.h
# End Source File
# Begin Source File
SOURCE=.\ittxtbd.h
# End Source File
# Begin Source File
SOURCE=.\itutil.h
# End Source File
# Begin Source File

View File

@ -1,5 +1,5 @@
/********************************************************************
* COPYRIGHT:
* COPYRIGHT:
* Copyright (c) 1998-2001, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -19,7 +19,6 @@
#include "itutil.h"
#include "tscoll.h"
#include "ittxtbd.h"
#include "itformat.h"
#include "itconv.h"
#include "ittrans.h"
@ -42,15 +41,15 @@
void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
switch (index) {
case 0: name = "utility";
if (exec) {
case 0: name = "utility";
if (exec) {
logln("TestSuite Utilities---"); logln();
IntlTestUtilities test;
callTest( test, par );
}
break;
case 1: name = "normalize";
case 1: name = "normalize";
if (exec) {
logln("TestSuite Normalize---"); logln();
IntlTestNormalize test;
@ -58,7 +57,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
}
break;
case 2: name = "collate";
case 2: name = "collate";
if (exec) {
logln("TestSuite Collator---"); logln();
IntlTestCollator test;
@ -66,15 +65,11 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
}
break;
case 3: name = "textbounds";
if (exec) {
logln("TestSuite TextBoundary---"); logln();
IntlTestTextBoundary test;
callTest( test, par );
}
case 3: name = "unused";
// Used to be text bounds.
break;
case 4: name = "format";
case 4: name = "format";
if (exec) {
logln("TestSuite Format---"); logln();
IntlTestFormat test;
@ -82,7 +77,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
}
break;
case 5: name = "translit";
case 5: name = "translit";
if (exec) {
logln("TestSuite Transliterator---"); logln();
IntlTestTransliterator test;
@ -90,7 +85,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
}
break;
case 6: name = "rbbi";
case 6: name = "rbbi";
if (exec) {
logln("TestSuite RuleBasedBreakIterator---"); logln();
IntlTestRBBI test;
@ -114,7 +109,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
/* Only the C API is exists */
#ifdef ICU_UNICODECONVERTER_USE_DEPRECATES
case 9: name = "convert";
case 9: name = "convert";
if (exec) {
logln("TestSuite Convert---"); logln();
IntlTestConvert test;

File diff suppressed because it is too large Load Diff

View File

@ -1,182 +0,0 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2001, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
#ifndef _INTLTESTTEXTBOUNDARY
#define _INTLTESTTEXTBOUNDARY
#include "intltest.h"
#include "unicode/brkiter.h"
class Vector;
class Enumeration;
/**
* Test the BreakIterator class and indirectly all related classes
*/
class IntlTestTextBoundary: public IntlTest {
public:
IntlTestTextBoundary();
virtual ~IntlTestTextBoundary();
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
/**
* Test sentence break using generalIteratorTest()
**/
void TestSentenceIteration(void);
/**
* Test word break using generalIteratorTest()
**/
void TestWordIteration(void);
/**
* Test line break using generalIteratorTest()
**/
void TestLineIteration(void);
/**
* Test character break using generalIteratorTest()
**/
void TestCharacterIteration(void);
/**
* Test sentence break using ()
**/
void TestSentenceInvariants(void);
/**
* Test sentence break Invariants using generalIteratorTest()
**/
void TestWordInvariants(void);
/**
* Test sentence break Invariants using generalIteratorTest()
**/
void TestLineInvariants(void);
/**
* Test sentence break Invariants using generalIteratorTest()
**/
void TestCharacterInvariants(void);
/**
* Test Japanese line break Invariants using generalIteratorTest()
**/
void TestJapaneseLineBreak(void);
/**
* Test Thai line break using generalIteratorTest()
**/
void TestThaiLineBreak(void);
/**
* Test Mixed Thai (thai with other languages like english)line break using generalIteratorTest()
**/
void TestMixedThaiLineBreak(void);
/**
* Test Thai Line break with Maiyamok using generalIteratorTest()
* The Thai maiyamok character is a shorthand symbol that means "repeat the previous
* word". Instead of appearing as a word unto itself, however, it's kept together
* with the word before it
**/
void TestMaiyamok(void);
/**
* Test Thai word break using generalIteratorTest()
**/
void TestThaiWordBreak(void);
/**
* test behaviour of BreakIterator on an empty string
**/
void TestEmptyString(void);
/**
* Test BreakIterator::getAvailableLocales
**/
void TestGetAvailableLocales(void);
/**
* Test BreakIterator::getDisplayName
**/
void TestGetDisplayName(void);
/**
* test methods preceding, following and isBoundary
**/
void TestPreceding(void);
void TestBug4153072(void);
/**
* Test End Behaviour
* @bug 4068137
**/
void TestEndBehaviour(void);
/***********************/
private:
/**
* internal methods to prepare test data
**/
void addTestWordData(void);
void addTestSentenceData(void);
void addTestLineData(void);
void addTestCharacterData(void);
UnicodeString createTestData(Enumeration* e);
/**
* Perform tests of BreakIterator forward and backward functionality
* on different kinds of iterators (word, sentence, line and character).
* It tests the methods first(), next(), current(), preceding(), following()
* previous() and isBoundary().
* It makes use of internal functions to achieve this.
**/
void generalIteratorTest(BreakIterator& bi, Vector* expectedResult);
/**
* Internal method to perform iteration and test the first() and next() functions
**/
Vector* testFirstAndNext(BreakIterator& bi, UnicodeString& text);
/**
* Internal method to perform iteration and test the last() and previous() functions
**/
Vector* testLastAndPrevious(BreakIterator& bi, UnicodeString& text);
/**
* Internal method to perform iteration and test the following() function
**/
void testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
/**
* Internal method to perform iteration and test the preceding() function
**/
void testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
/**
* Internal method to perform iteration and test the isBoundary() function
**/
void testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
/**
* Internal method which does the comparision of expected and got results.
**/
void compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2);
/**
* Internal method to perform tests of BreakIterator multiple selection functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doMultipleSelectionTest(BreakIterator& iterator, UnicodeString& testText);
/**
* Internal method to perform tests of BreakIterator break Invariants
* on different kinds of iterators (word, sentence, line and character)
**/
void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars);
/**
* Internal method to perform tests of BreakIterator other invariants
* on different kinds of iterators (word, sentence, line and character)
**/
void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars);
/**
* Perform tests with short sample code
**/
void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title);
/**
* The vectors holding test data for testing
* different kinds of iterators( word, sentence, line and character)
**/
Vector* lineSelectionData;
Vector* sentenceSelectionData;
Vector* wordSelectionData;
Vector* characterSelectionData;
static const UChar cannedTestArray[];
static UnicodeString *cannedTestChars;
};
#endif

View File

@ -166,11 +166,24 @@ void BITestData::clearResults() {
}
//--------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------
//
// RBBITest
// Cannned Test Characters
//
//--------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------
static const UChar cannedTestArray[] = {
0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
};
static UnicodeString* cannedTestChars = 0;
#define halfNA "\\u0928\\u094d\\u200d"
#define halfSA "\\u0938\\u094d\\u200d"
@ -178,7 +191,23 @@ void BITestData::clearResults() {
#define halfKA "\\u0915\\u094d\\u200d"
#define deadTA "\\u0924\\u094d"
//--------------------------------------------------------------------------------------
//
// RBBITest constructor and destructor
//
//--------------------------------------------------------------------------------------
RBBITest::RBBITest() {
UnicodeString temp(cannedTestArray);
cannedTestChars = new UnicodeString();
*cannedTestChars += (UChar)0x0000;
*cannedTestChars += temp;
}
RBBITest::~RBBITest() {
delete cannedTestChars;
}
//--------------------------------------------------------------------
//tests default rules based character iteration
@ -209,6 +238,32 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration()
ADD_DATACHUNK(chardata, "e\\u0301", 0, status); //acuteE
ADD_DATACHUNK(chardata, "&", 0, status);
ADD_DATACHUNK(chardata, "e\\u0303", 0, status); //tildaE
ADD_DATACHUNK(chardata, "S\\u0300", 0, status); //graveS
ADD_DATACHUNK(chardata, "i\\u0301", 0, status); // acuteBelowI
ADD_DATACHUNK(chardata, "m", 0, status);
ADD_DATACHUNK(chardata, "p", 0, status);
ADD_DATACHUNK(chardata, "l", 0, status);
ADD_DATACHUNK(chardata, "e\\u0301", 0, status); // acuteE
ADD_DATACHUNK(chardata, " ", 0, status);
ADD_DATACHUNK(chardata, "s", 0, status);
ADD_DATACHUNK(chardata, "a\\u0302", 0, status); // circumflexA
ADD_DATACHUNK(chardata, "m", 0, status);
ADD_DATACHUNK(chardata, "p", 0, status);
ADD_DATACHUNK(chardata, "l", 0, status);
ADD_DATACHUNK(chardata, "e\\u0303", 0, status); // tildeE
ADD_DATACHUNK(chardata, ".", 0, status);
ADD_DATACHUNK(chardata, "w", 0, status);
ADD_DATACHUNK(chardata, "a\\u0302", 0, status); // circumflexA
ADD_DATACHUNK(chardata, "w", 0, status);
ADD_DATACHUNK(chardata, "a", 0, status);
ADD_DATACHUNK(chardata, "f", 0, status);
ADD_DATACHUNK(chardata, "q", 0, status);
ADD_DATACHUNK(chardata, "\n", 0, status);
ADD_DATACHUNK(chardata, "\r", 0, status);
ADD_DATACHUNK(chardata, "\r\n", 0, status);
ADD_DATACHUNK(chardata, "\n", 0, status);
//devanagiri characters for Hindi support
ADD_DATACHUNK(chardata, "\\u0906", 0, status); //devanagiri AA
//ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0); //devanagiri vowelsign AA+ chandrabindhu
@ -233,6 +288,10 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration()
ADD_DATACHUNK(chardata, "i\\u0301", 0, status); //acuteBelowI
ADD_DATACHUNK(chardata, "!", 0, status);
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
@ -354,6 +413,8 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
ADD_DATACHUNK(worddata, "$", 0, status);
ADD_DATACHUNK(worddata, "30.10", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "12,34", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u00A2", 0, status); //cent sign
ADD_DATACHUNK(worddata, "\\u00A3", 0, status); //pound sign
ADD_DATACHUNK(worddata, "\\u00A4", 0, status); //currency sign
@ -365,14 +426,33 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, "?", 0, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "We", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "don't", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "need", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "no", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "STINKING", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, "!", 0, status);
ADD_DATACHUNK(worddata, "1000,233,456.000", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "1,23.322", T_NUMBER, status);
ADD_DATACHUNK(worddata, "%", 0, status);
ADD_DATACHUNK(worddata, "123.1222", T_NUMBER, status);
ADD_DATACHUNK(worddata, "$", 0, status);
ADD_DATACHUNK(worddata, "123,000.20", T_NUMBER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "179.01", T_NUMBER, status);
ADD_DATACHUNK(worddata, "%", 0, status);
ADD_DATACHUNK(worddata, "X", T_LETTER, status);
@ -428,12 +508,54 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
ADD_DATACHUNK(worddata, "\\u3094\\u0301", T_H_OR_K, status); // Hiragana
ADD_DATACHUNK(worddata, "\\u309d", T_H_OR_K, status); // Hiragana
ADD_DATACHUNK(worddata, "\\u30a1\\u30fd\\uff66\\uff9d", T_H_OR_K, status); // Katakana
// ADD_DATACHUNK(worddata, "def", T_LETTER, status); // TODO why does this fail???
ADD_DATACHUNK(worddata, ".", 0, status);
ADD_DATACHUNK(worddata, "def", T_LETTER, status);
ADD_DATACHUNK(worddata, "#", 0, status);
// Words with interior formatting characters
ADD_DATACHUNK(worddata, "def\\u0301\\u070Fabc", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// to test for bug #4097779
ADD_DATACHUNK(worddata, "aa\\u0300a", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// to test for bug #4098467
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
ADD_DATACHUNK(worddata, "\\uc0c1\\ud56d", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\ud55c\\uc778", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\uc5f0\\ud569", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\uc7a5\\ub85c\\uad50\\ud68c", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// conjoining jamo...
ADD_DATACHUNK(worddata, "\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
ADD_DATACHUNK(worddata, "\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c", T_LETTER, status);
ADD_DATACHUNK(worddata, " ", 0, status);
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
// count as a Kanji character for the purposes of word breaking
ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
// Unicode TR29: Ideographs do NOT group together into words.
//wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
ADD_DATACHUNK(worddata, "\\u4e01", T_IDEO, status);
ADD_DATACHUNK(worddata, "\\u4e02", T_IDEO, status);
ADD_DATACHUNK(worddata, "\\u3005", T_LETTER, status); // TODO: 3005 is ideographic iteration mark
// Treating as letter is according to TR.
// Check whether this is really intended.
ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status);
ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status);
ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
if (U_FAILURE(status)){
errln("FAIL : in BITestData construction");
@ -531,6 +653,40 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
// opening punctuation
ADD_DATACHUNK(sentdata, "How do you do?", 0, status);
ADD_DATACHUNK(sentdata, "(fine).", 0, status);
// test for bug #4158381: Don't break sentence after period if it isn't
// followed by a space
ADD_DATACHUNK(sentdata, "Test <code>Flags.Flag</code> class. ", 0, status);
ADD_DATACHUNK(sentdata, "Another test.\\u2029", 0, status);
// test for bug #4158381: No breaks when there are no terminators around
ADD_DATACHUNK(sentdata, "<P>Provides a set of &quot;lightweight&quot; (all-java<FONT SIZE=\"-2\">"
"<SUP>TM</SUP></FONT> language) components that, to the maximum degree possible,"
"work the same on all platforms. ", 0, status);
ADD_DATACHUNK(sentdata, "Another test.\\u2029", 0, status);
// test for bug #4143071: Make sure sentences that end with digits
// work right
ADD_DATACHUNK(sentdata, "Today is the 27th of May, 1998. ", 0, status);
ADD_DATACHUNK(sentdata, "Tomorrow with be 28 May 1998. ", 0, status);
ADD_DATACHUNK(sentdata, "The day after will be the 30th.\\u2029", 0, status);
// test for bug #4152416: Make sure sentences ending with a capital
// letter are treated correctly
// Unicode TR29 reverses above bug: Don't break a sentence if the last word begins with an upper case letter.
ADD_DATACHUNK(sentdata, "The type of all primitive <code>boolean</code> values accessed in the "
"target VM. Calls to xxx will return an implementor of this interface. \\u2029", 0, status);
// test for bug #4152117: Make sure sentence breaking is handling
// punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
// HERE TO MAKE SURE IT DOESN'T CROP UP]
ADD_DATACHUNK(sentdata, "Constructs a randomly generated BigInteger, uniformly distributed "
"over the range <tt>0</tt> to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive. ", 0, status);
ADD_DATACHUNK(sentdata, "The uniformity of the distribution assumes that a fair source of random bits "
"is provided in <tt>rnd</tt>. ", 0, status);
ADD_DATACHUNK(sentdata, "Note that this constructor always constructs a non-negative biginteger. \n", 0, status);
ADD_DATACHUNK(sentdata, "Ahh abc. \n", 0, status);
//sentence breaks for hindi which used Devanagari script
//make sure there is sentence break after ?,danda(hindi phrase separator),fullstop followed by space and no break after \n \r
ADD_DATACHUNK(sentdata, "\\u0928\\u092e" halfSA
@ -1198,6 +1354,222 @@ void RBBITest::TestAbbrRuleBasedWordIteration()
delete rb;
} */
void RBBITest::TestThaiLineBreak() {
UErrorCode status = U_ZERO_ERROR;
BITestData thaiLineSelection(status);
// \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
// represents elided letters at the end of a long word. It should be bound to
// the end of the word and not treated as an independent punctuation mark.
ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
// ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
// ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
// the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
// the one time where the paiyannoi occurs somewhere other than at the end
// of a word is in the Thai abbrevation for "etc.", which both begins and
// ends with a paiyannoi
ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
Locale("th"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
return;
}
generalIteratorTest(*e, thaiLineSelection);
}
void RBBITest::TestMixedThaiLineBreak()
{
UErrorCode status = U_ZERO_ERROR;
BITestData thaiLineSelection(status);
ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
// Arabic numerals should always be separated from surrounding Thai text
/*
ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
thaiLineSelection->addElement("39");
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
// words in non-Thai scripts should always be separated from surrounding Thai text
ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);
thaiLineSelection->addElement("Java");
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);
// Thai numerals should always be separated from the text surrounding them
ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
// Thai text should interact correctly with punctuation and symbols
ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);
// ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);
// ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);
ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);
// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
*/
// The Unicode Linebreak TR says do not break before or after quotes.
// So this test is changed ot not break around the quote.
// TODO: should Thai break around the around the quotes, like the original behavior here?
// ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);
// ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
"\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);
ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
return;
}
generalIteratorTest(*e, thaiLineSelection);
}
void RBBITest::TestMaiyamok()
{
UErrorCode status = U_ZERO_ERROR;
BITestData thaiLineSelection(status);
ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
// the Thai maiyamok character is a shorthand symbol that means "repeat the previous
// word". Instead of appearing as a word unto itself, however, it's kept together
// with the word before it
ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
Locale("th"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
return;
}
generalIteratorTest(*e, thaiLineSelection);
delete e;
}
void RBBITest::TestThaiWordBreak() {
UErrorCode status = U_ZERO_ERROR;
BITestData thaiWordSelection(status);
ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); // Break at start of data
ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2
ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5
ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6
ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); //10
ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); //16
ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status); //18
// This is the correct result
//ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24
//ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
// and this is what the dictionary does...
ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); // 20
ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status); //33
// This is the correct result
//ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37
//ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
// and this is what the dictionary does
ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); //45
ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); //49
ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status); //51
// This is the correct result
//ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57
//ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60
// and this is what the dictionary does
ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); // 54
ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status); //60
ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status); //63
// This is the correct result
//ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68
//ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71
//ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74
//ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77
// and this is what the dictionary does
ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); // 65
ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status); //77
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
Locale("th"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");
return;
}
generalIteratorTest(*e, thaiWordSelection);
delete e;
}
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
@ -1223,11 +1595,42 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if(exec) TestTitleBreak(); break;
case 7: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
case 8: name = "TestLineBreakData";
if(exec) TestLineBreakData(); break;
// case 6: name = "TestDanda()";
// if(exec) TestDanda(); break;
case 8: name = "TestLineBreakData";
if(exec) TestLineBreakData(); break;
case 9: name = "TestSentenceInvariants";
if(exec) TestSentenceInvariants(); break;
case 10: name = "TestCharacterInvariants";
if(exec) TestCharacterInvariants(); break;
case 11: name = "TestWordInvariants";
if(exec) TestWordInvariants(); break;
case 12: name = "TestEmptyString";
if(exec) TestEmptyString(); break;
case 13: name = "TestGetAvailableLocales";
if(exec) TestGetAvailableLocales(); break;
case 14: name = "TestGetDisplayName";
if(exec) TestGetDisplayName(); break;
case 15: name = "TestEndBehaviour";
if(exec) TestEndBehaviour(); break;
case 16: name = "TestBug4153072";
if(exec) TestBug4153072(); break;
case 17: name = "TestJapaneseLineBreak()";
if(exec) TestJapaneseLineBreak(); break;
case 18: name = "TestThaiLineBreak()";
if(exec) TestThaiLineBreak(); break;
case 19: name = "TestMixedThaiLineBreak()";
if(exec) TestMixedThaiLineBreak(); break;
case 20: name = "TestMaiyamok()";
if(exec) TestMaiyamok(); break;
case 21: name = "TestThaiWordBreak()";
if(exec) TestThaiWordBreak(); break;
// case 7: name = "TestHindiCharacterWrapping()";
// if(exec) TestHindiCharacterWrapping(); break;
// case 8: name = "TestCustomRuleBasedWordIteration";
@ -1486,6 +1889,488 @@ void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestD
}
//--------------------------------------------------------------------------------------------
//
// Break Iterator Invariants Tests
//
//--------------------------------------------------------------------------------------------
void RBBITest::TestCharacterInvariants()
{
UErrorCode status = U_ZERO_ERROR;
BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");
return;
}
UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
doBreakInvariantTest(*e, s);
s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
doOtherInvariantTest(*e, s);
delete e;
}
void RBBITest::TestWordInvariants()
{
UErrorCode status = U_ZERO_ERROR;
BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");
return;
}
UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
doBreakInvariantTest(*e, s);
s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
doOtherInvariantTest(*e, s);
delete e;
}
void RBBITest::TestSentenceInvariants()
{
UErrorCode status = U_ZERO_ERROR;
BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");
return;
}
UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");
doOtherInvariantTest(*e, s);
delete e;
}
void RBBITest::TestLineInvariants()
{
UErrorCode status = U_ZERO_ERROR;
BreakIterator *e = BreakIterator::createLineInstance(Locale::getUS(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");
return;
}
UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");
UnicodeString testChars = *cannedTestChars + s;
doBreakInvariantTest(*e, testChars);
doOtherInvariantTest(*e, testChars);
int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;
int32_t i, j, k;
// in addition to the other invariants, a line-break iterator should make sure that:
// it doesn't break around the non-breaking characters,
// EXCEPT breaking after a space takes precedence over not breaking before
// an non-breaking char. So says TR 14.
UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
UnicodeString work("aaa");
testCharsLen = testChars.length();
noBreakLen = noBreak.length();
for (i = 0; i < testCharsLen; i++) {
UChar c = testChars[i];
if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 ||
u_charType(c) == U_CONTROL_CHAR) {
continue;
}
work[0] = c;
for (j = 0; j < noBreakLen; j++) {
work[1] = noBreak[j];
for (k = 0; k < testCharsLen; k++) {
work[2] = testChars[k];
e->setText(work);
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
UChar c1 = work[l - 1];
UChar c2 = work[l];
if (c1 == 0x20 && l == 1) {
continue;
}
if (l == 1 || l == 2) {
errln("Got break between U+" + UCharToUnicodeString(c1) +
" and U+" + UCharToUnicodeString(c2));
errCount++;
if (errCount >= 75)
return;
}
}
}
}
}
// it does break after hyphens (Rule 15B from TR 14
// (unless they're followed by a digit, a non-spacing mark,
// a currency symbol, a non-breaking space, or a line or paragraph separator
// or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
// This test is sufficiently screwed up that I'm largely disabling it. TODO: fix it. 06/12/2002 AGH
//
UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
dashesLen = dashes.length();
for (i = 0; i < testCharsLen; i++) {
work[0] = testChars[i];
for (j = 0; j < dashesLen; j++) {
UChar c1 = work[1] = dashes[j];
for (k = 0; k < testCharsLen; k++) {
UChar c2 = work[2] = testChars[k];
int8_t type = u_charType(c2);
if (type == U_DECIMAL_DIGIT_NUMBER ||
type == U_OTHER_NUMBER ||
type == U_NON_SPACING_MARK ||
type == U_ENCLOSING_MARK ||
type == U_CURRENCY_SYMBOL ||
type == U_SPACE_SEPARATOR ||
type == U_DASH_PUNCTUATION ||
type == U_CONTROL_CHAR ||
type == U_FORMAT_CHAR ||
c2 == '\n' || c2 == '\r' || c2 == 0x2028 || c2 == 0x2029 ||
c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 ||
c2 == 0xfeff)
{
continue;
}
// If c1 == hyphen-minus, and ...
if (c1 == 0x002d && (
c2 == 0x0021 || // !
c2 == 0x002c || // ,
c2 == 0x002d || // -
c2 == 0x002e || // . (TR 14 class IS)
c2 == 0x0029 || // )
c2 == 0x003a || // :
c2 == 0x003b || // ; (TR 14 class IS)
c2 == 0x005d || // ]
c2 == 0x007c || // | (TR 14 class BA, rule 15)
c2 == 0x007d || // }
c2 == 0x0903 || // Devanagari sign visarga, combining, what's it doing in this test?
c2 == 0x093E || // Devanagari , combining, what's it doing in this test?
c2 == 0x093F || // Devanagari , combining, what's it doing in this test?
c2 == 0x0940 || // Devanagari , combining, what's it doing in this test?
c2 == 0x0949 || // Devanagari , combining, what's it doing in this test?
c2 == 0x0f3b || // Tibetan closing bracket
c2 == 0x3001 || // CJK closing bracket
c2 == 0x3002 // CJK closing bracket
)) {
continue;
}
e->setText(work);
UBool saw2 = FALSE;
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
if (l == 2) {
saw2 = TRUE;
break;
}
}
if (!saw2) {
// TODO: This test is completely out of sync with the spec. Fix it.
// errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +
// " and U+" + UCharToUnicodeString(work[2]));
// errCount++;
// if (errCount >= 75)
// return;
}
}
}
}
delete e;
}
void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
{
UnicodeString work("aaa");
int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;
// a break should always occur after CR (unless followed by LF), LF, PS, and LS
UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");
int32_t i, j;
breaksLen = breaks.length();
for (i = 0; i < breaksLen; i++) {
UChar c1 = work[1] = breaks[i];
for (j = 0; j < testCharsLen; j++) {
UChar c0 = work[0] = testChars[j];
for (int k = 0; k < testCharsLen; k++) {
UChar c2 = work[2] = testChars[k];
// if a cr is followed by lf, ps, ls or etx, don't do the check (that's
// not supposed to work)
if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029
|| c2 == 0x2028 || c2 == 0x0003))
continue;
if (u_charType(c1) == U_CONTROL_CHAR &&
(u_charType(c2) == U_NON_SPACING_MARK ||
u_charType(c2) == U_ENCLOSING_MARK ||
u_charType(c2) == U_COMBINING_SPACING_MARK)
) {
// Combining marks don't combine with controls.
// TODO: enhance test to verify that the break actually occurs,
// not just ignore the case.
continue;
}
tb.setText(work);
UBool seen2 = FALSE;
for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {
if (l == 2) {
seen2 = TRUE;
break;
}
}
if (!seen2) {
errln("No break between U+" + UCharToUnicodeString(c1)
+ " and U+" + UCharToUnicodeString(c2));
errCount++;
if (errCount >= 75)
return;
}
}
}
}
}
void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
{
UnicodeString work("a\r\na");
int32_t errCount = 0, testCharsLen = testChars.length();
int32_t i, j;
int8_t type;
// a break should never occur between CR and LF
for (i = 0; i < testCharsLen; i++) {
work[0] = testChars[i];
for (j = 0; j < testCharsLen; j++) {
work[3] = testChars[j];
tb.setText(work);
for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
if (k == 2) {
errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) +
", U+d U+a U+" + UCharToUnicodeString(work[3]));
errCount++;
if (errCount >= 75)
return;
}
}
}
// a break should never occur before a non-spacing mark, unless the preceding
// character is CR, LF, PS, or LS
// Or the general category == Control.
work.remove();
work += "aaaa";
for (i = 0; i < testCharsLen; i++) {
UChar c1 = testChars[i];
if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||
u_charType(c1) == U_CONTROL_CHAR || u_charType(c1) == U_FORMAT_CHAR) {
continue;
}
work[1] = c1;
for (j = 0; j < testCharsLen; j++) {
UChar c2 = testChars[j];
type = u_charType(c2);
if ((type != U_NON_SPACING_MARK) &&
(type != U_ENCLOSING_MARK)) {
continue;
}
work[2] = c2;
tb.setText(work);
for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())
if (k == 2) {
errln("Break between U+" + UCharToUnicodeString(work[1])
+ " and U+" + UCharToUnicodeString(work[2]));
errCount++;
if (errCount >= 75)
return;
}
}
}
}
//---------------------------------------------
//
// other tests
//
//---------------------------------------------
void RBBITest::TestEmptyString()
{
UnicodeString text = "";
UErrorCode status = U_ZERO_ERROR;
BITestData x(status);
ADD_DATACHUNK(x, "", 0, status); // Break at start of data
RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
return;
}
generalIteratorTest(*bi, x);
delete bi;
}
void RBBITest::TestGetAvailableLocales()
{
int32_t locCount = 0;
const Locale* locList = BreakIterator::getAvailableLocales(locCount);
if (locCount == 0)
errln("getAvailableLocales() returned an empty list!");
// Just make sure that it's returning good memory.
for (int32_t i = 0; i < locCount; ++i) {
logln(locList[i].getName());
}
}
//Testing the BreakIterator::getDisplayName() function
void RBBITest::TestGetDisplayName()
{
UnicodeString result;
BreakIterator::getDisplayName(Locale::getUS(), result);
if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
+ result);
BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
if (result != "French (France)")
errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
+ result);
}
/**
* Test End Behaviour
* @bug 4068137
*/
void RBBITest::TestEndBehaviour()
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString("boo.");
BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
return;
}
wb->setText(testString);
if (wb->first() != 0)
errln("Didn't get break at beginning of string.");
if (wb->next() != 3)
errln("Didn't get break before period in \"boo.\"");
if (wb->current() != 4 && wb->next() != 4)
errln("Didn't get break at end of string.");
delete wb;
}
/*
* @bug 4153072
*/
void RBBITest::TestBug4153072() {
UErrorCode status = U_ZERO_ERROR;
BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
return;
}
UnicodeString str("...Hello, World!...");
int32_t begin = 3;
int32_t end = str.length() - 3;
UBool dummy;
StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
iter->adoptText(textIterator);
for (int index = -1; index < begin + 1; ++index) {
dummy = iter->isBoundary(index);
if (index < begin && dummy == TRUE) {
errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
" and begin index = " + begin);
}
}
delete iter;
}
/**
* Test Japanese Line Break
* @bug 4095322
*/
void RBBITest::TestJapaneseLineBreak()
{
// Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
// as opening and closing punctuation for line breaking.
// Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
// from these tests. 6-13-2002
//
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
UnicodeString precedingChars = CharsToUnicodeString(
//"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
"([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
UnicodeString followingChars = CharsToUnicodeString(
// ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
// ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
"\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
int32_t i;
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
return;
}
for (i = 0; i < precedingChars.length(); i++) {
testString[1] = precedingChars[i];
iter->setText(testString);
int32_t j = iter->first();
if (j != 0)
errln("ja line break failure: failed to start at 0");
j = iter->next();
if (j != 1)
errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
+ "' (" + ((int)(precedingChars[i])) + ")");
j = iter->next();
if (j != 3)
errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
+ "' (" + ((int)(precedingChars[i])) + ")");
}
for (i = 0; i < followingChars.length(); i++) {
testString[1] = followingChars[i];
iter->setText(testString);
int j = iter->first();
if (j != 0)
errln("ja line break failure: failed to start at 0");
j = iter->next();
if (j != 2)
errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
+ "' (" + ((int)(followingChars[i])) + ")");
j = iter->next();
if (j != 3)
errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
+ "' (" + ((int)(followingChars[i])) + ")");
}
delete iter;
}
//--------------------------------------------------------------------------------------------
//
// Exhaustive Tests, using Unicode Data Files.
//
//--------------------------------------------------------------------------------------------
//
// Token level scanner for the Unicode Line Break Test Data file.
// Return the next token, as follows:

View File

@ -27,6 +27,9 @@ class BITestData;
class RBBITest: public IntlTest {
public:
RBBITest();
~RBBITest();
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
/**
* Tests default rules based character iteration
@ -67,6 +70,22 @@ public:
**/
void TestLineBreakData();
void TestSentenceInvariants();
void TestCharacterInvariants();
void TestWordInvariants();
void TestLineInvariants();
void TestEmptyString();
void TestGetAvailableLocales();
void TestGetDisplayName();
void TestEndBehaviour();
void TestBug4153072();
void TestJapaneseLineBreak();
void TestThaiLineBreak();
void TestMixedThaiLineBreak();
void TestMaiyamok();
void TestThaiWordBreak();
/**
* Test Hindi Danda i.e make sure we have a break point before and after danda
**/
@ -136,6 +155,9 @@ private:
**/
void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td);
void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars);
void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars);
};