ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests

into rbbitest. Many tests were replicated in both places. X-SVN-Rev: 9793
2002-08-27 19:10:11 +00:00 · 2002-08-27 19:10:11 +00:00 · 4a1d1083b0
commit 4a1d1083b0
parent f0bd5ecb0d
6 changed files with 927 additions and 1882 deletions
--- a/icu4c/source/test/intltest/intltest.dsp
+++ b/icu4c/source/test/intltest/intltest.dsp
@ -306,10 +306,6 @@ SOURCE=.\ittrans.cpp
 # End Source File
 # Begin Source File

-SOURCE=.\ittxtbd.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\itutil.cpp
 # End Source File
 # Begin Source File
@ -683,10 +679,6 @@ SOURCE=.\ittrans.h
 # End Source File
 # Begin Source File

-SOURCE=.\ittxtbd.h
-# End Source File
-# Begin Source File
-
 SOURCE=.\itutil.h
 # End Source File
 # Begin Source File
--- a/icu4c/source/test/intltest/itmajor.cpp
+++ b/icu4c/source/test/intltest/itmajor.cpp
@ -1,5 +1,5 @@
 /********************************************************************
- * COPYRIGHT: 
+ * COPYRIGHT:
 * Copyright (c) 1998-2001, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
@ -19,7 +19,6 @@

 #include "itutil.h"
 #include "tscoll.h"
-#include "ittxtbd.h"
 #include "itformat.h"
 #include "itconv.h"
 #include "ittrans.h"
@ -42,15 +41,15 @@
 void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
 {
    switch (index) {
-        case 0: name = "utility"; 
-                if (exec) { 
+        case 0: name = "utility";
+                if (exec) {
                    logln("TestSuite Utilities---"); logln();
                    IntlTestUtilities test;
                    callTest( test, par );
                }
                break;

-        case 1: name = "normalize"; 
+        case 1: name = "normalize";
                if (exec) {
                    logln("TestSuite Normalize---"); logln();
                    IntlTestNormalize test;
@ -58,7 +57,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
                }
                break;

-        case 2: name = "collate"; 
+        case 2: name = "collate";
                if (exec) {
                    logln("TestSuite Collator---"); logln();
                    IntlTestCollator test;
@ -66,15 +65,11 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
                }
                break;

-        case 3: name = "textbounds"; 
-                if (exec) {
-                    logln("TestSuite TextBoundary---"); logln();
-                    IntlTestTextBoundary test;
-                    callTest( test, par );
-                }
+        case 3: name = "unused";
+                // Used to be text bounds.
                break;

-        case 4: name = "format"; 
+        case 4: name = "format";
                if (exec) {
                    logln("TestSuite Format---"); logln();
                    IntlTestFormat test;
@ -82,7 +77,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
                }
                break;

-        case 5: name = "translit"; 
+        case 5: name = "translit";
                if (exec) {
                    logln("TestSuite Transliterator---"); logln();
                    IntlTestTransliterator test;
@ -90,7 +85,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
                }
                break;

-        case 6: name = "rbbi"; 
+        case 6: name = "rbbi";
                if (exec) {
                    logln("TestSuite RuleBasedBreakIterator---"); logln();
                    IntlTestRBBI test;
@ -114,7 +109,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam

 /* Only the C API is exists */
 #ifdef ICU_UNICODECONVERTER_USE_DEPRECATES
-        case 9: name = "convert"; 
+        case 9: name = "convert";
                if (exec) {
                    logln("TestSuite Convert---"); logln();
                    IntlTestConvert test;
--- a/icu4c/source/test/intltest/ittxtbd.cpp
+++ b/icu4c/source/test/intltest/ittxtbd.cpp
--- a/icu4c/source/test/intltest/ittxtbd.h
+++ b/icu4c/source/test/intltest/ittxtbd.h
@ -1,182 +0,0 @@
-/********************************************************************
- * COPYRIGHT: 
- * Copyright (c) 1997-2001, International Business Machines Corporation and
- * others. All Rights Reserved.
- ********************************************************************/
-
-
-#ifndef _INTLTESTTEXTBOUNDARY
-#define _INTLTESTTEXTBOUNDARY
-
-
-#include "intltest.h"
-#include "unicode/brkiter.h"
-
-class Vector;
-class Enumeration;
-
-/**
- * Test the BreakIterator class and indirectly all related classes
- */
-class IntlTestTextBoundary: public IntlTest {
-public:
-    IntlTestTextBoundary();
-    virtual ~IntlTestTextBoundary();
-    
-    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
-    /**
-     * Test sentence break using generalIteratorTest()
-     **/
-    void TestSentenceIteration(void);
-    /**
-     * Test word break using generalIteratorTest()
-     **/
-    void TestWordIteration(void);
-    /**
-     * Test line break using generalIteratorTest()
-     **/ 
-    void TestLineIteration(void);
-    /**
-     * Test character break using generalIteratorTest()
-     **/
-    void TestCharacterIteration(void);
-    /**
-     * Test sentence break using ()
-     **/
-    void TestSentenceInvariants(void);
-     /**
-     * Test sentence break Invariants using generalIteratorTest()
-     **/ 
-    void TestWordInvariants(void);
-     /**
-     * Test sentence break Invariants using generalIteratorTest()
-     **/
-    void TestLineInvariants(void);
-     /**
-     * Test sentence break Invariants using generalIteratorTest()
-     **/
-    void TestCharacterInvariants(void);
-     /**
-     * Test Japanese line break Invariants using generalIteratorTest()
-     **/
-    void TestJapaneseLineBreak(void);
-     /**
-     * Test Thai line break using generalIteratorTest()
-     **/
-    void TestThaiLineBreak(void);
-     /**
-     * Test Mixed Thai (thai with other languages like english)line break using generalIteratorTest()
-     **/
-    void TestMixedThaiLineBreak(void);
-    /**
-     * Test Thai Line break with Maiyamok using generalIteratorTest()
-     * The Thai maiyamok character is a shorthand symbol that means "repeat the previous
-     * word".  Instead of appearing as a word unto itself, however, it's kept together
-     * with the word before it
-     **/
-    void TestMaiyamok(void);
-     /**
-     * Test Thai word break using generalIteratorTest()
-     **/
-    void TestThaiWordBreak(void);
-    /**
-     * test behaviour of BreakIterator on an empty string
-     **/
-    void TestEmptyString(void);
-    /**
-     * Test BreakIterator::getAvailableLocales
-     **/
-    void TestGetAvailableLocales(void);
-    /**
-     * Test BreakIterator::getDisplayName
-     **/
-    void TestGetDisplayName(void);
-    /**
-     * test methods preceding, following and isBoundary
-     **/
-    void TestPreceding(void);
-
-    void TestBug4153072(void);
-    /**
-     * Test End Behaviour
-     * @bug 4068137
-     **/
-    void TestEndBehaviour(void);
-
-/***********************/
-private:
-    /**
-     * internal methods to prepare test data
-     **/
-    void addTestWordData(void);
-    void addTestSentenceData(void);
-    void addTestLineData(void);
-    void addTestCharacterData(void);
-    UnicodeString createTestData(Enumeration* e);
-
-    /**
-     * Perform tests of BreakIterator forward and backward functionality 
-     * on different kinds of iterators (word, sentence, line and character).
-     * It tests the methods first(), next(), current(), preceding(), following()
-     * previous() and isBoundary().
-     * It makes use of internal functions to achieve this.
-     **/
-    void generalIteratorTest(BreakIterator& bi, Vector* expectedResult);
-    /**
-     * Internal method to perform iteration and test the first() and next() functions
-     **/
-    Vector* testFirstAndNext(BreakIterator& bi, UnicodeString& text);
-    /**
-     * Internal method to perform iteration and test the last() and previous() functions
-     **/
-    Vector* testLastAndPrevious(BreakIterator& bi, UnicodeString& text);
-    /**
-     * Internal method to perform iteration and test the following() function
-     **/
-    void testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
-    /**
-     * Internal method to perform iteration and test the preceding() function
-     **/
-    void testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
-    /**
-     * Internal method to perform iteration and test the isBoundary() function
-     **/
-    void testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
-    /** 
-     * Internal method which does the comparision of expected and got results.
-     **/
-    void compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2);
-    /**
-     * Internal method to perform tests of BreakIterator multiple selection functionality 
-     * on different kinds of iterators (word, sentence, line and character)
-     **/
-    void doMultipleSelectionTest(BreakIterator& iterator, UnicodeString& testText);
-    /**
-     * Internal method to perform tests of BreakIterator break Invariants 
-     * on different kinds of iterators (word, sentence, line and character)
-     **/
-    void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars);
-    /**
-     * Internal method to perform tests of BreakIterator other invariants 
-     * on different kinds of iterators (word, sentence, line and character)
-     **/
-    void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars);
-    /**
-     * Perform tests with short sample code
-     **/ 
-    void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title);
-    /**
-     * The vectors holding test data for testing 
-     * different kinds of iterators( word, sentence, line and character)
-     **/
-    Vector* lineSelectionData;
-    Vector* sentenceSelectionData;
-    Vector* wordSelectionData;
-    Vector* characterSelectionData;
-
-    static const UChar cannedTestArray[];
-    static UnicodeString *cannedTestChars;
-};
-
-
-#endif
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -166,11 +166,24 @@ void BITestData::clearResults() {
 }


-//--------------------------------------------------------------------------------------
+//-----------------------------------------------------------------------------------
 //
-//    RBBITest 
+//    Cannned Test Characters
 //
-//--------------------------------------------------------------------------------------
+//-----------------------------------------------------------------------------------
+
+static const UChar cannedTestArray[] = {
+    0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
+    0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
+    0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
+    0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, 
+    0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
+    0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
+    0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
+    0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
+};
+
+static UnicodeString* cannedTestChars = 0;

 #define  halfNA     "\\u0928\\u094d\\u200d"
 #define  halfSA     "\\u0938\\u094d\\u200d"
@ -178,7 +191,23 @@ void BITestData::clearResults() {
 #define  halfKA     "\\u0915\\u094d\\u200d"
 #define  deadTA     "\\u0924\\u094d"

+//--------------------------------------------------------------------------------------
+//
+//    RBBITest    constructor and destructor
+//
+//--------------------------------------------------------------------------------------

+RBBITest::RBBITest() {
+    UnicodeString temp(cannedTestArray);
+    cannedTestChars = new UnicodeString();
+    *cannedTestChars += (UChar)0x0000;
+    *cannedTestChars += temp;
+}
+
+
+RBBITest::~RBBITest() {
+    delete cannedTestChars;
+}

 //--------------------------------------------------------------------
 //tests default rules based character iteration
@ -209,6 +238,32 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration()
    ADD_DATACHUNK(chardata, "e\\u0301", 0, status);                   //acuteE
    ADD_DATACHUNK(chardata, "&", 0, status);
    ADD_DATACHUNK(chardata, "e\\u0303", 0, status);                   //tildaE
+
+    ADD_DATACHUNK(chardata, "S\\u0300", 0, status); //graveS
+    ADD_DATACHUNK(chardata, "i\\u0301", 0, status); // acuteBelowI
+    ADD_DATACHUNK(chardata, "m", 0, status);
+    ADD_DATACHUNK(chardata, "p", 0, status);
+    ADD_DATACHUNK(chardata, "l", 0, status);
+    ADD_DATACHUNK(chardata, "e\\u0301", 0, status);  // acuteE
+    ADD_DATACHUNK(chardata, " ", 0, status);
+    ADD_DATACHUNK(chardata, "s", 0, status);
+    ADD_DATACHUNK(chardata, "a\\u0302", 0, status);  // circumflexA
+    ADD_DATACHUNK(chardata, "m", 0, status);
+    ADD_DATACHUNK(chardata, "p", 0, status);
+    ADD_DATACHUNK(chardata, "l", 0, status);
+    ADD_DATACHUNK(chardata, "e\\u0303", 0, status);  // tildeE
+    ADD_DATACHUNK(chardata, ".", 0, status);
+    ADD_DATACHUNK(chardata, "w", 0, status);
+    ADD_DATACHUNK(chardata, "a\\u0302", 0, status);  // circumflexA
+    ADD_DATACHUNK(chardata, "w", 0, status);
+    ADD_DATACHUNK(chardata, "a", 0, status);
+    ADD_DATACHUNK(chardata, "f", 0, status);
+    ADD_DATACHUNK(chardata, "q", 0, status);
+    ADD_DATACHUNK(chardata, "\n", 0, status);
+    ADD_DATACHUNK(chardata, "\r", 0, status);
+    ADD_DATACHUNK(chardata, "\r\n", 0, status);
+    ADD_DATACHUNK(chardata, "\n", 0, status);
+
    //devanagiri characters for Hindi support
    ADD_DATACHUNK(chardata, "\\u0906", 0, status);                    //devanagiri AA
    //ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0);              //devanagiri vowelsign AA+ chandrabindhu
@ -233,6 +288,10 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration()
    ADD_DATACHUNK(chardata, "i\\u0301", 0, status);                   //acuteBelowI
    ADD_DATACHUNK(chardata, "!", 0, status);

+
+
+
+
    // What follows is a string of Korean characters (I found it in the Yellow Pages
    // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
    // it correctly), first as precomposed syllables, and then as conjoining jamo.
@ -354,6 +413,8 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
    ADD_DATACHUNK(worddata, "$", 0, status);
    ADD_DATACHUNK(worddata, "30.10", T_NUMBER, status);
    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "12,34", T_NUMBER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
    ADD_DATACHUNK(worddata, "\\u00A2", 0, status); //cent sign
    ADD_DATACHUNK(worddata, "\\u00A3", 0, status); //pound sign
    ADD_DATACHUNK(worddata, "\\u00A4", 0, status); //currency sign
@ -365,14 +426,33 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
    ADD_DATACHUNK(worddata, " ", 0, status);
    ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status);
    ADD_DATACHUNK(worddata, "!", 0, status);
+    ADD_DATACHUNK(worddata, "?", 0, status);
+    ADD_DATACHUNK(worddata, "!", 0, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "We", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "don't", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "need", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "no", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "STINKING", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status);
+    ADD_DATACHUNK(worddata, "!", 0, status);
+    ADD_DATACHUNK(worddata, "!", 0, status);
+
    ADD_DATACHUNK(worddata, "1000,233,456.000", T_NUMBER, status);
    ADD_DATACHUNK(worddata, " ", 0, status);
+
    ADD_DATACHUNK(worddata, "1,23.322", T_NUMBER, status);
    ADD_DATACHUNK(worddata, "%", 0, status);
    ADD_DATACHUNK(worddata, "123.1222", T_NUMBER, status);
    ADD_DATACHUNK(worddata, "$", 0, status);
    ADD_DATACHUNK(worddata, "123,000.20", T_NUMBER, status);
    ADD_DATACHUNK(worddata, " ", 0, status);
+
    ADD_DATACHUNK(worddata, "179.01", T_NUMBER, status);
    ADD_DATACHUNK(worddata, "%", 0, status);
    ADD_DATACHUNK(worddata, "X", T_LETTER, status);
@ -428,12 +508,54 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
    ADD_DATACHUNK(worddata, "\\u3094\\u0301", T_H_OR_K, status);   // Hiragana
    ADD_DATACHUNK(worddata, "\\u309d", T_H_OR_K, status);   // Hiragana
    ADD_DATACHUNK(worddata, "\\u30a1\\u30fd\\uff66\\uff9d", T_H_OR_K, status);  // Katakana
-    // ADD_DATACHUNK(worddata, "def", T_LETTER, status);   // TODO  why does this fail???
-    ADD_DATACHUNK(worddata, ".", 0, status);
+    ADD_DATACHUNK(worddata, "def", T_LETTER, status);
+    ADD_DATACHUNK(worddata, "#", 0, status);

    // Words with interior formatting characters
    ADD_DATACHUNK(worddata, "def\\u0301\\u070Fabc", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
    
+    // to test for bug #4097779
+    ADD_DATACHUNK(worddata, "aa\\u0300a", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+
+    // to test for bug #4098467
+    // What follows is a string of Korean characters (I found it in the Yellow Pages
+    // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
+    // it correctly), first as precomposed syllables, and then as conjoining jamo.
+    // Both sequences should be semantically identical and break the same way.
+    // precomposed syllables...
+    ADD_DATACHUNK(worddata, "\\uc0c1\\ud56d", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "\\ud55c\\uc778", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "\\uc5f0\\ud569", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "\\uc7a5\\ub85c\\uad50\\ud68c", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    // conjoining jamo...
+    ADD_DATACHUNK(worddata, "\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+    ADD_DATACHUNK(worddata, "\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c", T_LETTER, status);
+    ADD_DATACHUNK(worddata, " ", 0, status);
+
+    // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
+    // count as a Kanji character for the purposes of word breaking
+    ADD_DATACHUNK(worddata, "abc", T_LETTER, status);
+    // Unicode TR29:  Ideographs do NOT group together into words.
+    //wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
+    ADD_DATACHUNK(worddata, "\\u4e01", T_IDEO, status);
+    ADD_DATACHUNK(worddata, "\\u4e02", T_IDEO, status);
+    ADD_DATACHUNK(worddata, "\\u3005", T_LETTER, status);   // TODO:  3005 is ideographic iteration mark
+                                                            //        Treating as letter is according to TR.
+                                                            //        Check whether this is really intended.
+    ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status);
+    ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status);
+    ADD_DATACHUNK(worddata, "abc",     T_LETTER, status);

    if (U_FAILURE(status)){
        errln("FAIL : in BITestData construction");
@ -531,6 +653,40 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
      // opening punctuation
      ADD_DATACHUNK(sentdata, "How do you do?", 0, status);
      ADD_DATACHUNK(sentdata, "(fine).", 0, status);
+
+      // test for bug #4158381: Don't break sentence after period if it isn't
+      // followed by a space
+      ADD_DATACHUNK(sentdata, "Test <code>Flags.Flag</code> class.  ", 0, status);
+      ADD_DATACHUNK(sentdata, "Another test.\\u2029", 0, status);
+      
+      // test for bug #4158381: No breaks when there are no terminators around
+      ADD_DATACHUNK(sentdata, "<P>Provides a set of &quot;lightweight&quot; (all-java<FONT SIZE=\"-2\">"
+                              "<SUP>TM</SUP></FONT> language) components that, to the maximum degree possible,"
+                              "work the same on all platforms.  ", 0, status);
+      ADD_DATACHUNK(sentdata, "Another test.\\u2029", 0, status);
+      
+      // test for bug #4143071: Make sure sentences that end with digits
+      // work right
+      ADD_DATACHUNK(sentdata, "Today is the 27th of May, 1998.  ", 0, status);
+      ADD_DATACHUNK(sentdata, "Tomorrow with be 28 May 1998.  ", 0, status);
+      ADD_DATACHUNK(sentdata, "The day after will be the 30th.\\u2029", 0, status);
+      
+      // test for bug #4152416: Make sure sentences ending with a capital
+      // letter are treated correctly
+      // Unicode TR29 reverses above bug:  Don't break a sentence if the last word begins with an upper case letter.
+      ADD_DATACHUNK(sentdata, "The type of all primitive <code>boolean</code> values accessed in the "            
+          "target VM.  Calls to xxx will return an implementor of this interface.  \\u2029", 0, status);
+      
+      // test for bug #4152117: Make sure sentence breaking is handling
+      // punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
+      // HERE TO MAKE SURE IT DOESN'T CROP UP]
+      ADD_DATACHUNK(sentdata, "Constructs a randomly generated BigInteger, uniformly distributed "
+                              "over the range <tt>0</tt> to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive.  ", 0, status);
+      ADD_DATACHUNK(sentdata, "The uniformity of the distribution assumes that a fair source of random bits "
+                              "is provided in <tt>rnd</tt>.  ", 0, status);
+      ADD_DATACHUNK(sentdata, "Note that this constructor always constructs a non-negative biginteger.  \n", 0, status);
+      ADD_DATACHUNK(sentdata, "Ahh abc.  \n", 0, status);
+
      //sentence breaks for hindi which used Devanagari script
      //make sure there is sentence break after ?,danda(hindi phrase separator),fullstop followed by space and no break after \n \r
      ADD_DATACHUNK(sentdata,  "\\u0928\\u092e" halfSA
@ -1198,6 +1354,222 @@ void RBBITest::TestAbbrRuleBasedWordIteration()
      delete rb;
 } */

+
+
+void RBBITest::TestThaiLineBreak() {
+    UErrorCode status = U_ZERO_ERROR;
+    BITestData thaiLineSelection(status);
+
+    // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
+    // represents elided letters at the end of a long word.  It should be bound to
+    // the end of the word and not treated as an independent punctuation mark.
+
+
+    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
+//        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
+//        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
+    // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
+
+    // the one time where the paiyannoi occurs somewhere other than at the end
+    // of a word is in the Thai abbrevation for "etc.", which both begins and
+    // ends with a paiyannoi
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
+
+    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
+        Locale("th"), status); 
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
+        return;
+    }
+
+    generalIteratorTest(*e, thaiLineSelection);
+}
+
+
+
+void RBBITest::TestMixedThaiLineBreak() 
+{
+    UErrorCode   status = U_ZERO_ERROR;
+    BITestData   thaiLineSelection(status);
+
+    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
+    
+    // Arabic numerals should always be separated from surrounding Thai text
+/*
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
+        thaiLineSelection->addElement("39");
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
+
+        // words in non-Thai scripts should always be separated from surrounding Thai text
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);
+        thaiLineSelection->addElement("Java");
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);
+
+        // Thai numerals should always be separated from the text surrounding them
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
+
+        // Thai text should interact correctly with punctuation and symbols
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);
+//        ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);
+//        ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);
+ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);
+// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);
+        ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
+*/
+
+    // The Unicode Linebreak TR says do not break before or after quotes.
+    //    So this test is changed ot not break around the quote.
+    //    TODO:  should Thai break around the around the quotes, like the original behavior here?
+//    ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);
+//    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
+      ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
+                                                         "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
+    
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);
+
+    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); 
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
+        return;
+    }
+
+
+    generalIteratorTest(*e, thaiLineSelection);
+}
+
+
+void RBBITest::TestMaiyamok() 
+{
+    UErrorCode status = U_ZERO_ERROR;
+    BITestData   thaiLineSelection(status);
+    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
+    // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
+    // word".  Instead of appearing as a word unto itself, however, it's kept together
+    // with the word before it
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
+
+    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
+        Locale("th"), status); 
+
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
+        return;
+    }
+    generalIteratorTest(*e, thaiLineSelection);
+    delete e;
+}
+
+void RBBITest::TestThaiWordBreak() {
+    UErrorCode status = U_ZERO_ERROR;
+    BITestData   thaiWordSelection(status);
+
+    ADD_DATACHUNK(thaiWordSelection, NULL, 0, status);           // Break at start of data
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); //10
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); //16
+    ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status); //18
+
+    // This is the correct result
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
+
+    // and this is what the dictionary does...
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); // 20
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
+
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status); //33
+
+    // This is the correct result
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
+
+    // and this is what the dictionary does
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
+
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); //45
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); //49
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status); //51
+
+    // This is the correct result
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60
+
+    // and this is what the dictionary does
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); // 54
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status); //60
+
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status); //63
+
+    // This is the correct result
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74
+    //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77
+
+    // and this is what the dictionary does
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); // 65
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status); //77
+
+    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
+        Locale("th"), status); 
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");
+        return;
+    }
+
+    generalIteratorTest(*e, thaiWordSelection);
+    delete e;
+}
+
+
 //---------------------------------------------
 // runIndexedTest
 //---------------------------------------------
@ -1223,11 +1595,42 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
            if(exec) TestTitleBreak();                         break;
        case 7: name = "TestStatusReturn";
            if(exec) TestStatusReturn();                       break;
-        case 8: name = "TestLineBreakData";
-            if(exec) TestLineBreakData();                       break;

-//      case 6: name = "TestDanda()";
-//           if(exec) TestDanda();                             break;
+        case 8: name = "TestLineBreakData";
+            if(exec) TestLineBreakData();                      break;
+        case 9: name = "TestSentenceInvariants";
+            if(exec) TestSentenceInvariants();                 break;
+        case 10: name = "TestCharacterInvariants";
+            if(exec) TestCharacterInvariants();                break;
+        case 11: name = "TestWordInvariants";
+            if(exec) TestWordInvariants();                     break;
+
+        case 12: name = "TestEmptyString";
+            if(exec) TestEmptyString();                        break;
+
+        case 13: name = "TestGetAvailableLocales";
+            if(exec) TestGetAvailableLocales();                break;
+
+        case 14: name = "TestGetDisplayName";
+            if(exec) TestGetDisplayName();                     break;
+
+        case 15: name = "TestEndBehaviour";
+            if(exec) TestEndBehaviour();                       break;
+        case 16: name = "TestBug4153072";
+            if(exec) TestBug4153072();                         break;
+        case 17: name = "TestJapaneseLineBreak()";
+             if(exec) TestJapaneseLineBreak();                 break;
+
+
+        case 18: name = "TestThaiLineBreak()";
+             if(exec) TestThaiLineBreak();                     break;
+        case 19: name = "TestMixedThaiLineBreak()";
+             if(exec) TestMixedThaiLineBreak();                break;
+        case 20: name = "TestMaiyamok()";
+             if(exec) TestMaiyamok();                          break;
+        case 21: name = "TestThaiWordBreak()";
+             if(exec) TestThaiWordBreak();                     break;
+
 //      case 7: name = "TestHindiCharacterWrapping()";
 //           if(exec) TestHindiCharacterWrapping();            break;
 //      case 8: name = "TestCustomRuleBasedWordIteration";
@ -1486,6 +1889,488 @@ void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestD
 }


+
+//--------------------------------------------------------------------------------------------
+//
+//    Break Iterator Invariants Tests
+//
+//--------------------------------------------------------------------------------------------
+
+void RBBITest::TestCharacterInvariants()
+{
+    UErrorCode status = U_ZERO_ERROR;
+    BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");
+        return;
+    }
+    UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
+    doBreakInvariantTest(*e, s);
+    s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
+    doOtherInvariantTest(*e, s);
+    delete e;
+}
+
+
+void RBBITest::TestWordInvariants()
+{
+    UErrorCode status = U_ZERO_ERROR;
+    BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");
+        return;
+    }
+    UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
+    doBreakInvariantTest(*e, s);
+    s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
+    doOtherInvariantTest(*e, s);
+    delete e;
+}
+
+
+void RBBITest::TestSentenceInvariants()
+{
+    UErrorCode status = U_ZERO_ERROR;
+    BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");
+        return;
+    }
+    UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");
+    doOtherInvariantTest(*e, s);
+    delete e;
+}
+
+
+void RBBITest::TestLineInvariants()
+{
+    UErrorCode status = U_ZERO_ERROR;
+    BreakIterator *e = BreakIterator::createLineInstance(Locale::getUS(), status);
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");
+        return;
+    }
+    UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");
+    UnicodeString testChars = *cannedTestChars + s;
+    doBreakInvariantTest(*e, testChars);
+    doOtherInvariantTest(*e, testChars);
+
+    int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;
+    int32_t i, j, k;
+
+    // in addition to the other invariants, a line-break iterator should make sure that:
+    // it doesn't break around the non-breaking characters,
+    // EXCEPT breaking after a space takes precedence over not breaking before
+    //        an non-breaking char.  So says TR 14.
+    UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
+    UnicodeString work("aaa");
+    testCharsLen = testChars.length();
+    noBreakLen = noBreak.length();
+    for (i = 0; i < testCharsLen; i++) {
+        UChar c = testChars[i];
+        if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 ||
+            u_charType(c) == U_CONTROL_CHAR) {
+            continue;
+        }
+        work[0] = c;
+        for (j = 0; j < noBreakLen; j++) {
+            work[1] = noBreak[j];
+            for (k = 0; k < testCharsLen; k++) {
+                work[2] = testChars[k];
+                e->setText(work);
+                for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
+                    UChar c1 = work[l - 1];
+                    UChar c2 = work[l];
+                    if (c1 == 0x20 && l == 1) {
+                        continue;
+                    }
+                    if (l == 1 || l == 2) {
+                        errln("Got break between U+" + UCharToUnicodeString(c1) + 
+                            " and U+" + UCharToUnicodeString(c2));
+                        errCount++;
+                        if (errCount >= 75)
+                            return;
+                    }
+                }
+            }
+        }
+    }
+
+    // it does break after hyphens (Rule 15B from TR 14
+    //  (unless they're followed by a digit, a non-spacing mark,
+    // a currency symbol, a non-breaking space, or a line or paragraph separator
+    //  or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
+
+    // This test is sufficiently screwed up that I'm largely disabling it.  TODO:  fix it.  06/12/2002  AGH
+    //
+    UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
+    dashesLen = dashes.length();
+    for (i = 0; i < testCharsLen; i++) {
+        work[0] = testChars[i];
+        for (j = 0; j < dashesLen; j++) {
+            UChar c1 = work[1] = dashes[j];
+            for (k = 0; k < testCharsLen; k++) {
+                UChar c2 = work[2] = testChars[k];
+                int8_t type = u_charType(c2);
+                if (type == U_DECIMAL_DIGIT_NUMBER ||
+                    type == U_OTHER_NUMBER ||
+                    type == U_NON_SPACING_MARK ||
+                    type == U_ENCLOSING_MARK ||
+                    type == U_CURRENCY_SYMBOL ||
+                    type == U_SPACE_SEPARATOR ||
+                    type == U_DASH_PUNCTUATION ||
+                    type == U_CONTROL_CHAR ||
+                    type == U_FORMAT_CHAR ||
+                    c2 == '\n'   || c2 == '\r'   || c2 == 0x2028 || c2 == 0x2029 ||
+                    c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 ||
+                    c2 == 0xfeff)
+                {
+                    continue;
+                }
+                // If c1 == hyphen-minus, and ...
+                if (c1 == 0x002d  &&  (
+                       c2 == 0x0021  ||   // !
+                       c2 == 0x002c  ||   // ,
+                       c2 == 0x002d  ||   // -
+                       c2 == 0x002e  ||   // .   (TR 14 class IS)
+                       c2 == 0x0029  ||   // )
+                       c2 == 0x003a  ||   // :
+                       c2 == 0x003b  ||   // ;   (TR 14 class IS)
+                       c2 == 0x005d  ||   // ]
+                       c2 == 0x007c  ||   // |   (TR 14 class BA, rule 15)
+                       c2 == 0x007d  ||   // }
+                       c2 == 0x0903  ||   // Devanagari sign visarga, combining, what's it doing in this test?
+                       c2 == 0x093E  ||   // Devanagari , combining, what's it doing in this test?
+                       c2 == 0x093F  ||   // Devanagari , combining, what's it doing in this test?
+                       c2 == 0x0940  ||   // Devanagari , combining, what's it doing in this test?
+                       c2 == 0x0949  ||   // Devanagari , combining, what's it doing in this test?
+                       c2 == 0x0f3b  ||   // Tibetan closing bracket
+                       c2 == 0x3001  ||   // CJK closing bracket
+                       c2 == 0x3002       // CJK closing bracket
+                      )) {
+                    continue;
+                }
+
+                e->setText(work);
+                UBool saw2 = FALSE;
+                for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
+                    if (l == 2) {
+                        saw2 = TRUE;
+                        break;
+                    }
+                }
+                if (!saw2) {
+                    // TODO:  This test is completely out of sync with the spec.  Fix it.
+                    // errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) + 
+                    //    " and U+" + UCharToUnicodeString(work[2]));
+                    // errCount++;
+                    // if (errCount >= 75)
+                    //    return;
+                }
+            }
+        }
+    }
+    delete e;
+}
+
+
+
+void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
+{
+    UnicodeString work("aaa");
+    int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;
+
+    // a break should always occur after CR (unless followed by LF), LF, PS, and LS
+    UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");
+    int32_t i, j;
+
+    breaksLen = breaks.length();
+    for (i = 0; i < breaksLen; i++) {
+        UChar c1 = work[1] = breaks[i];
+        for (j = 0; j < testCharsLen; j++) {
+            UChar c0 = work[0] = testChars[j];
+            for (int k = 0; k < testCharsLen; k++) {
+                UChar c2 = work[2] = testChars[k];
+
+                // if a cr is followed by lf, ps, ls or etx, don't do the check (that's
+                // not supposed to work)
+                if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029
+                        || c2 == 0x2028 || c2 == 0x0003))
+                    continue;
+
+                if (u_charType(c1) == U_CONTROL_CHAR &&  
+                    (u_charType(c2) == U_NON_SPACING_MARK ||
+                     u_charType(c2) == U_ENCLOSING_MARK ||
+                     u_charType(c2) == U_COMBINING_SPACING_MARK)
+                    ) {
+                    // Combining marks don't combine with controls.
+                    //  TODO:  enhance test to verify that the break actually occurs,
+                    //         not just ignore the case.
+                    continue;
+                }
+
+
+                tb.setText(work);
+                UBool seen2 = FALSE;
+                for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {
+                    if (l == 2) {
+                        seen2 = TRUE;
+                        break;
+                    }
+                }
+                if (!seen2) {
+                    errln("No break between U+" + UCharToUnicodeString(c1)
+                                + " and U+" + UCharToUnicodeString(c2));
+                    errCount++;
+                    if (errCount >= 75)
+                        return;
+                }
+            }
+        }
+    }
+}
+
+
+
+void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
+{
+    UnicodeString work("a\r\na");
+    int32_t errCount = 0, testCharsLen = testChars.length();
+    int32_t i, j;
+    int8_t type;
+
+    // a break should never occur between CR and LF
+    for (i = 0; i < testCharsLen; i++) {
+        work[0] = testChars[i];
+        for (j = 0; j < testCharsLen; j++) {
+            work[3] = testChars[j];
+            tb.setText(work);
+            for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
+                if (k == 2) {
+                    errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) + 
+                        ", U+d U+a U+" + UCharToUnicodeString(work[3]));
+                    errCount++;
+                    if (errCount >= 75)
+                        return;
+                }
+        }
+    }
+
+    // a break should never occur before a non-spacing mark, unless the preceding
+    // character is CR, LF, PS, or LS
+    //   Or the general category == Control.
+    work.remove();
+    work += "aaaa";
+    for (i = 0; i < testCharsLen; i++) {
+        UChar c1 = testChars[i];
+        if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||
+            u_charType(c1) == U_CONTROL_CHAR  ||  u_charType(c1) == U_FORMAT_CHAR) {
+            continue;
+        }
+        work[1] = c1;
+        for (j = 0; j < testCharsLen; j++) {
+            UChar c2 = testChars[j];
+            type = u_charType(c2);
+            if ((type != U_NON_SPACING_MARK) && 
+                (type != U_ENCLOSING_MARK)) {
+                continue;
+            }
+            work[2] = c2;
+            tb.setText(work);
+            for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())
+                if (k == 2) {
+                    errln("Break between U+" + UCharToUnicodeString(work[1])
+                            + " and U+" + UCharToUnicodeString(work[2]));
+                    errCount++;
+                    if (errCount >= 75)
+                        return;
+                }
+        }
+    }
+}
+
+
+
+
+//---------------------------------------------
+//
+//     other tests
+//
+//---------------------------------------------
+void RBBITest::TestEmptyString()
+{
+    UnicodeString text = "";
+    UErrorCode status = U_ZERO_ERROR;
+
+    BITestData x(status);
+    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
+    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
+        return;
+    }
+    generalIteratorTest(*bi, x);
+    delete bi;
+}
+
+void RBBITest::TestGetAvailableLocales()
+{
+    int32_t locCount = 0;
+    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
+
+    if (locCount == 0)
+        errln("getAvailableLocales() returned an empty list!");
+    // Just make sure that it's returning good memory.
+    for (int32_t i = 0; i < locCount; ++i) {
+        logln(locList[i].getName());
+    }
+}
+
+//Testing the BreakIterator::getDisplayName() function 
+void RBBITest::TestGetDisplayName()
+{
+    UnicodeString   result;
+    
+    BreakIterator::getDisplayName(Locale::getUS(), result);
+    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
+        errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
+                + result);
+
+    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
+    if (result != "French (France)")
+        errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
+                + result);
+}
+/**
+ * Test End Behaviour
+ * @bug 4068137
+ */
+void RBBITest::TestEndBehaviour()
+{
+    UErrorCode status = U_ZERO_ERROR;
+    UnicodeString testString("boo.");
+    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
+        return;
+    }
+    wb->setText(testString);
+
+    if (wb->first() != 0)
+        errln("Didn't get break at beginning of string.");
+    if (wb->next() != 3)
+        errln("Didn't get break before period in \"boo.\"");
+    if (wb->current() != 4 && wb->next() != 4)
+        errln("Didn't get break at end of string.");
+    delete wb;
+}
+/*
+ * @bug 4153072
+ */
+void RBBITest::TestBug4153072() {
+    UErrorCode status = U_ZERO_ERROR;
+    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
+        return;
+    }
+    UnicodeString str("...Hello, World!...");
+    int32_t begin = 3;
+    int32_t end = str.length() - 3;
+    UBool dummy;
+
+    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
+    iter->adoptText(textIterator);
+    for (int index = -1; index < begin + 1; ++index) {
+        dummy = iter->isBoundary(index);
+        if (index < begin && dummy == TRUE) {
+            errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
+                            " and begin index = " + begin);
+        }
+    }
+    delete iter;
+}
+
+
+/**
+ * Test Japanese Line Break
+ * @bug 4095322
+ */
+void RBBITest::TestJapaneseLineBreak()
+{
+    // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
+    //        as opening and closing punctuation for line breaking.
+    //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
+    //        from these tests.    6-13-2002  
+    //
+    UErrorCode status = U_ZERO_ERROR;
+    UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
+    UnicodeString precedingChars = CharsToUnicodeString(
+        //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
+        "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
+    UnicodeString followingChars = CharsToUnicodeString(
+        // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
+        ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
+        // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
+        ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
+        "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
+    BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
+
+    int32_t i;
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
+        return;
+    }
+
+    for (i = 0; i < precedingChars.length(); i++) {
+        testString[1] = precedingChars[i];
+        iter->setText(testString);
+        int32_t j = iter->first();
+        if (j != 0)
+            errln("ja line break failure: failed to start at 0");
+        j = iter->next();
+        if (j != 1)
+            errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
+                        + "' (" + ((int)(precedingChars[i])) + ")");
+        j = iter->next();
+        if (j != 3)
+            errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
+                        + "' (" + ((int)(precedingChars[i])) + ")");
+    }
+
+    for (i = 0; i < followingChars.length(); i++) {
+        testString[1] = followingChars[i];
+        iter->setText(testString);
+        int j = iter->first();
+        if (j != 0)
+            errln("ja line break failure: failed to start at 0");
+        j = iter->next();
+        if (j != 2)
+            errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
+                        + "' (" + ((int)(followingChars[i])) + ")");
+        j = iter->next();
+        if (j != 3)
+            errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
+                        + "' (" + ((int)(followingChars[i])) + ")");
+    }
+    delete iter;
+}
+
+
+//--------------------------------------------------------------------------------------------
+//
+//     Exhaustive Tests, using Unicode Data Files.
+//
+//--------------------------------------------------------------------------------------------
+
 //
 //  Token level scanner for the Unicode Line Break Test Data file.
 //      Return the next token, as follows:
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -27,6 +27,9 @@ class BITestData;
 class RBBITest: public IntlTest {
 public:
  
+    RBBITest();
+    ~RBBITest();
+
    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
    /**
     * Tests default rules based character iteration
@ -67,6 +70,22 @@ public:
     **/  
    void TestLineBreakData();

+    void TestSentenceInvariants();
+    void TestCharacterInvariants();
+    void TestWordInvariants();
+    void TestLineInvariants();
+    void TestEmptyString();
+    void TestGetAvailableLocales();
+    void TestGetDisplayName();
+    void TestEndBehaviour();
+    void TestBug4153072();
+    void TestJapaneseLineBreak();
+    void TestThaiLineBreak();
+    void TestMixedThaiLineBreak(); 
+    void TestMaiyamok(); 
+    void TestThaiWordBreak();
+    
+    
    /**
    * Test Hindi Danda i.e make sure we have a break point before and after danda 
    **/ 
@ -136,6 +155,9 @@ private:
     **/
    void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td);

+    void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars);
+    void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars);
+
 };