scuffed-code/icu4c/source/test/intltest/rbbitst.cpp

/********************************************************************
 * COPYRIGHT:
 * Copyright (c) 1999-2007, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
/************************************************************************
*   Date        Name        Description
*   12/15/99    Madhu        Creation.
*   01/12/2000  Madhu        Updated for changed API and added new tests
************************************************************************/

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "unicode/utypes.h"
#include "unicode/brkiter.h"
#include "unicode/rbbi.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "unicode/ucnv.h"
#include "unicode/schriter.h"
#include "unicode/uniset.h"
#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "intltest.h"
#include "rbbitst.h"
#include <string.h>
#include "uvector.h"
#include "uvectr32.h"
#include "triedict.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#define TEST_ASSERT(x) {if (!(x)) { \
    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}

#define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
    errln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}


//---------------------------------------------
// runIndexedTest
//---------------------------------------------

void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
{
    if (exec) logln("TestSuite RuleBasedBreakIterator: ");

    switch (index) {
        case 0: name = "TestBug4153072";
            if(exec) TestBug4153072();                         break;
        case 1: name = "TestJapaneseLineBreak";
            if(exec) TestJapaneseLineBreak();                  break;
        case 2: name = "TestStatusReturn";
            if(exec) TestStatusReturn();                       break;
        case 3: name = "TestUnicodeFiles";
            if(exec) TestUnicodeFiles();                       break;
        case 4: name = "TestEmptyString";
            if(exec) TestEmptyString();                        break;

        case 5: name = "TestGetAvailableLocales";
            if(exec) TestGetAvailableLocales();                break;

        case 6: name = "TestGetDisplayName";
            if(exec) TestGetDisplayName();                     break;

        case 7: name = "TestEndBehaviour";
            if(exec) TestEndBehaviour();                       break;
        case 8: name = "TestMixedThaiLineBreak";
             if(exec) TestMixedThaiLineBreak();                break;
        case 9: name = "TestThaiLineBreak";
             if(exec) TestThaiLineBreak();                     break;
        case 10: name = "TestMaiyamok";
             if(exec) TestMaiyamok();                          break;
        case 11: name = "TestWordBreaks";
             if(exec) TestWordBreaks();                        break;
        case 12: name = "TestWordBoundary";
             if(exec) TestWordBoundary();                      break;
        case 13: name = "TestLineBreaks";
             if(exec) TestLineBreaks();                        break;
        case 14: name = "TestSentBreaks";
             if(exec) TestSentBreaks();                        break;
        case 15: name = "TestExtended";
             if(exec) TestExtended();                          break;
        case 16: name = "TestMonkey";
             if(exec) {
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
               TestMonkey(params);
 #else
               logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
 #endif
             }
                                                               break;
        case 17: name = "TestBug3818";
            if(exec) TestBug3818();                            break;
        case 18: name = "TestJapaneseWordBreak";
            if(exec) TestJapaneseWordBreak();                  break;
        case 19: name = "TestDebug";
            if(exec) TestDebug();                              break;
        case 20: name = "TestTrieDict";
            if(exec) TestTrieDict();                           break;
        case 21: name = "TestBug5775";
            if (exec) TestBug5775();                        break;

        default: name = ""; break; //needed to end loop
    }
}


//---------------------------------------------------------------------------
//
//   class BITestData   Holds a set of Break iterator test data and results
//                      Includes
//                         - the string data to be broken
//                         - a vector of the expected break positions.
//                         - a vector of source line numbers for the data,
//                               (to help see where errors occured.)
//                         - The expected break tag values.
//                         - Vectors of actual break positions and tag values.
//                         - Functions for comparing actual with expected and
//                            reporting errors.
//
//----------------------------------------------------------------------------
class BITestData {
public:
    UnicodeString    fDataToBreak;
    UVector          fExpectedBreakPositions;
    UVector          fExpectedTags;
    UVector          fLineNum;
    UVector          fActualBreakPositions;   // Test Results.
    UVector          fActualTags;

    BITestData(UErrorCode &status);
    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    void             checkResults(const char *heading, RBBITest *test);
    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    void             clearResults();
};

//
// Constructor.
//
BITestData::BITestData(UErrorCode &status)
: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
  fActualTags(status)
{
}

//
// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
//                 The macro form collects the line number, which is helpful
//                 when tracking down failures.
//
//                 A null data item is inserted at the start of each test's data
//                  to put the starting zero into the data list.  The position saved for
//                  each non-null item is its ending position.
//
#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    if (U_FAILURE(status)) {return;}
    if (data != NULL) {
        fDataToBreak.append(CharsToUnicodeString(data));
    }
    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    fExpectedTags.addElement(tag, status);
    fLineNum.addElement(lineNum, status);
}


//
//  checkResults.   Compare the actual and expected break positions, report any differences.
//
void BITestData::checkResults(const char *heading, RBBITest *test) {
    int32_t   expectedIndex = 0;
    int32_t   actualIndex = 0;

    for (;;) {
        // If we've run through both the expected and actual results vectors, we're done.
        //   break out of the loop.
        if (expectedIndex >= fExpectedBreakPositions.size() &&
            actualIndex   >= fActualBreakPositions.size()) {
            break;
        }


        if (expectedIndex >= fExpectedBreakPositions.size()) {
            err(heading, test, expectedIndex-1, actualIndex);
            actualIndex++;
            continue;
        }

        if (actualIndex >= fActualBreakPositions.size()) {
            err(heading, test, expectedIndex, actualIndex-1);
            expectedIndex++;
            continue;
        }

        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
            err(heading, test, expectedIndex, actualIndex);
            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
                actualIndex++;
            } else {
                expectedIndex++;
            }
            continue;
        }

        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
                heading, fLineNum.elementAt(expectedIndex),
                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
        }

        actualIndex++;
        expectedIndex++;
    }
}

//
//  err   -  An error was found.  Report it, along with information about where the
//                                incorrectly broken test data appeared in the source file.
//
void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
{
    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    int32_t   o        = 0;
    int32_t   line     = fLineNum.elementAti(expectedIdx);
    if (expectedIdx > 0) {
        // The line numbers are off by one because a premature break occurs somewhere
        //    within the previous item, rather than at the start of the current (expected) item.
        //    We want to report the offset of the unexpected break from the start of
        //      this previous item.
        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    }
    if (actual < expected) {
        test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);
    } else {
        test->errln("%s Failed to find break at end of item from line %d", heading, line);
    }
}


void BITestData::clearResults() {
    fActualBreakPositions.removeAllElements();
    fActualTags.removeAllElements();
}


//-----------------------------------------------------------------------------------
//
//    Cannned Test Characters
//
//-----------------------------------------------------------------------------------

static const UChar cannedTestArray[] = {
    0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
    0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
    0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
    0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
    0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
    0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
    0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
    0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
};

static UnicodeString* cannedTestChars = 0;

#define  halfNA     "\\u0928\\u094d\\u200d"
#define  halfSA     "\\u0938\\u094d\\u200d"
#define  halfCHA    "\\u091a\\u094d\\u200d"
#define  halfKA     "\\u0915\\u094d\\u200d"
#define  deadTA     "\\u0924\\u094d"

//--------------------------------------------------------------------------------------
//
//    RBBITest    constructor and destructor
//
//--------------------------------------------------------------------------------------

RBBITest::RBBITest() {
    UnicodeString temp(cannedTestArray);
    cannedTestChars = new UnicodeString();
    *cannedTestChars += (UChar)0x0000;
    *cannedTestChars += temp;
}


RBBITest::~RBBITest() {
    delete cannedTestChars;
}


static const int T_NUMBER = 100;
static const int T_LETTER = 200;
static const int T_H_OR_K = 300;
static const int T_IDEO   = 400;


//--------------------------------------------------------------------
//Testing the BreakIterator for devanagari script
//--------------------------------------------------------------------

#define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
#define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
#define deadTTHA "\\u0920\\u094d"
#define deadPA   "\\u092a\\u094d"
#define deadSA   "\\u0938\\u094d"
#define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/


//-----------------------------------------------------------------------------------
//
//   Test for status {tag} return value from break rules.
//        TODO:  a more thorough test.
//
//-----------------------------------------------------------------------------------
void RBBITest::TestStatusReturn() {
     UnicodeString rulesString1 = "$Letters = [:L:];\n"
                                  "$Numbers = [:N:];\n"
                                  "$Letters+{1};\n"
                                  "$Numbers+{2};\n"
                                  "Help\\ {4}/me\\!;\n"
                                  "[^$Letters $Numbers];\n"
                                  "!.*;\n";
     UnicodeString testString1  = "abc123..abc Help me Help me!";
                                // 01234567890123456789012345678
     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};

     UErrorCode status=U_ZERO_ERROR;
     UParseError    parseError;

     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
     if(U_FAILURE(status)) {
         errln("FAIL : in construction");
     } else {
         int32_t  pos;
         int32_t  i = 0;
         bi->setText(testString1);
         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
             if (pos != bounds1[i]) {
                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
                 break;
             }

             int tag = bi->getRuleStatus();
             if (tag != brkStatus[i]) {
                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
                 break;
             }
             i++;
         }
     }
     delete bi;
}


static void printStringBreaks(UnicodeString ustr, int expected[],
                              int expectedcount)
{
    UErrorCode status = U_ZERO_ERROR;
    char name[100];
    printf("code    alpha extend alphanum type word sent line name\n");
    int j;
    for (j = 0; j < ustr.length(); j ++) {
        if (expectedcount > 0) {
            int k;
            for (k = 0; k < expectedcount; k ++) {
                if (j == expected[k]) {
                    printf("------------------------------------------------ %d\n",
                           j);
                }
            }
        }
        UChar32 c = ustr.char32At(j);
        if (c > 0xffff) {
            j ++;
        }
        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
                           u_isUAlphabetic(c),
                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
                           u_isalnum(c),
                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
                                                  u_charType(c),
                                                  U_SHORT_PROPERTY_NAME),
                           u_getPropertyValueName(UCHAR_WORD_BREAK,
                                                  u_getIntPropertyValue(c,
                                                          UCHAR_WORD_BREAK),
                                                  U_SHORT_PROPERTY_NAME),
                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
                                   u_getIntPropertyValue(c,
                                           UCHAR_SENTENCE_BREAK),
                                   U_SHORT_PROPERTY_NAME),
                           u_getPropertyValueName(UCHAR_LINE_BREAK,
                                   u_getIntPropertyValue(c,
                                           UCHAR_LINE_BREAK),
                                   U_SHORT_PROPERTY_NAME),
                           name);
    }
}

void RBBITest::TestThaiLineBreak() {
    UErrorCode status = U_ZERO_ERROR;
    BITestData thaiLineSelection(status);

    // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
    // represents elided letters at the end of a long word.  It should be bound to
    // the end of the word and not treated as an independent punctuation mark.


    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
//        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
//        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
    // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);

    // the one time where the paiyannoi occurs somewhere other than at the end
    // of a word is in the Thai abbrevation for "etc.", which both begins and
    // ends with a paiyannoi
    ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);

    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
        Locale("th"), status);
    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
        return;
    }

    generalIteratorTest(*e, thaiLineSelection);
    delete e;
}


void RBBITest::TestMixedThaiLineBreak()
{
    UErrorCode   status = U_ZERO_ERROR;
    BITestData   thaiLineSelection(status);

    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data


    // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
    // start

    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);

    // @suwit - end of changes


    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
        return;
    }


    generalIteratorTest(*e, thaiLineSelection);
    delete e;
}


void RBBITest::TestMaiyamok()
{
    UErrorCode status = U_ZERO_ERROR;
    BITestData   thaiLineSelection(status);
    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
    // word".  Instead of appearing as a word unto itself, however, it's kept together
    // with the word before it
    ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);

    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
        Locale("th"), status);

    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
        return;
    }
    generalIteratorTest(*e, thaiLineSelection);
    delete e;
}


void RBBITest::TestBug3818() {
    UErrorCode  status = U_ZERO_ERROR;

    // Four Thai words...
    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    UnicodeString  thaiStr(thaiWordData);

    RuleBasedBreakIterator* bi =
        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    if (U_FAILURE(status) || bi == NULL) {
        errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
        return;
    }
    bi->setText(thaiStr);

    int32_t  startOfSecondWord = bi->following(1);
    if (startOfSecondWord != 4) {
        errln("Fail at file %s, line %d expected start of word at 4, got %d",
            __FILE__, __LINE__, startOfSecondWord);
    }
    startOfSecondWord = bi->following(0);
    if (startOfSecondWord != 4) {
        errln("Fail at file %s, line %d expected start of word at 4, got %d",
            __FILE__, __LINE__, startOfSecondWord);
    }
    delete bi;
}


void RBBITest::TestJapaneseWordBreak() {
    UErrorCode status = U_ZERO_ERROR;
    BITestData   japaneseWordSelection(status);

    ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
    ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
    ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
    ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
    ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
    ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
    ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12

    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
        Locale("ja"), status);
    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
        return;
    }

    generalIteratorTest(*e, japaneseWordSelection);
    delete e;
}

void RBBITest::TestTrieDict() {
    UErrorCode      status  = U_ZERO_ERROR;

    //
    //  Open and read the test data file.
    //
    const char *testDataDirectory = IntlTest::getSourceTestData(status);
    char testFileName[1000];
    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
        errln("Can't open test data.  Path too long.");
        return;
    }
    strcpy(testFileName, testDataDirectory);
    strcat(testFileName, "riwords.txt");

    // Items needing deleting at the end
    MutableTrieDictionary *mutableDict = NULL;
    CompactTrieDictionary *compactDict = NULL;
    UnicodeSet            *breaks      = NULL;
    UChar                 *testFile    = NULL;
    StringEnumeration     *enumer1     = NULL;
    StringEnumeration     *enumer2     = NULL;
    MutableTrieDictionary *mutable2    = NULL;
    StringEnumeration     *cloneEnum   = NULL;
    CompactTrieDictionary *compact2    = NULL;


    const UnicodeString *originalWord = NULL;
    const UnicodeString *cloneWord    = NULL;
    UChar *current;
    UChar *word;
    UChar uc;
    int32_t wordLen;
    int32_t wordCount;
    int32_t testCount;

    int    len;
    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    if (U_FAILURE(status)) {
        goto cleanup; /* something went wrong, error already output */
    }

    mutableDict = new MutableTrieDictionary(0x0E1C, status);
    if (U_FAILURE(status)) {
        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
        goto cleanup;
    }

    breaks = new UnicodeSet;
    breaks->add(0x000A);     // Line Feed
    breaks->add(0x000D);     // Carriage Return
    breaks->add(0x2028);     // Line Separator
    breaks->add(0x2029);     // Paragraph Separator

    // Now add each non-comment line of the file as a word.
    current = testFile;
    word = current;
    uc = *current++;
    wordLen = 0;
    wordCount = 0;

    while (uc) {
        if (uc == 0x0023) {     // #comment line, skip
            while (uc && !breaks->contains(uc)) {
                uc = *current++;
            }
        }
        else while (uc && !breaks->contains(uc)) {
            ++wordLen;
            uc = *current++;
        }
        if (wordLen > 0) {
            mutableDict->addWord(word, wordLen, status);
            if (U_FAILURE(status)) {
                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
                goto cleanup;
            }
            wordCount += 1;
        }

        // Find beginning of next line
        while (uc && breaks->contains(uc)) {
            uc = *current++;
        }
        word = current-1;
        wordLen = 0;
    }

    if (wordCount < 50) {
        errln("Word count (%d) unreasonably small\n", wordCount);
        goto cleanup;
    }

    enumer1 = mutableDict->openWords(status);
    if (U_FAILURE(status)) {
        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
        goto cleanup;
    }

    testCount = 0;
    if (wordCount != (testCount = enumer1->count(status))) {
        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
            testCount, wordCount, u_errorName(status));
        goto cleanup;
    }

    // Now compact it
    compactDict = new CompactTrieDictionary(*mutableDict, status);
    if (U_FAILURE(status)) {
        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
        goto cleanup;
    }

    enumer2 = compactDict->openWords(status);
    if (U_FAILURE(status)) {
        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
        goto cleanup;
    }

    if (wordCount != (testCount = enumer2->count(status))) {
        errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
            testCount, wordCount, u_errorName(status));
        goto cleanup;
    }

    if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
        errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
    }
    delete enumer1;
    enumer1 = NULL;
    delete enumer2;
    enumer2 = NULL;

    // Now un-compact it
    mutable2 = compactDict->cloneMutable(status);
    if (U_FAILURE(status)) {
        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
        goto cleanup;
    }

    cloneEnum = mutable2->openWords(status);
    if (U_FAILURE(status)) {
        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
        goto cleanup;
    }

    if (wordCount != (testCount = cloneEnum->count(status))) {
        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
            testCount, wordCount, u_errorName(status));
        goto cleanup;
    }

    // Compact original dictionary to clone. Note that we can only compare the same kind of
    // dictionary as the order of the enumerators is not guaranteed to be the same between
    // different kinds
    enumer1 = mutableDict->openWords(status);
    if (U_FAILURE(status)) {
        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
        goto cleanup;
     }

    originalWord = enumer1->snext(status);
    cloneWord = cloneEnum->snext(status);
    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
        if (*originalWord != *cloneWord) {
            errln("Original and cloned MutableTrieDictionary word mismatch\n");
            goto cleanup;
        }
        originalWord = enumer1->snext(status);
        cloneWord = cloneEnum->snext(status);
    }

    if (U_FAILURE(status)) {
        errln("Enumeration failed: %s\n", u_errorName(status));
        goto cleanup;
    }

    if (originalWord != cloneWord) {
        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
        goto cleanup;
    }

    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
    compact2 = new CompactTrieDictionary(compactDict->data(), status);
    if (U_FAILURE(status)) {
        errln("CompactTrieDictionary(const void *,...) failed\n");
        goto cleanup;
    }

    if (compact2->dataSize() == 0) {
        errln("CompactTrieDictionary->dataSize() == 0\n");
        goto cleanup;
    }

    // Now count the words via the second dictionary
    delete enumer1;
    enumer1 = compact2->openWords(status);
    if (U_FAILURE(status)) {
        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
        goto cleanup;
    }

    if (wordCount != (testCount = enumer1->count(status))) {
        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
            testCount, wordCount, u_errorName(status));
        goto cleanup;
    }

cleanup:
    delete compactDict;
    delete mutableDict;
    delete breaks;
    delete[] testFile;
    delete enumer1;
    delete mutable2;
    delete cloneEnum;
    delete compact2;
}


//----------------------------------------------------------------------------
//
// generalIteratorTest      Given a break iterator and a set of test data,
//                          Run the tests and report the results.
//
//----------------------------------------------------------------------------
void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
{

    bi.setText(td.fDataToBreak);

    testFirstAndNext(bi, td);

    testLastAndPrevious(bi, td);

    testFollowing(bi, td);
    testPreceding(bi, td);
    testIsBoundary(bi, td);
    doMultipleSelectionTest(bi, td);
}


//
//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
//                       kind of loop.
//
void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
{
    UErrorCode  status = U_ZERO_ERROR;
    int32_t     p;
    int32_t     lastP = -1;
    int32_t     tag;

    logln("Test first and next");
    bi.setText(td.fDataToBreak);
    td.clearResults();

    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
        td.fActualBreakPositions.addElement(p, status);  // Save result.
        tag = bi.getRuleStatus();
        td.fActualTags.addElement(tag, status);
        if (p <= lastP) {
            // If the iterator is not making forward progress, stop.
            //  No need to raise an error here, it'll be detected in the normal check of results.
            break;
        }
        lastP = p;
    }
    td.checkResults("testFirstAndNext", this);
}


//
//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
//
void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
{
    UErrorCode  status = U_ZERO_ERROR;
    int32_t     p;
    int32_t     lastP  = 0x7ffffffe;
    int32_t     tag;

    logln("Test first and next");
    bi.setText(td.fDataToBreak);
    td.clearResults();

    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
        // Save break position.  Insert it at start of vector of results, shoving
        //    already-saved results further towards the end.
        td.fActualBreakPositions.insertElementAt(p, 0, status);
        // bi.previous();   // TODO:  Why does this fix things up????
        // bi.next();
        tag = bi.getRuleStatus();
        td.fActualTags.insertElementAt(tag, 0, status);
        if (p >= lastP) {
            // If the iterator is not making progress, stop.
            //  No need to raise an error here, it'll be detected in the normal check of results.
            break;
        }
        lastP = p;
    }
    td.checkResults("testLastAndPrevious", this);
}


void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
{
    UErrorCode  status = U_ZERO_ERROR;
    int32_t     p;
    int32_t     tag;
    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
                                 //   cannot be -1; that is returned for DONE.
    int         i;

    logln("testFollowing():");
    bi.setText(td.fDataToBreak);
    td.clearResults();

    // Save the starting point, since we won't get that out of following.
    p = bi.first();
    td.fActualBreakPositions.addElement(p, status);  // Save result.
    tag = bi.getRuleStatus();
    td.fActualTags.addElement(tag, status);

    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
        p = bi.following(i);
        if (p != lastP) {
            if (p == RuleBasedBreakIterator::DONE) {
                break;
            }
            // We've reached a new break position.  Save it.
            td.fActualBreakPositions.addElement(p, status);  // Save result.
            tag = bi.getRuleStatus();
            td.fActualTags.addElement(tag, status);
            lastP = p;
        }
    }
    // The loop normally exits by means of the break in the middle.
    // Make sure that the index was at the correct position for the break iterator to have
    //   returned DONE.
    if (i != td.fDataToBreak.length()) {
        errln("testFollowing():  iterator returned DONE prematurely.");
    }

    // Full check of all results.
    td.checkResults("testFollowing", this);
}


void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    UErrorCode  status = U_ZERO_ERROR;
    int32_t     p;
    int32_t     tag;
    int32_t     lastP  = 0x7ffffffe;
    int         i;

    logln("testPreceding():");
    bi.setText(td.fDataToBreak);
    td.clearResults();

    p = bi.last();
    td.fActualBreakPositions.addElement(p, status);
    tag = bi.getRuleStatus();
    td.fActualTags.addElement(tag, status);

    for (i = td.fDataToBreak.length(); i>=-1; i--) {
        p = bi.preceding(i);
        if (p != lastP) {
            if (p == RuleBasedBreakIterator::DONE) {
                break;
            }
            // We've reached a new break position.  Save it.
            td.fActualBreakPositions.insertElementAt(p, 0, status);
            lastP = p;
            tag = bi.getRuleStatus();
            td.fActualTags.insertElementAt(tag, 0, status);
        }
    }
    // The loop normally exits by means of the break in the middle.
    // Make sure that the index was at the correct position for the break iterator to have
    //   returned DONE.
    if (i != 0) {
        errln("testPreceding():  iterator returned DONE prematurely.");
    }

    // Full check of all results.
    td.checkResults("testPreceding", this);
}


void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
    UErrorCode  status = U_ZERO_ERROR;
    int         i;
    int32_t     tag;

    logln("testIsBoundary():");
    bi.setText(td.fDataToBreak);
    td.clearResults();

    for (i = 0; i <= td.fDataToBreak.length(); i++) {
        if (bi.isBoundary(i)) {
            td.fActualBreakPositions.addElement(i, status);  // Save result.
            tag = bi.getRuleStatus();
            td.fActualTags.addElement(tag, status);
        }
    }
    td.checkResults("testIsBoundary: ", this);
}


void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
{
    iterator.setText(td.fDataToBreak);

    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
    int32_t offset = iterator.first();
    int32_t testOffset;
    int32_t count = 0;

    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());

    if (*testIterator != iterator)
        errln("clone() or operator!= failed: two clones compared unequal");

    do {
        testOffset = testIterator->first();
        testOffset = testIterator->next(count);
        if (offset != testOffset)
            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);

        if (offset != RuleBasedBreakIterator::DONE) {
            count++;
            offset = iterator.next();

            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
                if (count > 10000 || offset == -1) {
                    errln("operator== failed too many times. Stopping test.");
                    if (offset == -1) {
                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
                    }
                    return;
                }
            }
        }
    } while (offset != RuleBasedBreakIterator::DONE);

    // now do it backwards...
    offset = iterator.last();
    count = 0;

    do {
        testOffset = testIterator->last();
        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
        if (offset != testOffset)
            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);

        if (offset != RuleBasedBreakIterator::DONE) {
            count--;
            offset = iterator.previous();
        }
    } while (offset != RuleBasedBreakIterator::DONE);

    delete testIterator;
}


//---------------------------------------------
//
//     other tests
//
//---------------------------------------------
void RBBITest::TestEmptyString()
{
    UnicodeString text = "";
    UErrorCode status = U_ZERO_ERROR;

    BITestData x(status);
    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
        return;
    }
    generalIteratorTest(*bi, x);
    delete bi;
}

void RBBITest::TestGetAvailableLocales()
{
    int32_t locCount = 0;
    const Locale* locList = BreakIterator::getAvailableLocales(locCount);

    if (locCount == 0)
        errln("getAvailableLocales() returned an empty list!");
    // Just make sure that it's returning good memory.
    int32_t i;
    for (i = 0; i < locCount; ++i) {
        logln(locList[i].getName());
    }
}

//Testing the BreakIterator::getDisplayName() function
void RBBITest::TestGetDisplayName()
{
    UnicodeString   result;

    BreakIterator::getDisplayName(Locale::getUS(), result);
    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
        errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
                + result);

    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
    if (result != "French (France)")
        errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
                + result);
}
/**
 * Test End Behaviour
 * @bug 4068137
 */
void RBBITest::TestEndBehaviour()
{
    UErrorCode status = U_ZERO_ERROR;
    UnicodeString testString("boo.");
    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
        return;
    }
    wb->setText(testString);

    if (wb->first() != 0)
        errln("Didn't get break at beginning of string.");
    if (wb->next() != 3)
        errln("Didn't get break before period in \"boo.\"");
    if (wb->current() != 4 && wb->next() != 4)
        errln("Didn't get break at end of string.");
    delete wb;
}
/*
 * @bug 4153072
 */
void RBBITest::TestBug4153072() {
    UErrorCode status = U_ZERO_ERROR;
    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
        return;
    }
    UnicodeString str("...Hello, World!...");
    int32_t begin = 3;
    int32_t end = str.length() - 3;
    UBool onBoundary;

    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    iter->adoptText(textIterator);
    int index;
    // Note: with the switch to UText, there is no way to restrict the
    //       iteration range to begin at an index other than zero.
    //       String character iterators created with a non-zero bound are
    //         treated by RBBI as being empty.
    for (index = -1; index < begin + 1; ++index) {
        onBoundary = iter->isBoundary(index);
        if (index == 0?  !onBoundary : onBoundary) {
            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
                            " and begin index = " + begin);
        }
    }
    delete iter;
}


//
// Test for problem reported by Ashok Matoria on 9 July 2007
//    One.<kSoftHyphen><kSpace>Two.
//
//    Sentence break at start (0) and then on calling next() it breaks at
//   ‘T’ of “Two”. Now, at this point if I do next() and
//    then previous(), it breaks at <kSOftHyphen> instead of ‘T’ of Two.
//
void RBBITest::TestBug5775() {
    UErrorCode status = U_ZERO_ERROR;
    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT(bi != NULL);

    UnicodeString s("One.\\u00ad Two.");
    //               01234      56789
    s = s.unescape();
    bi->setText(s);
    int pos = bi->next();
    TEST_ASSERT(pos == 6);
    pos = bi->next();
    TEST_ASSERT(pos == 10);
    pos = bi->previous();
    TEST_ASSERT(pos == 6);
    delete bi;
}
    
    
/**
 * Test Japanese Line Break
 * @bug 4095322
 */
void RBBITest::TestJapaneseLineBreak()
{
#if 0
    // Test needs updating some more...   Dump it for now.


    // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
    //        as opening and closing punctuation for line breaking.
    //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
    //        from these tests.    6-13-2002
    //
    UErrorCode status = U_ZERO_ERROR;
    UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
    UnicodeString precedingChars = CharsToUnicodeString(
        //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
        "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
    UnicodeString followingChars = CharsToUnicodeString(
        // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
        ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
        // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
        ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
        "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
    BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);

    int32_t i;
    if (U_FAILURE(status))
    {
        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
        return;
    }

    for (i = 0; i < precedingChars.length(); i++) {
        testString.setCharAt(1, precedingChars[i]);
        iter->setText(testString);
        int32_t j = iter->first();
        if (j != 0)
            errln("ja line break failure: failed to start at 0");
        j = iter->next();
        if (j != 1)
            errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
                        + "' (" + ((int)(precedingChars[i])) + ")");
        j = iter->next();
        if (j != 3)
            errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
                        + "' (" + ((int)(precedingChars[i])) + ")");
    }

    for (i = 0; i < followingChars.length(); i++) {
        testString.setCharAt(1, followingChars[i]);
        iter->setText(testString);
        int j = iter->first();
        if (j != 0)
            errln("ja line break failure: failed to start at 0");
        j = iter->next();
        if (j != 2)
            errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
                        + "' (" + ((int)(followingChars[i])) + ")");
        j = iter->next();
        if (j != 3)
            errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
                        + "' (" + ((int)(followingChars[i])) + ")");
    }
    delete iter;
#endif
}


//------------------------------------------------------------------------------
//
//   RBBITest::Extended    Run  RBBI Tests from an external test data file
//
//------------------------------------------------------------------------------

struct TestParams {
    BreakIterator   *bi;
    UnicodeString    dataToBreak;
    UVector32       *expectedBreaks;
    UVector32       *srcLine;
    UVector32       *srcCol;
};

void RBBITest::executeTest(TestParams *t) {
    int32_t    bp;
    int32_t    prevBP;
    int32_t    i;

    if (t->bi == NULL) {
        return;
    }

    t->bi->setText(t->dataToBreak);
    //
    //  Run the iterator forward
    //
    prevBP = -1;
    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
        if (prevBP ==  bp) {
            // Fail for lack of forward progress.
            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
            break;
        }

        // Check that there were we didn't miss an expected break between the last one
        //  and this one.
        for (i=prevBP+1; i<bp; i++) {
            if (t->expectedBreaks->elementAti(i) != 0) {
                int expected[] = {0, i};
                printStringBreaks(t->dataToBreak, expected, 2);
                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
            }
        }

        // Check that the break we did find was expected
        if (t->expectedBreaks->elementAti(bp) == 0) {
            int expected[] = {0, bp};
            printStringBreaks(t->dataToBreak, expected, 2);
            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
        } else {
            // The break was expected.
            //   Check that the {nnn} tag value is correct.
            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
            if (expectedTagVal == -1) {
                expectedTagVal = 0;
            }
            int32_t line = t->srcLine->elementAti(bp);
            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
            if (rs != expectedTagVal) {
                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
                      "          Actual, Expected status = %4d, %4d",
                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
            }
        }


        prevBP = bp;
    }

    // Verify that there were no missed expected breaks after the last one found
    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
        if (t->expectedBreaks->elementAti(i) != 0) {
            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
        }
    }

    //
    //  Run the iterator backwards, verify that the same breaks are found.
    //
    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
        if (prevBP ==  bp) {
            // Fail for lack of progress.
            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
            break;
        }

        // Check that there were we didn't miss an expected break between the last one
        //  and this one.  (UVector returns zeros for index out of bounds.)
        for (i=prevBP-1; i>bp; i--) {
            if (t->expectedBreaks->elementAti(i) != 0) {
                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
            }
        }

        // Check that the break we did find was expected
        if (t->expectedBreaks->elementAti(bp) == 0) {
            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
        } else {
            // The break was expected.
            //   Check that the {nnn} tag value is correct.
            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
            if (expectedTagVal == -1) {
                expectedTagVal = 0;
            }
            int line = t->srcLine->elementAti(bp);
            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
            if (rs != expectedTagVal) {
                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
                      "          Actual, Expected status = %4d, %4d",
                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
            }
        }

        prevBP = bp;
    }

    // Verify that there were no missed breaks prior to the last one found
    for (i=prevBP-1; i>=0; i--) {
        if (t->expectedBreaks->elementAti(i) != 0) {
            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
        }
    }
}


void RBBITest::TestExtended() {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    UErrorCode      status  = U_ZERO_ERROR;
    Locale          locale("");

    UnicodeString       rules;
    TestParams          tp;
    tp.bi             = NULL;
    tp.expectedBreaks = new UVector32(status);
    tp.srcLine        = new UVector32(status);
    tp.srcCol         = new UVector32(status);

    RegexMatcher      localeMatcher("<locale *([\\p{L}\\p{Nd}_]*) *>", 0, status);
    TEST_ASSERT_SUCCESS(status);


    //
    //  Open and read the test data file.
    //
    const char *testDataDirectory = IntlTest::getSourceTestData(status);
    char testFileName[1000];
    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
        errln("Can't open test data.  Path too long.");
        return;
    }
    strcpy(testFileName, testDataDirectory);
    strcat(testFileName, "rbbitst.txt");

    int    len;
    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
    if (U_FAILURE(status)) {
        return; /* something went wrong, error already output */
    }


    //
    //  Put the test data into a UnicodeString
    //
    UnicodeString testString(FALSE, testFile, len);

    enum EParseState{
        PARSE_COMMENT,
        PARSE_TAG,
        PARSE_DATA,
        PARSE_NUM
    }
    parseState = PARSE_TAG;

    EParseState savedState = PARSE_TAG;

    static const UChar CH_LF        = 0x0a;
    static const UChar CH_CR        = 0x0d;
    static const UChar CH_HASH      = 0x23;
    /*static const UChar CH_PERIOD    = 0x2e;*/
    static const UChar CH_LT        = 0x3c;
    static const UChar CH_GT        = 0x3e;
    static const UChar CH_BACKSLASH = 0x5c;
    static const UChar CH_BULLET    = 0x2022;

    int32_t    lineNum  = 1;
    int32_t    colStart = 0;
    int32_t    column   = 0;
    int32_t    charIdx  = 0;

    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.

    for (charIdx = 0; charIdx < len; ) {
        status = U_ZERO_ERROR;
        UChar  c = testString.charAt(charIdx);
        charIdx++;
        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
            // treat CRLF as a unit
            c = CH_LF;
            charIdx++;
        }
        if (c == CH_LF || c == CH_CR) {
            lineNum++;
            colStart = charIdx;
        }
        column = charIdx - colStart + 1;

        switch (parseState) {
        case PARSE_COMMENT:
            if (c == 0x0a || c == 0x0d) {
                parseState = savedState;
            }
            break;

        case PARSE_TAG:
            {
            if (c == CH_HASH) {
                parseState = PARSE_COMMENT;
                savedState = PARSE_TAG;
                break;
            }
            if (u_isUWhiteSpace(c)) {
                break;
            }
            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
                delete tp.bi;
                tp.bi = BreakIterator::createWordInstance(locale,  status);
                charIdx += 5;
                break;
            }
            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
                delete tp.bi;
                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
                charIdx += 5;
                break;
            }
            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
                delete tp.bi;
                tp.bi = BreakIterator::createLineInstance(locale,  status);
                charIdx += 5;
                break;
            }
            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
                delete tp.bi;
                tp.bi = NULL;
                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
                charIdx += 5;
                break;
            }
            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
                delete tp.bi;
                tp.bi = BreakIterator::createTitleInstance(locale,  status);
                charIdx += 6;
                break;
            }
            if (testString.compare(charIdx-1, 5, "<xgc>") == 0) {
                delete tp.bi;
                tp.bi = BreakIterator::createXGraphemeClusterInstance(locale,  status);
                charIdx += 4;
                break;
            }
                
            // <locale  loc_name>
            localeMatcher.reset(testString);
            if (localeMatcher.lookingAt(charIdx-1, status)) {
                UnicodeString localeName = localeMatcher.group(1, status);
                char localeName8[100];
                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
                locale = Locale::createFromName(localeName8);
                charIdx += localeMatcher.group(0, status).length();
                TEST_ASSERT_SUCCESS(status);
                break;
            }
            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
                parseState = PARSE_DATA;
                charIdx += 5;
                tp.dataToBreak = "";
                tp.expectedBreaks->removeAllElements();
                tp.srcCol ->removeAllElements();
                tp.srcLine->removeAllElements();
                break;
            }

            errln("line %d: Tag expected in test file.", lineNum);
            parseState = PARSE_COMMENT;
            savedState = PARSE_DATA;
            goto end_test; // Stop the test.
            }
            break;

        case PARSE_DATA:
            if (c == CH_BULLET) {
                int32_t  breakIdx = tp.dataToBreak.length();
                tp.expectedBreaks->setSize(breakIdx+1);
                tp.expectedBreaks->setElementAt(-1, breakIdx);
                tp.srcLine->setSize(breakIdx+1);
                tp.srcLine->setElementAt(lineNum, breakIdx);
                tp.srcCol ->setSize(breakIdx+1);
                tp.srcCol ->setElementAt(column, breakIdx);
                break;
            }

            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
                // Add final entry to mappings from break location to source file position.
                //  Need one extra because last break position returned is after the
                //    last char in the data, not at the last char.
                tp.srcLine->addElement(lineNum, status);
                tp.srcCol ->addElement(column, status);

                parseState = PARSE_TAG;
                charIdx += 6;

                // RUN THE TEST!
                executeTest(&tp);
                break;
            }

            if (testString.compare(charIdx-1, 3, "\\N{") == 0) {
                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
                // Get the code point from the name and insert it into the test data.
                //   (Damn, no API takes names in Unicode  !!!
                //    we've got to take it back to char *)
                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
                int32_t nameLength = nameEndIdx - (charIdx+2);
                char charNameBuf[200];
                UChar32 theChar = -1;
                if (nameEndIdx != -1) {
                    UErrorCode status = U_ZERO_ERROR;
                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
                    charNameBuf[sizeof(charNameBuf)-1] = 0;
                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
                    if (U_FAILURE(status)) {
                        theChar = -1;
                    }
                }
                if (theChar == -1) {
                    errln("Error in named character in test file at line %d, col %d",
                        lineNum, column);
                } else {
                    // Named code point was recognized.  Insert it
                    //   into the test data.
                    tp.dataToBreak.append(theChar);
                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
                        tp.srcLine->addElement(lineNum, status);
                        tp.srcCol ->addElement(column, status);
                    }
                }
                if (nameEndIdx > charIdx) {
                    charIdx = nameEndIdx+1;

                }
                break;
            }


            if (testString.compare(charIdx-1, 2, "<>") == 0) {
                charIdx++;
                int32_t  breakIdx = tp.dataToBreak.length();
                tp.expectedBreaks->setSize(breakIdx+1);
                tp.expectedBreaks->setElementAt(-1, breakIdx);
                tp.srcLine->setSize(breakIdx+1);
                tp.srcLine->setElementAt(lineNum, breakIdx);
                tp.srcCol ->setSize(breakIdx+1);
                tp.srcCol ->setElementAt(column, breakIdx);
                break;
            }

            if (c == CH_LT) {
                tagValue   = 0;
                parseState = PARSE_NUM;
                break;
            }

            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
                parseState = PARSE_COMMENT;
                savedState = PARSE_DATA;
                break;
            }

            if (c == CH_BACKSLASH) {
                // Check for \ at end of line, a line continuation.
                //     Advance over (discard) the newline
                UChar32 cp = testString.char32At(charIdx);
                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
                    // We have a CR LF
                    //  Need an extra increment of the input ptr to move over both of them
                    charIdx++;
                }
                if (cp == CH_LF || cp == CH_CR) {
                    lineNum++;
                    colStart = charIdx;
                    charIdx++;
                    break;
                }

                // Let unescape handle the back slash.
                cp = testString.unescapeAt(charIdx);
                if (cp != -1) {
                    // Escape sequence was recognized.  Insert the char
                    //   into the test data.
                    tp.dataToBreak.append(cp);
                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
                        tp.srcLine->addElement(lineNum, status);
                        tp.srcCol ->addElement(column, status);
                    }
                    break;
                }


                // Not a recognized backslash escape sequence.
                // Take the next char as a literal.
                //  TODO:  Should this be an error?
                c = testString.charAt(charIdx);
                charIdx = testString.moveIndex32(charIdx, 1);
            }

            // Normal, non-escaped data char.
            tp.dataToBreak.append(c);

            // Save the mapping from offset in the data to line/column numbers in
            //   the original input file.  Will be used for better error messages only.
            //   If there's an expected break before this char, the slot in the mapping
            //     vector will already be set for this char; don't overwrite it.
            if (tp.dataToBreak.length() > tp.srcLine->size()) {
                tp.srcLine->addElement(lineNum, status);
                tp.srcCol ->addElement(column, status);
            }
            break;


        case PARSE_NUM:
            // We are parsing an expected numeric tag value, like <1234>,
            //   within a chunk of data.
            if (u_isUWhiteSpace(c)) {
                break;
            }

            if (c == CH_GT) {
                // Finished the number.  Add the info to the expected break data,
                //   and switch parse state back to doing plain data.
                parseState = PARSE_DATA;
                if (tagValue == 0) {
                    tagValue = -1;
                }
                int32_t  breakIdx = tp.dataToBreak.length();
                tp.expectedBreaks->setSize(breakIdx+1);
                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
                tp.srcLine->setSize(breakIdx+1);
                tp.srcLine->setElementAt(lineNum, breakIdx);
                tp.srcCol ->setSize(breakIdx+1);
                tp.srcCol ->setElementAt(column, breakIdx);
                break;
            }

            if (u_isdigit(c)) {
                tagValue = tagValue*10 + u_charDigitValue(c);
                break;
            }

            errln("Syntax Error in test file at line %d, col %d",
                lineNum, column);
            parseState = PARSE_COMMENT;
            goto end_test; // Stop the test
            break;
        }


        if (U_FAILURE(status)) {
            errln("ICU Error %s while parsing test file at line %d.",
                u_errorName(status), lineNum);
            status = U_ZERO_ERROR;
            goto end_test; // Stop the test
        }

    }

end_test:
    delete tp.bi;
    delete tp.expectedBreaks;
    delete tp.srcLine;
    delete tp.srcCol;
    delete [] testFile;
#endif
}


//-------------------------------------------------------------------------------
//
//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
//    return the datain one big UChar * buffer, which the caller must delete.
//
//    parameters:
//          fileName:   the name of the file, with no directory part.  The test data directory
//                      is assumed.
//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
//                      Pass NULL for the system default encoding.
//          status
//    returns:
//                      The file data, converted to UChar.
//                      The caller must delete this when done with
//                           delete [] theBuffer;
//
//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
//           Move this function to some common place.
//
//--------------------------------------------------------------------------------
UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
    UChar       *retPtr  = NULL;
    char        *fileBuf = NULL;
    UConverter* conv     = NULL;
    FILE        *f       = NULL;

    ulen = 0;
    if (U_FAILURE(status)) {
        return retPtr;
    }

    //
    //  Open the file.
    //
    f = fopen(fileName, "rb");
    if (f == 0) {
        errln("Error opening test data file %s\n", fileName);
        status = U_FILE_ACCESS_ERROR;
        return NULL;
    }
    //
    //  Read it in
    //
    int   fileSize;
    int   amt_read;

    fseek( f, 0, SEEK_END);
    fileSize = ftell(f);
    fileBuf = new char[fileSize];
    fseek(f, 0, SEEK_SET);
    amt_read = fread(fileBuf, 1, fileSize, f);
    if (amt_read != fileSize || fileSize <= 0) {
        errln("Error reading test data file.");
        goto cleanUpAndReturn;
    }

    //
    // Look for a Unicode Signature (BOM) on the data just read
    //
    int32_t        signatureLength;
    const char *   fileBufC;
    const char*    bomEncoding;

    fileBufC = fileBuf;
    bomEncoding = ucnv_detectUnicodeSignature(
        fileBuf, fileSize, &signatureLength, &status);
    if(bomEncoding!=NULL ){
        fileBufC  += signatureLength;
        fileSize  -= signatureLength;
        encoding = bomEncoding;
    }

    //
    // Open a converter to take the rule file to UTF-16
    //
    conv = ucnv_open(encoding, &status);
    if (U_FAILURE(status)) {
        goto cleanUpAndReturn;
    }

    //
    // Convert the rules to UChar.
    //  Preflight first to determine required buffer size.
    //
    ulen = ucnv_toUChars(conv,
        NULL,           //  dest,
        0,              //  destCapacity,
        fileBufC,
        fileSize,
        &status);
    if (status == U_BUFFER_OVERFLOW_ERROR) {
        // Buffer Overflow is expected from the preflight operation.
        status = U_ZERO_ERROR;

        retPtr = new UChar[ulen+1];
        ucnv_toUChars(conv,
            retPtr,       //  dest,
            ulen+1,
            fileBufC,
            fileSize,
            &status);
    }

cleanUpAndReturn:
    fclose(f);
    delete []fileBuf;
    ucnv_close(conv);
    if (U_FAILURE(status)) {
        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
        delete retPtr;
        retPtr = 0;
        ulen   = 0;
    };
    return retPtr;
}


//--------------------------------------------------------------------------------------------
//
//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
//
//-------------------------------------------------------------------------------------------
void RBBITest::TestUnicodeFiles() {
    RuleBasedBreakIterator  *bi;
    UErrorCode               status = U_ZERO_ERROR;

    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
    TEST_ASSERT_SUCCESS(status);
    if (U_SUCCESS(status)) {
        runUnicodeTestData("GraphemeBreakTest.txt", bi);
    }
    delete bi;

    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
    TEST_ASSERT_SUCCESS(status);
    if (U_SUCCESS(status)) {
        runUnicodeTestData("WordBreakTest.txt", bi);
    }
    delete bi;

    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
    TEST_ASSERT_SUCCESS(status);
    if (U_SUCCESS(status)) {
        runUnicodeTestData("SentenceBreakTest.txt", bi);
    }
    delete bi;

    #if 0
    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharInstance(Locale::getDefault(), status);
    TEST_ASSERT_SUCCESS(status);
    if (U_SUCCESS(status)) {
        runUnicodeTestData("LBTest.txt", bi);
    }
    delete bi;
    #endif

}


//--------------------------------------------------------------------------------------------
//
//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
//
//-------------------------------------------------------------------------------------------
void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    UErrorCode  status = U_ZERO_ERROR;

    //
    //  Open and read the test data file, put it into a UnicodeString.
    //
    const char *testDataDirectory = IntlTest::getSourceTestData(status);
    char testFileName[1000];
    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
        errln("Can't open test data.  Path too long.");
        return;
    }
    strcpy(testFileName, testDataDirectory);
    strcat(testFileName, fileName);

    int    len;
    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT(testFile != NULL);
    if (U_FAILURE(status) || testFile == NULL) {
        return; /* something went wrong, error already output */
    }
    UnicodeString testFileAsString(TRUE, testFile, len);

    //
    //  Parse the test data file using a regular expression.
    //  Each kind of token is recognized in its own capture group; what type of item was scanned
    //     is identified by which group had a match.
    //
    //       Caputure Group #                  1          2            3            4           5
    //       Parses this item:               divide       x       hex digits   comment & nl   unrecognized
    //
    UnicodeString tokenExpr = "(?ms)\\s*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|(#.*?$.)|(.*?$.))";
    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, 0, status);
    UnicodeString   testString;
    UVector32       breakPositions(status);
    int             lineNumber = 1;
    int             charIndex  = 0;
    TEST_ASSERT_SUCCESS(status);
    if (U_FAILURE(status)) {
        return;
    }

    //
    //  Scan through each test case, building up the string to be broken in testString,
    //   and the positions that should be boundaries in the breakPositions vector.
    //
    while (tokenMatcher.lookingAt(charIndex, status)) {
        if (tokenMatcher.start(1, status) >= 0) {
            // Scanned a divide sign, indicating a break position in the test data.
            if (testString.length()>0) {
                breakPositions.addElement(testString.length(), status);
            }
        }
        else if (tokenMatcher.start(2, status) >= 0) {
            // Scanned an 'x', meaning no break at this position in the test data
            //   Nothing to be done here.
            }
        else if (tokenMatcher.start(3, status) >= 0) {
            // Scanned Hex digits.  Convert them to binary, append to the character data string.
            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
            int length = hexNumber.length();
            if (length<=8) {
                char buf[10];
                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
                UChar32 c = (UChar32)strtol(buf, NULL, 16);
                if (c<=0x10ffff) {
                    testString.append(c);
                } else {
                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
                       fileName, lineNumber);
                }
            } else {
                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
                       fileName, lineNumber);
             }
        }
        else if (tokenMatcher.start(4, status) >= 0) {
            // Scanned to end of a line, possibly skipping over a comment in the process.
            //   If the line from the file contained test data, run the test now.
            //
            if (testString.length() > 0) {
                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
            }

            // Clear out this test case.
            //    The string and breakPositions vector will be refilled as the next
            //       test case is parsed.
            testString.remove();
            breakPositions.setSize(0);
            lineNumber++;
        } else {
            // Scanner catchall.  Something unrecognized appeared on the line.
            char token[16];
            UnicodeString uToken = tokenMatcher.group(0, status);
            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
            token[sizeof(token)-1] = 0;
            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);

            // Clean up, in preparation for continuing with the next line.
            testString.remove();
            breakPositions.setSize(0);
            lineNumber++;
        }
        TEST_ASSERT_SUCCESS(status);
        if (U_FAILURE(status)) {
            break;
        }
        charIndex = tokenMatcher.end(status);
    }

    delete [] testFile;
 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
}

//--------------------------------------------------------------------------------------------
//
//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
//                            test data files.  Do only a simple, forward-only check -
//                            this test is mostly to check that ICU and the Unicode
//                            data agree with each other.
//
//--------------------------------------------------------------------------------------------
void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
                         const UnicodeString &testString,   // Text data to be broken
                         UVector32 *breakPositions,         // Positions where breaks should be found.
                         RuleBasedBreakIterator *bi) {
    int32_t pos;                 // Break Position in the test string
    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
    int32_t expectedPos;         // Expected break position (index into test string)

    bi->setText(testString);
    pos = bi->first();
    pos = bi->next();

    while (pos != BreakIterator::DONE) {
        if (expectedI >= breakPositions->size()) {
            errln("Test file \"%s\", line %d, unexpected break found at position %d",
                testFileName, lineNumber, pos);
            break;
        }
        expectedPos = breakPositions->elementAti(expectedI);
        if (pos < expectedPos) {
            errln("Test file \"%s\", line %d, unexpected break found at position %d",
                testFileName, lineNumber, pos);
            break;
        }
        if (pos > expectedPos) {
            errln("Test file \"%s\", line %d, failed to find break at position %d",
                testFileName, lineNumber, expectedPos);
            break;
        }
        pos = bi->next();
        expectedI++;
    }
    
    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
        errln("Test file \"%s\", line %d, failed to find break at position %d",
            testFileName, lineNumber, breakPositions->elementAti(expectedI));
    }
}


#if !UCONFIG_NO_REGULAR_EXPRESSIONS
//---------------------------------------------------------------------------------------
//
//   classs RBBIMonkeyKind
//
//      Monkey Test for Break Iteration
//      Abstract interface class.   Concrete derived classes independently
//      implement the break rules for different iterator types.
//
//      The Monkey Test itself uses doesn't know which type of break iterator it is
//      testing, but works purely in terms of the interface defined here.
//
//---------------------------------------------------------------------------------------
class RBBIMonkeyKind {
public:
    // Return a UVector of UnicodeSets, representing the character classes used
    //   for this type of iterator.
    virtual  UVector  *charClasses() = 0;

    // Set the test text on which subsequent calls to next() will operate
    virtual  void      setText(const UnicodeString &s) = 0;

    // Find the next break postion, starting from the prev break position, or from zero.
    // Return -1 after reaching end of string.
    virtual  int32_t   next(int32_t i) = 0;

    virtual ~RBBIMonkeyKind();
    UErrorCode       deferredStatus;


protected:
    RBBIMonkeyKind();

private:
};

RBBIMonkeyKind::RBBIMonkeyKind() {
    deferredStatus = U_ZERO_ERROR;
}

RBBIMonkeyKind::~RBBIMonkeyKind() {
}


//----------------------------------------------------------------------------------------
//
//   Random Numbers.  Similar to standard lib rand() and srand()
//                    Not using library to
//                      1.  Get same results on all platforms.
//                      2.  Get access to current seed, to more easily reproduce failures.
//
//---------------------------------------------------------------------------------------
static uint32_t m_seed = 1;

static uint32_t m_rand()
{
    m_seed = m_seed * 1103515245 + 12345;
    return (uint32_t)(m_seed/65536) % 32768;
}


//------------------------------------------------------------------------------------------
//
//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
//                             of RBBIMonkeyKind.
//
//------------------------------------------------------------------------------------------
class RBBICharMonkey: public RBBIMonkeyKind {
public:
    RBBICharMonkey();
    virtual          ~RBBICharMonkey();
    virtual  UVector *charClasses();
    virtual  void     setText(const UnicodeString &s);
    virtual  int32_t  next(int32_t i);
private:
    UVector   *fSets;

    UnicodeSet  *fCRLFSet;
    UnicodeSet  *fControlSet;
    UnicodeSet  *fExtendSet;
    UnicodeSet  *fHangulSet;
    UnicodeSet  *fAnySet;

    RegexMatcher  *fMatcher;
    const UnicodeString *fText;
};


RBBICharMonkey::RBBICharMonkey() {
    UErrorCode  status = U_ZERO_ERROR;

    fText = NULL;
    fMatcher = new RegexMatcher("\\X", 0, status);     // Pattern to match a grampheme cluster

    fCRLFSet    = new UnicodeSet("[\\r\\n]", status);
    fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
    fExtendSet  = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
    fHangulSet  = new UnicodeSet(
        "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
         "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);
    fAnySet     = new UnicodeSet("[\\u0000-\\U0010ffff]", status);

    fSets       = new UVector(status);
    fSets->addElement(fCRLFSet,    status);
    fSets->addElement(fControlSet, status);
    fSets->addElement(fExtendSet,  status);
    fSets->addElement(fHangulSet,  status);
    fSets->addElement(fAnySet,     status);
    if (U_FAILURE(status)) {
        deferredStatus = status;
    }
}


void RBBICharMonkey::setText(const UnicodeString &s) {
    fText = &s;
    fMatcher->reset(s);
}


int32_t RBBICharMonkey::next(int32_t i) {
    UErrorCode status = U_ZERO_ERROR;
    int32_t  retVal = -1;

    if (fMatcher->find(i, status)) {
        retVal = fMatcher->end(status);
    }
    if (U_FAILURE(status)){
        retVal = -1;
    }
    return retVal;
}


UVector  *RBBICharMonkey::charClasses() {
    return fSets;
}


RBBICharMonkey::~RBBICharMonkey() {
    delete fSets;
    delete fCRLFSet;
    delete fControlSet;
    delete fExtendSet;
    delete fHangulSet;
    delete fAnySet;

    delete fMatcher;
}

//------------------------------------------------------------------------------------------
//
//   class RBBIWordMonkey      Word Break specific implementation
//                             of RBBIMonkeyKind.
//
//------------------------------------------------------------------------------------------
class RBBIWordMonkey: public RBBIMonkeyKind {
public:
    RBBIWordMonkey();
    virtual          ~RBBIWordMonkey();
    virtual  UVector *charClasses();
    virtual  void     setText(const UnicodeString &s);
    virtual int32_t   next(int32_t i);
private:
    UVector      *fSets;

    UnicodeSet  *fKatakanaSet;
    UnicodeSet  *fALetterSet;
    UnicodeSet  *fMidLetterSet;
    UnicodeSet  *fMidNumSet;
    UnicodeSet  *fNumericSet;
    UnicodeSet  *fFormatSet;
    UnicodeSet  *fOtherSet;
    UnicodeSet  *fExtendSet;
    UnicodeSet  *fExtendNumLetSet;

    RegexMatcher  *fMatcher;

    const UnicodeString  *fText;
};


RBBIWordMonkey::RBBIWordMonkey()
{
    UErrorCode  status = U_ZERO_ERROR;


    fSets            = new UVector(status);

    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
                         "[\\p{Line_Break = Complex_Context}"
                         "-\\p{Grapheme_Cluster_Break = Extend}"
                         "-\\p{Grapheme_Cluster_Break = Control}]]",      status);
    //fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]",      status);
    fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}-[\\uff9e\\uff9f]]",     status);
    fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]",    status);
    fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]",       status);
    fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]",      status);
    fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]",       status);
    fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]", status);
    //fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]", status);
    fExtendSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}\\uff9e\\uff9f]", status);

    fOtherSet        = new UnicodeSet();
    if(U_FAILURE(status)) {
      deferredStatus = status;
      return;
    }

    fOtherSet->complement();
    fOtherSet->removeAll(*fKatakanaSet);
    fOtherSet->removeAll(*fALetterSet);
    fOtherSet->removeAll(*fMidLetterSet);
    fOtherSet->removeAll(*fMidNumSet);
    fOtherSet->removeAll(*fNumericSet);
    fOtherSet->removeAll(*fExtendNumLetSet);
    fOtherSet->removeAll(*fFormatSet);
    fOtherSet->removeAll(*fExtendSet);

    fSets->addElement(fALetterSet,   status);
    fSets->addElement(fKatakanaSet,  status);
    fSets->addElement(fMidLetterSet, status);
    fSets->addElement(fMidNumSet,    status);
    fSets->addElement(fNumericSet,   status);
    fSets->addElement(fFormatSet,    status);
    fSets->addElement(fExtendSet,    status);
    fSets->addElement(fOtherSet,     status);
    fSets->addElement(fExtendNumLetSet, status);


    if (U_FAILURE(status)) {
        deferredStatus = status;
    }
}

void RBBIWordMonkey::setText(const UnicodeString &s) {
    fText       = &s;
}


int32_t RBBIWordMonkey::next(int32_t prevPos) {
    int    p0, p1, p2, p3;    // Indices of the significant code points around the
                              //   break position being tested.  The candidate break
                              //   location is before p2.

    int     breakPos = -1;

    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.

    // Prev break at end of string.  return DONE.
    if (prevPos >= fText->length()) {
        return -1;
    }
    p0 = p1 = p2 = p3 = prevPos;
    c3 =  fText->char32At(prevPos);
    c0 = c1 = c2 = 0;

    // Loop runs once per "significant" character position in the input text.
    for (;;) {
        // Move all of the positions forward in the input string.
        p0 = p1;  c0 = c1;
        p1 = p2;  c1 = c2;
        p2 = p3;  c2 = c3;

        // Advancd p3 by    X(Extend | Format)*   Rule 4
        do {
            p3 = fText->moveIndex32(p3, 1);
            c3 = fText->char32At(p3);
        }
        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));


        if (p1 == p2) {
            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
            continue;
        }
        if (p2 == fText->length()) {
            // Reached end of string.  Always a break position.
            break;
        }

        // Rule  (3)   CR x LF
        //     No Extend or Format characters may appear between the CR and LF,
        //     which requires the additional check for p2 immediately following p1.
        //
        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
            continue;
        }

        // Rule (5).   ALetter x ALetter
        if (fALetterSet->contains(c1) &&
            fALetterSet->contains(c2))  {
            continue;
        }

        // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
        //
        //    Also incorporates rule 7 by skipping pos ahead to position of the
        //    terminating ALetter.
        if ( fALetterSet->contains(c1)   &&
             fMidLetterSet->contains(c2) &&
             fALetterSet->contains(c3)) {
            continue;
        }


        // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
        if (fALetterSet->contains(c0) &&
            (fMidLetterSet->contains(c1)  ) &&
            fALetterSet->contains(c2)) {
            continue;
        }

        // Rule (8)    Numeric x Numeric
        if (fNumericSet->contains(c1) &&
            fNumericSet->contains(c2))  {
            continue;
        }

        // Rule (9)    ALetter x Numeric
        if (fALetterSet->contains(c1) &&
            fNumericSet->contains(c2))  {
            continue;
        }

        // Rule (10)    Numeric x ALetter
        if (fNumericSet->contains(c1) &&
            fALetterSet->contains(c2))  {
            continue;
        }

        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
        if ( fNumericSet->contains(c0) &&
             fMidNumSet->contains(c1)  &&
            fNumericSet->contains(c2)) {
            continue;
        }

        // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
        if (fNumericSet->contains(c1) &&
            fMidNumSet->contains(c2)  &&
            fNumericSet->contains(c3)) {
            continue;
        }

        // Rule (13)  Katakana x Katakana
        if (fKatakanaSet->contains(c1) &&
            fKatakanaSet->contains(c2))  {
            continue;
        }

        // Rule 13a
        if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
             fExtendNumLetSet->contains(c2)) {
                continue;
             }

        // Rule 13b
        if (fExtendNumLetSet->contains(c1) &&
                (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
                fKatakanaSet->contains(c2)))  {
                continue;
             }

        // Rule 14.  Break found here.
        break;
    }

    breakPos = p2;
    return breakPos;
}


UVector  *RBBIWordMonkey::charClasses() {
    return fSets;
}


RBBIWordMonkey::~RBBIWordMonkey() {
    delete fSets;
    delete fKatakanaSet;
    delete fALetterSet;
    delete fMidLetterSet;
    delete fMidNumSet;
    delete fNumericSet;
    delete fFormatSet;
    delete fExtendSet;
    delete fExtendNumLetSet;
    delete fOtherSet;
}


//------------------------------------------------------------------------------------------
//
//   class RBBISentMonkey      Sentence Break specific implementation
//                             of RBBIMonkeyKind.
//
//------------------------------------------------------------------------------------------
class RBBISentMonkey: public RBBIMonkeyKind {
public:
    RBBISentMonkey();
    virtual          ~RBBISentMonkey();
    virtual  UVector *charClasses();
    virtual  void     setText(const UnicodeString &s);
    virtual int32_t   next(int32_t i);
private:
    int               moveBack(int posFrom);
    int               moveForward(int posFrom);
    UChar32           cAt(int pos);

    UVector      *fSets;

    UnicodeSet  *fSepSet;
    UnicodeSet  *fFormatSet;
    UnicodeSet  *fSpSet;
    UnicodeSet  *fLowerSet;
    UnicodeSet  *fUpperSet;
    UnicodeSet  *fOLetterSet;
    UnicodeSet  *fNumericSet;
    UnicodeSet  *fATermSet;
    UnicodeSet  *fSTermSet;
    UnicodeSet  *fCloseSet;
    UnicodeSet  *fOtherSet;
    UnicodeSet  *fExtendSet;

    const UnicodeString  *fText;

};

RBBISentMonkey::RBBISentMonkey()
{
    UErrorCode  status = U_ZERO_ERROR;

    fSets            = new UVector(status);

    fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep}]",     status);
    fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]",  status);
    fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]",      status);
    fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]",   status);
    fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]",   status);
    fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}-[\\uff9e\\uff9f]]", status);
    fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]", status);
    fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]",   status);
    fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]",   status);
    fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]",   status);
    fExtendSet       = new UnicodeSet("[\\p{Grapheme_Extend}\\uff9e\\uff9f]", status);
    fOtherSet        = new UnicodeSet();

    if(U_FAILURE(status)) {
      deferredStatus = status;
      return;
    }

    fOtherSet->complement();
    fOtherSet->removeAll(*fSepSet);
    fOtherSet->removeAll(*fFormatSet);
    fOtherSet->removeAll(*fSpSet);
    fOtherSet->removeAll(*fLowerSet);
    fOtherSet->removeAll(*fUpperSet);
    fOtherSet->removeAll(*fOLetterSet);
    fOtherSet->removeAll(*fNumericSet);
    fOtherSet->removeAll(*fATermSet);
    fOtherSet->removeAll(*fSTermSet);
    fOtherSet->removeAll(*fCloseSet);
    fOtherSet->removeAll(*fExtendSet);

    fSets->addElement(fSepSet,     status);
    fSets->addElement(fFormatSet,  status);

    fSets->addElement(fSpSet,      status);
    fSets->addElement(fLowerSet,   status);
    fSets->addElement(fUpperSet,   status);
    fSets->addElement(fOLetterSet, status);
    fSets->addElement(fNumericSet, status);
    fSets->addElement(fATermSet,   status);
    fSets->addElement(fSTermSet,   status);
    fSets->addElement(fCloseSet,   status);
    fSets->addElement(fOtherSet,   status);
    fSets->addElement(fExtendSet,  status);

    if (U_FAILURE(status)) {
        deferredStatus = status;
    }
}


void RBBISentMonkey::setText(const UnicodeString &s) {
    fText       = &s;
}

UVector  *RBBISentMonkey::charClasses() {
    return fSets;
}


//  moveBack()   Find the "significant" code point preceding the index i.
//               Skips over ($Extend | $Format)* .
//
int RBBISentMonkey::moveBack(int i) {
    if (i <= 0) {
        return -1;
    }
    UChar32   c;
    int32_t   j = i;
    do {
        j = fText->moveIndex32(j, -1);
        c = fText->char32At(j);
    }
    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
    return j;

 }


int RBBISentMonkey::moveForward(int i) {
    if (i>=fText->length()) {
        return fText->length();
    }
    UChar32   c;
    int32_t   j = i;
    do {
        j = fText->moveIndex32(j, 1);
        c = cAt(j);
    }
    while (fFormatSet->contains(c) || fExtendSet->contains(c));
    return j;
}

UChar32 RBBISentMonkey::cAt(int pos) {
    if (pos<0 || pos>=fText->length()) {
        return -1;
    } else {
        return fText->char32At(pos);
    }
}

int32_t RBBISentMonkey::next(int32_t prevPos) {
    int    p0, p1, p2, p3;    // Indices of the significant code points around the
                              //   break position being tested.  The candidate break
                              //   location is before p2.

    int     breakPos = -1;

    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
    UChar32 c;

    // Prev break at end of string.  return DONE.
    if (prevPos >= fText->length()) {
        return -1;
    }
    p0 = p1 = p2 = p3 = prevPos;
    c3 =  fText->char32At(prevPos);
    c0 = c1 = c2 = 0;

    // Loop runs once per "significant" character position in the input text.
    for (;;) {
        // Move all of the positions forward in the input string.
        p0 = p1;  c0 = c1;
        p1 = p2;  c1 = c2;
        p2 = p3;  c2 = c3;

        // Advancd p3 by    X(Extend | Format)*   Rule 4
        p3 = moveForward(p3);
        c3 = cAt(p3);

        // Rule (3)  CR x LF
        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
            continue;
        }

        // Rule (4).   Sep  <break>
        if (fSepSet->contains(c1)) {
            p2 = p1+1;   // Separators don't combine with Extend or Format.
            break;
        }

        if (p2 >= fText->length()) {
            // Reached end of string.  Always a break position.
            break;
        }

        if (p2 == prevPos) {
            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
            continue;
        }

        // Rule (6).   ATerm x Numeric
        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
            continue;
        }

        // Rule (7).  Upper ATerm  x  Uppper
        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
            continue;
        }

        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
        //           Note:  STerm | ATerm are added to the negated part of the expression by a
        //                  note to the Unicode 5.0 documents.
        int p8 = p1;
        while (fSpSet->contains(cAt(p8))) {
            p8 = moveBack(p8);
        }
        while (fCloseSet->contains(cAt(p8))) {
            p8 = moveBack(p8);
        }
        if (fATermSet->contains(cAt(p8))) {
            p8=p2;
            for (;;) {
                c = cAt(p8);
                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
                    fLowerSet->contains(c) || fSepSet->contains(c) ||
                    fATermSet->contains(c) || fSTermSet->contains(c))  {
                    break;
                }
                p8 = moveForward(p8);
            }
            if (fLowerSet->contains(cAt(p8))) {
                continue;
            }
        }

        // Rule 8a   (STerm | ATerm) Close* Sp* x (STerm | ATerm);
        if (fSTermSet->contains(c2) || fATermSet->contains(c2)) {
            p8 = p1;
            while (fSpSet->contains(cAt(p8))) {
                p8 = moveBack(p8);
            }
            while (fCloseSet->contains(cAt(p8))) {
                p8 = moveBack(p8);
            }
            c = cAt(p8);
            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
                continue;
            }
        }

        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep)
        int p9 = p1;
        while (fCloseSet->contains(cAt(p9))) {
            p9 = moveBack(p9);
        }
        c = cAt(p9);
        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
                continue;
            }
        }

        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep)
        int p10 = p1;
        while (fSpSet->contains(cAt(p10))) {
            p10 = moveBack(p10);
        }
        while (fCloseSet->contains(cAt(p10))) {
            p10 = moveBack(p10);
        }
        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
                continue;
            }
        }

        // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
        int p11 = p1;
        while (fSpSet->contains(cAt(p11))) {
            p11 = moveBack(p11);
        }
        while (fCloseSet->contains(cAt(p11))) {
            p11 = moveBack(p11);
        }
        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
            break;
        }

        //  Rule (12)  Any x Any
        continue;
    }
    breakPos = p2;
    return breakPos;
}

RBBISentMonkey::~RBBISentMonkey() {
    delete fSets;
    delete fSepSet;
    delete fFormatSet;
    delete fSpSet;
    delete fLowerSet;
    delete fUpperSet;
    delete fOLetterSet;
    delete fNumericSet;
    delete fATermSet;
    delete fSTermSet;
    delete fCloseSet;
    delete fOtherSet;
    delete fExtendSet;
}


//-------------------------------------------------------------------------------------------
//
//  RBBILineMonkey
//
//-------------------------------------------------------------------------------------------

class RBBILineMonkey: public RBBIMonkeyKind {
public:
    RBBILineMonkey();
    virtual          ~RBBILineMonkey();
    virtual  UVector *charClasses();
    virtual  void     setText(const UnicodeString &s);
    virtual  int32_t  next(int32_t i);
    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
private:
    UVector      *fSets;

    UnicodeSet  *fBK;
    UnicodeSet  *fCR;
    UnicodeSet  *fLF;
    UnicodeSet  *fCM;
    UnicodeSet  *fNL;
    UnicodeSet  *fSG;
    UnicodeSet  *fWJ;
    UnicodeSet  *fZW;
    UnicodeSet  *fGL;
    UnicodeSet  *fCB;
    UnicodeSet  *fSP;
    UnicodeSet  *fB2;
    UnicodeSet  *fBA;
    UnicodeSet  *fBB;
    UnicodeSet  *fHY;
    UnicodeSet  *fH2;
    UnicodeSet  *fH3;
    UnicodeSet  *fCL;
    UnicodeSet  *fEX;
    UnicodeSet  *fIN;
    UnicodeSet  *fJL;
    UnicodeSet  *fJV;
    UnicodeSet  *fJT;
    UnicodeSet  *fNS;
    UnicodeSet  *fOP;
    UnicodeSet  *fQU;
    UnicodeSet  *fIS;
    UnicodeSet  *fNU;
    UnicodeSet  *fPO;
    UnicodeSet  *fPR;
    UnicodeSet  *fSY;
    UnicodeSet  *fAI;
    UnicodeSet  *fAL;
    UnicodeSet  *fID;
    UnicodeSet  *fSA;
    UnicodeSet  *fXX;

    BreakIterator  *fCharBI;

    const UnicodeString  *fText;
    int32_t              *fOrigPositions;

    RegexMatcher         *fNumberMatcher;
    RegexMatcher         *fLB11Matcher;
};


RBBILineMonkey::RBBILineMonkey()
{
    UErrorCode  status = U_ZERO_ERROR;

    fSets  = new UVector(status);

    fBK    = new UnicodeSet("[\\p{Line_Break=BK}]", status);
    fCR    = new UnicodeSet("[\\p{Line_break=CR}]", status);
    fLF    = new UnicodeSet("[\\p{Line_break=LF}]", status);
    fCM    = new UnicodeSet("[\\p{Line_break=CM}]", status);
    fNL    = new UnicodeSet("[\\p{Line_break=NL}]", status);
    fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]", status);
    fZW    = new UnicodeSet("[\\p{Line_break=ZW}]", status);
    fGL    = new UnicodeSet("[\\p{Line_break=GL}]", status);
    fCB    = new UnicodeSet("[\\p{Line_break=CB}]", status);
    fSP    = new UnicodeSet("[\\p{Line_break=SP}]", status);
    fB2    = new UnicodeSet("[\\p{Line_break=B2}]", status);
    fBA    = new UnicodeSet("[\\p{Line_break=BA}]", status);
    fBB    = new UnicodeSet("[\\p{Line_break=BB}]", status);
    fHY    = new UnicodeSet("[\\p{Line_break=HY}]", status);
    fH2    = new UnicodeSet("[\\p{Line_break=H2}]", status);
    fH3    = new UnicodeSet("[\\p{Line_break=H3}]", status);
    fCL    = new UnicodeSet("[\\p{Line_break=CL}]", status);
    fEX    = new UnicodeSet("[\\p{Line_break=EX}]", status);
    fIN    = new UnicodeSet("[\\p{Line_break=IN}]", status);
    fJL    = new UnicodeSet("[\\p{Line_break=JL}]", status);
    fJV    = new UnicodeSet("[\\p{Line_break=JV}]", status);
    fJT    = new UnicodeSet("[\\p{Line_break=JT}]", status);
    fNS    = new UnicodeSet("[\\p{Line_break=NS}]", status);
    fOP    = new UnicodeSet("[\\p{Line_break=OP}]", status);
    fQU    = new UnicodeSet("[\\p{Line_break=QU}]", status);
    fIS    = new UnicodeSet("[\\p{Line_break=IS}]", status);
    fNU    = new UnicodeSet("[\\p{Line_break=NU}]", status);
    fPO    = new UnicodeSet("[\\p{Line_break=PO}]", status);
    fPR    = new UnicodeSet("[\\p{Line_break=PR}]", status);
    fSY    = new UnicodeSet("[\\p{Line_break=SY}]", status);
    fAI    = new UnicodeSet("[\\p{Line_break=AI}]", status);
    fAL    = new UnicodeSet("[\\p{Line_break=AL}]", status);
    fID    = new UnicodeSet("[\\p{Line_break=ID}]", status);
    fSA    = new UnicodeSet("[\\p{Line_break=SA}]", status);
    fSG    = new UnicodeSet("[\\ud800-\\udfff]", status);
    fXX    = new UnicodeSet("[\\p{Line_break=XX}]", status);

    if (U_FAILURE(status)) {
        deferredStatus = status;
        fCharBI = NULL;
        fNumberMatcher = NULL;
        return;
    }

    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.

    fSets->addElement(fBK, status);
    fSets->addElement(fCR, status);
    fSets->addElement(fLF, status);
    fSets->addElement(fCM, status);
    fSets->addElement(fNL, status);
    fSets->addElement(fWJ, status);
    fSets->addElement(fZW, status);
    fSets->addElement(fGL, status);
    fSets->addElement(fCB, status);
    fSets->addElement(fSP, status);
    fSets->addElement(fB2, status);
    fSets->addElement(fBA, status);
    fSets->addElement(fBB, status);
    fSets->addElement(fHY, status);
    fSets->addElement(fH2, status);
    fSets->addElement(fH3, status);
    fSets->addElement(fCL, status);
    fSets->addElement(fEX, status);
    fSets->addElement(fIN, status);
    fSets->addElement(fJL, status);
    fSets->addElement(fJT, status);
    fSets->addElement(fJV, status);
    fSets->addElement(fNS, status);
    fSets->addElement(fOP, status);
    fSets->addElement(fQU, status);
    fSets->addElement(fIS, status);
    fSets->addElement(fNU, status);
    fSets->addElement(fPO, status);
    fSets->addElement(fPR, status);
    fSets->addElement(fSY, status);
    fSets->addElement(fAI, status);
    fSets->addElement(fAL, status);
    fSets->addElement(fID, status);
    fSets->addElement(fWJ, status);
    fSets->addElement(fSA, status);
    fSets->addElement(fSG, status);

    fNumberMatcher = new RegexMatcher(
        "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
        "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
        "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
        "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
        "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
        "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?",
        0, status);

    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);

    if (U_FAILURE(status)) {
        deferredStatus = status;
    }
}


void RBBILineMonkey::setText(const UnicodeString &s) {
    fText       = &s;
    fCharBI->setText(s);
    fNumberMatcher->reset(s);
}

//
//  rule9Adjust
//     Line Break TR rules 9 and 10 implementation.
//     This deals with combining marks and other sequences that
//     that must be treated as if they were something other than what they actually are.
//
//     This is factored out into a separate function because it must be applied twice for
//     each potential break, once to the chars before the position being checked, then
//     again to the text following the possible break.
//
void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
    if (pos == -1) {
        // Invalid initial position.  Happens during the warmup iteration of the
        //   main loop in next().
        return;
    }

    int32_t  nPos = *nextPos;

    // LB 9  Keep combining sequences together.
    //  advance over any CM class chars.  Note that Line Break CM is different
    //  from the normal Grapheme Extend property.
    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
        for (;;) {
            *nextChar = fText->char32At(nPos);
            if (!fCM->contains(*nextChar)) {
                break;
            }
            nPos = fText->moveIndex32(nPos, 1);
        }
    }


    // LB 9 Treat X CM* as if it were x.
    //       No explicit action required.

    // LB 10  Treat any remaining combining mark as AL
    if (fCM->contains(*posChar)) {
        *posChar = 0x41;   // thisChar = 'A';
    }

    // Push the updated nextPos and nextChar back to our caller.
    // This only makes a difference if posChar got bigger by consuming a
    // combining sequence.
    *nextPos  = nPos;
    *nextChar = fText->char32At(nPos);
}


int32_t RBBILineMonkey::next(int32_t startPos) {
    UErrorCode status = U_ZERO_ERROR;
    int32_t    pos;       //  Index of the char following a potential break position
    UChar32    thisChar;  //  Character at above position "pos"

    int32_t    prevPos;   //  Index of the char preceding a potential break position
    UChar32    prevChar;  //  Character at above position.  Note that prevChar
                          //   and thisChar may not be adjacent because combining
                          //   characters between them will be ignored.

    int32_t    nextPos;   //  Index of the next character following pos.
                          //     Usually skips over combining marks.
    int32_t    nextCPPos; //  Index of the code point following "pos."
                          //     May point to a combining mark.
    int32_t    tPos;      //  temp value.
    UChar32    c;

    if (startPos >= fText->length()) {
        return -1;
    }


    // Initial values for loop.  Loop will run the first time without finding breaks,
    //                           while the invalid values shift out and the "this" and
    //                           "prev" positions are filled in with good values.
    pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
    thisChar = prevChar  = 0;
    nextPos  = nextCPPos = startPos;


    // Loop runs once per position in the test text, until a break position
    //  is found.
    for (;;) {
        prevPos   = pos;
        prevChar  = thisChar;

        pos       = nextPos;
        thisChar  = fText->char32At(pos);

        nextCPPos = fText->moveIndex32(pos, 1);
        nextPos   = nextCPPos;

        // Rule LB2 - Break at end of text.
        if (pos >= fText->length()) {
            break;
        }

        // Rule LB 9 - adjust for combining sequences.
        //             We do this one out-of-order because the adjustment does not change anything
        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
        //             be applied.
        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
        c = fText->char32At(nextPos);
        rule9Adjust(pos,     &thisChar, &nextPos, &c);

        // If the loop is still warming up - if we haven't shifted the initial
        //   -1 positions out of prevPos yet - loop back to advance the
        //    position in the input without any further looking for breaks.
        if (prevPos == -1) {
            continue;
        }

        // LB 4  Always break after hard line breaks,
        if (fBK->contains(prevChar)) {
            break;
        }

        // LB 5  Break after CR, LF, NL, but not inside CR LF
        if (prevChar == 0x0d && thisChar == 0x0a) {
            continue;
        }
        if (prevChar == 0x0d ||
            prevChar == 0x0a ||
            prevChar == 0x85)  {
            break;
        }

        // LB 6  Don't break before hard line breaks
        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
            fBK->contains(thisChar)) {
                continue;
        }


        // LB 7  Don't break before spaces or zero-width space.
        if (fSP->contains(thisChar)) {
            continue;
        }

        if (fZW->contains(thisChar)) {
            continue;
        }

        // LB 8  Break after zero width space
        if (fZW->contains(prevChar)) {
            break;
        }

        // LB 9, 10  Already done, at top of loop.
        //


        // LB 11  Do not break before or after WORD JOINER and related characters.
        //    x  WJ
        //    WJ  x
        //
        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
            continue;
        }

        // LB 12
        //    (!SP) x  GL
        //    GL  x
        if ((!fSP->contains(prevChar)) && fGL->contains(thisChar) ||
             fGL->contains(prevChar)) {
            continue;
        }


        // LB 13  Don't break before closings.
        //        NU x CL  and NU x IS are not matched here so that they will
        //        fall into LB 17 and the more general number regular expression.
        //
        if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
                                        fEX->contains(thisChar) ||
            !fNU->contains(prevChar) && fIS->contains(thisChar) ||
            !fNU->contains(prevChar) && fSY->contains(thisChar))    {
            continue;
        }

        // LB 14 Don't break after OP SP*
        //       Scan backwards, checking for this sequence.
        //       The OP char could include combining marks, so we actually check for
        //           OP CM* SP*
        //       Another Twist: The Rule 67 fixes may have changed a SP CM
        //       sequence into a ID char, so before scanning back through spaces,
        //       verify that prevChar is indeed a space.  The prevChar variable
        //       may differ from fText[prevPos]
        tPos = prevPos;
        if (fSP->contains(prevChar)) {
            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
                tPos=fText->moveIndex32(tPos, -1);
            }
        }
        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
            tPos=fText->moveIndex32(tPos, -1);
        }
        if (fOP->contains(fText->char32At(tPos))) {
            continue;
        }


        // LB 15    QU SP* x OP
        if (fOP->contains(thisChar)) {
            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
            int tPos = prevPos;
            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
                tPos = fText->moveIndex32(tPos, -1);
            }
            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
                tPos = fText->moveIndex32(tPos, -1);
            }
            if (fQU->contains(fText->char32At(tPos))) {
                continue;
            }
        }


        // LB 16   CL SP* x NS
        //    Scan backwards for SP* CM* CL
        if (fNS->contains(thisChar)) {
            int tPos = prevPos;
            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
                tPos = fText->moveIndex32(tPos, -1);
            }
            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
                tPos = fText->moveIndex32(tPos, -1);
            }
            if (fCL->contains(fText->char32At(tPos))) {
                continue;
            }
        }


        // LB 17        B2 SP* x B2
        if (fB2->contains(thisChar)) {
            //  Scan backwards, checking for the B2 CM* SP* sequence.
            tPos = prevPos;
            if (fSP->contains(prevChar)) {
                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
                    tPos=fText->moveIndex32(tPos, -1);
                }
            }
            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
                tPos=fText->moveIndex32(tPos, -1);
            }
            if (fB2->contains(fText->char32At(tPos))) {
                continue;
            }
        }


        // LB 18    break after space
        if (fSP->contains(prevChar)) {
            break;
        }

        // LB 19
        //    x   QU
        //    QU  x
        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
            continue;
        }

        // LB 20  Break around a CB
        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
            break;
        }

        // LB 21
        if (fBA->contains(thisChar) ||
            fHY->contains(thisChar) ||
            fNS->contains(thisChar) ||
            fBB->contains(prevChar) )   {
            continue;
        }

        // LB 22
        if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
            fID->contains(prevChar) && fIN->contains(thisChar) ||
            fIN->contains(prevChar) && fIN->contains(thisChar) ||
            fNU->contains(prevChar) && fIN->contains(thisChar) )   {
            continue;
        }


        // LB 23    ID x PO
        //          AL x NU
        //          NU x AL
        if (fID->contains(prevChar) && fPO->contains(thisChar) ||
            fAL->contains(prevChar) && fNU->contains(thisChar) ||
            fNU->contains(prevChar) && fAL->contains(thisChar) )   {
            continue;
        }

        // LB 24  Do not break between prefix and letters or ideographs.
        //        PR x ID
        //        PR x AL
        //        PO x AL
        if (fPR->contains(prevChar) && fID->contains(thisChar) ||
            fPR->contains(prevChar) && fAL->contains(thisChar) ||
            fPO->contains(prevChar) && fAL->contains(thisChar) )   {
            continue;
        }


        // LB 25    Numbers
        if (fNumberMatcher->lookingAt(prevPos, status)) {
            if (U_FAILURE(status)) {
                break;
            }
            // Matched a number.  But could have been just a single digit, which would
            //    not represent a "no break here" between prevChar and thisChar
            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
            if (numEndIdx > pos) {
                // Number match includes at least our two chars being checked
                if (numEndIdx > nextPos) {
                    // Number match includes additional chars.  Update pos and nextPos
                    //   so that next loop iteration will continue at the end of the number,
                    //   checking for breaks between last char in number & whatever follows.
                    pos = nextPos = numEndIdx;
                    do {
                        pos = fText->moveIndex32(pos, -1);
                        thisChar = fText->char32At(pos);
                    } while (fCM->contains(thisChar));
                }
                continue;
            }
        }


        // LB 26 Do not break a Korean syllable.
        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
                                        fJV->contains(thisChar) ||
                                        fH2->contains(thisChar) ||
                                        fH3->contains(thisChar))) {
                                            continue;
                                        }

        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
                continue;
        }

        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
            fJT->contains(thisChar)) {
                continue;
        }

        // LB 27 Treat a Korean Syllable Block the same as ID.
        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
            fIN->contains(thisChar)) {
                continue;
            }
        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
            fPO->contains(thisChar)) {
                continue;
            }
        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
                continue;
            }


        // LB 28  Do not break between alphabetics (“at”).
        if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
            continue;
        }

        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
        if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
            continue;
        }

        //LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation
        //      (AL | NU) x OP
        //       CL x (AL | NU)
        if ((fAL->contains(prevChar) || fNU->contains(prevChar)) &&
              fOP->contains(thisChar)) {
            continue;
        }
        if (fCL->contains(prevChar) &&
            (fAL->contains(thisChar) || fNU->contains(thisChar))) {
            continue;
        }


        // LB 31    Break everywhere else
        break;

    }

    return pos;
}


UVector  *RBBILineMonkey::charClasses() {
    return fSets;
}


RBBILineMonkey::~RBBILineMonkey() {
    delete fSets;

    delete fBK;
    delete fCR;
    delete fLF;
    delete fCM;
    delete fNL;
    delete fWJ;
    delete fZW;
    delete fGL;
    delete fCB;
    delete fSP;
    delete fB2;
    delete fBA;
    delete fBB;
    delete fHY;
    delete fH2;
    delete fH3;
    delete fCL;
    delete fEX;
    delete fIN;
    delete fJL;
    delete fJV;
    delete fJT;
    delete fNS;
    delete fOP;
    delete fQU;
    delete fIS;
    delete fNU;
    delete fPO;
    delete fPR;
    delete fSY;
    delete fAI;
    delete fAL;
    delete fID;
    delete fSA;
    delete fSG;
    delete fXX;

    delete fCharBI;
    delete fNumberMatcher;
}


//-------------------------------------------------------------------------------------------
//
//   TestMonkey
//
//     params
//       seed=nnnnn        Random number starting seed.
//                         Setting the seed allows errors to be reproduced.
//       loop=nnn          Looping count.  Controls running time.
//                         -1:  run forever.
//                          0 or greater:  run length.
//
//       type = char | word | line | sent | title
//
//-------------------------------------------------------------------------------------------

static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
    int32_t val = defaultVal;
    name.append(" *= *(-?\\d+)");
    UErrorCode status = U_ZERO_ERROR;
    RegexMatcher m(name, params, 0, status);
    if (m.find()) {
        // The param exists.  Convert the string to an int.
        char valString[100];
        int32_t paramLength = m.end(1, status) - m.start(1, status);
        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
            paramLength = (int32_t)(sizeof(valString)-2);
        }
        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
        val = strtol(valString,  NULL, 10);

        // Delete this parameter from the params string.
        m.reset();
        params = m.replaceFirst("", status);
    }
    U_ASSERT(U_SUCCESS(status));
    return val;
}
#endif

static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
                                    BreakIterator *bi,
                                    int expected[],
                                    int expectedcount)
{
    int count = 0;
    int i = 0;
    int forward[50];
    bi->setText(ustr);
    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
        forward[count] = i;
        if (count < expectedcount && expected[count] != i) {
            test->errln("break forward test failed: expected %d but got %d",
                        expected[count], i);
            break;
        }
        count ++;
    }
    if (count != expectedcount) {
        printStringBreaks(ustr, expected, expectedcount);
        test->errln("break forward test failed: missed %d match",
                    expectedcount - count);
        return;
    }
    // testing boundaries
    for (i = 1; i < expectedcount; i ++) {
        int j = expected[i - 1];
        if (!bi->isBoundary(j)) {
            printStringBreaks(ustr, expected, expectedcount);
            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
            return;
        }
        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
            if (bi->isBoundary(j)) {
                printStringBreaks(ustr, expected, expectedcount);
                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
                return;
            }
        }
    }

    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
        count --;
        if (forward[count] != i) {
            test->errln("happy break test previous() failed: expected %d but got %d",
                        forward[count], i);
            break;
        }
    }
    if (count != 0) {
        printStringBreaks(ustr, expected, expectedcount);
        test->errln("break test previous() failed: missed a match");
        return;
    }

    // testing preceding
    for (i = 0; i < expectedcount - 1; i ++) {
        // int j = expected[i] + 1;
        int j = ustr.moveIndex32(expected[i], 1);
        for (; j <= expected[i + 1]; j ++) {
            if (bi->preceding(j) != expected[i]) {
                printStringBreaks(ustr, expected, expectedcount);
                test->errln("preceding(): Not expecting boundary at position %d", j);
                return;
            }
        }
    }
}

void RBBITest::TestWordBreaks(void)
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS

    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
    Locale        locale("en");
    UErrorCode    status = U_ZERO_ERROR;
    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
    UChar         str[300];
    static const char *strlist[] =
    {
    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
    "\\u90ca\\u3588\\u009c\\u0953\\u194b",
    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
    "\\u2027\\U000e0067\\u0a47\\u00b7",
    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
    "\\u0589\\U000e006e\\u0a42\\U000104a5",
    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
    "\\u0027\\u11af\\U000e0057\\u0602",
    "\\U0001d7f2\\U000e007\\u0004\\u0589",
    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
    "\\u0233\\U000e0020\\u0a69\\u0d6a",
    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
    "\\u58f4\\U000e0049\\u20e7\\u2027",
    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
    "\\ua183\\u102d\\u0bec\\u003a",
    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
    "\\u003a\\u0e57\\u0fad\\u002e",
    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
    "\\u003a\\u0664\\u00b7\\u1fba",
    "\\u003b\\u0027\\u00b7\\u47a3",
    "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
    };
    int loop;
    if (U_FAILURE(status)) {
        errln("Creation of break iterator failed %s", u_errorName(status));
        return;
    }
    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
        // printf("looping %d\n", loop);
        u_unescape(strlist[loop], str, 25);
        UnicodeString ustr(str);
        // RBBICharMonkey monkey;
        RBBIWordMonkey monkey;

        int expected[50];
        int expectedcount = 0;

        monkey.setText(ustr);
        int i;
        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
            expected[expectedcount ++] = i;
        }

        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
    }
    delete bi;
#endif
}

void RBBITest::TestWordBoundary(void)
{
    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
    Locale        locale("en");
    UErrorCode    status = U_ZERO_ERROR;
    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
    UChar         str[50];
    static const char *strlist[] =
    {
    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
    "\\u2027\\U000e0067\\u0a47\\u00b7",
    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
    "\\u0589\\U000e006e\\u0a42\\U000104a5",
    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
    "\\u0027\\u11af\\U000e0057\\u0602",
    "\\U0001d7f2\\U000e007\\u0004\\u0589",
    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
    "\\u0233\\U000e0020\\u0a69\\u0d6a",
    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
    "\\u58f4\\U000e0049\\u20e7\\u2027",
    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
    "\\ua183\\u102d\\u0bec\\u003a",
    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
    "\\u003a\\u0e57\\u0fad\\u002e",
    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
    "\\u003a\\u0664\\u00b7\\u1fba",
    "\\u003b\\u0027\\u00b7\\u47a3",
    };
    int loop;
    if (U_FAILURE(status)) {
        errln("Creation of break iterator failed %s", u_errorName(status));
        return;
    }
    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
        // printf("looping %d\n", loop);
        u_unescape(strlist[loop], str, 20);
        UnicodeString ustr(str);
        int forward[50];
        int count = 0;

        bi->setText(ustr);
        int prev = 0;
        int i;
        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
            forward[count ++] = i;
            if (i > prev) {
                int j;
                for (j = prev + 1; j < i; j ++) {
                    if (bi->isBoundary(j)) {
                        printStringBreaks(ustr, forward, count);
                        errln("happy boundary test failed: expected %d not a boundary",
                               j);
                        return;
                    }
                }
            }
            if (!bi->isBoundary(i)) {
                printStringBreaks(ustr, forward, count);
                errln("happy boundary test failed: expected %d a boundary",
                       i);
                return;
            }
            prev = i;
        }
    }
    delete bi;
}

void RBBITest::TestLineBreaks(void)
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    Locale        locale("en");
    UErrorCode    status = U_ZERO_ERROR;
    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
    const int32_t  STRSIZE = 50;
    UChar         str[STRSIZE];
    static const char *strlist[] =
    {
     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
    };
    int loop;
    TEST_ASSERT_SUCCESS(status);
    if (U_FAILURE(status)) {
        return;
    }
    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
        // printf("looping %d\n", loop);
        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
        if (t >= STRSIZE) {
            TEST_ASSERT(FALSE);
            continue;
        }


        UnicodeString ustr(str);
        RBBILineMonkey monkey;
        if (U_FAILURE(monkey.deferredStatus)) {
            continue;
        }

        const int EXPECTEDSIZE = 50;
        int expected[EXPECTEDSIZE];
        int expectedcount = 0;

        monkey.setText(ustr);
        int i;
        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
            if (expectedcount >= EXPECTEDSIZE) {
                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
                return;
            }
            expected[expectedcount ++] = i;
        }

        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
    }
    delete bi;
#endif
}

void RBBITest::TestSentBreaks(void)
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    Locale        locale("en");
    UErrorCode    status = U_ZERO_ERROR;
    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
    UChar         str[200];
    static const char *strlist[] =
    {
     "Now\ris\nthe\r\ntime\n\rfor\r\r",
     "This\n",
     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
     "\"Sentence ending with a quote.\" Bye.",
     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
    };
    int loop;
    if (U_FAILURE(status)) {
        errln("Creation of break iterator failed %s", u_errorName(status));
        return;
    }
    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
        UnicodeString ustr(str);

        RBBISentMonkey monkey;
        if (U_FAILURE(monkey.deferredStatus)) {
            continue;
        }

        const int EXPECTEDSIZE = 50;
        int expected[EXPECTEDSIZE];
        int expectedcount = 0;

        monkey.setText(ustr);
        int i;
        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
            if (expectedcount >= EXPECTEDSIZE) {
                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
                return;
            }
            expected[expectedcount ++] = i;
        }

        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
    }
    delete bi;
#endif
}

void RBBITest::TestMonkey(char *params) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS

    UErrorCode     status    = U_ZERO_ERROR;
    int32_t        loopCount = 500;
    int32_t        seed      = 1;
    UnicodeString  breakType = "all";
    Locale         locale("en");
    UBool          useUText  = FALSE;

    if (quick == FALSE) {
        loopCount = 10000;
    }

    if (params) {
        UnicodeString p(params);
        loopCount = getIntParam("loop", p, loopCount);
        seed      = getIntParam("seed", p, seed);

        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
        if (m.find()) {
            breakType = m.group(1, status);
            m.reset();
            p = m.replaceFirst("", status);
        }

        RegexMatcher u(" *utext", p, 0, status);
        if (u.find()) {
            useUText = TRUE;
            u.reset();
            p = u.replaceFirst("", status);
        }


        // m.reset(p);
        if (RegexMatcher("\\S", p, 0, status).find()) {
            // Each option is stripped out of the option string as it is processed.
            // All options have been checked.  The option string should have been completely emptied..
            char buf[100];
            p.extract(buf, sizeof(buf), NULL, status);
            buf[sizeof(buf)-1] = 0;
            errln("Unrecognized or extra parameter:  %s\n", buf);
            return;
        }

    }

    if (breakType == "char" || breakType == "all") {
        RBBICharMonkey  m;
        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
        if (U_SUCCESS(status)) {
            RunMonkey(bi, m, "char", seed, loopCount, useUText);
            if (breakType == "all" && useUText==FALSE) {
                // Also run a quick test with UText when "all" is specified
                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
            }
        }
        else {
            errln("Creation of character break iterator failed %s", u_errorName(status));
        }
        delete bi;
    }

    if (breakType == "word" || breakType == "all") {
        logln("Word Break Monkey Test");
        RBBIWordMonkey  m;
        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
        if (U_SUCCESS(status)) {
            RunMonkey(bi, m, "word", seed, loopCount, useUText);
        }
        else {
            errln("Creation of word break iterator failed %s", u_errorName(status));
        }
        delete bi;
    }

    if (breakType == "line" || breakType == "all") {
        logln("Line Break Monkey Test");
        RBBILineMonkey  m;
        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
        if (loopCount >= 10) {
            loopCount = loopCount / 5;   // Line break runs slower than the others.
        }
        if (U_SUCCESS(status)) {
            RunMonkey(bi, m, "line", seed, loopCount, useUText);
        }
        else {
            errln("Creation of line break iterator failed %s", u_errorName(status));
        }
        delete bi;
    }

    if (breakType == "sent" || breakType == "all"  ) {
        logln("Sentence Break Monkey Test");
        RBBISentMonkey  m;
        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
        if (loopCount >= 10) {
            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
        }
        if (U_SUCCESS(status)) {
            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
        }
        else {
            errln("Creation of line break iterator failed %s", u_errorName(status));
        }
        delete bi;
    }

#endif
}

//
//  Run a RBBI monkey test.  Common routine, for all break iterator types.
//    Parameters:
//       bi      - the break iterator to use
//       mk      - MonkeyKind, abstraction for obtaining expected results
//       name    - Name of test (char, word, etc.) for use in error messages
//       seed    - Seed for starting random number generator (parameter from user)
//       numIterations
//
void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
                         int32_t numIterations, UBool useUText) {

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

    const int32_t    TESTSTRINGLEN = 500;
    UnicodeString    testText;
    int32_t          numCharClasses;
    UVector          *chClasses;
    int              expected[TESTSTRINGLEN*2 + 1];
    int              expectedCount = 0;
    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
    char             reverseBreaks[TESTSTRINGLEN*2+1];
    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
    char             followingBreaks[TESTSTRINGLEN*2+1];
    char             precedingBreaks[TESTSTRINGLEN*2+1];
    int              i;
    int              loopCount = 0;

    m_seed = seed;

    numCharClasses = mk.charClasses()->size();
    chClasses      = mk.charClasses();

    // Check for errors that occured during the construction of the MonkeyKind object.
    //  Can't report them where they occured because errln() is a method coming from intlTest,
    //  and is not visible outside of RBBITest :-(
    if (U_FAILURE(mk.deferredStatus)) {
        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
        return;
    }

    // Verify that the character classes all have at least one member.
    for (i=0; i<numCharClasses; i++) {
        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
        if (s == NULL || s->size() == 0) {
            errln("Character Class #%d is null or of zero size.", i);
            return;
        }
    }

    while (loopCount < numIterations || numIterations == -1) {
        if (numIterations == -1 && loopCount % 10 == 0) {
            // If test is running in an infinite loop, display a periodic tic so
            //   we can tell that it is making progress.
            fprintf(stderr, ".");
        }
        // Save current random number seed, so that we can recreate the random numbers
        //   for this loop iteration in event of an error.
        seed = m_seed;

        // Populate a test string with data.
        testText.truncate(0);
        for (i=0; i<TESTSTRINGLEN; i++) {
            int32_t  aClassNum = m_rand() % numCharClasses;
            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
            int32_t   charIdx = m_rand() % classSet->size();
            UChar32   c = classSet->charAt(charIdx);
            if (c < 0) {   // TODO:  deal with sets containing strings.
                errln("c < 0");
                break;
            }
            testText.append(c);
        }

        // Calculate the expected results for this test string.
        mk.setText(testText);
        memset(expectedBreaks, 0, sizeof(expectedBreaks));
        expectedBreaks[0] = 1;
        int32_t breakPos = 0;
        expectedCount = 0;
        for (;;) {
            breakPos = mk.next(breakPos);
            if (breakPos == -1) {
                break;
            }
            if (breakPos > testText.length()) {
                errln("breakPos > testText.length()");
            }
            expectedBreaks[breakPos] = 1;
            U_ASSERT(expectedCount<testText.length());
            expected[expectedCount ++] = breakPos;
        }

        // Find the break positions using forward iteration
        memset(forwardBreaks, 0, sizeof(forwardBreaks));
        if (useUText) {
            UErrorCode status = U_ZERO_ERROR;
            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
            // testUText = utext_openUnicodeString(testUText, &testText, &status);
            bi->setText(testUText, status);
            TEST_ASSERT_SUCCESS(status);
            utext_close(testUText);   // The break iterator does a shallow clone of the UText
                                      //  This UText can be closed immediately, so long as the
                                      //  testText string continues to exist.
        } else {
            bi->setText(testText);
        }

        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
            if (i < 0 || i > testText.length()) {
                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
                break;
            }
            forwardBreaks[i] = 1;
        }

        // Find the break positions using reverse iteration
        memset(reverseBreaks, 0, sizeof(reverseBreaks));
        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
            if (i < 0 || i > testText.length()) {
                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
                break;
            }
            reverseBreaks[i] = 1;
        }

        // Find the break positions using isBoundary() tests.
        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
        for (i=0; i<=testText.length(); i++) {
            isBoundaryBreaks[i] = bi->isBoundary(i);
        }


        // Find the break positions using the following() function.
        // printf(".");
        memset(followingBreaks, 0, sizeof(followingBreaks));
        int32_t   lastBreakPos = 0;
        followingBreaks[0] = 1;
        for (i=0; i<testText.length(); i++) {
            breakPos = bi->following(i);
            if (breakPos <= i ||
                breakPos < lastBreakPos ||
                breakPos > testText.length() ||
                breakPos > lastBreakPos && lastBreakPos > i ) {
                errln("%s break monkey test: "
                    "Out of range value returned by BreakIterator::following().\n"
                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
                         name, seed, i, breakPos, lastBreakPos);
                break;
            }
            followingBreaks[breakPos] = 1;
            lastBreakPos = breakPos;
        }

        // Find the break positions using the preceding() function.
        memset(precedingBreaks, 0, sizeof(followingBreaks));
        lastBreakPos = testText.length();
        precedingBreaks[testText.length()] = 1;
        for (i=testText.length(); i>0; i--) {
            breakPos = bi->preceding(i);
            if (breakPos >= i ||
                breakPos > lastBreakPos ||
                breakPos < 0 && testText.getChar32Start(i)>0 ||
                breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
                errln("%s break monkey test: "
                    "Out of range value returned by BreakIterator::preceding().\n"
                    "index=%d;  prev returned %d; lastBreak=%d" ,
                    name,  i, breakPos, lastBreakPos);
                precedingBreaks[i] = 2;   // Forces an error.
            } else {
                precedingBreaks[breakPos] = 1;
                lastBreakPos = breakPos;
            }
        }

        // Compare the expected and actual results.
        for (i=0; i<=testText.length(); i++) {
            const char *errorType = NULL;
            if  (forwardBreaks[i] != expectedBreaks[i]) {
                errorType = "next()";
            } else if (reverseBreaks[i] != forwardBreaks[i]) {
                errorType = "previous()";
            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
                errorType = "isBoundary()";
            } else if (followingBreaks[i] != expectedBreaks[i]) {
                errorType = "following()";
            } else if (precedingBreaks[i] != expectedBreaks[i]) {
                errorType = "preceding()";
            }


            if (errorType != NULL) {
                // Format a range of the test text that includes the failure as
                //  a data item that can be included in the rbbi test data file.

                // Start of the range is the last point where expected and actual results
                //   both agreed that there was a break position.
                int startContext = i;
                int32_t count = 0;
                for (;;) {
                    if (startContext==0) { break; }
                    startContext --;
                    if (expectedBreaks[startContext] != 0) {
                        if (count == 2) break;
                        count ++;
                    }
                }

                // End of range is two expected breaks past the start position.
                int endContext = i + 1;
                int ci;
                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
                    for (;;) {
                        if (endContext >= testText.length()) {break;}
                        if (expectedBreaks[endContext-1] != 0) {
                            if (count == 0) break;
                            count --;
                        }
                        endContext ++;
                    }
                }

                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
                UnicodeString errorText = "<data>";
                /***if (strcmp(errorType, "next()") == 0) {
                    startContext = 0;
                    endContext = testText.length();

                    printStringBreaks(testText, expected, expectedCount);
                }***/

                for (ci=startContext; ci<endContext;) {
                    UnicodeString hexChars("0123456789abcdef");
                    UChar32  c;
                    int      bn;
                    c = testText.char32At(ci);
                    if (ci == i) {
                        // This is the location of the error.
                        errorText.append("<?>");
                    } else if (expectedBreaks[ci] != 0) {
                        // This a non-error expected break position.
                        errorText.append("\\");
                    }
                    if (c < 0x10000) {
                        errorText.append("\\u");
                        for (bn=12; bn>=0; bn-=4) {
                            errorText.append(hexChars.charAt((c>>bn)&0xf));
                        }
                    } else {
                        errorText.append("\\U");
                        for (bn=28; bn>=0; bn-=4) {
                            errorText.append(hexChars.charAt((c>>bn)&0xf));
                        }
                    }
                    ci = testText.moveIndex32(ci, 1);
                }
                errorText.append("\\");
                errorText.append("</data>\n");

                // Output the error
                char  charErrorTxt[500];
                UErrorCode status = U_ZERO_ERROR;
                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
                errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
                    name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
                    errorType, seed, i, charErrorTxt);
                break;
            }
        }

        loopCount++;
    }
#endif
}

//
//  TestDebug    -  A place-holder test for debugging purposes.
//                  For putting in fragments of other tests that can be invoked
//                  for tracing  without a lot of unwanted extra stuff happening.
//
void RBBITest::TestDebug(void) {
#if 0
    UErrorCode   status = U_ZERO_ERROR;
    int pos = 0;
    int ruleStatus = 0;

    RuleBasedBreakIterator* bi =
       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
    // UnicodeString s("Aaa.  Bcd");
    s = s.unescape();
    bi->setText(s);
    UBool r = bi->isBoundary(8);
    printf("%s", r?"true":"false");
    return;
    pos = bi->last();
    do {
        // ruleStatus = bi->getRuleStatus();
        printf("%d\t%d\n", pos, ruleStatus);
        pos = bi->previous();
    } while (pos != BreakIterator::DONE);
#endif
}

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
-												ICU-161 More copyright fixes.

X-SVN-Rev: 935
											
										
										
											2000-03-10 00:42:27 +00:00
+								/********************************************************************
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								 * COPYRIGHT:
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								 * Copyright (c) 1999-2007, International Business Machines Corporation and
-												ICU-161 More copyright fixes.

X-SVN-Rev: 935
											
										
										
											2000-03-10 00:42:27 +00:00
+								 * others. All Rights Reserved.
 								 ********************************************************************/
 								/************************************************************************
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								*   Date        Name        Description
 								*   12/15/99    Madhu        Creation.
 								*   01/12/2000  Madhu        Updated for changed API and added new tests
 								************************************************************************/
-												ICU-2248 modularize ICU

X-SVN-Rev: 9910
											
										
										
											2002-09-21 00:43:14 +00:00
+								#include "unicode/utypes.h"
 								#if !UCONFIG_NO_BREAK_ITERATION
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								#include "unicode/utypes.h"
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								#include "unicode/brkiter.h"
 								#include "unicode/rbbi.h"
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								#include "unicode/uchar.h"
 								#include "unicode/utf16.h"
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								#include "unicode/ucnv.h"
 								#include "unicode/schriter.h"
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								#include "unicode/uniset.h"
 								#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								#include "unicode/ustring.h"
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								#include "unicode/utext.h"
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								#include "intltest.h"
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								#include "rbbitst.h"
 								#include <string.h>
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								#include "uvector.h"
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								#include "uvectr32.h"
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								#include "triedict.h"
-												ICU-2093 intltest rbbitest, remove dependency on regexp

X-SVN-Rev: 11990
											
										
										
											2003-05-17 02:07:52 +00:00
+								#include <string.h>
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								#include <stdio.h>
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								#include <stdlib.h>
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								#define TEST_ASSERT(x) {if (!(x)) { \
 								    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
 								#define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
 								    errln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
-												ICU-5775 add test case for reported RBBI problem

X-SVN-Rev: 21934
											
										
										
											2007-07-10 18:11:43 +00:00
+								//---------------------------------------------
 								// runIndexedTest
 								//---------------------------------------------
 								void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
 								{
 								    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
 								    switch (index) {
 								        case 0: name = "TestBug4153072";
 								            if(exec) TestBug4153072();                         break;
 								        case 1: name = "TestJapaneseLineBreak";
 								            if(exec) TestJapaneseLineBreak();                  break;
 								        case 2: name = "TestStatusReturn";
 								            if(exec) TestStatusReturn();                       break;
 								        case 3: name = "TestUnicodeFiles";
 								            if(exec) TestUnicodeFiles();                       break;
 								        case 4: name = "TestEmptyString";
 								            if(exec) TestEmptyString();                        break;
 								        case 5: name = "TestGetAvailableLocales";
 								            if(exec) TestGetAvailableLocales();                break;
 								        case 6: name = "TestGetDisplayName";
 								            if(exec) TestGetDisplayName();                     break;
 								        case 7: name = "TestEndBehaviour";
 								            if(exec) TestEndBehaviour();                       break;
 								        case 8: name = "TestMixedThaiLineBreak";
 								             if(exec) TestMixedThaiLineBreak();                break;
 								        case 9: name = "TestThaiLineBreak";
 								             if(exec) TestThaiLineBreak();                     break;
 								        case 10: name = "TestMaiyamok";
 								             if(exec) TestMaiyamok();                          break;
 								        case 11: name = "TestWordBreaks";
 								             if(exec) TestWordBreaks();                        break;
 								        case 12: name = "TestWordBoundary";
 								             if(exec) TestWordBoundary();                      break;
 								        case 13: name = "TestLineBreaks";
 								             if(exec) TestLineBreaks();                        break;
 								        case 14: name = "TestSentBreaks";
 								             if(exec) TestSentBreaks();                        break;
 								        case 15: name = "TestExtended";
 								             if(exec) TestExtended();                          break;
 								        case 16: name = "TestMonkey";
 								             if(exec) {
 								 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 								               TestMonkey(params);
 								 #else
 								               logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
 								 #endif
 								             }
 								                                                               break;
 								        case 17: name = "TestBug3818";
 								            if(exec) TestBug3818();                            break;
 								        case 18: name = "TestJapaneseWordBreak";
 								            if(exec) TestJapaneseWordBreak();                  break;
 								        case 19: name = "TestDebug";
 								            if(exec) TestDebug();                              break;
 								        case 20: name = "TestTrieDict";
 								            if(exec) TestTrieDict();                           break;
-												ICU-5775 Remove parentheses on test name

X-SVN-Rev: 22095
											
										
										
											2007-07-22 20:56:42 +00:00
+								        case 21: name = "TestBug5775";
-												ICU-5775 add test case for reported RBBI problem

X-SVN-Rev: 21934
											
										
										
											2007-07-10 18:11:43 +00:00
+								            if (exec) TestBug5775();                        break;
 								        default: name = ""; break; //needed to end loop
 								    }
 								}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								//---------------------------------------------------------------------------
 								//
 								//   class BITestData   Holds a set of Break iterator test data and results
 								//                      Includes
 								//                         - the string data to be broken
 								//                         - a vector of the expected break positions.
 								//                         - a vector of source line numbers for the data,
 								//                               (to help see where errors occured.)
 								//                         - The expected break tag values.
 								//                         - Vectors of actual break positions and tag values.
 								//                         - Functions for comparing actual with expected and
 								//                            reporting errors.
 								//
 								//----------------------------------------------------------------------------
 								class BITestData {
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								public:
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    UnicodeString    fDataToBreak;
 								    UVector          fExpectedBreakPositions;
 								    UVector          fExpectedTags;
 								    UVector          fLineNum;
 								    UVector          fActualBreakPositions;   // Test Results.
 								    UVector          fActualTags;
 								    BITestData(UErrorCode &status);
-												ICU-900 Fix some compiler warnings.

X-SVN-Rev: 9307
											
										
										
											2002-07-24 14:16:31 +00:00
+								    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
-												ICU-900 Fixed some compiler warnings.

X-SVN-Rev: 9342
											
										
										
											2002-07-25 18:32:04 +00:00
+								    void             checkResults(const char *heading, RBBITest *test);
 								    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    void             clearResults();
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								};
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								//
 								// Constructor.
 								//
 								BITestData::BITestData(UErrorCode &status)
 								: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 								  fActualTags(status)
 								{
-												ICU-4288 Mostly fixes for --enable-strict for gcc 3.4 (Fedora Core 3)

X-SVN-Rev: 17040
											
										
										
											2004-12-30 07:25:51 +00:00
+								}
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								//
 								// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 								//                 The macro form collects the line number, which is helpful
 								//                 when tracking down failures.
 								//
 								//                 A null data item is inserted at the start of each test's data
 								//                  to put the starting zero into the data list.  The position saved for
 								//                  each non-null item is its ending position.
 								//
 								#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
-												ICU-900 Fix some compiler warnings.

X-SVN-Rev: 9307
											
										
										
											2002-07-24 14:16:31 +00:00
+								void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    if (U_FAILURE(status)) {return;}
 								    if (data != NULL) {
 								        fDataToBreak.append(CharsToUnicodeString(data));
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 								    fExpectedTags.addElement(tag, status);
 								    fLineNum.addElement(lineNum, status);
-												ICU-4288 Mostly fixes for --enable-strict for gcc 3.4 (Fedora Core 3)

X-SVN-Rev: 17040
											
										
										
											2004-12-30 07:25:51 +00:00
+								}
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								//
 								//  checkResults.   Compare the actual and expected break positions, report any differences.
 								//
-												ICU-900 Fixed some compiler warnings.

X-SVN-Rev: 9342
											
										
										
											2002-07-25 18:32:04 +00:00
+								void BITestData::checkResults(const char *heading, RBBITest *test) {
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    int32_t   expectedIndex = 0;
 								    int32_t   actualIndex = 0;
 								    for (;;) {
 								        // If we've run through both the expected and actual results vectors, we're done.
 								        //   break out of the loop.
 								        if (expectedIndex >= fExpectedBreakPositions.size() &&
 								            actualIndex   >= fActualBreakPositions.size()) {
 								            break;
 								        }
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								        if (expectedIndex >= fExpectedBreakPositions.size()) {
 								            err(heading, test, expectedIndex-1, actualIndex);
 								            actualIndex++;
 								            continue;
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								        }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								        if (actualIndex >= fActualBreakPositions.size()) {
 								            err(heading, test, expectedIndex, actualIndex-1);
 								            expectedIndex++;
 								            continue;
 								        }
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 								            err(heading, test, expectedIndex, actualIndex);
 								            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 								            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 								                actualIndex++;
 								            } else {
 								                expectedIndex++;
 								            }
 								            continue;
 								        }
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								                heading, fLineNum.elementAt(expectedIndex),
 								                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 								        }
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								        actualIndex++;
 								        expectedIndex++;
 								    }
 								}
 								//
 								//  err   -  An error was found.  Report it, along with information about where the
 								//                                incorrectly broken test data appeared in the source file.
 								//
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								{
 								    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 								    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 								    int32_t   o        = 0;
-												ICU-2231 RBBI  Sentence Break Rules and test updated to match draft of TR 29

X-SVN-Rev: 9823
											
										
										
											2002-08-30 21:37:59 +00:00
+								    int32_t   line     = fLineNum.elementAti(expectedIdx);
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    if (expectedIdx > 0) {
 								        // The line numbers are off by one because a premature break occurs somewhere
 								        //    within the previous item, rather than at the start of the current (expected) item.
-												ICU-2231 RBBI  Sentence Break Rules and test updated to match draft of TR 29

X-SVN-Rev: 9823
											
										
										
											2002-08-30 21:37:59 +00:00
+								        //    We want to report the offset of the unexpected break from the start of
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								        //      this previous item.
 								        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 								    }
 								    if (actual < expected) {
 								        test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);
 								    } else {
 								        test->errln("%s Failed to find break at end of item from line %d", heading, line);
 								    }
 								}
 								void BITestData::clearResults() {
 								    fActualBreakPositions.removeAllElements();
 								    fActualTags.removeAllElements();
 								}
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								//-----------------------------------------------------------------------------------
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								//
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								//    Cannned Test Characters
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								//
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								//-----------------------------------------------------------------------------------
 								static const UChar cannedTestArray[] = {
 x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
 x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
 x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
 x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
 x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
 x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
 								};
 								static UnicodeString* cannedTestChars = 0;
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								#define  halfNA     "\\u0928\\u094d\\u200d"
 								#define  halfSA     "\\u0938\\u094d\\u200d"
 								#define  halfCHA    "\\u091a\\u094d\\u200d"
 								#define  halfKA     "\\u0915\\u094d\\u200d"
 								#define  deadTA     "\\u0924\\u094d"
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								//--------------------------------------------------------------------------------------
 								//
 								//    RBBITest    constructor and destructor
 								//
 								//--------------------------------------------------------------------------------------
 								RBBITest::RBBITest() {
 								    UnicodeString temp(cannedTestArray);
 								    cannedTestChars = new UnicodeString();
 								    *cannedTestChars += (UChar)0x0000;
 								    *cannedTestChars += temp;
 								}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								RBBITest::~RBBITest() {
 								    delete cannedTestChars;
 								}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
-												ICU-865 Removed duplicate test run

X-SVN-Rev: 4048
											
										
										
											2001-03-13 03:39:45 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								static const int T_NUMBER = 100;
 								static const int T_LETTER = 200;
 								static const int T_H_OR_K = 300;
 								static const int T_IDEO   = 400;
-												ICU-2342 LineBreak rules, fix problem with Greek, Cyrillic

X-SVN-Rev: 9952
											
										
										
											2002-10-03 17:53:15 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
-												ICU-2342 LineBreak rules, fix problem with Greek, Cyrillic

X-SVN-Rev: 9952
											
										
										
											2002-10-03 17:53:15 +00:00
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								//--------------------------------------------------------------------
 								//Testing the BreakIterator for devanagari script
 								//--------------------------------------------------------------------
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								#define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
 								#define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
 								#define deadTTHA "\\u0920\\u094d"
 								#define deadPA   "\\u092a\\u094d"
 								#define deadSA   "\\u0938\\u094d"
 								#define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-1126 Add title break iterator

X-SVN-Rev: 7801
											
										
										
											2002-02-28 01:28:04 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
-												ICU-1126 Add title break iterator

X-SVN-Rev: 7801
											
										
										
											2002-02-28 01:28:04 +00:00
-												ICU-1117 add getRuleStatus() to RBBI

X-SVN-Rev: 8956
											
										
										
											2002-06-27 01:50:22 +00:00
 								//-----------------------------------------------------------------------------------
 								//
 								//   Test for status {tag} return value from break rules.
 								//        TODO:  a more thorough test.
 								//
 								//-----------------------------------------------------------------------------------
 								void RBBITest::TestStatusReturn() {
 								     UnicodeString rulesString1 = "$Letters = [:L:];\n"
 								                                  "$Numbers = [:N:];\n"
 								                                  "$Letters+{1};\n"
 								                                  "$Numbers+{2};\n"
 								                                  "Help\\ {4}/me\\!;\n"
 								                                  "[^$Letters $Numbers];\n"
 								                                  "!.*;\n";
 								     UnicodeString testString1  = "abc123..abc Help me Help me!";
 								                                // 01234567890123456789012345678
 								     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 								     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 								     UErrorCode status=U_ZERO_ERROR;
 								     UParseError    parseError;
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
-												ICU-1117 add getRuleStatus() to RBBI

X-SVN-Rev: 8956
											
										
										
											2002-06-27 01:50:22 +00:00
+								     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 								     if(U_FAILURE(status)) {
 								         errln("FAIL : in construction");
 								     } else {
 								         int32_t  pos;
 								         int32_t  i = 0;
 								         bi->setText(testString1);
 								         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 								             if (pos != bounds1[i]) {
 								                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 								                 break;
 								             }
 								             int tag = bi->getRuleStatus();
 								             if (tag != brkStatus[i]) {
 								                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 								                 break;
 								             }
 								             i++;
 								         }
 								     }
 								     delete bi;
 								}
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								static void printStringBreaks(UnicodeString ustr, int expected[],
 								                              int expectedcount)
 								{
 								    UErrorCode status = U_ZERO_ERROR;
 								    char name[100];
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								    printf("code    alpha extend alphanum type word sent line name\n");
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								    int j;
 								    for (j = 0; j < ustr.length(); j ++) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        if (expectedcount > 0) {
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								            int k;
 								            for (k = 0; k < expectedcount; k ++) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                if (j == expected[k]) {
 								                    printf("------------------------------------------------ %d\n",
 								                           j);
 								                }
 								            }
 								        }
 								        UChar32 c = ustr.char32At(j);
 								        if (c > 0xffff) {
 								            j ++;
 								        }
 								        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								                           u_isUAlphabetic(c),
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								                           u_isalnum(c),
 								                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 								                                                  u_charType(c),
 								                                                  U_SHORT_PROPERTY_NAME),
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                           u_getPropertyValueName(UCHAR_WORD_BREAK,
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								                                                  u_getIntPropertyValue(c,
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                                                          UCHAR_WORD_BREAK),
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                                                  U_SHORT_PROPERTY_NAME),
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 								                                   u_getIntPropertyValue(c,
 								                                           UCHAR_SENTENCE_BREAK),
 								                                   U_SHORT_PROPERTY_NAME),
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								                           u_getPropertyValueName(UCHAR_LINE_BREAK,
 								                                   u_getIntPropertyValue(c,
 								                                           UCHAR_LINE_BREAK),
 								                                   U_SHORT_PROPERTY_NAME),
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                           name);
 								    }
 								}
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								void RBBITest::TestThaiLineBreak() {
 								    UErrorCode status = U_ZERO_ERROR;
 								    BITestData thaiLineSelection(status);
 								    // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
 								    // represents elided letters at the end of a long word.  It should be bound to
 								    // the end of the word and not treated as an independent punctuation mark.
 								    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
 								//        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
 								//        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
 								    // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
 								    // the one time where the paiyannoi occurs somewhere other than at the end
 								    // of a word is in the Thai abbrevation for "etc.", which both begins and
 								    // ends with a paiyannoi
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
 								    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        Locale("th"), status);
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								    if (U_FAILURE(status))
 								    {
 								        errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
 								        return;
 								    }
 								    generalIteratorTest(*e, thaiLineSelection);
-												ICU-2129 Intltest, fix memory leak in revised rbbitest.

X-SVN-Rev: 9811
											
										
										
											2002-08-28 22:10:32 +00:00
+								    delete e;
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								}
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								void RBBITest::TestMixedThaiLineBreak()
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								{
 								    UErrorCode   status = U_ZERO_ERROR;
 								    BITestData   thaiLineSelection(status);
 								    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-4216 updated Thai break dictionary from thai7.ucs.

X-SVN-Rev: 17201
											
										
										
											2005-02-15 19:44:09 +00:00
 								    // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    // start
-												ICU-4216 updated Thai break dictionary from thai7.ucs.

X-SVN-Rev: 17201
											
										
										
											2005-02-15 19:44:09 +00:00
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
-												ICU-4216 updated Thai break dictionary from thai7.ucs.

X-SVN-Rev: 17201
											
										
										
											2005-02-15 19:44:09 +00:00
+								    ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
-												ICU-4216 updated Thai break dictionary from thai7.ucs.

X-SVN-Rev: 17201
											
										
										
											2005-02-15 19:44:09 +00:00
+								    ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-4216 updated Thai break dictionary from thai7.ucs.

X-SVN-Rev: 17201
											
										
										
											2005-02-15 19:44:09 +00:00
+								    // @suwit - end of changes
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								    if (U_FAILURE(status))
 								    {
 								        errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
 								        return;
 								    }
 								    generalIteratorTest(*e, thaiLineSelection);
-												ICU-2129 Intltest, fix memory leak in revised rbbitest.

X-SVN-Rev: 9811
											
										
										
											2002-08-28 22:10:32 +00:00
+								    delete e;
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								}
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								void RBBITest::TestMaiyamok()
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								{
 								    UErrorCode status = U_ZERO_ERROR;
 								    BITestData   thaiLineSelection(status);
 								    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 								    // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
 								    // word".  Instead of appearing as a word unto itself, however, it's kept together
 								    // with the word before it
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
-												ICU-5117 Thai break should work in all locales

X-SVN-Rev: 19408
											
										
										
											2006-03-23 00:54:12 +00:00
+								    ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								    ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
-												ICU-5117 Thai break should work in all locales

X-SVN-Rev: 19408
											
										
										
											2006-03-23 00:54:12 +00:00
+								    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
 								    ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								    ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
 								    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        Locale("th"), status);
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
 								    if (U_FAILURE(status))
 								    {
 								        errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
 								        return;
 								    }
 								    generalIteratorTest(*e, thaiLineSelection);
 								    delete e;
 								}
-												ICU-3818 Fix for Thai Dictionary Break Iterator following(1) failure.

X-SVN-Rev: 15815
											
										
										
											2004-06-09 18:12:01 +00:00
+								void RBBITest::TestBug3818() {
 								    UErrorCode  status = U_ZERO_ERROR;
 								    // Four Thai words...
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
-												ICU-3818 Fix for Thai Dictionary Break Iterator following(1) failure.

X-SVN-Rev: 15815
											
										
										
											2004-06-09 18:12:01 +00:00
+								    UnicodeString  thaiStr(thaiWordData);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    RuleBasedBreakIterator* bi =
-												ICU-3818 Fix for Thai Dictionary Break Iterator following(1) failure.

X-SVN-Rev: 15815
											
										
										
											2004-06-09 18:12:01 +00:00
+								        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
 								    if (U_FAILURE(status) || bi == NULL) {
 								        errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 								        return;
 								    }
 								    bi->setText(thaiStr);
 								    int32_t  startOfSecondWord = bi->following(1);
 								    if (startOfSecondWord != 4) {
 								        errln("Fail at file %s, line %d expected start of word at 4, got %d",
 								            __FILE__, __LINE__, startOfSecondWord);
 								    }
 								    startOfSecondWord = bi->following(0);
 								    if (startOfSecondWord != 4) {
 								        errln("Fail at file %s, line %d expected start of word at 4, got %d",
 								            __FILE__, __LINE__, startOfSecondWord);
 								    }
-												ICU-3862 Fix a memory leak

X-SVN-Rev: 15930
											
										
										
											2004-06-21 17:41:27 +00:00
+								    delete bi;
-												ICU-3818 Fix for Thai Dictionary Break Iterator following(1) failure.

X-SVN-Rev: 15815
											
										
										
											2004-06-09 18:12:01 +00:00
+								}
-												ICU-3561 Locale-based text boundaries

X-SVN-Rev: 16582
											
										
										
											2004-10-21 01:03:01 +00:00
 								void RBBITest::TestJapaneseWordBreak() {
 								    UErrorCode status = U_ZERO_ERROR;
 								    BITestData   japaneseWordSelection(status);
 								    ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
 								    ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
 								    ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
 								    ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
 								    ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
 								    ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
 								    ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
 								    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
 								        Locale("ja"), status);
 								    if (U_FAILURE(status))
 								    {
 								        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
 								        return;
 								    }
 								    generalIteratorTest(*e, japaneseWordSelection);
 								    delete e;
 								}
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								void RBBITest::TestTrieDict() {
 								    UErrorCode      status  = U_ZERO_ERROR;
 								    //
 								    //  Open and read the test data file.
 								    //
 								    const char *testDataDirectory = IntlTest::getSourceTestData(status);
 								    char testFileName[1000];
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								        errln("Can't open test data.  Path too long.");
 								        return;
 								    }
 								    strcpy(testFileName, testDataDirectory);
 								    strcat(testFileName, "riwords.txt");
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    // Items needing deleting at the end
 								    MutableTrieDictionary *mutableDict = NULL;
 								    CompactTrieDictionary *compactDict = NULL;
 								    UnicodeSet            *breaks      = NULL;
 								    UChar                 *testFile    = NULL;
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    StringEnumeration     *enumer1     = NULL;
 								    StringEnumeration     *enumer2     = NULL;
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    MutableTrieDictionary *mutable2    = NULL;
 								    StringEnumeration     *cloneEnum   = NULL;
 								    CompactTrieDictionary *compact2    = NULL;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    const UnicodeString *originalWord = NULL;
 								    const UnicodeString *cloneWord    = NULL;
 								    UChar *current;
 								    UChar *word;
 								    UChar uc;
 								    int32_t wordLen;
 								    int32_t wordCount;
 								    int32_t testCount;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    int    len;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup; /* something went wrong, error already output */
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
 								    mutableDict = new MutableTrieDictionary(0x0E1C, status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    breaks = new UnicodeSet;
 								    breaks->add(0x000A);     // Line Feed
 								    breaks->add(0x000D);     // Carriage Return
 								    breaks->add(0x2028);     // Line Separator
 								    breaks->add(0x2029);     // Paragraph Separator
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
 								    // Now add each non-comment line of the file as a word.
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    current = testFile;
 								    word = current;
 								    uc = *current++;
 								    wordLen = 0;
 								    wordCount = 0;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    while (uc) {
 								        if (uc == 0x0023) {     // #comment line, skip
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								            while (uc && !breaks->contains(uc)) {
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								                uc = *current++;
 								            }
 								        }
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        else while (uc && !breaks->contains(uc)) {
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								            ++wordLen;
 								            uc = *current++;
 								        }
 								        if (wordLen > 0) {
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								            mutableDict->addWord(word, wordLen, status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								            if (U_FAILURE(status)) {
 								                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								                goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								            }
 								            wordCount += 1;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								        // Find beginning of next line
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        while (uc && breaks->contains(uc)) {
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								            uc = *current++;
 								        }
 								        word = current-1;
 								        wordLen = 0;
 								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (wordCount < 50) {
 								        errln("Word count (%d) unreasonably small\n", wordCount);
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    enumer1 = mutableDict->openWords(status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
 								    testCount = 0;
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    if (wordCount != (testCount = enumer1->count(status))) {
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 								            testCount, wordCount, u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    // Now compact it
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    compactDict = new CompactTrieDictionary(*mutableDict, status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    enumer2 = compactDict->openWords(status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    if (wordCount != (testCount = enumer2->count(status))) {
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								        errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 								            testCount, wordCount, u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
 								        errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
 								    }
 								    delete enumer1;
 								    enumer1 = NULL;
 								    delete enumer2;
 								    enumer2 = NULL;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    // Now un-compact it
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    mutable2 = compactDict->cloneMutable(status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    cloneEnum = mutable2->openWords(status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (wordCount != (testCount = cloneEnum->count(status))) {
 								        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 								            testCount, wordCount, u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    // Compact original dictionary to clone. Note that we can only compare the same kind of
 								    // dictionary as the order of the enumerators is not guaranteed to be the same between
 								    // different kinds
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    enumer1 = mutableDict->openWords(status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
 								     }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    originalWord = enumer1->snext(status);
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    cloneWord = cloneEnum->snext(status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
 								        if (*originalWord != *cloneWord) {
 								            errln("Original and cloned MutableTrieDictionary word mismatch\n");
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								            goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								        }
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								        originalWord = enumer1->snext(status);
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								        cloneWord = cloneEnum->snext(status);
 								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Enumeration failed: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    if (originalWord != cloneWord) {
 								        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								    }
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    compact2 = new CompactTrieDictionary(compactDict->data(), status);
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("CompactTrieDictionary(const void *,...) failed\n");
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    if (compact2->dataSize() == 0) {
 								        errln("CompactTrieDictionary->dataSize() == 0\n");
 								        goto cleanup;
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								    // Now count the words via the second dictionary
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    delete enumer1;
 								    enumer1 = compact2->openWords(status);
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    if (wordCount != (testCount = enumer1->count(status))) {
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
 								            testCount, wordCount, u_errorName(status));
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								        goto cleanup;
-												ICU-5163 Restore const void * constructor, improve test coverage

X-SVN-Rev: 19563
											
										
										
											2006-04-18 22:49:08 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								cleanup:
 								    delete compactDict;
 								    delete mutableDict;
 								    delete breaks;
 								    delete[] testFile;
-												ICU-5410 Improve code coverage.

X-SVN-Rev: 21736
											
										
										
											2007-06-14 19:48:47 +00:00
+								    delete enumer1;
-												ICU-5170 RBBI Dictionary test, moved stack objects to heap so Purify can better
check for out-of-bounds writes.  In search of an elusive error.

X-SVN-Rev: 19581
											
										
										
											2006-04-24 17:31:44 +00:00
+								    delete mutable2;
 								    delete cloneEnum;
 								    delete compact2;
-												ICU-5136 Add test of MutableTrieDictionary and CompactTrieDictionary

X-SVN-Rev: 19473
											
										
										
											2006-03-29 23:33:02 +00:00
+								}
-												ICU-865 Removed duplicate test run

X-SVN-Rev: 4048
											
										
										
											2001-03-13 03:39:45 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								//----------------------------------------------------------------------------
 								//
 								// generalIteratorTest      Given a break iterator and a set of test data,
 								//                          Run the tests and report the results.
 								//
 								//----------------------------------------------------------------------------
 								void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								{
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    bi.setText(td.fDataToBreak);
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    testFirstAndNext(bi, td);
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    testLastAndPrevious(bi, td);
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    testFollowing(bi, td);
 								    testPreceding(bi, td);
 								    testIsBoundary(bi, td);
 								    doMultipleSelectionTest(bi, td);
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								}
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								//
 								//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 								//                       kind of loop.
 								//
 								void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 								{
 								    UErrorCode  status = U_ZERO_ERROR;
 								    int32_t     p;
 								    int32_t     lastP = -1;
 								    int32_t     tag;
 								    logln("Test first and next");
 								    bi.setText(td.fDataToBreak);
 								    td.clearResults();
 								    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 								        td.fActualBreakPositions.addElement(p, status);  // Save result.
 								        tag = bi.getRuleStatus();
 								        td.fActualTags.addElement(tag, status);
 								        if (p <= lastP) {
 								            // If the iterator is not making forward progress, stop.
 								            //  No need to raise an error here, it'll be detected in the normal check of results.
 								            break;
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								        }
 								        lastP = p;
 								    }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    td.checkResults("testFirstAndNext", this);
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								//
 								//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 								//
 								void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								{
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    UErrorCode  status = U_ZERO_ERROR;
 								    int32_t     p;
 								    int32_t     lastP  = 0x7ffffffe;
 								    int32_t     tag;
 								    logln("Test first and next");
 								    bi.setText(td.fDataToBreak);
 								    td.clearResults();
 								    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 								        // Save break position.  Insert it at start of vector of results, shoving
 								        //    already-saved results further towards the end.
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        td.fActualBreakPositions.insertElementAt(p, 0, status);
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								        // bi.previous();   // TODO:  Why does this fix things up????
 								        // bi.next();
 								        tag = bi.getRuleStatus();
 								        td.fActualTags.insertElementAt(tag, 0, status);
 								        if (p >= lastP) {
 								            // If the iterator is not making progress, stop.
 								            //  No need to raise an error here, it'll be detected in the normal check of results.
 								            break;
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								        }
 								        lastP = p;
 								    }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    td.checkResults("testLastAndPrevious", this);
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								{
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    UErrorCode  status = U_ZERO_ERROR;
 								    int32_t     p;
 								    int32_t     tag;
-												ICU-2129 Intltest, fix problem with break iteration test of a zero lenght string.

X-SVN-Rev: 9794
											
										
										
											2002-08-27 20:28:05 +00:00
+								    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 								                                 //   cannot be -1; that is returned for DONE.
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    int         i;
 								    logln("testFollowing():");
 								    bi.setText(td.fDataToBreak);
 								    td.clearResults();
 								    // Save the starting point, since we won't get that out of following.
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    p = bi.first();
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    td.fActualBreakPositions.addElement(p, status);  // Save result.
 								    tag = bi.getRuleStatus();
 								    td.fActualTags.addElement(tag, status);
 								    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 								        p = bi.following(i);
 								        if (p != lastP) {
 								            if (p == RuleBasedBreakIterator::DONE) {
 								                break;
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								            }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								            // We've reached a new break position.  Save it.
 								            td.fActualBreakPositions.addElement(p, status);  // Save result.
 								            tag = bi.getRuleStatus();
 								            td.fActualTags.addElement(tag, status);
 								            lastP = p;
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								        }
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								    }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    // The loop normally exits by means of the break in the middle.
 								    // Make sure that the index was at the correct position for the break iterator to have
 								    //   returned DONE.
 								    if (i != td.fDataToBreak.length()) {
 								        errln("testFollowing():  iterator returned DONE prematurely.");
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								    }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								    // Full check of all results.
 								    td.checkResults("testFollowing", this);
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 								    UErrorCode  status = U_ZERO_ERROR;
 								    int32_t     p;
 								    int32_t     tag;
 								    int32_t     lastP  = 0x7ffffffe;
 								    int         i;
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								    logln("testPreceding():");
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    bi.setText(td.fDataToBreak);
 								    td.clearResults();
 								    p = bi.last();
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    td.fActualBreakPositions.addElement(p, status);
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    tag = bi.getRuleStatus();
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    td.fActualTags.addElement(tag, status);
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								    for (i = td.fDataToBreak.length(); i>=-1; i--) {
 								        p = bi.preceding(i);
 								        if (p != lastP) {
 								            if (p == RuleBasedBreakIterator::DONE) {
 								                break;
 								            }
 								            // We've reached a new break position.  Save it.
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            td.fActualBreakPositions.insertElementAt(p, 0, status);
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								            lastP = p;
 								            tag = bi.getRuleStatus();
 								            td.fActualTags.insertElementAt(tag, 0, status);
 								        }
 								    }
 								    // The loop normally exits by means of the break in the middle.
 								    // Make sure that the index was at the correct position for the break iterator to have
 								    //   returned DONE.
 								    if (i != 0) {
 								        errln("testPreceding():  iterator returned DONE prematurely.");
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								    }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								    // Full check of all results.
 								    td.checkResults("testPreceding", this);
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 								    UErrorCode  status = U_ZERO_ERROR;
 								    int         i;
 								    int32_t     tag;
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								    logln("testIsBoundary():");
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    bi.setText(td.fDataToBreak);
 								    td.clearResults();
 								    for (i = 0; i <= td.fDataToBreak.length(); i++) {
 								        if (bi.isBoundary(i)) {
 								            td.fActualBreakPositions.addElement(i, status);  // Save result.
 								            tag = bi.getRuleStatus();
 								            td.fActualTags.addElement(tag, status);
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
+								        }
 								    }
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    td.checkResults("testIsBoundary: ", this);
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								}
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
 								void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								{
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    iterator.setText(td.fDataToBreak);
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 								    int32_t offset = iterator.first();
 								    int32_t testOffset;
 								    int32_t count = 0;
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
 								    if (*testIterator != iterator)
 								        errln("clone() or operator!= failed: two clones compared unequal");
-												ICU-535 fixed some compiler warnings

X-SVN-Rev: 2226
											
										
										
											2000-08-14 21:42:36 +00:00
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								    do {
 								        testOffset = testIterator->first();
 								        testOffset = testIterator->next(count);
 								        if (offset != testOffset)
 								            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 								        if (offset != RuleBasedBreakIterator::DONE) {
 								            count++;
 								            offset = iterator.next();
-												ICU-4148 gcc has a bug where RuleBasedBreakIterator::DONE isn't exported correctly.
Change this test so that it doesn't go into an infinite loop.

X-SVN-Rev: 16740
											
										
										
											2004-11-03 15:53:10 +00:00
+								            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 								                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 								                if (count > 10000 || offset == -1) {
 								                    errln("operator== failed too many times. Stopping test.");
 								                    if (offset == -1) {
 								                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 								                    }
 								                    return;
 								                }
 								            }
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								        }
 								    } while (offset != RuleBasedBreakIterator::DONE);
 								    // now do it backwards...
 								    offset = iterator.last();
 								    count = 0;
 								    do {
 								        testOffset = testIterator->last();
-												ICU-45 RBBI, getRuleStatus() works after previous().
More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
											
										
										
											2002-07-22 22:02:08 +00:00
+								        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								        if (offset != testOffset)
 								            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 								        if (offset != RuleBasedBreakIterator::DONE) {
 								            count--;
 								            offset = iterator.previous();
 								        }
 								    } while (offset != RuleBasedBreakIterator::DONE);
-												ICU-432 various memory leaks

X-SVN-Rev: 1709
											
										
										
											2000-06-29 19:42:17 +00:00
+								    delete testIterator;
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
+								}
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								//---------------------------------------------
 								//
 								//     other tests
 								//
 								//---------------------------------------------
 								void RBBITest::TestEmptyString()
 								{
 								    UnicodeString text = "";
 								    UErrorCode status = U_ZERO_ERROR;
 								    BITestData x(status);
 								    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 								    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 								    if (U_FAILURE(status))
 								    {
 								        errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
 								        return;
 								    }
 								    generalIteratorTest(*bi, x);
 								    delete bi;
 								}
 								void RBBITest::TestGetAvailableLocales()
 								{
 								    int32_t locCount = 0;
 								    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 								    if (locCount == 0)
 								        errln("getAvailableLocales() returned an empty list!");
 								    // Just make sure that it's returning good memory.
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								    int32_t i;
 								    for (i = 0; i < locCount; ++i) {
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								        logln(locList[i].getName());
 								    }
 								}
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								//Testing the BreakIterator::getDisplayName() function
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								void RBBITest::TestGetDisplayName()
 								{
 								    UnicodeString   result;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								    BreakIterator::getDisplayName(Locale::getUS(), result);
 								    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 								        errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 								                + result);
 								    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 								    if (result != "French (France)")
 								        errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 								                + result);
 								}
 								/**
 								 * Test End Behaviour
 								 * @bug 4068137
 								 */
 								void RBBITest::TestEndBehaviour()
 								{
 								    UErrorCode status = U_ZERO_ERROR;
 								    UnicodeString testString("boo.");
 								    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 								    if (U_FAILURE(status))
 								    {
 								        errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
 								        return;
 								    }
 								    wb->setText(testString);
 								    if (wb->first() != 0)
 								        errln("Didn't get break at beginning of string.");
 								    if (wb->next() != 3)
 								        errln("Didn't get break before period in \"boo.\"");
 								    if (wb->current() != 4 && wb->next() != 4)
 								        errln("Didn't get break at end of string.");
 								    delete wb;
 								}
 								/*
 								 * @bug 4153072
 								 */
 								void RBBITest::TestBug4153072() {
 								    UErrorCode status = U_ZERO_ERROR;
 								    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 								    if (U_FAILURE(status))
 								    {
 								        errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
 								        return;
 								    }
 								    UnicodeString str("...Hello, World!...");
 								    int32_t begin = 3;
 								    int32_t end = str.length() - 3;
-												ICU-5170 move RBBI from CharacterIterator to UText

X-SVN-Rev: 19579
											
										
										
											2006-04-22 05:29:27 +00:00
+								    UBool onBoundary;
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
 								    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 								    iter->adoptText(textIterator);
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								    int index;
-												ICU-5170 move RBBI from CharacterIterator to UText

X-SVN-Rev: 19579
											
										
										
											2006-04-22 05:29:27 +00:00
+								    // Note: with the switch to UText, there is no way to restrict the
 								    //       iteration range to begin at an index other than zero.
 								    //       String character iterators created with a non-zero bound are
 								    //         treated by RBBI as being empty.
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								    for (index = -1; index < begin + 1; ++index) {
-												ICU-5170 move RBBI from CharacterIterator to UText

X-SVN-Rev: 19579
											
										
										
											2006-04-22 05:29:27 +00:00
+								        onBoundary = iter->isBoundary(index);
 								        if (index == 0?  !onBoundary : onBoundary) {
 								            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								                            " and begin index = " + begin);
 								        }
 								    }
 								    delete iter;
 								}
-												ICU-5775 add test case for reported RBBI problem

X-SVN-Rev: 21934
											
										
										
											2007-07-10 18:11:43 +00:00
+								//
 								// Test for problem reported by Ashok Matoria on 9 July 2007
 								//    One.<kSoftHyphen><kSpace>Two.
 								//
 								//    Sentence break at start (0) and then on calling next() it breaks at
 								//   ‘T’ of “Two”. Now, at this point if I do next() and
 								//    then previous(), it breaks at <kSOftHyphen> instead of ‘T’ of Two.
 								//
 								void RBBITest::TestBug5775() {
 								    UErrorCode status = U_ZERO_ERROR;
 								    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 								    TEST_ASSERT_SUCCESS(status);
 								    TEST_ASSERT(bi != NULL);
 								    UnicodeString s("One.\\u00ad Two.");
 								    //               01234      56789
 								    s = s.unescape();
 								    bi->setText(s);
 								    int pos = bi->next();
 								    TEST_ASSERT(pos == 6);
 								    pos = bi->next();
 								    TEST_ASSERT(pos == 10);
 								    pos = bi->previous();
 								    TEST_ASSERT(pos == 6);
 								    delete bi;
 								}
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								/**
 								 * Test Japanese Line Break
 								 * @bug 4095322
 								 */
 								void RBBITest::TestJapaneseLineBreak()
 								{
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								#if 0
 								    // Test needs updating some more...   Dump it for now.
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								    // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
 								    //        as opening and closing punctuation for line breaking.
 								    //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    //        from these tests.    6-13-2002
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								    //
 								    UErrorCode status = U_ZERO_ERROR;
 								    UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
 								    UnicodeString precedingChars = CharsToUnicodeString(
 								        //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
 								        "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
 								    UnicodeString followingChars = CharsToUnicodeString(
 								        // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
 								        ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
 								        // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
 								        ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
 								        "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
 								    BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
 								    int32_t i;
 								    if (U_FAILURE(status))
 								    {
 								        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
 								        return;
 								    }
 								    for (i = 0; i < precedingChars.length(); i++) {
-												ICU-2343 remove UCharReference and non-const UnicodeString::operator[]

X-SVN-Rev: 9994
											
										
										
											2002-10-08 23:56:15 +00:00
+								        testString.setCharAt(1, precedingChars[i]);
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								        iter->setText(testString);
 								        int32_t j = iter->first();
 								        if (j != 0)
 								            errln("ja line break failure: failed to start at 0");
 								        j = iter->next();
 								        if (j != 1)
 								            errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
 								                        + "' (" + ((int)(precedingChars[i])) + ")");
 								        j = iter->next();
 								        if (j != 3)
 								            errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
 								                        + "' (" + ((int)(precedingChars[i])) + ")");
 								    }
 								    for (i = 0; i < followingChars.length(); i++) {
-												ICU-2343 remove UCharReference and non-const UnicodeString::operator[]

X-SVN-Rev: 9994
											
										
										
											2002-10-08 23:56:15 +00:00
+								        testString.setCharAt(1, followingChars[i]);
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								        iter->setText(testString);
 								        int j = iter->first();
 								        if (j != 0)
 								            errln("ja line break failure: failed to start at 0");
 								        j = iter->next();
 								        if (j != 2)
 								            errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
 								                        + "' (" + ((int)(followingChars[i])) + ")");
 								        j = iter->next();
 								        if (j != 3)
 								            errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
 								                        + "' (" + ((int)(followingChars[i])) + ")");
 								    }
 								    delete iter;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								#endif
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
+								}
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								//------------------------------------------------------------------------------
 								//
 								//   RBBITest::Extended    Run  RBBI Tests from an external test data file
 								//
 								//------------------------------------------------------------------------------
 								struct TestParams {
 								    BreakIterator   *bi;
 								    UnicodeString    dataToBreak;
 								    UVector32       *expectedBreaks;
 								    UVector32       *srcLine;
 								    UVector32       *srcCol;
 								};
 								void RBBITest::executeTest(TestParams *t) {
 								    int32_t    bp;
 								    int32_t    prevBP;
 								    int32_t    i;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								    if (t->bi == NULL) {
 								        return;
 								    }
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    t->bi->setText(t->dataToBreak);
 								    //
 								    //  Run the iterator forward
 								    //
 								    prevBP = -1;
 								    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
 								        if (prevBP ==  bp) {
 								            // Fail for lack of forward progress.
 								            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
 								                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 								            break;
 								        }
 								        // Check that there were we didn't miss an expected break between the last one
 								        //  and this one.
 								        for (i=prevBP+1; i<bp; i++) {
 								            if (t->expectedBreaks->elementAti(i) != 0) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                int expected[] = {0, i};
 								                printStringBreaks(t->dataToBreak, expected, 2);
 								                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 								            }
 								        }
 								        // Check that the break we did find was expected
 								        if (t->expectedBreaks->elementAti(bp) == 0) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								            int expected[] = {0, bp};
 								            printStringBreaks(t->dataToBreak, expected, 2);
 								            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 								        } else {
 								            // The break was expected.
 								            //   Check that the {nnn} tag value is correct.
 								            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
 								            if (expectedTagVal == -1) {
 								                expectedTagVal = 0;
 								            }
-												ICU-5170 move RBBI from CharacterIterator to UText

X-SVN-Rev: 19579
											
										
										
											2006-04-22 05:29:27 +00:00
+								            int32_t line = t->srcLine->elementAti(bp);
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
 								            if (rs != expectedTagVal) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                      "          Actual, Expected status = %4d, %4d",
-												ICU-5170 move RBBI from CharacterIterator to UText

X-SVN-Rev: 19579
											
										
										
											2006-04-22 05:29:27 +00:00
+								                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								            }
 								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								        prevBP = bp;
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    // Verify that there were no missed expected breaks after the last one found
 								    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
 								        if (t->expectedBreaks->elementAti(i) != 0) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 								        }
 								    }
 								    //
 								    //  Run the iterator backwards, verify that the same breaks are found.
 								    //
 								    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
 								    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
 								        if (prevBP ==  bp) {
 								            // Fail for lack of progress.
 								            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
 								                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 								            break;
 								        }
 								        // Check that there were we didn't miss an expected break between the last one
 								        //  and this one.  (UVector returns zeros for index out of bounds.)
 								        for (i=prevBP-1; i>bp; i--) {
 								            if (t->expectedBreaks->elementAti(i) != 0) {
 								                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 								                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 								            }
 								        }
 								        // Check that the break we did find was expected
 								        if (t->expectedBreaks->elementAti(bp) == 0) {
 								            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 								                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 								        } else {
 								            // The break was expected.
 								            //   Check that the {nnn} tag value is correct.
 								            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
 								            if (expectedTagVal == -1) {
 								                expectedTagVal = 0;
 								            }
-												ICU-5170 move RBBI from CharacterIterator to UText

X-SVN-Rev: 19579
											
										
										
											2006-04-22 05:29:27 +00:00
+								            int line = t->srcLine->elementAti(bp);
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
 								            if (rs != expectedTagVal) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                      "          Actual, Expected status = %4d, %4d",
-												ICU-5170 compiler warning cleanup

X-SVN-Rev: 19596
											
										
										
											2006-04-26 04:01:00 +00:00
+								                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								            }
 								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								        prevBP = bp;
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    // Verify that there were no missed breaks prior to the last one found
 								    for (i=prevBP-1; i>=0; i--) {
 								        if (t->expectedBreaks->elementAti(i) != 0) {
 								            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 								                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 								        }
 								    }
 								}
 								void RBBITest::TestExtended() {
-												ICU-5282 Fix problems found by uconfigtest.

X-SVN-Rev: 19922
											
										
										
											2006-07-28 22:58:29 +00:00
+								#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    UErrorCode      status  = U_ZERO_ERROR;
-												ICU-4325 Allow tests to work in the en_US_POSIX and th_TH locales

X-SVN-Rev: 17071
											
										
										
											2005-01-01 21:55:07 +00:00
+								    Locale          locale("");
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
 								    UnicodeString       rules;
 								    TestParams          tp;
 								    tp.bi             = NULL;
 								    tp.expectedBreaks = new UVector32(status);
 								    tp.srcLine        = new UVector32(status);
 								    tp.srcCol         = new UVector32(status);
-												ICU-3671 Add locale option for data driven tests.  Move Thai word test data from code to the data file

X-SVN-Rev: 18455
											
										
										
											2005-08-19 01:25:17 +00:00
+								    RegexMatcher      localeMatcher("<locale *([\\p{L}\\p{Nd}_]*) *>", 0, status);
 								    TEST_ASSERT_SUCCESS(status);
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
 								    //
 								    //  Open and read the test data file.
 								    //
-												ICU-4043 Use a real path to source data.

X-SVN-Rev: 16109
											
										
										
											2004-08-04 23:40:31 +00:00
+								    const char *testDataDirectory = IntlTest::getSourceTestData(status);
-												ICU-2093 intltest rbbitest, remove dependency on regexp

X-SVN-Rev: 11990
											
										
										
											2003-05-17 02:07:52 +00:00
+								    char testFileName[1000];
-												ICU-4043 Use a real path to source data.

X-SVN-Rev: 16109
											
										
										
											2004-08-04 23:40:31 +00:00
+								    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
-												ICU-2093 intltest rbbitest, remove dependency on regexp

X-SVN-Rev: 11990
											
										
										
											2003-05-17 02:07:52 +00:00
+								        errln("Can't open test data.  Path too long.");
 								        return;
 								    }
 								    strcpy(testFileName, testDataDirectory);
-												ICU-4043 Use a real path to source data.

X-SVN-Rev: 16109
											
										
										
											2004-08-04 23:40:31 +00:00
+								    strcat(testFileName, "rbbitst.txt");
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    int    len;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
-												ICU-3389 Don't crash if file can't be found.

X-SVN-Rev: 15751
											
										
										
											2004-06-07 05:29:50 +00:00
+								    if (U_FAILURE(status)) {
 								        return; /* something went wrong, error already output */
 								    }
-												ICU-2093 intltest rbbitest, remove dependency on regexp

X-SVN-Rev: 11990
											
										
										
											2003-05-17 02:07:52 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    //
 								    //  Put the test data into a UnicodeString
 								    //
 								    UnicodeString testString(FALSE, testFile, len);
 								    enum EParseState{
 								        PARSE_COMMENT,
 								        PARSE_TAG,
 								        PARSE_DATA,
 								        PARSE_NUM
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    }
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    parseState = PARSE_TAG;
 								    EParseState savedState = PARSE_TAG;
-												ICU-3222 Fix some compiler warnings

X-SVN-Rev: 13927
											
										
										
											2003-12-02 01:34:21 +00:00
+								    static const UChar CH_LF        = 0x0a;
 								    static const UChar CH_CR        = 0x0d;
 								    static const UChar CH_HASH      = 0x23;
 								    /*static const UChar CH_PERIOD    = 0x2e;*/
 								    static const UChar CH_LT        = 0x3c;
 								    static const UChar CH_GT        = 0x3e;
 								    static const UChar CH_BACKSLASH = 0x5c;
 								    static const UChar CH_BULLET    = 0x2022;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    int32_t    lineNum  = 1;
 								    int32_t    colStart = 0;
 								    int32_t    column   = 0;
 								    int32_t    charIdx  = 0;
 								    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
 								    for (charIdx = 0; charIdx < len; ) {
-												ICU-3671 Add locale option for data driven tests.  Move Thai word test data from code to the data file

X-SVN-Rev: 18455
											
										
										
											2005-08-19 01:25:17 +00:00
+								        status = U_ZERO_ERROR;
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								        UChar  c = testString.charAt(charIdx);
 								        charIdx++;
 								        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
 								            // treat CRLF as a unit
 								            c = CH_LF;
 								            charIdx++;
 								        }
 								        if (c == CH_LF || c == CH_CR) {
 								            lineNum++;
 								            colStart = charIdx;
 								        }
 								        column = charIdx - colStart + 1;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								        switch (parseState) {
 								        case PARSE_COMMENT:
 								            if (c == 0x0a || c == 0x0d) {
 								                parseState = savedState;
 								            }
 								            break;
 								        case PARSE_TAG:
 								            {
 								            if (c == CH_HASH) {
 								                parseState = PARSE_COMMENT;
 								                savedState = PARSE_TAG;
 								                break;
 								            }
 								            if (u_isUWhiteSpace(c)) {
 								                break;
 								            }
 								            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
 								                delete tp.bi;
 								                tp.bi = BreakIterator::createWordInstance(locale,  status);
 								                charIdx += 5;
 								                break;
 								            }
 								            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
 								                delete tp.bi;
 								                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
 								                charIdx += 5;
 								                break;
 								            }
 								            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
 								                delete tp.bi;
 								                tp.bi = BreakIterator::createLineInstance(locale,  status);
 								                charIdx += 5;
 								                break;
 								            }
 								            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
 								                delete tp.bi;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								                tp.bi = NULL;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                charIdx += 5;
 								                break;
 								            }
 								            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
 								                delete tp.bi;
 								                tp.bi = BreakIterator::createTitleInstance(locale,  status);
 								                charIdx += 6;
 								                break;
 								            }
-												ICU-5766 Extended Grapheme Clusters for ICU4C

X-SVN-Rev: 21933
											
										
										
											2007-07-10 01:25:26 +00:00
+								            if (testString.compare(charIdx-1, 5, "<xgc>") == 0) {
 								                delete tp.bi;
 								                tp.bi = BreakIterator::createXGraphemeClusterInstance(locale,  status);
 								                charIdx += 4;
 								                break;
 								            }
-												ICU-3671 Add locale option for data driven tests.  Move Thai word test data from code to the data file

X-SVN-Rev: 18455
											
										
										
											2005-08-19 01:25:17 +00:00
+								            // <locale  loc_name>
 								            localeMatcher.reset(testString);
 								            if (localeMatcher.lookingAt(charIdx-1, status)) {
 								                UnicodeString localeName = localeMatcher.group(1, status);
 								                char localeName8[100];
 								                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
 								                locale = Locale::createFromName(localeName8);
 								                charIdx += localeMatcher.group(0, status).length();
 								                TEST_ASSERT_SUCCESS(status);
 								                break;
 								            }
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
 								                parseState = PARSE_DATA;
 								                charIdx += 5;
 								                tp.dataToBreak = "";
 								                tp.expectedBreaks->removeAllElements();
 								                tp.srcCol ->removeAllElements();
 								                tp.srcLine->removeAllElements();
 								                break;
 								            }
 								            errln("line %d: Tag expected in test file.", lineNum);
 								            parseState = PARSE_COMMENT;
 								            savedState = PARSE_DATA;
-												ICU-5445 Fix some compiler warnings.

X-SVN-Rev: 22127
											
										
										
											2007-07-24 21:21:49 +00:00
+								            goto end_test; // Stop the test.
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								            }
 								            break;
 								        case PARSE_DATA:
 								            if (c == CH_BULLET) {
 								                int32_t  breakIdx = tp.dataToBreak.length();
 								                tp.expectedBreaks->setSize(breakIdx+1);
 								                tp.expectedBreaks->setElementAt(-1, breakIdx);
 								                tp.srcLine->setSize(breakIdx+1);
 								                tp.srcLine->setElementAt(lineNum, breakIdx);
 								                tp.srcCol ->setSize(breakIdx+1);
 								                tp.srcCol ->setElementAt(column, breakIdx);
 								                break;
 								            }
 								            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
 								                // Add final entry to mappings from break location to source file position.
 								                //  Need one extra because last break position returned is after the
 								                //    last char in the data, not at the last char.
 								                tp.srcLine->addElement(lineNum, status);
 								                tp.srcCol ->addElement(column, status);
 								                parseState = PARSE_TAG;
-												ICU-3671 Add locale option for data driven tests.  Move Thai word test data from code to the data file

X-SVN-Rev: 18455
											
										
										
											2005-08-19 01:25:17 +00:00
+								                charIdx += 6;
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
 								                // RUN THE TEST!
 								                executeTest(&tp);
 								                break;
 								            }
-												ICU-2093 Updated tests

X-SVN-Rev: 11999
											
										
										
											2003-05-19 03:16:45 +00:00
+								            if (testString.compare(charIdx-1, 3, "\\N{") == 0) {
 								                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
 								                // Get the code point from the name and insert it into the test data.
 								                //   (Damn, no API takes names in Unicode  !!!
 								                //    we've got to take it back to char *)
 								                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
 								                int32_t nameLength = nameEndIdx - (charIdx+2);
 								                char charNameBuf[200];
 								                UChar32 theChar = -1;
 								                if (nameEndIdx != -1) {
 								                    UErrorCode status = U_ZERO_ERROR;
 								                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
-												ICU-2093 Update LineBreak tests (work in progress).
Fix array index out of bounds in rbbitest

X-SVN-Rev: 12071
											
										
										
											2003-05-23 07:11:14 +00:00
+								                    charNameBuf[sizeof(charNameBuf)-1] = 0;
-												ICU-2093 Updated tests

X-SVN-Rev: 11999
											
										
										
											2003-05-19 03:16:45 +00:00
+								                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
 								                    if (U_FAILURE(status)) {
 								                        theChar = -1;
 								                    }
 								                }
 								                if (theChar == -1) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								                    errln("Error in named character in test file at line %d, col %d",
-												ICU-2093 Updated tests

X-SVN-Rev: 11999
											
										
										
											2003-05-19 03:16:45 +00:00
+								                        lineNum, column);
 								                } else {
 								                    // Named code point was recognized.  Insert it
 								                    //   into the test data.
 								                    tp.dataToBreak.append(theChar);
 								                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
 								                        tp.srcLine->addElement(lineNum, status);
 								                        tp.srcCol ->addElement(column, status);
 								                    }
 								                }
 								                if (nameEndIdx > charIdx) {
 								                    charIdx = nameEndIdx+1;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
-												ICU-2093 Updated tests

X-SVN-Rev: 11999
											
										
										
											2003-05-19 03:16:45 +00:00
+								                }
 								                break;
 								            }
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								            if (testString.compare(charIdx-1, 2, "<>") == 0) {
 								                charIdx++;
 								                int32_t  breakIdx = tp.dataToBreak.length();
 								                tp.expectedBreaks->setSize(breakIdx+1);
 								                tp.expectedBreaks->setElementAt(-1, breakIdx);
 								                tp.srcLine->setSize(breakIdx+1);
 								                tp.srcLine->setElementAt(lineNum, breakIdx);
 								                tp.srcCol ->setSize(breakIdx+1);
 								                tp.srcCol ->setElementAt(column, breakIdx);
 								                break;
 								            }
 								            if (c == CH_LT) {
 								                tagValue   = 0;
 								                parseState = PARSE_NUM;
 								                break;
 								            }
 								            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
 								                parseState = PARSE_COMMENT;
 								                savedState = PARSE_DATA;
 								                break;
 								            }
 								            if (c == CH_BACKSLASH) {
 								                // Check for \ at end of line, a line continuation.
 								                //     Advance over (discard) the newline
 								                UChar32 cp = testString.char32At(charIdx);
 								                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
 								                    // We have a CR LF
 								                    //  Need an extra increment of the input ptr to move over both of them
 								                    charIdx++;
 								                }
 								                if (cp == CH_LF || cp == CH_CR) {
 								                    lineNum++;
 								                    colStart = charIdx;
 								                    charIdx++;
 								                    break;
 								                }
 								                // Let unescape handle the back slash.
 								                cp = testString.unescapeAt(charIdx);
 								                if (cp != -1) {
 								                    // Escape sequence was recognized.  Insert the char
 								                    //   into the test data.
 								                    tp.dataToBreak.append(cp);
 								                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
 								                        tp.srcLine->addElement(lineNum, status);
 								                        tp.srcCol ->addElement(column, status);
 								                    }
 								                    break;
 								                }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								                // Not a recognized backslash escape sequence.
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                // Take the next char as a literal.
 								                //  TODO:  Should this be an error?
 								                c = testString.charAt(charIdx);
 								                charIdx = testString.moveIndex32(charIdx, 1);
 								            }
 								            // Normal, non-escaped data char.
 								            tp.dataToBreak.append(c);
 								            // Save the mapping from offset in the data to line/column numbers in
 								            //   the original input file.  Will be used for better error messages only.
 								            //   If there's an expected break before this char, the slot in the mapping
 								            //     vector will already be set for this char; don't overwrite it.
 								            if (tp.dataToBreak.length() > tp.srcLine->size()) {
 								                tp.srcLine->addElement(lineNum, status);
 								                tp.srcCol ->addElement(column, status);
 								            }
 								            break;
 								        case PARSE_NUM:
 								            // We are parsing an expected numeric tag value, like <1234>,
 								            //   within a chunk of data.
 								            if (u_isUWhiteSpace(c)) {
 								                break;
 								            }
 								            if (c == CH_GT) {
 								                // Finished the number.  Add the info to the expected break data,
 								                //   and switch parse state back to doing plain data.
 								                parseState = PARSE_DATA;
 								                if (tagValue == 0) {
 								                    tagValue = -1;
 								                }
 								                int32_t  breakIdx = tp.dataToBreak.length();
 								                tp.expectedBreaks->setSize(breakIdx+1);
 								                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
 								                tp.srcLine->setSize(breakIdx+1);
 								                tp.srcLine->setElementAt(lineNum, breakIdx);
 								                tp.srcCol ->setSize(breakIdx+1);
 								                tp.srcCol ->setElementAt(column, breakIdx);
 								                break;
 								            }
 								            if (u_isdigit(c)) {
 								                tagValue = tagValue*10 + u_charDigitValue(c);
 								                break;
 								            }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            errln("Syntax Error in test file at line %d, col %d",
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                lineNum, column);
 								            parseState = PARSE_COMMENT;
-												ICU-5445 Fix some compiler warnings.

X-SVN-Rev: 22127
											
										
										
											2007-07-24 21:21:49 +00:00
+								            goto end_test; // Stop the test
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								            break;
 								        }
 								        if (U_FAILURE(status)) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            errln("ICU Error %s while parsing test file at line %d.",
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								                u_errorName(status), lineNum);
 								            status = U_ZERO_ERROR;
-												ICU-5445 Fix some compiler warnings.

X-SVN-Rev: 22127
											
										
										
											2007-07-24 21:21:49 +00:00
+								            goto end_test; // Stop the test
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								        }
 								    }
-												ICU-2840 tests shouldn't crash if there is no data

X-SVN-Rev: 12340
											
										
										
											2003-06-06 04:54:34 +00:00
+								end_test:
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    delete tp.bi;
 								    delete tp.expectedBreaks;
 								    delete tp.srcLine;
 								    delete tp.srcCol;
 								    delete [] testFile;
-												ICU-5282 Fix problems found by uconfigtest.

X-SVN-Rev: 19922
											
										
										
											2006-07-28 22:58:29 +00:00
+								#endif
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								}
 								//-------------------------------------------------------------------------------
 								//
 								//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
 								//    return the datain one big UChar * buffer, which the caller must delete.
 								//
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								//    parameters:
 								//          fileName:   the name of the file, with no directory part.  The test data directory
 								//                      is assumed.
 								//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
 								//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
 								//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
 								//                      Pass NULL for the system default encoding.
 								//          status
 								//    returns:
 								//                      The file data, converted to UChar.
 								//                      The caller must delete this when done with
 								//                           delete [] theBuffer;
 								//
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
 								//           Move this function to some common place.
 								//
 								//--------------------------------------------------------------------------------
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    UChar       *retPtr  = NULL;
 								    char        *fileBuf = NULL;
 								    UConverter* conv     = NULL;
 								    FILE        *f       = NULL;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    ulen = 0;
 								    if (U_FAILURE(status)) {
 								        return retPtr;
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    //
 								    //  Open the file.
 								    //
 								    f = fopen(fileName, "rb");
 								    if (f == 0) {
 								        errln("Error opening test data file %s\n", fileName);
-												ICU-3389 Don't crash if file can't be found.

X-SVN-Rev: 15751
											
										
										
											2004-06-07 05:29:50 +00:00
+								        status = U_FILE_ACCESS_ERROR;
 								        return NULL;
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    }
 								    //
 								    //  Read it in
 								    //
 								    int   fileSize;
 								    int   amt_read;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    fseek( f, 0, SEEK_END);
 								    fileSize = ftell(f);
 								    fileBuf = new char[fileSize];
 								    fseek(f, 0, SEEK_SET);
 								    amt_read = fread(fileBuf, 1, fileSize, f);
 								    if (amt_read != fileSize || fileSize <= 0) {
 								        errln("Error reading test data file.");
 								        goto cleanUpAndReturn;
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    //
 								    // Look for a Unicode Signature (BOM) on the data just read
 								    //
 								    int32_t        signatureLength;
 								    const char *   fileBufC;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    const char*    bomEncoding;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    fileBufC = fileBuf;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    bomEncoding = ucnv_detectUnicodeSignature(
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								        fileBuf, fileSize, &signatureLength, &status);
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    if(bomEncoding!=NULL ){
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								        fileBufC  += signatureLength;
 								        fileSize  -= signatureLength;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								        encoding = bomEncoding;
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    //
 								    // Open a converter to take the rule file to UTF-16
 								    //
 								    conv = ucnv_open(encoding, &status);
 								    if (U_FAILURE(status)) {
 								        goto cleanUpAndReturn;
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    //
 								    // Convert the rules to UChar.
 								    //  Preflight first to determine required buffer size.
 								    //
 								    ulen = ucnv_toUChars(conv,
 								        NULL,           //  dest,
 ,              //  destCapacity,
 								        fileBufC,
 								        fileSize,
 								        &status);
 								    if (status == U_BUFFER_OVERFLOW_ERROR) {
 								        // Buffer Overflow is expected from the preflight operation.
 								        status = U_ZERO_ERROR;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								        retPtr = new UChar[ulen+1];
 								        ucnv_toUChars(conv,
 								            retPtr,       //  dest,
 								            ulen+1,
 								            fileBufC,
 								            fileSize,
 								            &status);
 								    }
 								cleanUpAndReturn:
 								    fclose(f);
-												ICU-5161 Properly delete memory, and make sure there is enough space to unescape a string.

X-SVN-Rev: 19555
											
										
										
											2006-04-16 17:28:00 +00:00
+								    delete []fileBuf;
-												ICU-2093 rbbi rules and tests updated

X-SVN-Rev: 11974
											
										
										
											2003-05-16 22:05:35 +00:00
+								    ucnv_close(conv);
 								    if (U_FAILURE(status)) {
 								        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
 								        delete retPtr;
 								        retPtr = 0;
 								        ulen   = 0;
 								    };
 								    return retPtr;
 								}
-												ICU-2129 Intltest, remove textbounds test, merge the non-redundant tests
into rbbitest.  Many tests were replicated in both places.

X-SVN-Rev: 9793
											
										
										
											2002-08-27 19:10:11 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								//--------------------------------------------------------------------------------------------
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								//
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								//
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								//-------------------------------------------------------------------------------------------
 								void RBBITest::TestUnicodeFiles() {
 								    RuleBasedBreakIterator  *bi;
 								    UErrorCode               status = U_ZERO_ERROR;
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
 								    TEST_ASSERT_SUCCESS(status);
 								    if (U_SUCCESS(status)) {
 								        runUnicodeTestData("GraphemeBreakTest.txt", bi);
 								    }
 								    delete bi;
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
 								    TEST_ASSERT_SUCCESS(status);
 								    if (U_SUCCESS(status)) {
 								        runUnicodeTestData("WordBreakTest.txt", bi);
 								    }
 								    delete bi;
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
 								    TEST_ASSERT_SUCCESS(status);
 								    if (U_SUCCESS(status)) {
 								        runUnicodeTestData("SentenceBreakTest.txt", bi);
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    delete bi;
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    #if 0
 								    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharInstance(Locale::getDefault(), status);
 								    TEST_ASSERT_SUCCESS(status);
 								    if (U_SUCCESS(status)) {
 								        runUnicodeTestData("LBTest.txt", bi);
 								    }
 								    delete bi;
 								    #endif
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								}
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								//--------------------------------------------------------------------------------------------
 								//
 								//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
 								//
 								//-------------------------------------------------------------------------------------------
 								void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
 								#if !UCONFIG_NO_REGULAR_EXPRESSIONS
 								    UErrorCode  status = U_ZERO_ERROR;
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    //
 								    //  Open and read the test data file, put it into a UnicodeString.
 								    //
 								    const char *testDataDirectory = IntlTest::getSourceTestData(status);
 								    char testFileName[1000];
 								    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
 								        errln("Can't open test data.  Path too long.");
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								        return;
 								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    strcpy(testFileName, testDataDirectory);
 								    strcat(testFileName, fileName);
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    int    len;
 								    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
 								    TEST_ASSERT_SUCCESS(status);
 								    TEST_ASSERT(testFile != NULL);
 								    if (U_FAILURE(status) || testFile == NULL) {
 								        return; /* something went wrong, error already output */
 								    }
 								    UnicodeString testFileAsString(TRUE, testFile, len);
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    //
 								    //  Parse the test data file using a regular expression.
 								    //  Each kind of token is recognized in its own capture group; what type of item was scanned
 								    //     is identified by which group had a match.
 								    //
 								    //       Caputure Group #                  1          2            3            4           5
 								    //       Parses this item:               divide       x       hex digits   comment & nl   unrecognized
 								    //
-												ICU-5722 Temporarily comment out failing tests, fix compiler warning.

X-SVN-Rev: 21608
											
										
										
											2007-05-31 16:31:49 +00:00
+								    UnicodeString tokenExpr = "(?ms)\\s*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|(#.*?$.)|(.*?$.))";
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, 0, status);
 								    UnicodeString   testString;
 								    UVector32       breakPositions(status);
 								    int             lineNumber = 1;
 								    int             charIndex  = 0;
 								    TEST_ASSERT_SUCCESS(status);
 								    if (U_FAILURE(status)) {
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								        return;
 								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    //
 								    //  Scan through each test case, building up the string to be broken in testString,
 								    //   and the positions that should be boundaries in the breakPositions vector.
 								    //
 								    while (tokenMatcher.lookingAt(charIndex, status)) {
 								        if (tokenMatcher.start(1, status) >= 0) {
 								            // Scanned a divide sign, indicating a break position in the test data.
 								            if (testString.length()>0) {
 								                breakPositions.addElement(testString.length(), status);
 								            }
 								        }
 								        else if (tokenMatcher.start(2, status) >= 0) {
 								            // Scanned an 'x', meaning no break at this position in the test data
 								            //   Nothing to be done here.
 								            }
 								        else if (tokenMatcher.start(3, status) >= 0) {
 								            // Scanned Hex digits.  Convert them to binary, append to the character data string.
 								            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
 								            int length = hexNumber.length();
 								            if (length<=8) {
 								                char buf[10];
 								                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
 								                UChar32 c = (UChar32)strtol(buf, NULL, 16);
 								                if (c<=0x10ffff) {
 								                    testString.append(c);
 								                } else {
 								                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
 								                       fileName, lineNumber);
 								                }
 								            } else {
 								                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
 								                       fileName, lineNumber);
 								             }
 								        }
 								        else if (tokenMatcher.start(4, status) >= 0) {
 								            // Scanned to end of a line, possibly skipping over a comment in the process.
 								            //   If the line from the file contained test data, run the test now.
 								            //
 								            if (testString.length() > 0) {
 								                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								            }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								            // Clear out this test case.
 								            //    The string and breakPositions vector will be refilled as the next
 								            //       test case is parsed.
 								            testString.remove();
 								            breakPositions.setSize(0);
 								            lineNumber++;
 								        } else {
 								            // Scanner catchall.  Something unrecognized appeared on the line.
 								            char token[16];
 								            UnicodeString uToken = tokenMatcher.group(0, status);
 								            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
 								            token[sizeof(token)-1] = 0;
 								            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
 								            // Clean up, in preparation for continuing with the next line.
 								            testString.remove();
 								            breakPositions.setSize(0);
 								            lineNumber++;
 								        }
 								        TEST_ASSERT_SUCCESS(status);
 								        if (U_FAILURE(status)) {
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								            break;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								        charIndex = tokenMatcher.end(status);
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    delete [] testFile;
 								 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
 								}
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								//--------------------------------------------------------------------------------------------
 								//
 								//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
 								//                            test data files.  Do only a simple, forward-only check -
 								//                            this test is mostly to check that ICU and the Unicode
 								//                            data agree with each other.
 								//
 								//--------------------------------------------------------------------------------------------
 								void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
 								                         const UnicodeString &testString,   // Text data to be broken
 								                         UVector32 *breakPositions,         // Positions where breaks should be found.
 								                         RuleBasedBreakIterator *bi) {
 								    int32_t pos;                 // Break Position in the test string
 								    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
 								    int32_t expectedPos;         // Expected break position (index into test string)
 								    bi->setText(testString);
 								    pos = bi->first();
 								    pos = bi->next();
 								    while (pos != BreakIterator::DONE) {
 								        if (expectedI >= breakPositions->size()) {
 								            errln("Test file \"%s\", line %d, unexpected break found at position %d",
 								                testFileName, lineNumber, pos);
 								            break;
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								        expectedPos = breakPositions->elementAti(expectedI);
 								        if (pos < expectedPos) {
 								            errln("Test file \"%s\", line %d, unexpected break found at position %d",
 								                testFileName, lineNumber, pos);
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								            break;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								        if (pos > expectedPos) {
 								            errln("Test file \"%s\", line %d, failed to find break at position %d",
 								                testFileName, lineNumber, expectedPos);
 								            break;
 								        }
 								        pos = bi->next();
 								        expectedI++;
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
+								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
 								    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
 								        errln("Test file \"%s\", line %d, failed to find break at position %d",
 								            testFileName, lineNumber, breakPositions->elementAti(expectedI));
 								    }
 								}
-												ICU-45 RBBI Bug:  fix handling of \uffff in data to be iterated.
Add test for same.
Add test for new line break test data file.

X-SVN-Rev: 9464
											
										
										
											2002-07-31 19:05:33 +00:00
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-45 RBBI tests

X-SVN-Rev: 610
											
										
										
											2000-01-17 20:59:08 +00:00
-												ICU-2896 build without regex (disable monkey test in that situation)

X-SVN-Rev: 12153
											
										
										
											2003-05-29 00:54:50 +00:00
+								#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								//---------------------------------------------------------------------------------------
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								//
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								//   classs RBBIMonkeyKind
 								//
 								//      Monkey Test for Break Iteration
 								//      Abstract interface class.   Concrete derived classes independently
 								//      implement the break rules for different iterator types.
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								//
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								//      The Monkey Test itself uses doesn't know which type of break iterator it is
 								//      testing, but works purely in terms of the interface defined here.
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								//
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								//---------------------------------------------------------------------------------------
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								class RBBIMonkeyKind {
 								public:
 								    // Return a UVector of UnicodeSets, representing the character classes used
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    //   for this type of iterator.
 								    virtual  UVector  *charClasses() = 0;
 								    // Set the test text on which subsequent calls to next() will operate
 								    virtual  void      setText(const UnicodeString &s) = 0;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
 								    // Find the next break postion, starting from the prev break position, or from zero.
 								    // Return -1 after reaching end of string.
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    virtual  int32_t   next(int32_t i) = 0;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								    virtual ~RBBIMonkeyKind();
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    UErrorCode       deferredStatus;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
 								protected:
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								    RBBIMonkeyKind();
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
 								private:
 								};
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								RBBIMonkeyKind::RBBIMonkeyKind() {
 								    deferredStatus = U_ZERO_ERROR;
 								}
 								RBBIMonkeyKind::~RBBIMonkeyKind() {
 								}
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								//----------------------------------------------------------------------------------------
 								//
 								//   Random Numbers.  Similar to standard lib rand() and srand()
 								//                    Not using library to
 								//                      1.  Get same results on all platforms.
 								//                      2.  Get access to current seed, to more easily reproduce failures.
 								//
 								//---------------------------------------------------------------------------------------
 								static uint32_t m_seed = 1;
 								static uint32_t m_rand()
 								{
 								    m_seed = m_seed * 1103515245 + 12345;
 								    return (uint32_t)(m_seed/65536) % 32768;
 								}
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								//------------------------------------------------------------------------------------------
 								//
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
 								//                             of RBBIMonkeyKind.
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								//
 								//------------------------------------------------------------------------------------------
 								class RBBICharMonkey: public RBBIMonkeyKind {
 								public:
 								    RBBICharMonkey();
 								    virtual          ~RBBICharMonkey();
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    virtual  UVector *charClasses();
 								    virtual  void     setText(const UnicodeString &s);
 								    virtual  int32_t  next(int32_t i);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								private:
 								    UVector   *fSets;
 								    UnicodeSet  *fCRLFSet;
 								    UnicodeSet  *fControlSet;
 								    UnicodeSet  *fExtendSet;
 								    UnicodeSet  *fHangulSet;
 								    UnicodeSet  *fAnySet;
 								    RegexMatcher  *fMatcher;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    const UnicodeString *fText;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								};
 								RBBICharMonkey::RBBICharMonkey() {
 								    UErrorCode  status = U_ZERO_ERROR;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    fText = NULL;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    fMatcher = new RegexMatcher("\\X", 0, status);     // Pattern to match a grampheme cluster
 								    fCRLFSet    = new UnicodeSet("[\\r\\n]", status);
-												ICU-3170 More RBBI tweaks for Unicode 4.01 update

X-SVN-Rev: 14912
											
										
										
											2004-04-08 23:38:02 +00:00
+								    fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    fExtendSet  = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
 								    fHangulSet  = new UnicodeSet(
 								        "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
 								         "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);
 								    fAnySet     = new UnicodeSet("[\\u0000-\\U0010ffff]", status);
 								    fSets       = new UVector(status);
 								    fSets->addElement(fCRLFSet,    status);
 								    fSets->addElement(fControlSet, status);
 								    fSets->addElement(fExtendSet,  status);
 								    fSets->addElement(fHangulSet,  status);
 								    fSets->addElement(fAnySet,     status);
 								    if (U_FAILURE(status)) {
 								        deferredStatus = status;
 								    }
-												ICU-4288 Mostly fixes for --enable-strict for gcc 3.4 (Fedora Core 3)

X-SVN-Rev: 17040
											
										
										
											2004-12-30 07:25:51 +00:00
+								}
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								void RBBICharMonkey::setText(const UnicodeString &s) {
 								    fText = &s;
 								    fMatcher->reset(s);
 								}
 								int32_t RBBICharMonkey::next(int32_t i) {
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    UErrorCode status = U_ZERO_ERROR;
 								    int32_t  retVal = -1;
 								    if (fMatcher->find(i, status)) {
 								        retVal = fMatcher->end(status);
 								    }
 								    if (U_FAILURE(status)){
 								        retVal = -1;
 								    }
 								    return retVal;
 								}
 								UVector  *RBBICharMonkey::charClasses() {
 								    return fSets;
 								}
 								RBBICharMonkey::~RBBICharMonkey() {
 								    delete fSets;
 								    delete fCRLFSet;
 								    delete fControlSet;
 								    delete fExtendSet;
 								    delete fHangulSet;
 								    delete fAnySet;
 								    delete fMatcher;
 								}
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								//------------------------------------------------------------------------------------------
 								//
 								//   class RBBIWordMonkey      Word Break specific implementation
 								//                             of RBBIMonkeyKind.
 								//
 								//------------------------------------------------------------------------------------------
 								class RBBIWordMonkey: public RBBIMonkeyKind {
 								public:
 								    RBBIWordMonkey();
 								    virtual          ~RBBIWordMonkey();
 								    virtual  UVector *charClasses();
 								    virtual  void     setText(const UnicodeString &s);
 								    virtual int32_t   next(int32_t i);
 								private:
 								    UVector      *fSets;
 								    UnicodeSet  *fKatakanaSet;
 								    UnicodeSet  *fALetterSet;
 								    UnicodeSet  *fMidLetterSet;
 								    UnicodeSet  *fMidNumSet;
 								    UnicodeSet  *fNumericSet;
 								    UnicodeSet  *fFormatSet;
 								    UnicodeSet  *fOtherSet;
 								    UnicodeSet  *fExtendSet;
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								    UnicodeSet  *fExtendNumLetSet;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								    RegexMatcher  *fMatcher;
 								    const UnicodeString  *fText;
 								};
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								RBBIWordMonkey::RBBIWordMonkey()
-												ICU-2840 tests shouldn't crash if there is no data

X-SVN-Rev: 12340
											
										
										
											2003-06-06 04:54:34 +00:00
+								{
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    UErrorCode  status = U_ZERO_ERROR;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
 								    fSets            = new UVector(status);
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-5117 Thai break should work in all locales

X-SVN-Rev: 19408
											
										
										
											2006-03-23 00:54:12 +00:00
+								    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                         "[\\p{Line_Break = Complex_Context}"
 								                         "-\\p{Grapheme_Cluster_Break = Extend}"
 								                         "-\\p{Grapheme_Cluster_Break = Control}]]",      status);
 								    //fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]",      status);
-												ICU-5276 update rbbi tests for treating Japanese Half Width voicing marks as Extend.

X-SVN-Rev: 20038
											
										
										
											2006-08-11 19:38:49 +00:00
+								    fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}-[\\uff9e\\uff9f]]",     status);
-												ICU-4766 rbbi word rules & tests updated for Unicode 4.1 handling of trailing format chars

X-SVN-Rev: 18510
											
										
										
											2005-09-10 03:52:54 +00:00
+								    fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]",    status);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]",       status);
 								    fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]",      status);
 								    fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]",       status);
-												ICU-4766 rbbi word rules & tests updated for Unicode 4.1 handling of trailing format chars

X-SVN-Rev: 18510
											
										
										
											2005-09-10 03:52:54 +00:00
+								    fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]", status);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    //fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]", status);
-												ICU-5276 update rbbi tests for treating Japanese Half Width voicing marks as Extend.

X-SVN-Rev: 20038
											
										
										
											2006-08-11 19:38:49 +00:00
+								    fExtendSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}\\uff9e\\uff9f]", status);
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-4766 rbbi word rules & tests updated for Unicode 4.1 handling of trailing format chars

X-SVN-Rev: 18510
											
										
										
											2005-09-10 03:52:54 +00:00
+								    fOtherSet        = new UnicodeSet();
-												ICU-2840 tests shouldn't crash if there is no data

X-SVN-Rev: 12340
											
										
										
											2003-06-06 04:54:34 +00:00
+								    if(U_FAILURE(status)) {
 								      deferredStatus = status;
 								      return;
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    fOtherSet->complement();
 								    fOtherSet->removeAll(*fKatakanaSet);
 								    fOtherSet->removeAll(*fALetterSet);
 								    fOtherSet->removeAll(*fMidLetterSet);
 								    fOtherSet->removeAll(*fMidNumSet);
 								    fOtherSet->removeAll(*fNumericSet);
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								    fOtherSet->removeAll(*fExtendNumLetSet);
-												ICU-4157 Word Break, fix problem with CR <combining> sequences

X-SVN-Rev: 17427
											
										
										
											2005-03-31 01:45:27 +00:00
+								    fOtherSet->removeAll(*fFormatSet);
 								    fOtherSet->removeAll(*fExtendSet);
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								    fSets->addElement(fALetterSet,   status);
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								    fSets->addElement(fKatakanaSet,  status);
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    fSets->addElement(fMidLetterSet, status);
 								    fSets->addElement(fMidNumSet,    status);
 								    fSets->addElement(fNumericSet,   status);
 								    fSets->addElement(fFormatSet,    status);
-												ICU-4157 Word Break, fix problem with CR <combining> sequences

X-SVN-Rev: 17427
											
										
										
											2005-03-31 01:45:27 +00:00
+								    fSets->addElement(fExtendSet,    status);
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    fSets->addElement(fOtherSet,     status);
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								    fSets->addElement(fExtendNumLetSet, status);
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								    if (U_FAILURE(status)) {
 								        deferredStatus = status;
 								    }
-												ICU-4288 Mostly fixes for --enable-strict for gcc 3.4 (Fedora Core 3)

X-SVN-Rev: 17040
											
										
										
											2004-12-30 07:25:51 +00:00
+								}
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								void RBBIWordMonkey::setText(const UnicodeString &s) {
 								    fText       = &s;
 								}
 								int32_t RBBIWordMonkey::next(int32_t prevPos) {
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    int    p0, p1, p2, p3;    // Indices of the significant code points around the
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								                              //   break position being tested.  The candidate break
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								                              //   location is before p2.
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
 								    int     breakPos = -1;
 								    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
 								    // Prev break at end of string.  return DONE.
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    if (prevPos >= fText->length()) {
 								        return -1;
 								    }
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								    p0 = p1 = p2 = p3 = prevPos;
 								    c3 =  fText->char32At(prevPos);
 								    c0 = c1 = c2 = 0;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								    // Loop runs once per "significant" character position in the input text.
 								    for (;;) {
 								        // Move all of the positions forward in the input string.
 								        p0 = p1;  c0 = c1;
 								        p1 = p2;  c1 = c2;
 								        p2 = p3;  c2 = c3;
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
 								        // Advancd p3 by    X(Extend | Format)*   Rule 4
 								        do {
 								            p3 = fText->moveIndex32(p3, 1);
 								            c3 = fText->char32At(p3);
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        }
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if (p1 == p2) {
 								            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
 								            continue;
 								        }
 								        if (p2 == fText->length()) {
 								            // Reached end of string.  Always a break position.
-												ICU-3007 rbbi test, string indexing out-of-bounds fixed.

X-SVN-Rev: 12392
											
										
										
											2003-06-09 23:01:11 +00:00
+								            break;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        // Rule  (3)   CR x LF
 								        //     No Extend or Format characters may appear between the CR and LF,
 								        //     which requires the additional check for p2 immediately following p1.
 								        //
 								        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
 								            continue;
 								        }
-												ICU-3007 rbbi test, string indexing out-of-bounds fixed.

X-SVN-Rev: 12392
											
										
										
											2003-06-09 23:01:11 +00:00
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        // Rule (5).   ALetter x ALetter
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if (fALetterSet->contains(c1) &&
 								            fALetterSet->contains(c2))  {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
 								        }
 								        // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        //
 								        //    Also incorporates rule 7 by skipping pos ahead to position of the
 								        //    terminating ALetter.
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								        if ( fALetterSet->contains(c1)   &&
 								             fMidLetterSet->contains(c2) &&
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								             fALetterSet->contains(c3)) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								        // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if (fALetterSet->contains(c0) &&
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								            (fMidLetterSet->contains(c1)  ) &&
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								            fALetterSet->contains(c2)) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								        // Rule (8)    Numeric x Numeric
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if (fNumericSet->contains(c1) &&
 								            fNumericSet->contains(c2))  {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
 								        }
 								        // Rule (9)    ALetter x Numeric
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if (fALetterSet->contains(c1) &&
 								            fNumericSet->contains(c2))  {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
 								        }
 								        // Rule (10)    Numeric x ALetter
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if (fNumericSet->contains(c1) &&
 								            fALetterSet->contains(c2))  {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
 								        }
 								        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if ( fNumericSet->contains(c0) &&
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								             fMidNumSet->contains(c1)  &&
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								            fNumericSet->contains(c2)) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
 								        // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if (fNumericSet->contains(c1) &&
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								            fMidNumSet->contains(c2)  &&
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								            fNumericSet->contains(c3)) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        }
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        // Rule (13)  Katakana x Katakana
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        if (fKatakanaSet->contains(c1) &&
 								            fKatakanaSet->contains(c2))  {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            continue;
 								        }
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								        // Rule 13a
 								        if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
 								             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
 								             fExtendNumLetSet->contains(c2)) {
 								                continue;
 								             }
 								        // Rule 13b
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        if (fExtendNumLetSet->contains(c1) &&
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								                (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
 								                fKatakanaSet->contains(c2)))  {
 								                continue;
 								             }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        // Rule 14.  Break found here.
 								        break;
 								    }
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								    breakPos = p2;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    return breakPos;
 								}
 								UVector  *RBBIWordMonkey::charClasses() {
 								    return fSets;
 								}
 								RBBIWordMonkey::~RBBIWordMonkey() {
 								    delete fSets;
 								    delete fKatakanaSet;
 								    delete fALetterSet;
 								    delete fMidLetterSet;
 								    delete fMidNumSet;
 								    delete fNumericSet;
 								    delete fFormatSet;
 								    delete fExtendSet;
-												ICU-4157 Fix a memory leak introduced by this bug.

X-SVN-Rev: 17295
											
										
										
											2005-03-08 18:54:35 +00:00
+								    delete fExtendNumLetSet;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    delete fOtherSet;
 								}
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								//------------------------------------------------------------------------------------------
 								//
 								//   class RBBISentMonkey      Sentence Break specific implementation
 								//                             of RBBIMonkeyKind.
 								//
 								//------------------------------------------------------------------------------------------
 								class RBBISentMonkey: public RBBIMonkeyKind {
 								public:
 								    RBBISentMonkey();
 								    virtual          ~RBBISentMonkey();
 								    virtual  UVector *charClasses();
 								    virtual  void     setText(const UnicodeString &s);
 								    virtual int32_t   next(int32_t i);
 								private:
 								    int               moveBack(int posFrom);
 								    int               moveForward(int posFrom);
 								    UChar32           cAt(int pos);
 								    UVector      *fSets;
 								    UnicodeSet  *fSepSet;
 								    UnicodeSet  *fFormatSet;
 								    UnicodeSet  *fSpSet;
 								    UnicodeSet  *fLowerSet;
 								    UnicodeSet  *fUpperSet;
 								    UnicodeSet  *fOLetterSet;
 								    UnicodeSet  *fNumericSet;
 								    UnicodeSet  *fATermSet;
 								    UnicodeSet  *fSTermSet;
 								    UnicodeSet  *fCloseSet;
 								    UnicodeSet  *fOtherSet;
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    UnicodeSet  *fExtendSet;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
 								    const UnicodeString  *fText;
 								};
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								RBBISentMonkey::RBBISentMonkey()
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								{
 								    UErrorCode  status = U_ZERO_ERROR;
 								    fSets            = new UVector(status);
 								    fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep}]",     status);
 								    fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]",  status);
 								    fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]",      status);
 								    fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]",   status);
 								    fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]",   status);
-												ICU-5276 Sentence Break Rule updates for Japanese Voicing Marks change.

X-SVN-Rev: 20042
											
										
										
											2006-08-11 21:01:07 +00:00
+								    fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}-[\\uff9e\\uff9f]]", status);
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								    fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]", status);
 								    fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]",   status);
 								    fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]",   status);
 								    fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]",   status);
-												ICU-5276 Sentence Break Rule updates for Japanese Voicing Marks change.

X-SVN-Rev: 20042
											
										
										
											2006-08-11 21:01:07 +00:00
+								    fExtendSet       = new UnicodeSet("[\\p{Grapheme_Extend}\\uff9e\\uff9f]", status);
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								    fOtherSet        = new UnicodeSet();
 								    if(U_FAILURE(status)) {
 								      deferredStatus = status;
 								      return;
 								    }
 								    fOtherSet->complement();
 								    fOtherSet->removeAll(*fSepSet);
 								    fOtherSet->removeAll(*fFormatSet);
 								    fOtherSet->removeAll(*fSpSet);
 								    fOtherSet->removeAll(*fLowerSet);
 								    fOtherSet->removeAll(*fUpperSet);
 								    fOtherSet->removeAll(*fOLetterSet);
 								    fOtherSet->removeAll(*fNumericSet);
 								    fOtherSet->removeAll(*fATermSet);
 								    fOtherSet->removeAll(*fSTermSet);
 								    fOtherSet->removeAll(*fCloseSet);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    fOtherSet->removeAll(*fExtendSet);
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
 								    fSets->addElement(fSepSet,     status);
 								    fSets->addElement(fFormatSet,  status);
 								    fSets->addElement(fSpSet,      status);
 								    fSets->addElement(fLowerSet,   status);
 								    fSets->addElement(fUpperSet,   status);
 								    fSets->addElement(fOLetterSet, status);
 								    fSets->addElement(fNumericSet, status);
 								    fSets->addElement(fATermSet,   status);
 								    fSets->addElement(fSTermSet,   status);
 								    fSets->addElement(fCloseSet,   status);
 								    fSets->addElement(fOtherSet,   status);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    fSets->addElement(fExtendSet,  status);
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
 								    if (U_FAILURE(status)) {
 								        deferredStatus = status;
 								    }
 								}
 								void RBBISentMonkey::setText(const UnicodeString &s) {
 								    fText       = &s;
 								}
 								UVector  *RBBISentMonkey::charClasses() {
 								    return fSets;
 								}
 								//  moveBack()   Find the "significant" code point preceding the index i.
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								//               Skips over ($Extend | $Format)* .
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								//
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								int RBBISentMonkey::moveBack(int i) {
 								    if (i <= 0) {
 								        return -1;
 								    }
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    UChar32   c;
 								    int32_t   j = i;
 								    do {
 								        j = fText->moveIndex32(j, -1);
 								        c = fText->char32At(j);
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								    }
-												ICU-5242 sentence break monkey test updates

X-SVN-Rev: 19751
											
										
										
											2006-06-23 01:10:22 +00:00
+								    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    return j;
 								 }
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
 								int RBBISentMonkey::moveForward(int i) {
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    if (i>=fText->length()) {
 								        return fText->length();
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								    }
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    UChar32   c;
 								    int32_t   j = i;
 								    do {
 								        j = fText->moveIndex32(j, 1);
 								        c = cAt(j);
 								    }
 								    while (fFormatSet->contains(c) || fExtendSet->contains(c));
 								    return j;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								}
 								UChar32 RBBISentMonkey::cAt(int pos) {
 								    if (pos<0 || pos>=fText->length()) {
 								        return -1;
 								    } else {
 								        return fText->char32At(pos);
 								    }
 								}
 								int32_t RBBISentMonkey::next(int32_t prevPos) {
 								    int    p0, p1, p2, p3;    // Indices of the significant code points around the
 								                              //   break position being tested.  The candidate break
 								                              //   location is before p2.
 								    int     breakPos = -1;
 								    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
 								    UChar32 c;
 								    // Prev break at end of string.  return DONE.
 								    if (prevPos >= fText->length()) {
 								        return -1;
 								    }
 								    p0 = p1 = p2 = p3 = prevPos;
 								    c3 =  fText->char32At(prevPos);
 								    c0 = c1 = c2 = 0;
 								    // Loop runs once per "significant" character position in the input text.
 								    for (;;) {
 								        // Move all of the positions forward in the input string.
 								        p0 = p1;  c0 = c1;
 								        p1 = p2;  c1 = c2;
 								        p2 = p3;  c2 = c3;
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        // Advancd p3 by    X(Extend | Format)*   Rule 4
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								        p3 = moveForward(p3);
 								        c3 = cAt(p3);
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        // Rule (3)  CR x LF
 								        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
 								            continue;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        // Rule (4).   Sep  <break>
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								        if (fSepSet->contains(c1)) {
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								            p2 = p1+1;   // Separators don't combine with Extend or Format.
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								            break;
 								        }
-												ICU-5242 sentence break monkey test updates

X-SVN-Rev: 19751
											
										
										
											2006-06-23 01:10:22 +00:00
+								        if (p2 >= fText->length()) {
 								            // Reached end of string.  Always a break position.
 								            break;
 								        }
 								        if (p2 == prevPos) {
 								            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
 								            continue;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								        // Rule (6).   ATerm x Numeric
 								        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
 								            continue;
 								        }
 								        // Rule (7).  Upper ATerm  x  Uppper
 								        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
 								            continue;
 								        }
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
 								        //           Note:  STerm | ATerm are added to the negated part of the expression by a
 								        //                  note to the Unicode 5.0 documents.
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								        int p8 = p1;
 								        while (fSpSet->contains(cAt(p8))) {
 								            p8 = moveBack(p8);
 								        }
 								        while (fCloseSet->contains(cAt(p8))) {
 								            p8 = moveBack(p8);
 								        }
 								        if (fATermSet->contains(cAt(p8))) {
 								            p8=p2;
 								            for (;;) {
 								                c = cAt(p8);
 								                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								                    fLowerSet->contains(c) || fSepSet->contains(c) ||
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                    fATermSet->contains(c) || fSTermSet->contains(c))  {
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								                    break;
 								                }
 								                p8 = moveForward(p8);
 								            }
 								            if (fLowerSet->contains(cAt(p8))) {
 								                continue;
 								            }
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        // Rule 8a   (STerm | ATerm) Close* Sp* x (STerm | ATerm);
 								        if (fSTermSet->contains(c2) || fATermSet->contains(c2)) {
 								            p8 = p1;
 								            while (fSpSet->contains(cAt(p8))) {
 								                p8 = moveBack(p8);
 								            }
 								            while (fCloseSet->contains(cAt(p8))) {
 								                p8 = moveBack(p8);
 								            }
 								            c = cAt(p8);
 								            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
 								                continue;
 								            }
 								        }
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
 								        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep)
 								        int p9 = p1;
 								        while (fCloseSet->contains(cAt(p9))) {
 								            p9 = moveBack(p9);
 								        }
 								        c = cAt(p9);
 								        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
 								            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
 								                continue;
 								            }
 								        }
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep)
 								        int p10 = p1;
 								        while (fSpSet->contains(cAt(p10))) {
 								            p10 = moveBack(p10);
 								        }
 								        while (fCloseSet->contains(cAt(p10))) {
 								            p10 = moveBack(p10);
 								        }
 								        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
 								            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
 								                continue;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								            }
 								        }
 								        // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
 								        int p11 = p1;
 								        while (fSpSet->contains(cAt(p11))) {
 								            p11 = moveBack(p11);
 								        }
 								        while (fCloseSet->contains(cAt(p11))) {
 								            p11 = moveBack(p11);
 								        }
 								        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
 								            break;
 								        }
 								        //  Rule (12)  Any x Any
 								        continue;
 								    }
 								    breakPos = p2;
 								    return breakPos;
 								}
 								RBBISentMonkey::~RBBISentMonkey() {
 								    delete fSets;
 								    delete fSepSet;
 								    delete fFormatSet;
 								    delete fSpSet;
 								    delete fLowerSet;
 								    delete fUpperSet;
 								    delete fOLetterSet;
 								    delete fNumericSet;
 								    delete fATermSet;
 								    delete fSTermSet;
 								    delete fCloseSet;
 								    delete fOtherSet;
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								    delete fExtendSet;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								}
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								//-------------------------------------------------------------------------------------------
 								//
 								//  RBBILineMonkey
 								//
 								//-------------------------------------------------------------------------------------------
 								class RBBILineMonkey: public RBBIMonkeyKind {
 								public:
 								    RBBILineMonkey();
 								    virtual          ~RBBILineMonkey();
 								    virtual  UVector *charClasses();
 								    virtual  void     setText(const UnicodeString &s);
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								    virtual  int32_t  next(int32_t i);
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								private:
 								    UVector      *fSets;
 								    UnicodeSet  *fBK;
 								    UnicodeSet  *fCR;
 								    UnicodeSet  *fLF;
 								    UnicodeSet  *fCM;
 								    UnicodeSet  *fNL;
 								    UnicodeSet  *fSG;
 								    UnicodeSet  *fWJ;
 								    UnicodeSet  *fZW;
 								    UnicodeSet  *fGL;
 								    UnicodeSet  *fCB;
 								    UnicodeSet  *fSP;
 								    UnicodeSet  *fB2;
 								    UnicodeSet  *fBA;
 								    UnicodeSet  *fBB;
 								    UnicodeSet  *fHY;
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    UnicodeSet  *fH2;
 								    UnicodeSet  *fH3;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    UnicodeSet  *fCL;
 								    UnicodeSet  *fEX;
 								    UnicodeSet  *fIN;
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    UnicodeSet  *fJL;
 								    UnicodeSet  *fJV;
 								    UnicodeSet  *fJT;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    UnicodeSet  *fNS;
 								    UnicodeSet  *fOP;
 								    UnicodeSet  *fQU;
 								    UnicodeSet  *fIS;
 								    UnicodeSet  *fNU;
 								    UnicodeSet  *fPO;
 								    UnicodeSet  *fPR;
 								    UnicodeSet  *fSY;
 								    UnicodeSet  *fAI;
 								    UnicodeSet  *fAL;
 								    UnicodeSet  *fID;
 								    UnicodeSet  *fSA;
 								    UnicodeSet  *fXX;
 								    BreakIterator  *fCharBI;
 								    const UnicodeString  *fText;
 								    int32_t              *fOrigPositions;
 								    RegexMatcher         *fNumberMatcher;
-												ICU-2924 Line break update - fix more monkey failures, getting closer.

X-SVN-Rev: 13397
											
										
										
											2003-10-13 06:01:21 +00:00
+								    RegexMatcher         *fLB11Matcher;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								};
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								RBBILineMonkey::RBBILineMonkey()
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								{
 								    UErrorCode  status = U_ZERO_ERROR;
-												ICU-4354 make intltest pass with no data

X-SVN-Rev: 18140
											
										
										
											2005-07-05 18:39:00 +00:00
+								    fSets  = new UVector(status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
 								    fBK    = new UnicodeSet("[\\p{Line_Break=BK}]", status);
 								    fCR    = new UnicodeSet("[\\p{Line_break=CR}]", status);
 								    fLF    = new UnicodeSet("[\\p{Line_break=LF}]", status);
 								    fCM    = new UnicodeSet("[\\p{Line_break=CM}]", status);
 								    fNL    = new UnicodeSet("[\\p{Line_break=NL}]", status);
 								    fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]", status);
 								    fZW    = new UnicodeSet("[\\p{Line_break=ZW}]", status);
 								    fGL    = new UnicodeSet("[\\p{Line_break=GL}]", status);
 								    fCB    = new UnicodeSet("[\\p{Line_break=CB}]", status);
 								    fSP    = new UnicodeSet("[\\p{Line_break=SP}]", status);
 								    fB2    = new UnicodeSet("[\\p{Line_break=B2}]", status);
 								    fBA    = new UnicodeSet("[\\p{Line_break=BA}]", status);
 								    fBB    = new UnicodeSet("[\\p{Line_break=BB}]", status);
 								    fHY    = new UnicodeSet("[\\p{Line_break=HY}]", status);
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								    fH2    = new UnicodeSet("[\\p{Line_break=H2}]", status);
 								    fH3    = new UnicodeSet("[\\p{Line_break=H3}]", status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    fCL    = new UnicodeSet("[\\p{Line_break=CL}]", status);
 								    fEX    = new UnicodeSet("[\\p{Line_break=EX}]", status);
 								    fIN    = new UnicodeSet("[\\p{Line_break=IN}]", status);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    fJL    = new UnicodeSet("[\\p{Line_break=JL}]", status);
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								    fJV    = new UnicodeSet("[\\p{Line_break=JV}]", status);
 								    fJT    = new UnicodeSet("[\\p{Line_break=JT}]", status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    fNS    = new UnicodeSet("[\\p{Line_break=NS}]", status);
 								    fOP    = new UnicodeSet("[\\p{Line_break=OP}]", status);
 								    fQU    = new UnicodeSet("[\\p{Line_break=QU}]", status);
 								    fIS    = new UnicodeSet("[\\p{Line_break=IS}]", status);
 								    fNU    = new UnicodeSet("[\\p{Line_break=NU}]", status);
 								    fPO    = new UnicodeSet("[\\p{Line_break=PO}]", status);
 								    fPR    = new UnicodeSet("[\\p{Line_break=PR}]", status);
 								    fSY    = new UnicodeSet("[\\p{Line_break=SY}]", status);
 								    fAI    = new UnicodeSet("[\\p{Line_break=AI}]", status);
 								    fAL    = new UnicodeSet("[\\p{Line_break=AL}]", status);
 								    fID    = new UnicodeSet("[\\p{Line_break=ID}]", status);
 								    fSA    = new UnicodeSet("[\\p{Line_break=SA}]", status);
-												ICU-4855 rbbi Line Break, handle unpaired surrogates as AL

X-SVN-Rev: 18596
											
										
										
											2005-09-28 04:57:25 +00:00
+								    fSG    = new UnicodeSet("[\\ud800-\\udfff]", status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    fXX    = new UnicodeSet("[\\p{Line_break=XX}]", status);
-												ICU-4354 make intltest pass with no data

X-SVN-Rev: 18140
											
										
										
											2005-07-05 18:39:00 +00:00
+								    if (U_FAILURE(status)) {
 								        deferredStatus = status;
 								        fCharBI = NULL;
 								        fNumberMatcher = NULL;
 								        return;
 								    }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
 								    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
 								    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
-												ICU-4855 rbbi Line Break, handle unpaired surrogates as AL

X-SVN-Rev: 18596
											
										
										
											2005-09-28 04:57:25 +00:00
+								    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
 								    fSets->addElement(fBK, status);
 								    fSets->addElement(fCR, status);
 								    fSets->addElement(fLF, status);
 								    fSets->addElement(fCM, status);
 								    fSets->addElement(fNL, status);
 								    fSets->addElement(fWJ, status);
 								    fSets->addElement(fZW, status);
 								    fSets->addElement(fGL, status);
 								    fSets->addElement(fCB, status);
 								    fSets->addElement(fSP, status);
 								    fSets->addElement(fB2, status);
 								    fSets->addElement(fBA, status);
 								    fSets->addElement(fBB, status);
 								    fSets->addElement(fHY, status);
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								    fSets->addElement(fH2, status);
 								    fSets->addElement(fH3, status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    fSets->addElement(fCL, status);
 								    fSets->addElement(fEX, status);
 								    fSets->addElement(fIN, status);
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								    fSets->addElement(fJL, status);
 								    fSets->addElement(fJT, status);
 								    fSets->addElement(fJV, status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    fSets->addElement(fNS, status);
 								    fSets->addElement(fOP, status);
 								    fSets->addElement(fQU, status);
 								    fSets->addElement(fIS, status);
 								    fSets->addElement(fNU, status);
 								    fSets->addElement(fPO, status);
 								    fSets->addElement(fPR, status);
 								    fSets->addElement(fSY, status);
 								    fSets->addElement(fAI, status);
 								    fSets->addElement(fAL, status);
 								    fSets->addElement(fID, status);
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								    fSets->addElement(fWJ, status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    fSets->addElement(fSA, status);
-												ICU-4855 rbbi Line Break, handle unpaired surrogates as AL

X-SVN-Rev: 18596
											
										
										
											2005-09-28 04:57:25 +00:00
+								    fSets->addElement(fSG, status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
 								    fNumberMatcher = new RegexMatcher(
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
 								        "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
-												ICU-3700 updated rules for Unicode 4.0.1.

X-SVN-Rev: 15286
											
										
										
											2004-05-12 23:29:24 +00:00
+								        "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?",
-												ICU-2924 Line break update - fix more monkey failures, getting closer.

X-SVN-Rev: 13397
											
										
										
											2003-10-13 06:01:21 +00:00
+, status);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
 								    if (U_FAILURE(status)) {
 								        deferredStatus = status;
 								    }
-												ICU-4288 Mostly fixes for --enable-strict for gcc 3.4 (Fedora Core 3)

X-SVN-Rev: 17040
											
										
										
											2004-12-30 07:25:51 +00:00
+								}
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
 								void RBBILineMonkey::setText(const UnicodeString &s) {
 								    fText       = &s;
 								    fCharBI->setText(s);
 								    fNumberMatcher->reset(s);
 								}
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
+								//
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								//  rule9Adjust
 								//     Line Break TR rules 9 and 10 implementation.
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								//     This deals with combining marks and other sequences that
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
+								//     that must be treated as if they were something other than what they actually are.
 								//
 								//     This is factored out into a separate function because it must be applied twice for
 								//     each potential break, once to the chars before the position being checked, then
 								//     again to the text following the possible break.
 								//
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
+								    if (pos == -1) {
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        // Invalid initial position.  Happens during the warmup iteration of the
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
+								        //   main loop in next().
 								        return;
 								    }
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								    int32_t  nPos = *nextPos;
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								    // LB 9  Keep combining sequences together.
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								    //  advance over any CM class chars.  Note that Line Break CM is different
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								    //  from the normal Grapheme Extend property.
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
 								          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
-												ICU-2924 RBBI, line break rules,  monkey test, a few more fixes

X-SVN-Rev: 13402
											
										
										
											2003-10-13 22:01:53 +00:00
+								        for (;;) {
 								            *nextChar = fText->char32At(nPos);
 								            if (!fCM->contains(*nextChar)) {
 								                break;
 								            }
 								            nPos = fText->moveIndex32(nPos, 1);
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								        }
 								    }
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								    // LB 9 Treat X CM* as if it were x.
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    //       No explicit action required.
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								    // LB 10  Treat any remaining combining mark as AL
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								    if (fCM->contains(*posChar)) {
 								        *posChar = 0x41;   // thisChar = 'A';
 								    }
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
 								    // Push the updated nextPos and nextChar back to our caller.
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								    // This only makes a difference if posChar got bigger by consuming a
 								    // combining sequence.
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
+								    *nextPos  = nPos;
 								    *nextChar = fText->char32At(nPos);
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								}
 								int32_t RBBILineMonkey::next(int32_t startPos) {
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    UErrorCode status = U_ZERO_ERROR;
 								    int32_t    pos;       //  Index of the char following a potential break position
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								    UChar32    thisChar;  //  Character at above position "pos"
 								    int32_t    prevPos;   //  Index of the char preceding a potential break position
 								    UChar32    prevChar;  //  Character at above position.  Note that prevChar
 								                          //   and thisChar may not be adjacent because combining
 								                          //   characters between them will be ignored.
 								    int32_t    nextPos;   //  Index of the next character following pos.
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								                          //     Usually skips over combining marks.
 								    int32_t    nextCPPos; //  Index of the code point following "pos."
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								                          //     May point to a combining mark.
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    int32_t    tPos;      //  temp value.
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    UChar32    c;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								    if (startPos >= fText->length()) {
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        return -1;
 								    }
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
 								    // Initial values for loop.  Loop will run the first time without finding breaks,
 								    //                           while the invalid values shift out and the "this" and
 								    //                           "prev" positions are filled in with good values.
 								    pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
 								    thisChar = prevChar  = 0;
 								    nextPos  = nextCPPos = startPos;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12701
											
										
										
											2003-07-28 06:40:25 +00:00
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    // Loop runs once per position in the test text, until a break position
 								    //  is found.
 								    for (;;) {
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								        prevPos   = pos;
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								        prevChar  = thisChar;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12701
											
										
										
											2003-07-28 06:40:25 +00:00
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13385
											
										
										
											2003-10-10 00:53:18 +00:00
+								        pos       = nextPos;
 								        thisChar  = fText->char32At(pos);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12701
											
										
										
											2003-07-28 06:40:25 +00:00
+								        nextCPPos = fText->moveIndex32(pos, 1);
 								        nextPos   = nextCPPos;
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        // Rule LB2 - Break at end of text.
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (pos >= fText->length()) {
 								            break;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // Rule LB 9 - adjust for combining sequences.
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        //             We do this one out-of-order because the adjustment does not change anything
 								        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
 								        //             be applied.
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
 								        c = fText->char32At(nextPos);
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        rule9Adjust(pos,     &thisChar, &nextPos, &c);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
 								        // If the loop is still warming up - if we haven't shifted the initial
 								        //   -1 positions out of prevPos yet - loop back to advance the
 								        //    position in the input without any further looking for breaks.
 								        if (prevPos == -1) {
 								            continue;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 4  Always break after hard line breaks,
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (fBK->contains(prevChar)) {
 								            break;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 5  Break after CR, LF, NL, but not inside CR LF
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (prevChar == 0x0d && thisChar == 0x0a) {
 								            continue;
 								        }
 								        if (prevChar == 0x0d ||
 								            prevChar == 0x0a ||
 								            prevChar == 0x85)  {
 								            break;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 6  Don't break before hard line breaks
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
 								            fBK->contains(thisChar)) {
 								                continue;
 								        }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-2924 Line break update - fix more monkey failures, getting closer.

X-SVN-Rev: 13397
											
										
										
											2003-10-13 06:01:21 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 7  Don't break before spaces or zero-width space.
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								        if (fSP->contains(thisChar)) {
 								            continue;
 								        }
-												ICU-2292 line breaks passing on default option

X-SVN-Rev: 13636
											
										
										
											2003-11-07 22:49:38 +00:00
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								        if (fZW->contains(thisChar)) {
 								            continue;
 								        }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 8  Break after zero width space
-												ICU-2924 Line break update - fix some test failures.

X-SVN-Rev: 13370
											
										
										
											2003-10-09 05:39:58 +00:00
+								        if (fZW->contains(prevChar)) {
 								            break;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 9, 10  Already done, at top of loop.
 								        //
 								        // LB 11  Do not break before or after WORD JOINER and related characters.
 								        //    x  WJ
 								        //    WJ  x
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        //
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
 								            continue;
 								        }
-												ICU-2924 Line break update - fix more monkey failures, getting closer.

X-SVN-Rev: 13397
											
										
										
											2003-10-13 06:01:21 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 12
 								        //    (!SP) x  GL
 								        //    GL  x
 								        if ((!fSP->contains(prevChar)) && fGL->contains(thisChar) ||
 								             fGL->contains(prevChar)) {
 								            continue;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-2924 Line break update - fix more monkey failures, getting closer.

X-SVN-Rev: 13397
											
										
										
											2003-10-13 06:01:21 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 13  Don't break before closings.
 								        //        NU x CL  and NU x IS are not matched here so that they will
 								        //        fall into LB 17 and the more general number regular expression.
-												ICU-2924 RBBI, line break rules,  monkey test, better conformance to spec

X-SVN-Rev: 13394
											
										
										
											2003-10-11 00:44:36 +00:00
+								        //
 								        if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
 								                                        fEX->contains(thisChar) ||
 								            !fNU->contains(prevChar) && fIS->contains(thisChar) ||
-												ICU-3700 updated rules for Unicode 4.0.1.

X-SVN-Rev: 15286
											
										
										
											2004-05-12 23:29:24 +00:00
+								            !fNU->contains(prevChar) && fSY->contains(thisChar))    {
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								            continue;
 								        }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 14 Don't break after OP SP*
-												ICU-3170 Line Break Test fixes for Unicode 4.01

X-SVN-Rev: 14886
											
										
										
											2004-04-06 20:59:49 +00:00
+								        //       Scan backwards, checking for this sequence.
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        //       The OP char could include combining marks, so we actually check for
-												ICU-3170 Line Break Test fixes for Unicode 4.01

X-SVN-Rev: 14886
											
										
										
											2004-04-06 20:59:49 +00:00
+								        //           OP CM* SP*
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        //       Another Twist: The Rule 67 fixes may have changed a SP CM
-												ICU-3170 Line Break Test fixes for Unicode 4.01

X-SVN-Rev: 14886
											
										
										
											2004-04-06 20:59:49 +00:00
+								        //       sequence into a ID char, so before scanning back through spaces,
 								        //       verify that prevChar is indeed a space.  The prevChar variable
 								        //       may differ from fText[prevPos]
 								        tPos = prevPos;
 								        if (fSP->contains(prevChar)) {
 								            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								                tPos=fText->moveIndex32(tPos, -1);
 								            }
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        }
-												ICU-3170 Line Break Test fixes for Unicode 4.01

X-SVN-Rev: 14886
											
										
										
											2004-04-06 20:59:49 +00:00
+								        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
 								            tPos=fText->moveIndex32(tPos, -1);
 								        }
 								        if (fOP->contains(fText->char32At(tPos))) {
 								            continue;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 15    QU SP* x OP
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17387
											
										
										
											2005-03-25 00:56:00 +00:00
+								        if (fOP->contains(thisChar)) {
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
 								            int tPos = prevPos;
 								            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
 								                tPos = fText->moveIndex32(tPos, -1);
 								            }
 								            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
 								                tPos = fText->moveIndex32(tPos, -1);
 								            }
 								            if (fQU->contains(fText->char32At(tPos))) {
 								                continue;
 								            }
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 16   CL SP* x NS
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        //    Scan backwards for SP* CM* CL
 								        if (fNS->contains(thisChar)) {
 								            int tPos = prevPos;
 								            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
 								                tPos = fText->moveIndex32(tPos, -1);
 								            }
 								            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
 								                tPos = fText->moveIndex32(tPos, -1);
 								            }
 								            if (fCL->contains(fText->char32At(tPos))) {
 								                continue;
 								            }
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 17        B2 SP* x B2
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        if (fB2->contains(thisChar)) {
 								            //  Scan backwards, checking for the B2 CM* SP* sequence.
 								            tPos = prevPos;
 								            if (fSP->contains(prevChar)) {
 								                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
 								                    tPos=fText->moveIndex32(tPos, -1);
 								                }
 								            }
 								            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
 								                tPos=fText->moveIndex32(tPos, -1);
 								            }
 								            if (fB2->contains(fText->char32At(tPos))) {
 								                continue;
 								            }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 18    break after space
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (fSP->contains(prevChar)) {
 								            break;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 19
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        //    x   QU
 								        //    QU  x
 								        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
 								            continue;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 20  Break around a CB
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
 								            break;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 21
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (fBA->contains(thisChar) ||
 								            fHY->contains(thisChar) ||
 								            fNS->contains(thisChar) ||
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								            fBB->contains(prevChar) )   {
 								            continue;
 								        }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 22
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
 								            fID->contains(prevChar) && fIN->contains(thisChar) ||
 								            fIN->contains(prevChar) && fIN->contains(thisChar) ||
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								            fNU->contains(prevChar) && fIN->contains(thisChar) )   {
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								            continue;
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								        }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 23    ID x PO
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12701
											
										
										
											2003-07-28 06:40:25 +00:00
+								        //          AL x NU
 								        //          NU x AL
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (fID->contains(prevChar) && fPO->contains(thisChar) ||
 								            fAL->contains(prevChar) && fNU->contains(thisChar) ||
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								            fNU->contains(prevChar) && fAL->contains(thisChar) )   {
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								            continue;
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								        }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 24  Do not break between prefix and letters or ideographs.
 								        //        PR x ID
 								        //        PR x AL
 								        //        PO x AL
 								        if (fPR->contains(prevChar) && fID->contains(thisChar) ||
 								            fPR->contains(prevChar) && fAL->contains(thisChar) ||
 								            fPO->contains(prevChar) && fAL->contains(thisChar) )   {
 								            continue;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 25    Numbers
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        if (fNumberMatcher->lookingAt(prevPos, status)) {
 								            if (U_FAILURE(status)) {
 								                break;
 								            }
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
+								            // Matched a number.  But could have been just a single digit, which would
 								            //    not represent a "no break here" between prevChar and thisChar
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								            if (numEndIdx > pos) {
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
+								                // Number match includes at least our two chars being checked
 								                if (numEndIdx > nextPos) {
 								                    // Number match includes additional chars.  Update pos and nextPos
 								                    //   so that next loop iteration will continue at the end of the number,
 								                    //   checking for breaks between last char in number & whatever follows.
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								                    pos = nextPos = numEndIdx;
 								                    do {
 								                        pos = fText->moveIndex32(pos, -1);
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								                        thisChar = fText->char32At(pos);
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								                    } while (fCM->contains(thisChar));
-												ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
											
										
										
											2003-10-10 18:57:42 +00:00
+								                }
-												ICU-2924 RBBI line break rules and monkey test, work in progress

X-SVN-Rev: 12685
											
										
										
											2003-07-25 01:15:04 +00:00
+								                continue;
 								            }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        }
-												ICU-3700 updated rules for Unicode 4.0.1.

X-SVN-Rev: 15286
											
										
										
											2004-05-12 23:29:24 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 26 Do not break a Korean syllable.
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
 								                                        fJV->contains(thisChar) ||
 								                                        fH2->contains(thisChar) ||
 								                                        fH3->contains(thisChar))) {
 								                                            continue;
 								                                        }
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
 								        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
 								                continue;
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								        }
 								        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
 								            fJT->contains(thisChar)) {
 								                continue;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        }
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 27 Treat a Korean Syllable Block the same as ID.
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
 								            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
 								            fIN->contains(thisChar)) {
 								                continue;
 								            }
 								        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
 								            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
 								            fPO->contains(thisChar)) {
 								                continue;
 								            }
 								        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
 								            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
 								                continue;
 								            }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 28  Do not break between alphabetics (“at”).
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
 								            continue;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
-												ICU-3700 updated rules for Unicode 4.0.1.

X-SVN-Rev: 15286
											
										
										
											2004-05-12 23:29:24 +00:00
+								        if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
 								            continue;
 								        }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								        //LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation
 								        //      (AL | NU) x OP
 								        //       CL x (AL | NU)
 								        if ((fAL->contains(prevChar) || fNU->contains(prevChar)) &&
 								              fOP->contains(thisChar)) {
 								            continue;
 								        }
 								        if (fCL->contains(prevChar) &&
 								            (fAL->contains(thisChar) || fNU->contains(thisChar))) {
 								            continue;
 								        }
 								        // LB 31    Break everywhere else
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        break;
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    }
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    return pos;
 								}
 								UVector  *RBBILineMonkey::charClasses() {
 								    return fSets;
 								}
 								RBBILineMonkey::~RBBILineMonkey() {
 								    delete fSets;
 								    delete fBK;
 								    delete fCR;
 								    delete fLF;
 								    delete fCM;
 								    delete fNL;
 								    delete fWJ;
 								    delete fZW;
 								    delete fGL;
 								    delete fCB;
 								    delete fSP;
 								    delete fB2;
 								    delete fBA;
 								    delete fBB;
 								    delete fHY;
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								    delete fH2;
 								    delete fH3;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    delete fCL;
 								    delete fEX;
 								    delete fIN;
-												ICU-4157 RBBI Rule updates for Unicode 4.1

X-SVN-Rev: 17118
											
										
										
											2005-01-13 23:42:12 +00:00
+								    delete fJL;
 								    delete fJV;
 								    delete fJT;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    delete fNS;
 								    delete fOP;
 								    delete fQU;
 								    delete fIS;
 								    delete fNU;
 								    delete fPO;
 								    delete fPR;
 								    delete fSY;
 								    delete fAI;
 								    delete fAL;
 								    delete fID;
 								    delete fSA;
-												ICU-4855 rbbi Line Break, handle unpaired surrogates as AL

X-SVN-Rev: 18596
											
										
										
											2005-09-28 04:57:25 +00:00
+								    delete fSG;
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    delete fXX;
 								    delete fCharBI;
 								    delete fNumberMatcher;
 								}
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								//-------------------------------------------------------------------------------------------
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								//
 								//   TestMonkey
 								//
 								//     params
 								//       seed=nnnnn        Random number starting seed.
 								//                         Setting the seed allows errors to be reproduced.
 								//       loop=nnn          Looping count.  Controls running time.
 								//                         -1:  run forever.
 								//                          0 or greater:  run length.
 								//
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								//       type = char | word | line | sent | title
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								//
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								//-------------------------------------------------------------------------------------------
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
 								static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
 								    int32_t val = defaultVal;
 								    name.append(" *= *(-?\\d+)");
 								    UErrorCode status = U_ZERO_ERROR;
 								    RegexMatcher m(name, params, 0, status);
 								    if (m.find()) {
 								        // The param exists.  Convert the string to an int.
 								        char valString[100];
 								        int32_t paramLength = m.end(1, status) - m.start(1, status);
-												ICU-3222 Fix some compiler warnings

X-SVN-Rev: 13927
											
										
										
											2003-12-02 01:34:21 +00:00
+								        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
 								            paramLength = (int32_t)(sizeof(valString)-2);
 								        }
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
 								        val = strtol(valString,  NULL, 10);
 								        // Delete this parameter from the params string.
 								        m.reset();
 								        params = m.replaceFirst("", status);
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    }
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    U_ASSERT(U_SUCCESS(status));
 								    return val;
 								}
-												ICU-2896 build without regex (disable monkey test in that situation)

X-SVN-Rev: 12153
											
										
										
											2003-05-29 00:54:50 +00:00
+								#endif
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                                    BreakIterator *bi,
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								                                    int expected[],
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                                    int expectedcount)
 								{
 								    int count = 0;
 								    int i = 0;
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								    int forward[50];
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								    bi->setText(ustr);
 								    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
 								        forward[count] = i;
 								        if (count < expectedcount && expected[count] != i) {
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								            test->errln("break forward test failed: expected %d but got %d",
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                        expected[count], i);
 								            break;
 								        }
 								        count ++;
 								    }
 								    if (count != expectedcount) {
 								        printStringBreaks(ustr, expected, expectedcount);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        test->errln("break forward test failed: missed %d match",
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                    expectedcount - count);
 								        return;
 								    }
 								    // testing boundaries
 								    for (i = 1; i < expectedcount; i ++) {
 								        int j = expected[i - 1];
 								        if (!bi->isBoundary(j)) {
 								            printStringBreaks(ustr, expected, expectedcount);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								            return;
 								        }
 								        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
 								            if (bi->isBoundary(j)) {
 								                printStringBreaks(ustr, expected, expectedcount);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                return;
 								            }
 								        }
 								    }
 								    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
 								        count --;
 								        if (forward[count] != i) {
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								            test->errln("happy break test previous() failed: expected %d but got %d",
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                        forward[count], i);
 								            break;
 								        }
 								    }
 								    if (count != 0) {
 								        printStringBreaks(ustr, expected, expectedcount);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        test->errln("break test previous() failed: missed a match");
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								        return;
 								    }
 								    // testing preceding
 								    for (i = 0; i < expectedcount - 1; i ++) {
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        // int j = expected[i] + 1;
 								        int j = ustr.moveIndex32(expected[i], 1);
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								        for (; j <= expected[i + 1]; j ++) {
 								            if (bi->preceding(j) != expected[i]) {
 								                printStringBreaks(ustr, expected, expectedcount);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                test->errln("preceding(): Not expecting boundary at position %d", j);
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                return;
 								            }
 								        }
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    }
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								}
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								void RBBITest::TestWordBreaks(void)
 								{
-												ICU-3831 uconfig fixes

X-SVN-Rev: 15842
											
										
										
											2004-06-10 23:51:33 +00:00
+								#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
 								    Locale        locale("en");
 								    UErrorCode    status = U_ZERO_ERROR;
 								    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
 								    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    UChar         str[300];
 								    static const char *strlist[] =
-												ICU-2292 word breaks fixed and passing (i think)

X-SVN-Rev: 13605
											
										
										
											2003-11-06 20:00:46 +00:00
+								    {
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
 								    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
 								    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
 								    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
 								    "\\u90ca\\u3588\\u009c\\u0953\\u194b",
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
-												ICU-2292 word breaks fixed and passing (i think)

X-SVN-Rev: 13605
											
										
										
											2003-11-06 20:00:46 +00:00
+								    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
 								    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
 								    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
 								    "\\u2027\\U000e0067\\u0a47\\u00b7",
 								    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
 								    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
 								    "\\u0589\\U000e006e\\u0a42\\U000104a5",
 								    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
 								    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
 								    "\\u0027\\u11af\\U000e0057\\u0602",
 								    "\\U0001d7f2\\U000e007\\u0004\\u0589",
 								    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
 								    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
 								    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
 								    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
 								    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
 								    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
 								    "\\u0233\\U000e0020\\u0a69\\u0d6a",
 								    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
 								    "\\u58f4\\U000e0049\\u20e7\\u2027",
 								    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
 								    "\\ua183\\u102d\\u0bec\\u003a",
 								    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
 								    "\\u003a\\u0e57\\u0fad\\u002e",
 								    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
 								    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
 								    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
 								    "\\u003a\\u0664\\u00b7\\u1fba",
 								    "\\u003b\\u0027\\u00b7\\u47a3",
-												ICU-2292 word breaks fixed and passing (i think)

X-SVN-Rev: 13605
											
										
										
											2003-11-06 20:00:46 +00:00
+								    "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
 								    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
 								    };
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								    int loop;
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Creation of break iterator failed %s", u_errorName(status));
 								        return;
 								    }
-												ICU-3222 Fix some compiler warnings

X-SVN-Rev: 13927
											
										
										
											2003-12-02 01:34:21 +00:00
+								    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								        // printf("looping %d\n", loop);
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        u_unescape(strlist[loop], str, 25);
 								        UnicodeString ustr(str);
 								        // RBBICharMonkey monkey;
 								        RBBIWordMonkey monkey;
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								        int expected[50];
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        int expectedcount = 0;
 								        monkey.setText(ustr);
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								        int i;
 								        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								            expected[expectedcount ++] = i;
 								        }
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    }
-												ICU-3259 Fix a memory leak

X-SVN-Rev: 13886
											
										
										
											2003-11-25 21:28:23 +00:00
+								    delete bi;
-												ICU-3831 uconfig fixes

X-SVN-Rev: 15842
											
										
										
											2004-06-10 23:51:33 +00:00
+								#endif
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								}
 								void RBBITest::TestWordBoundary(void)
 								{
 								    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
 								    Locale        locale("en");
 								    UErrorCode    status = U_ZERO_ERROR;
 								    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
 								    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    UChar         str[50];
 								    static const char *strlist[] =
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								    {
 								    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
 								    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
 								    "\\u2027\\U000e0067\\u0a47\\u00b7",
 								    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
 								    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
 								    "\\u0589\\U000e006e\\u0a42\\U000104a5",
 								    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
 								    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
 								    "\\u0027\\u11af\\U000e0057\\u0602",
 								    "\\U0001d7f2\\U000e007\\u0004\\u0589",
 								    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
 								    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
 								    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
 								    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
 								    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
 								    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
 								    "\\u0233\\U000e0020\\u0a69\\u0d6a",
 								    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
 								    "\\u58f4\\U000e0049\\u20e7\\u2027",
 								    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
 								    "\\ua183\\u102d\\u0bec\\u003a",
 								    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
 								    "\\u003a\\u0e57\\u0fad\\u002e",
 								    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
 								    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
 								    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
 								    "\\u003a\\u0664\\u00b7\\u1fba",
 								    "\\u003b\\u0027\\u00b7\\u47a3",
 								    };
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								    int loop;
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Creation of break iterator failed %s", u_errorName(status));
 								        return;
 								    }
-												ICU-3222 Fix some compiler warnings

X-SVN-Rev: 13927
											
										
										
											2003-12-02 01:34:21 +00:00
+								    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								        // printf("looping %d\n", loop);
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        u_unescape(strlist[loop], str, 20);
 								        UnicodeString ustr(str);
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								        int forward[50];
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        int count = 0;
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        bi->setText(ustr);
 								        int prev = 0;
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								        int i;
 								        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								            forward[count ++] = i;
 								            if (i > prev) {
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								                int j;
 								                for (j = prev + 1; j < i; j ++) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                    if (bi->isBoundary(j)) {
 								                        printStringBreaks(ustr, forward, count);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								                        errln("happy boundary test failed: expected %d not a boundary",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                               j);
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                        return;
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                    }
 								                }
 								            }
 								            if (!bi->isBoundary(i)) {
 								                printStringBreaks(ustr, forward, count);
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								                errln("happy boundary test failed: expected %d a boundary",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                       i);
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								                return;
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								            }
 								            prev = i;
 								        }
 								    }
-												ICU-3259 Fix a memory leak

X-SVN-Rev: 13886
											
										
										
											2003-11-25 21:28:23 +00:00
+								    delete bi;
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								}
 								void RBBITest::TestLineBreaks(void)
 								{
-												ICU-3831 uconfig fixes

X-SVN-Rev: 15842
											
										
										
											2004-06-10 23:51:33 +00:00
+								#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    Locale        locale("en");
 								    UErrorCode    status = U_ZERO_ERROR;
 								    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
-												ICU-4656 fix RBBI test failure

X-SVN-Rev: 18248
											
										
										
											2005-07-14 20:56:31 +00:00
+								    const int32_t  STRSIZE = 50;
 								    UChar         str[STRSIZE];
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    static const char *strlist[] =
-												ICU-2292 line breaks passing on default option

X-SVN-Rev: 13636
											
										
										
											2003-11-07 22:49:38 +00:00
+								    {
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
 								     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
 								             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
 								     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
 								             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
-												ICU-3700 updated rules for Unicode 4.0.1.

X-SVN-Rev: 15286
											
										
										
											2004-05-12 23:29:24 +00:00
+								     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
 								     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
 								     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
 								     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
 								     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
 								     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
 								     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
 								     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
 								     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
 								     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
 								     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
 								     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
 								     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
 								     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
-												ICU-2292 line breaks passing on default option

X-SVN-Rev: 13636
											
										
										
											2003-11-07 22:49:38 +00:00
+								     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
 								     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
 								     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
 								     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
 								     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
 								     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
 								     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
 								     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
 								     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
 								     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
 								     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
 								     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
 								     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
 								     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
 								     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
 								     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
 								     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
 								     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
 								     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
 								     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
 								         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
 								         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
 								     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
 								         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    };
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								    int loop;
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								    TEST_ASSERT_SUCCESS(status);
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								    if (U_FAILURE(status)) {
 								        return;
 								    }
-												ICU-3222 Fix some compiler warnings

X-SVN-Rev: 13927
											
										
										
											2003-12-02 01:34:21 +00:00
+								    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
-												ICU-2292 line breaks passing on default option

X-SVN-Rev: 13636
											
										
										
											2003-11-07 22:49:38 +00:00
+								        // printf("looping %d\n", loop);
-												ICU-4656 fix RBBI test failure

X-SVN-Rev: 18248
											
										
										
											2005-07-14 20:56:31 +00:00
+								        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
 								        if (t >= STRSIZE) {
 								            TEST_ASSERT(FALSE);
 								            continue;
 								        }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        UnicodeString ustr(str);
 								        RBBILineMonkey monkey;
-												ICU-4354 make intltest pass with no data

X-SVN-Rev: 18140
											
										
										
											2005-07-05 18:39:00 +00:00
+								        if (U_FAILURE(monkey.deferredStatus)) {
 								            continue;
 								        }
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								        const int EXPECTEDSIZE = 50;
 								        int expected[EXPECTEDSIZE];
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        int expectedcount = 0;
 								        monkey.setText(ustr);
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								        int i;
 								        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
-												ICU-4157 4.1 RBBI changes. Stub out TestLineBreaks, which is looping; real fix to come later.

X-SVN-Rev: 17106
											
										
										
											2005-01-11 00:49:22 +00:00
+								            if (expectedcount >= EXPECTEDSIZE) {
 								                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
 								                return;
 								            }
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								            expected[expectedcount ++] = i;
 								        }
-												ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
											
										
										
											2003-11-09 06:52:44 +00:00
+								        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    }
-												ICU-3259 Fix a memory leak

X-SVN-Rev: 13886
											
										
										
											2003-11-25 21:28:23 +00:00
+								    delete bi;
-												ICU-3831 uconfig fixes

X-SVN-Rev: 15842
											
										
										
											2004-06-10 23:51:33 +00:00
+								#endif
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								}
 								void RBBITest::TestSentBreaks(void)
 								{
-												ICU-5282 Fix problems found by uconfigtest.

X-SVN-Rev: 19922
											
										
										
											2006-07-28 22:58:29 +00:00
+								#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    Locale        locale("en");
 								    UErrorCode    status = U_ZERO_ERROR;
 								    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
-												ICU-5161 Properly delete memory, and make sure there is enough space to unescape a string.

X-SVN-Rev: 19555
											
										
										
											2006-04-16 17:28:00 +00:00
+								    UChar         str[200];
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    static const char *strlist[] =
-												ICU-2292 sentence break rules updated

X-SVN-Rev: 13649
											
										
										
											2003-11-09 20:32:00 +00:00
+								    {
 								     "Now\ris\nthe\r\ntime\n\rfor\r\r",
 								     "This\n",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
 								     "\"Sentence ending with a quote.\" Bye.",
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
 								     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
 								     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
 								     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
 								     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
 								             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
 								             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
 								             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
 								             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    };
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								    int loop;
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								    if (U_FAILURE(status)) {
 								        errln("Creation of break iterator failed %s", u_errorName(status));
 								        return;
 								    }
 								    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
-												ICU-5161 Properly delete memory, and make sure there is enough space to unescape a string.

X-SVN-Rev: 19555
											
										
										
											2006-04-16 17:28:00 +00:00
+								        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        UnicodeString ustr(str);
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        RBBISentMonkey monkey;
 								        if (U_FAILURE(monkey.deferredStatus)) {
 								            continue;
 								        }
 								        const int EXPECTEDSIZE = 50;
 								        int expected[EXPECTEDSIZE];
 								        int expectedcount = 0;
 								        monkey.setText(ustr);
-												ICU-2292 missing 'int i' on MSVC.  for (int i;...   is not portable

X-SVN-Rev: 13600
											
										
										
											2003-11-06 04:38:11 +00:00
+								        int i;
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
 								            if (expectedcount >= EXPECTEDSIZE) {
 								                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
 								                return;
 								            }
 								            expected[expectedcount ++] = i;
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								        }
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
 								        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								    }
-												ICU-3259 Fix a memory leak

X-SVN-Rev: 13886
											
										
										
											2003-11-25 21:28:23 +00:00
+								    delete bi;
-												ICU-5282 Fix problems found by uconfigtest.

X-SVN-Rev: 19922
											
										
										
											2006-07-28 22:58:29 +00:00
+								#endif
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								}
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								void RBBITest::TestMonkey(char *params) {
-												ICU-2896 build without regex (disable monkey test in that situation)

X-SVN-Rev: 12153
											
										
										
											2003-05-29 00:54:50 +00:00
+								#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    UErrorCode     status    = U_ZERO_ERROR;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    int32_t        loopCount = 500;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    int32_t        seed      = 1;
 								    UnicodeString  breakType = "all";
 								    Locale         locale("en");
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								    UBool          useUText  = FALSE;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								    if (quick == FALSE) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        loopCount = 10000;
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								    }
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
 								    if (params) {
 								        UnicodeString p(params);
-												ICU-2093 Monkey test, fixed intermittent failure from uninitialized variable.

X-SVN-Rev: 12122
											
										
										
											2003-05-27 17:59:26 +00:00
+								        loopCount = getIntParam("loop", p, loopCount);
 								        seed      = getIntParam("seed", p, seed);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        if (m.find()) {
 								            breakType = m.group(1, status);
 								            m.reset();
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            p = m.replaceFirst("", status);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        }
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								        RegexMatcher u(" *utext", p, 0, status);
 								        if (u.find()) {
 								            useUText = TRUE;
 								            u.reset();
 								            p = u.replaceFirst("", status);
 								        }
 								        // m.reset(p);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        if (RegexMatcher("\\S", p, 0, status).find()) {
 								            // Each option is stripped out of the option string as it is processed.
 								            // All options have been checked.  The option string should have been completely emptied..
 								            char buf[100];
 								            p.extract(buf, sizeof(buf), NULL, status);
 								            buf[sizeof(buf)-1] = 0;
 								            errln("Unrecognized or extra parameter:  %s\n", buf);
 								            return;
 								        }
 								    }
 								    if (breakType == "char" || breakType == "all") {
 								        RBBICharMonkey  m;
 								        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								        if (U_SUCCESS(status)) {
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								            RunMonkey(bi, m, "char", seed, loopCount, useUText);
 								            if (breakType == "all" && useUText==FALSE) {
 								                // Also run a quick test with UText when "all" is specified
-												ICU-4269 Tweak sentence break rules; fix rbbi UText problem.

X-SVN-Rev: 18591
											
										
										
											2005-09-27 18:21:18 +00:00
+								                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								            }
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								        }
 								        else {
 								            errln("Creation of character break iterator failed %s", u_errorName(status));
 								        }
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        delete bi;
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    if (breakType == "word" || breakType == "all") {
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12706
											
										
										
											2003-07-29 06:35:54 +00:00
+								        logln("Word Break Monkey Test");
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        RBBIWordMonkey  m;
 								        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								        if (U_SUCCESS(status)) {
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								            RunMonkey(bi, m, "word", seed, loopCount, useUText);
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								        }
 								        else {
 								            errln("Creation of word break iterator failed %s", u_errorName(status));
 								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        delete bi;
 								    }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								    if (breakType == "line" || breakType == "all") {
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12706
											
										
										
											2003-07-29 06:35:54 +00:00
+								        logln("Line Break Monkey Test");
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        RBBILineMonkey  m;
 								        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								        if (loopCount >= 10) {
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								            loopCount = loopCount / 5;   // Line break runs slower than the others.
-												ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
											
										
										
											2003-10-02 00:18:13 +00:00
+								        }
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								        if (U_SUCCESS(status)) {
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								            RunMonkey(bi, m, "line", seed, loopCount, useUText);
-												ICU-4098 Prevent missing break iterators from crashing the build.

X-SVN-Rev: 16310
											
										
										
											2004-09-13 15:39:02 +00:00
+								        }
 								        else {
 								            errln("Creation of line break iterator failed %s", u_errorName(status));
 								        }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								        delete bi;
 								    }
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								    if (breakType == "sent" || breakType == "all"  ) {
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								        logln("Sentence Break Monkey Test");
 								        RBBISentMonkey  m;
 								        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								        if (loopCount >= 10) {
 								            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								        }
 								        if (U_SUCCESS(status)) {
 								            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
 								        }
 								        else {
 								            errln("Creation of line break iterator failed %s", u_errorName(status));
 								        }
 								        delete bi;
 								    }
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
-												ICU-2896 build without regex (disable monkey test in that situation)

X-SVN-Rev: 12153
											
										
										
											2003-05-29 00:54:50 +00:00
+								#endif
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								}
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12706
											
										
										
											2003-07-29 06:35:54 +00:00
+								//
 								//  Run a RBBI monkey test.  Common routine, for all break iterator types.
 								//    Parameters:
 								//       bi      - the break iterator to use
 								//       mk      - MonkeyKind, abstraction for obtaining expected results
 								//       name    - Name of test (char, word, etc.) for use in error messages
 								//       seed    - Seed for starting random number generator (parameter from user)
 								//       numIterations
 								//
-												ICU-5722 add ICU tests for break test data from the Unicode web site.

X-SVN-Rev: 21591
											
										
										
											2007-05-29 22:56:20 +00:00
+								void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								                         int32_t numIterations, UBool useUText) {
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
-												ICU-2896 build without regex (disable monkey test in that situation)

X-SVN-Rev: 12153
											
										
										
											2003-05-29 00:54:50 +00:00
+								#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    const int32_t    TESTSTRINGLEN = 500;
 								    UnicodeString    testText;
 								    int32_t          numCharClasses;
 								    UVector          *chClasses;
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								    int              expected[TESTSTRINGLEN*2 + 1];
 								    int              expectedCount = 0;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
 								    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
 								    char             reverseBreaks[TESTSTRINGLEN*2+1];
-												ICU-2128 add isBoundary() check to RBBI MonkeyTest

X-SVN-Rev: 13304
											
										
										
											2003-10-03 05:05:13 +00:00
+								    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
-												ICU-3728 extend rbbi monkey test to cover following(), previous() funcs

X-SVN-Rev: 15347
											
										
										
											2004-05-17 23:16:00 +00:00
+								    char             followingBreaks[TESTSTRINGLEN*2+1];
 								    char             precedingBreaks[TESTSTRINGLEN*2+1];
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								    int              i;
 								    int              loopCount = 0;
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								    m_seed = seed;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
 								    numCharClasses = mk.charClasses()->size();
 								    chClasses      = mk.charClasses();
 								    // Check for errors that occured during the construction of the MonkeyKind object.
 								    //  Can't report them where they occured because errln() is a method coming from intlTest,
 								    //  and is not visible outside of RBBITest :-(
 								    if (U_FAILURE(mk.deferredStatus)) {
 								        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
 								        return;
 								    }
 								    // Verify that the character classes all have at least one member.
 								    for (i=0; i<numCharClasses; i++) {
-												ICU-2093 RBBI Monkey Test, crash on 64 bit platforms fixed.

X-SVN-Rev: 12183
											
										
										
											2003-05-29 23:39:54 +00:00
+								        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        if (s == NULL || s->size() == 0) {
 								            errln("Character Class #%d is null or of zero size.", i);
 								            return;
 								        }
 								    }
-												ICU-2924 RBBI, line break rules,  monkey test, a few more fixes

X-SVN-Rev: 13402
											
										
										
											2003-10-13 22:01:53 +00:00
+								    while (loopCount < numIterations || numIterations == -1) {
 								        if (numIterations == -1 && loopCount % 10 == 0) {
-												ICU-2128 RBBI monkey test, add periodic  tic output when running with an infinite loop count

X-SVN-Rev: 13314
											
										
										
											2003-10-03 22:27:35 +00:00
+								            // If test is running in an infinite loop, display a periodic tic so
 								            //   we can tell that it is making progress.
 								            fprintf(stderr, ".");
 								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        // Save current random number seed, so that we can recreate the random numbers
 								        //   for this loop iteration in event of an error.
 								        seed = m_seed;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        // Populate a test string with data.
 								        testText.truncate(0);
 								        for (i=0; i<TESTSTRINGLEN; i++) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            int32_t  aClassNum = m_rand() % numCharClasses;
-												ICU-2093 RBBI Monkey Test, crash on 64 bit platforms fixed.

X-SVN-Rev: 12183
											
										
										
											2003-05-29 23:39:54 +00:00
+								            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            int32_t   charIdx = m_rand() % classSet->size();
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								            UChar32   c = classSet->charAt(charIdx);
-												ICU-3178 Don't use assert.  It's very inconvienient while testing on other platforms.

X-SVN-Rev: 13453
											
										
										
											2003-10-17 16:45:43 +00:00
+								            if (c < 0) {   // TODO:  deal with sets containing strings.
 								                errln("c < 0");
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								                break;
-												ICU-3178 Don't use assert.  It's very inconvienient while testing on other platforms.

X-SVN-Rev: 13453
											
										
										
											2003-10-17 16:45:43 +00:00
+								            }
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								            testText.append(c);
 								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        // Calculate the expected results for this test string.
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        mk.setText(testText);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        memset(expectedBreaks, 0, sizeof(expectedBreaks));
 								        expectedBreaks[0] = 1;
 								        int32_t breakPos = 0;
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								        expectedCount = 0;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        for (;;) {
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								            breakPos = mk.next(breakPos);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								            if (breakPos == -1) {
 								                break;
 								            }
-												ICU-3178 Don't use assert.  It's very inconvienient while testing on other platforms.

X-SVN-Rev: 13453
											
										
										
											2003-10-17 16:45:43 +00:00
+								            if (breakPos > testText.length()) {
 								                errln("breakPos > testText.length()");
 								            }
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								            expectedBreaks[breakPos] = 1;
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
+								            U_ASSERT(expectedCount<testText.length());
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								            expected[expectedCount ++] = breakPos;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        // Find the break positions using forward iteration
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        memset(forwardBreaks, 0, sizeof(forwardBreaks));
-												ICU-3944 Text Access, rbbi impl fixes and tests added.

X-SVN-Rev: 18172
											
										
										
											2005-07-08 01:57:58 +00:00
+								        if (useUText) {
 								            UErrorCode status = U_ZERO_ERROR;
 								            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
 								            // testUText = utext_openUnicodeString(testUText, &testText, &status);
 								            bi->setText(testUText, status);
 								            TEST_ASSERT_SUCCESS(status);
 								            utext_close(testUText);   // The break iterator does a shallow clone of the UText
 								                                      //  This UText can be closed immediately, so long as the
 								                                      //  testText string continues to exist.
 								        } else {
 								            bi->setText(testText);
 								        }
-												ICU-4269 rbbi sentence break monkey test & rule updates.  Work in in progress, sentence breaks not in good shape now.

X-SVN-Rev: 18534
											
										
										
											2005-09-15 23:23:24 +00:00
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
 								            if (i < 0 || i > testText.length()) {
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12706
											
										
										
											2003-07-29 06:35:54 +00:00
+								                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                break;
 								            }
 								            forwardBreaks[i] = 1;
 								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        // Find the break positions using reverse iteration
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								        memset(reverseBreaks, 0, sizeof(reverseBreaks));
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
 								            if (i < 0 || i > testText.length()) {
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12706
											
										
										
											2003-07-29 06:35:54 +00:00
+								                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                break;
 								            }
 								            reverseBreaks[i] = 1;
 								        }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-2128 add isBoundary() check to RBBI MonkeyTest

X-SVN-Rev: 13304
											
										
										
											2003-10-03 05:05:13 +00:00
+								        // Find the break positions using isBoundary() tests.
 								        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
-												ICU-4707 Fix some compiler warnings.

X-SVN-Rev: 20028
											
										
										
											2006-08-11 04:05:04 +00:00
+								        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
-												ICU-2128 add isBoundary() check to RBBI MonkeyTest

X-SVN-Rev: 13304
											
										
										
											2003-10-03 05:05:13 +00:00
+								        for (i=0; i<=testText.length(); i++) {
 								            isBoundaryBreaks[i] = bi->isBoundary(i);
 								        }
-												ICU-3728 extend rbbi monkey test to cover following(), previous() funcs

X-SVN-Rev: 15347
											
										
										
											2004-05-17 23:16:00 +00:00
+								        // Find the break positions using the following() function.
 								        // printf(".");
 								        memset(followingBreaks, 0, sizeof(followingBreaks));
 								        int32_t   lastBreakPos = 0;
 								        followingBreaks[0] = 1;
 								        for (i=0; i<testText.length(); i++) {
 								            breakPos = bi->following(i);
 								            if (breakPos <= i ||
 								                breakPos < lastBreakPos ||
 								                breakPos > testText.length() ||
 								                breakPos > lastBreakPos && lastBreakPos > i ) {
 								                errln("%s break monkey test: "
 								                    "Out of range value returned by BreakIterator::following().\n"
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
 								                         name, seed, i, breakPos, lastBreakPos);
-												ICU-3728 extend rbbi monkey test to cover following(), previous() funcs

X-SVN-Rev: 15347
											
										
										
											2004-05-17 23:16:00 +00:00
+								                break;
 								            }
 								            followingBreaks[breakPos] = 1;
 								            lastBreakPos = breakPos;
 								        }
 								        // Find the break positions using the preceding() function.
 								        memset(precedingBreaks, 0, sizeof(followingBreaks));
 								        lastBreakPos = testText.length();
 								        precedingBreaks[testText.length()] = 1;
 								        for (i=testText.length(); i>0; i--) {
 								            breakPos = bi->preceding(i);
 								            if (breakPos >= i ||
 								                breakPos > lastBreakPos ||
-												ICU-5242 break rule updates for Unicode 5.0

X-SVN-Rev: 19747
											
										
										
											2006-06-22 01:10:54 +00:00
+								                breakPos < 0 && testText.getChar32Start(i)>0 ||
 								                breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
-												ICU-3728 extend rbbi monkey test to cover following(), previous() funcs

X-SVN-Rev: 15347
											
										
										
											2004-05-17 23:16:00 +00:00
+								                errln("%s break monkey test: "
 								                    "Out of range value returned by BreakIterator::preceding().\n"
 								                    "index=%d;  prev returned %d; lastBreak=%d" ,
 								                    name,  i, breakPos, lastBreakPos);
 								                precedingBreaks[i] = 2;   // Forces an error.
 								            } else {
 								                precedingBreaks[breakPos] = 1;
 								                lastBreakPos = breakPos;
 								            }
 								        }
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								        // Compare the expected and actual results.
 								        for (i=0; i<=testText.length(); i++) {
-												ICU-3222 Fix some compiler warnings.

X-SVN-Rev: 13338
											
										
										
											2003-10-07 16:12:46 +00:00
+								            const char *errorType = NULL;
-												ICU-2128 add isBoundary() check to RBBI MonkeyTest

X-SVN-Rev: 13304
											
										
										
											2003-10-03 05:05:13 +00:00
+								            if  (forwardBreaks[i] != expectedBreaks[i]) {
 								                errorType = "next()";
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								            } else if (reverseBreaks[i] != forwardBreaks[i]) {
-												ICU-2128 add isBoundary() check to RBBI MonkeyTest

X-SVN-Rev: 13304
											
										
										
											2003-10-03 05:05:13 +00:00
+								                errorType = "previous()";
 								            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
 								                errorType = "isBoundary()";
-												ICU-3728 extend rbbi monkey test to cover following(), previous() funcs

X-SVN-Rev: 15347
											
										
										
											2004-05-17 23:16:00 +00:00
+								            } else if (followingBreaks[i] != expectedBreaks[i]) {
 								                errorType = "following()";
 								            } else if (precedingBreaks[i] != expectedBreaks[i]) {
 								                errorType = "preceding()";
-												ICU-2128 add isBoundary() check to RBBI MonkeyTest

X-SVN-Rev: 13304
											
										
										
											2003-10-03 05:05:13 +00:00
+								            }
 								            if (errorType != NULL) {
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                // Format a range of the test text that includes the failure as
 								                //  a data item that can be included in the rbbi test data file.
 								                // Start of the range is the last point where expected and actual results
 								                //   both agreed that there was a break position.
 								                int startContext = i;
-												ICU-2292 word breaks fixed and passing (i think)

X-SVN-Rev: 13605
											
										
										
											2003-11-06 20:00:46 +00:00
+								                int32_t count = 0;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                for (;;) {
 								                    if (startContext==0) { break; }
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                    startContext --;
-												ICU-2292 word breaks fixed and passing (i think)

X-SVN-Rev: 13605
											
										
										
											2003-11-06 20:00:46 +00:00
+								                    if (expectedBreaks[startContext] != 0) {
 								                        if (count == 2) break;
 								                        count ++;
 								                    }
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                }
 								                // End of range is two expected breaks past the start position.
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                int endContext = i + 1;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                int ci;
 								                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
 								                    for (;;) {
 								                        if (endContext >= testText.length()) {break;}
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								                        if (expectedBreaks[endContext-1] != 0) {
-												ICU-2292 word breaks fixed and passing (i think)

X-SVN-Rev: 13605
											
										
										
											2003-11-06 20:00:46 +00:00
+								                            if (count == 0) break;
 								                            count --;
 								                        }
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                        endContext ++;
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                    }
 								                }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                UnicodeString errorText = "<data>";
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								                /***if (strcmp(errorType, "next()") == 0) {
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                    startContext = 0;
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								                    endContext = testText.length();
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
+								                    printStringBreaks(testText, expected, expectedCount);
-												ICU-2292 word breaks fixed and passing (i think)

X-SVN-Rev: 13605
											
										
										
											2003-11-06 20:00:46 +00:00
+								                }***/
-												ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
											
										
										
											2003-11-11 21:24:09 +00:00
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                for (ci=startContext; ci<endContext;) {
 								                    UnicodeString hexChars("0123456789abcdef");
 								                    UChar32  c;
 								                    int      bn;
 								                    c = testText.char32At(ci);
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12643
											
										
										
											2003-07-21 05:37:08 +00:00
+								                    if (ci == i) {
 								                        // This is the location of the error.
 								                        errorText.append("<?>");
 								                    } else if (expectedBreaks[ci] != 0) {
 								                        // This a non-error expected break position.
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								                        errorText.append("\\");
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                    }
 								                    if (c < 0x10000) {
 								                        errorText.append("\\u");
 								                        for (bn=12; bn>=0; bn-=4) {
 								                            errorText.append(hexChars.charAt((c>>bn)&0xf));
 								                        }
 								                    } else {
 								                        errorText.append("\\U");
 								                        for (bn=28; bn>=0; bn-=4) {
 								                            errorText.append(hexChars.charAt((c>>bn)&0xf));
 								                        }
 								                    }
 								                    ci = testText.moveIndex32(ci, 1);
 								                }
-												ICU-5242 unicode 5.0 rbbi rules update

X-SVN-Rev: 19758
											
										
										
											2006-06-26 04:54:00 +00:00
+								                errorText.append("\\");
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                errorText.append("</data>\n");
 								                // Output the error
-												ICU-2292 first cut of performance improvements, test failures commented out.

X-SVN-Rev: 13596
											
										
										
											2003-11-05 23:50:39 +00:00
+								                char  charErrorTxt[500];
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                UErrorCode status = U_ZERO_ERROR;
 								                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
-												ICU-2128 add isBoundary() check to RBBI MonkeyTest

X-SVN-Rev: 13304
											
										
										
											2003-10-03 05:05:13 +00:00
+								                errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
-												ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12706
											
										
										
											2003-07-29 06:35:54 +00:00
+								                    name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
-												ICU-2128 add isBoundary() check to RBBI MonkeyTest

X-SVN-Rev: 13304
											
										
										
											2003-10-03 05:05:13 +00:00
+								                    errorType, seed, i, charErrorTxt);
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								                break;
 								            }
 								        }
 								        loopCount++;
 								    }
-												ICU-2093 Word Breaks, monkey test and rule fixes.

X-SVN-Rev: 12171
											
										
										
											2003-05-29 21:15:14 +00:00
+								#endif
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
+								}
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								//
 								//  TestDebug    -  A place-holder test for debugging purposes.
 								//                  For putting in fragments of other tests that can be invoked
 								//                  for tracing  without a lot of unwanted extra stuff happening.
 								//
 								void RBBITest::TestDebug(void) {
 								#if 0
 								    UErrorCode   status = U_ZERO_ERROR;
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								    int pos = 0;
 								    int ruleStatus = 0;
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
 								    RuleBasedBreakIterator* bi =
 								       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
 								       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
 								    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
 								    // UnicodeString s("Aaa.  Bcd");
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    s = s.unescape();
 								    bi->setText(s);
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								    UBool r = bi->isBoundary(8);
 								    printf("%s", r?"true":"false");
 								    return;
 								    pos = bi->last();
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    do {
-												ICU-4269 Add a sentence break monkey test

X-SVN-Rev: 18588
											
										
										
											2005-09-26 23:58:54 +00:00
+								        // ruleStatus = bi->getRuleStatus();
 								        printf("%d\t%d\n", pos, ruleStatus);
 								        pos = bi->previous();
-												ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
											
										
										
											2005-03-23 02:13:53 +00:00
+								    } while (pos != BreakIterator::DONE);
 								#endif
 								}
-												ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
											
										
										
											2003-05-27 16:29:25 +00:00
-												ICU-2248 modularize ICU

X-SVN-Rev: 9910
											
										
										
											2002-09-21 00:43:14 +00:00
+								#endif /* #if !UCONFIG_NO_BREAK_ITERATION */