scuffed-code/icu4c/source/test/thaitest/thaitest.cpp

/*
 ******************************************************************************
 * Copyright (C) 1998-2001, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 */

#include <errno.h>
#include <stdio.h>
#include <string.h>

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/brkiter.h"
#include "unicode/locid.h"
#include "unicode/unistr.h"

/*
 * This program takes a Unicode text file containing Thai text with
 * spaces inserted where the word breaks are. It computes a copy of
 * the text without spaces and uses a word instance of a Thai BreakIterator
 * to compute the word breaks. The program reports any differences in the
 * breaks.
 *
 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
 * exptected that this program will always report some differences.
 */

/*
 * This class is a break iterator that counts words and spaces.
 */
class SpaceBreakIterator
{
public:
    // The constructor:
    // text  - pointer to an array of UChars to iterate over
    // count - the number of UChars in text
    SpaceBreakIterator(const UChar *text, int32_t count);

    // the destructor
    ~SpaceBreakIterator();

    // return next break position
    int32_t next();

    // return current word count
    int32_t getWordCount();

    // return current space count
    int32_t getSpaceCount();

private:
    // No arg constructor: private so clients can't call it.
    SpaceBreakIterator();

    // The underlying BreakIterator
    BreakIterator *fBreakIter;

    // address of the UChar array
    const UChar *fText;

    // number of UChars in fText
    int32_t fTextCount;

    // current word count
    int32_t fWordCount;

    // current space count
    int32_t fSpaceCount;

    // true when fBreakIter has returned DONE
    UBool fDone;
};

/*
 * This is the main class. It compares word breaks and reports the differences.
 */
class ThaiWordbreakTest
{
public:
    // The main constructor:
    // spaces       - pointer to a UChar array for the text with spaces
    // spaceCount   - the number of characters in the spaces array
    // noSpaces     - pointer to a UChar array for the text without spaces
    // noSpaceCount - the number of characters in the noSpaces array
    // verbose      - report all breaks if true, otherwise just report differences
    ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
    ~ThaiWordbreakTest();

    // returns the number of breaks that are in the spaces array
    // but aren't found in the noSpaces array
    int32_t getBreaksNotFound();

    // returns the number of breaks which are found in the noSpaces
    // array but aren't in the spaces array
    int32_t getInvalidBreaks();

    // returns the number of words found in the spaces array
    int32_t getWordCount();

    // reads the input Unicode text file:
    // fileName  - the path name of the file
    // charCount - set to the number of UChars read from the file
    // returns   - the address of the UChar array containing the characters
    static const UChar *readFile(char *fileName, int32_t &charCount);

    // removes spaces form the input UChar array:
    // spaces        - pointer to the input UChar array
    // count         - number of UChars in the spaces array
    // nonSpaceCount - the number of UChars in the result array
    // returns       - the address of the UChar array with spaces removed
    static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);

private:
    // The no arg constructor - private so clients can't call it
    ThaiWordbreakTest();

    // This does the actual comparison:
    // spaces - the address of the UChar array for the text with spaces
    // spaceCount - the number of UChars in the spaces array
    // noSpaces   - the address of the UChar array for the text without spaces
    // noSpaceCount - the number of UChars in the noSpaces array
    // returns      - true if all breaks match, false otherwise
    UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
                            const UChar *noSpaces, int32_t noSpaceCount);

    // helper method to report a break in the spaces
    // array that's not found in the noSpaces array
    void breakNotFound(int32_t br);

    // helper method to report a break that's found in
    // the noSpaces array that's not in the spaces array
    void foundInvalidBreak(int32_t br);

    // count of breaks in the spaces array that
    // aren't found in the noSpaces array
    int32_t fBreaksNotFound;

    // count of breaks found in the noSpaces array
    // that aren't in the spaces array
    int32_t fInvalidBreaks;

    // number of words found in the spaces array
    int32_t fWordCount;

    // report all breaks if true, otherwise just report differences
    UBool fVerbose;
};

/*
 * The main constructor: it calls compareWordBreaks and reports any differences
 */
ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
                                     const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
{
    compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
}

/*
 * The no arg constructor
 */
ThaiWordbreakTest::ThaiWordbreakTest()
{
    // nothing
}

/*
 * The destructor
 */
ThaiWordbreakTest::~ThaiWordbreakTest()
{
    // nothing?
}

/*
 * returns the number of breaks in the spaces array
 * that aren't found in the noSpaces array
 */
inline int32_t ThaiWordbreakTest::getBreaksNotFound()
{
    return fBreaksNotFound;
}

/*
 * Returns the number of breaks found in the noSpaces
 * array that aren't in the spaces array
 */
inline int32_t ThaiWordbreakTest::getInvalidBreaks()
{
    return fInvalidBreaks;
}

/*
 * Returns the number of words found in the spaces array
 */
inline int32_t ThaiWordbreakTest::getWordCount()
{
    return fWordCount;
}

/*
 * This method does the acutal break comparison and reports the results.
 * It uses a SpaceBreakIterator to iterate over the text with spaces,
 * and a word instance of a Thai BreakIterator to iterate over the text
 * without spaces.
 */
UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
                                           const UChar *noSpaces, int32_t noSpaceCount)
{
    UBool result = true;
    Locale thai("th");
    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
    UErrorCode status = U_ZERO_ERROR;
    
    BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
    breakIter->adoptText(noSpaceIter);
    
    SpaceBreakIterator spaceIter(spaces, spaceCount);
    
    int32_t nextBreak = 0;
    int32_t nextSpaceBreak = 0;
    int32_t iterCount = 0;
    
    while (true) {
        nextSpaceBreak = spaceIter.next();
        nextBreak = breakIter->next();
        
        if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
            if (nextBreak != BreakIterator::DONE) {
                fprintf(stderr, "break iterator didn't end.\n");
            } else if (nextSpaceBreak != BreakIterator::DONE) {
                fprintf(stderr, "premature break iterator end.\n");
            }
            
            break;
        }
        
        while (nextSpaceBreak != nextBreak &&
               nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
            if (nextSpaceBreak < nextBreak) {
                breakNotFound(nextSpaceBreak);
                result = false;
                nextSpaceBreak = spaceIter.next();
            } else if (nextSpaceBreak > nextBreak) {
                foundInvalidBreak(nextBreak);
                result = false;
                nextBreak = breakIter->next();
            }
        }
        
        if (fVerbose) {
            printf("%d   %d\n", nextSpaceBreak, nextBreak);
        }
    }
        
   
    fWordCount = spaceIter.getWordCount();
    
    delete breakIter;

    return result;
}

/*
 * Report a break that's in the text with spaces but
 * not found in the text without spaces.
 */
void ThaiWordbreakTest::breakNotFound(int32_t br)
{
    if (fVerbose) {
        printf("%d   ****\n", br);
    } else {
        fprintf(stderr, "break not found: %d\n", br);
    }
    
    fBreaksNotFound += 1;
}

/*
 * Report a break that's found in the text without spaces
 * that isn't in the text with spaces.
 */
void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
{
    if (fVerbose) {
        printf("****   %d\n", br);
    } else {
        fprintf(stderr, "found invalid break: %d\n", br);
    }
    
    fInvalidBreaks += 1;
}

/*
 * Read the text from a file. The text must start with a Unicode Byte
 * Order Mark (BOM) so that we know what order to read the bytes in.
 */
const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
{
    FILE *f;
    int32_t fileSize;
    
    UChar *buffer;
    char *bufferChars;
    
    f = fopen(fileName, "rb");
    
    if( f == NULL ) {
        fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
        return 0;
    }
    
    fseek(f, 0, SEEK_END);
    fileSize = ftell(f);
    
    fseek(f, 0, SEEK_SET);
    bufferChars = new char[fileSize];
    
    if(bufferChars == 0) {
        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
        fclose(f);
        return 0;
    }
    
    fread(bufferChars, sizeof(char), fileSize, f);
    if( ferror(f) ) {
        fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
        fclose(f);
        delete[] bufferChars;
        return 0;
    }
    fclose(f);
    
    UnicodeString myText(bufferChars, fileSize, "UTF-8");

    delete[] bufferChars;
    
    charCount = myText.length();
    buffer = new UChar[charCount];
    if(buffer == 0) {
        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
        return 0;
    }
    
    myText.extract(1, myText.length(), buffer);
    charCount--;  // skip the BOM
    buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
    
    return buffer;
}

/*
 * Remove spaces from the input UChar array.
 *
 * We check explicitly for a Unicode code value of 0x0020
 * because Unicode::isSpaceChar returns true for CR, LF, etc.
 *
 */
const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
{
    int32_t i, out, spaceCount;

    spaceCount = 0;
    for (i = 0; i < count; i += 1) {
        if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
            spaceCount += 1;
        }
    }

    nonSpaceCount = count - spaceCount;
    UChar *noSpaces = new UChar[nonSpaceCount];

    if (noSpaces == 0) {
        fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
        return 0;
    }

    for (out = 0, i = 0; i < count; i += 1) {
        if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
            noSpaces[out++] = spaces[i];
        }
    }

    return noSpaces;
}

/*
 * The main routine. Read the command line arguments, read the text file,
 * remove the spaces, do the comparison and report the final results
 */
int main(int argc, char **argv)
{
    char *fileName = "space.txt";
    int arg = 1;
    UBool verbose = false;

    if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
        verbose = true;
        arg += 1;
    }

    if (arg == argc - 1) {
        fileName = argv[arg++];
    }

    if (arg != argc) {
        fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
        return 1;
    }

    int32_t spaceCount, nonSpaceCount;
    const UChar *spaces, *noSpaces;

    spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);

    if (spaces == 0) {
        return 1;
    }

    noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);

    if (noSpaces == 0) {
        return 1;
    }

    ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);

    printf("word count: %d\n", test.getWordCount());
    printf("breaks not found: %d\n", test.getBreaksNotFound());
    printf("invalid breaks found: %d\n", test.getInvalidBreaks());

    return 0;
}

/*
 * The main constructor. Clear all the counts and construct a default
 * word instance of a BreakIterator.
 */
SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
  : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(false)
{
    UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
    UErrorCode status = U_ZERO_ERROR;
    Locale us("us");

    fBreakIter = BreakIterator::createWordInstance(us, status);
    fBreakIter->adoptText(iter);
}

SpaceBreakIterator::SpaceBreakIterator()
{
    // nothing
}

/*
 * The destructor. delete the underlying BreakIterator
 */
SpaceBreakIterator::~SpaceBreakIterator()
{
    delete fBreakIter;
}

/*
 * Return the next break, counting words and spaces.
 */
int32_t SpaceBreakIterator::next()
{
    if (fDone) {
        return BreakIterator::DONE;
    }
    
    int32_t nextBreak = fBreakIter->next();
    
    if (nextBreak == BreakIterator::DONE) {
        fDone = true;
        return BreakIterator::DONE;
    }
    
   int32_t result = nextBreak - fSpaceCount;
    
    if (nextBreak < fTextCount) {
        if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
            fSpaceCount += fBreakIter->next() - nextBreak;
        }
    }
    
    fWordCount += 1;

    return result;
}

/*
 * Returns the current space count
 */
int32_t SpaceBreakIterator::getSpaceCount()
{
    return fSpaceCount;
}

/*
 * Returns the current word count
 */
int32_t SpaceBreakIterator::getWordCount()
{
    return fWordCount;
}
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`/*`
			`******************************************************************************`
			`* Copyright (C) 1998-2001, International Business Machines Corporation and *`
			`* others. All Rights Reserved. *`
			`******************************************************************************`
			`*/`

			`#include <errno.h>`
			`#include <stdio.h>`
			`#include <string.h>`

			`#include "unicode/utypes.h"`
			`#include "unicode/uchar.h"`
			`#include "unicode/uchriter.h"`
			`#include "unicode/brkiter.h"`
			`#include "unicode/locid.h"`
ICU-999 Use a UTF-8 file instead of a UTF-16 file. X-SVN-Rev: 5401 2001-07-31 18:46:51 +00:00			`#include "unicode/unistr.h"`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`/*`
			`* This program takes a Unicode text file containing Thai text with`
			`* spaces inserted where the word breaks are. It computes a copy of`
			`* the text without spaces and uses a word instance of a Thai BreakIterator`
			`* to compute the word breaks. The program reports any differences in the`
			`* breaks.`
			`*`
			`* NOTE: by it's very nature, Thai word breaking is not exact, so it is`
			`* exptected that this program will always report some differences.`
			`*/`

			`/*`
			`* This class is a break iterator that counts words and spaces.`
			`*/`
			`class SpaceBreakIterator`
			`{`
			`public:`
			`// The constructor:`
			`// text - pointer to an array of UChars to iterate over`
			`// count - the number of UChars in text`
			`SpaceBreakIterator(const UChar *text, int32_t count);`

			`// the destructor`
			`~SpaceBreakIterator();`

			`// return next break position`
			`int32_t next();`

			`// return current word count`
			`int32_t getWordCount();`

			`// return current space count`
			`int32_t getSpaceCount();`

			`private:`
			`// No arg constructor: private so clients can't call it.`
			`SpaceBreakIterator();`

			`// The underlying BreakIterator`
			`BreakIterator *fBreakIter;`

			`// address of the UChar array`
			`const UChar *fText;`

			`// number of UChars in fText`
			`int32_t fTextCount;`

			`// current word count`
			`int32_t fWordCount;`

			`// current space count`
			`int32_t fSpaceCount;`

			`// true when fBreakIter has returned DONE`
			`UBool fDone;`
			`};`

			`/*`
			`* This is the main class. It compares word breaks and reports the differences.`
			`*/`
			`class ThaiWordbreakTest`
			`{`
			`public:`
			`// The main constructor:`
			`// spaces - pointer to a UChar array for the text with spaces`
			`// spaceCount - the number of characters in the spaces array`
			`// noSpaces - pointer to a UChar array for the text without spaces`
			`// noSpaceCount - the number of characters in the noSpaces array`
			`// verbose - report all breaks if true, otherwise just report differences`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`ThaiWordbreakTest(const UChar spaces, int32_t spaceCount, const UChar noSpaces, int32_t noSpaceCount, UBool verbose);`
			`~ThaiWordbreakTest();`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// returns the number of breaks that are in the spaces array`
			`// but aren't found in the noSpaces array`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`int32_t getBreaksNotFound();`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// returns the number of breaks which are found in the noSpaces`
			`// array but aren't in the spaces array`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`int32_t getInvalidBreaks();`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// returns the number of words found in the spaces array`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`int32_t getWordCount();`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// reads the input Unicode text file:`
			`// fileName - the path name of the file`
			`// charCount - set to the number of UChars read from the file`
			`// returns - the address of the UChar array containing the characters`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`static const UChar readFile(char fileName, int32_t &charCount);`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// removes spaces form the input UChar array:`
			`// spaces - pointer to the input UChar array`
			`// count - number of UChars in the spaces array`
			`// nonSpaceCount - the number of UChars in the result array`
			`// returns - the address of the UChar array with spaces removed`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`static const UChar crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount);`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`private:`
			`// The no arg constructor - private so clients can't call it`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`ThaiWordbreakTest();`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// This does the actual comparison:`
			`// spaces - the address of the UChar array for the text with spaces`
			`// spaceCount - the number of UChars in the spaces array`
			`// noSpaces - the address of the UChar array for the text without spaces`
			`// noSpaceCount - the number of UChars in the noSpaces array`
			`// returns - true if all breaks match, false otherwise`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`const UChar *noSpaces, int32_t noSpaceCount);`

			`// helper method to report a break in the spaces`
			`// array that's not found in the noSpaces array`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`void breakNotFound(int32_t br);`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// helper method to report a break that's found in`
			`// the noSpaces array that's not in the spaces array`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`void foundInvalidBreak(int32_t br);`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// count of breaks in the spaces array that`
			`// aren't found in the noSpaces array`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`int32_t fBreaksNotFound;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// count of breaks found in the noSpaces array`
			`// that aren't in the spaces array`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`int32_t fInvalidBreaks;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// number of words found in the spaces array`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`int32_t fWordCount;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`// report all breaks if true, otherwise just report differences`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`UBool fVerbose;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`};`

			`/*`
			`* The main constructor: it calls compareWordBreaks and reports any differences`
			`*/`
			`ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,`
			`const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)`
			`: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)`
			`{`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* The no arg constructor`
			`*/`
			`ThaiWordbreakTest::ThaiWordbreakTest()`
			`{`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`// nothing`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* The destructor`
			`*/`
			`ThaiWordbreakTest::~ThaiWordbreakTest()`
			`{`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`// nothing?`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* returns the number of breaks in the spaces array`
			`* that aren't found in the noSpaces array`
			`*/`
			`inline int32_t ThaiWordbreakTest::getBreaksNotFound()`
			`{`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`return fBreaksNotFound;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* Returns the number of breaks found in the noSpaces`
			`* array that aren't in the spaces array`
			`*/`
			`inline int32_t ThaiWordbreakTest::getInvalidBreaks()`
			`{`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`return fInvalidBreaks;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* Returns the number of words found in the spaces array`
			`*/`
			`inline int32_t ThaiWordbreakTest::getWordCount()`
			`{`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`return fWordCount;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* This method does the acutal break comparison and reports the results.`
			`* It uses a SpaceBreakIterator to iterate over the text with spaces,`
			`* and a word instance of a Thai BreakIterator to iterate over the text`
			`* without spaces.`
			`*/`
			`UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,`
			`const UChar *noSpaces, int32_t noSpaceCount)`
			`{`
			`UBool result = true;`
			`Locale thai("th");`
			`UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);`
ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`UErrorCode status = U_ZERO_ERROR;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);`
			`breakIter->adoptText(noSpaceIter);`

			`SpaceBreakIterator spaceIter(spaces, spaceCount);`

			`int32_t nextBreak = 0;`
			`int32_t nextSpaceBreak = 0;`
			`int32_t iterCount = 0;`

			`while (true) {`
			`nextSpaceBreak = spaceIter.next();`
			`nextBreak = breakIter->next();`

			`if (nextSpaceBreak == BreakIterator::DONE \|\| nextBreak == BreakIterator::DONE) {`
			`if (nextBreak != BreakIterator::DONE) {`
			`fprintf(stderr, "break iterator didn't end.\n");`
			`} else if (nextSpaceBreak != BreakIterator::DONE) {`
			`fprintf(stderr, "premature break iterator end.\n");`
			`}`

			`break;`
			`}`

			`while (nextSpaceBreak != nextBreak &&`
			`nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {`
			`if (nextSpaceBreak < nextBreak) {`
			`breakNotFound(nextSpaceBreak);`
			`result = false;`
			`nextSpaceBreak = spaceIter.next();`
			`} else if (nextSpaceBreak > nextBreak) {`
			`foundInvalidBreak(nextBreak);`
			`result = false;`
			`nextBreak = breakIter->next();`
			`}`
			`}`

			`if (fVerbose) {`
			`printf("%d %d\n", nextSpaceBreak, nextBreak);`
			`}`
			`}`


			`fWordCount = spaceIter.getWordCount();`

			`delete breakIter;`

			`return result;`
			`}`

			`/*`
			`* Report a break that's in the text with spaces but`
			`* not found in the text without spaces.`
			`*/`
			`void ThaiWordbreakTest::breakNotFound(int32_t br)`
			`{`
ICU-999 Use a UTF-8 file instead of a UTF-16 file. X-SVN-Rev: 5401 2001-07-31 18:46:51 +00:00			`if (fVerbose) {`
			`printf("%d ****\n", br);`
			`} else {`
			`fprintf(stderr, "break not found: %d\n", br);`
			`}`

			`fBreaksNotFound += 1;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* Report a break that's found in the text without spaces`
			`* that isn't in the text with spaces.`
			`*/`
			`void ThaiWordbreakTest::foundInvalidBreak(int32_t br)`
			`{`
ICU-999 Use a UTF-8 file instead of a UTF-16 file. X-SVN-Rev: 5401 2001-07-31 18:46:51 +00:00			`if (fVerbose) {`
			`printf("**** %d\n", br);`
			`} else {`
			`fprintf(stderr, "found invalid break: %d\n", br);`
			`}`

			`fInvalidBreaks += 1;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* Read the text from a file. The text must start with a Unicode Byte`
			`* Order Mark (BOM) so that we know what order to read the bytes in.`
			`*/`
			`const UChar ThaiWordbreakTest::readFile(char fileName, int32_t &charCount)`
			`{`
ICU-999 Use a UTF-8 file instead of a UTF-16 file. X-SVN-Rev: 5401 2001-07-31 18:46:51 +00:00			`FILE *f;`
			`int32_t fileSize;`

			`UChar *buffer;`
			`char *bufferChars;`

			`f = fopen(fileName, "rb");`

			`if( f == NULL ) {`
			`fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));`
			`return 0;`
			`}`

			`fseek(f, 0, SEEK_END);`
			`fileSize = ftell(f);`

			`fseek(f, 0, SEEK_SET);`
			`bufferChars = new char[fileSize];`

			`if(bufferChars == 0) {`
			`fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));`
			`fclose(f);`
			`return 0;`
			`}`

			`fread(bufferChars, sizeof(char), fileSize, f);`
			`if( ferror(f) ) {`
			`fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));`
			`fclose(f);`
			`delete[] bufferChars;`
			`return 0;`
			`}`
			`fclose(f);`

			`UnicodeString myText(bufferChars, fileSize, "UTF-8");`
ICU-1470 fixed a memory leak in readFile() X-SVN-Rev: 6686 2001-11-08 00:29:20 +00:00
			`delete[] bufferChars;`
ICU-999 Use a UTF-8 file instead of a UTF-16 file. X-SVN-Rev: 5401 2001-07-31 18:46:51 +00:00
			`charCount = myText.length();`
			`buffer = new UChar[charCount];`
			`if(buffer == 0) {`
			`fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));`
			`return 0;`
			`}`

			`myText.extract(1, myText.length(), buffer);`
			`charCount--; // skip the BOM`
			`buffer[charCount] = 0; // NULL terminate for easier reading in the debugger`

			`return buffer;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* Remove spaces from the input UChar array.`
			`*`
			`* We check explicitly for a Unicode code value of 0x0020`
			`* because Unicode::isSpaceChar returns true for CR, LF, etc.`
			`*`
			`*/`
			`const UChar ThaiWordbreakTest::crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount)`
			`{`
			`int32_t i, out, spaceCount;`

			`spaceCount = 0;`
			`for (i = 0; i < count; i += 1) {`
			`if (spaces[i] == 0x0020 /Unicode::isSpaceChar(spaces[i])/) {`
			`spaceCount += 1;`
			`}`
			`}`

ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`nonSpaceCount = count - spaceCount;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`UChar *noSpaces = new UChar[nonSpaceCount];`

ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`if (noSpaces == 0) {`
			`fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");`
			`return 0;`
			`}`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`for (out = 0, i = 0; i < count; i += 1) {`
			`if (spaces[i] != 0x0020 /! Unicode::isSpaceChar(spaces[i])/) {`
			`noSpaces[out++] = spaces[i];`
			`}`
			`}`

ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`return noSpaces;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`}`

			`/*`
			`* The main routine. Read the command line arguments, read the text file,`
			`* remove the spaces, do the comparison and report the final results`
			`*/`
			`int main(int argc, char **argv)`
			`{`
			`char *fileName = "space.txt";`
			`int arg = 1;`
			`UBool verbose = false;`

			`if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {`
			`verbose = true;`
			`arg += 1;`
			`}`

			`if (arg == argc - 1) {`
			`fileName = argv[arg++];`
			`}`

			`if (arg != argc) {`
			`fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);`
			`return 1;`
			`}`

ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`int32_t spaceCount, nonSpaceCount;`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00			`const UChar spaces, noSpaces;`

			`spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);`

			`if (spaces == 0) {`
			`return 1;`
			`}`

			`noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);`

ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`if (noSpaces == 0) {`
			`return 1;`
			`}`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);`

ICU-1828 Remove tabs X-SVN-Rev: 8331 2002-04-02 02:55:31 +00:00			`printf("word count: %d\n", test.getWordCount());`
			`printf("breaks not found: %d\n", test.getBreaksNotFound());`
			`printf("invalid breaks found: %d\n", test.getInvalidBreaks());`
ICU-603 initial checkin of full Thai word break test. X-SVN-Rev: 4625 2001-05-08 17:30:04 +00:00
			`return 0;`
			`}`

			`/*`
			`* The main constructor. Clear all the counts and construct a default`
			`* word instance of a BreakIterator.`
			`*/`
			`SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)`
			`: fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(false)`
			`{`
			`UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);`
			`UErrorCode status = U_ZERO_ERROR;`
			`Locale us("us");`

			`fBreakIter = BreakIterator::createWordInstance(us, status);`
			`fBreakIter->adoptText(iter);`
			`}`

			`SpaceBreakIterator::SpaceBreakIterator()`
			`{`
			`// nothing`
			`}`

			`/*`
			`* The destructor. delete the underlying BreakIterator`
			`*/`
			`SpaceBreakIterator::~SpaceBreakIterator()`
			`{`
			`delete fBreakIter;`
			`}`

			`/*`
			`* Return the next break, counting words and spaces.`
			`*/`
			`int32_t SpaceBreakIterator::next()`
			`{`
			`if (fDone) {`
			`return BreakIterator::DONE;`
			`}`

			`int32_t nextBreak = fBreakIter->next();`

			`if (nextBreak == BreakIterator::DONE) {`
			`fDone = true;`
			`return BreakIterator::DONE;`
			`}`

			`int32_t result = nextBreak - fSpaceCount;`

			`if (nextBreak < fTextCount) {`
			`if (fText[nextBreak] == 0x0020 /Unicode::isSpaceChar(fText[nextBreak])/) {`
			`fSpaceCount += fBreakIter->next() - nextBreak;`
			`}`
			`}`

			`fWordCount += 1;`

			`return result;`
			`}`

			`/*`
			`* Returns the current space count`
			`*/`
			`int32_t SpaceBreakIterator::getSpaceCount()`
			`{`
			`return fSpaceCount;`
			`}`

			`/*`
			`* Returns the current word count`
			`*/`
			`int32_t SpaceBreakIterator::getWordCount()`
			`{`
			`return fWordCount;`
			`}`