scuffed-code/icu4c/source/test/thaitest/thaitest.cpp
2001-05-08 17:30:04 +00:00

526 lines
14 KiB
C++

/*
******************************************************************************
* Copyright (C) 1998-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
*/
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include "unicode/utypes.h"
#include "unicode/unicode.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/brkiter.h"
#include "unicode/locid.h"
/*
* This program takes a Unicode text file containing Thai text with
* spaces inserted where the word breaks are. It computes a copy of
* the text without spaces and uses a word instance of a Thai BreakIterator
* to compute the word breaks. The program reports any differences in the
* breaks.
*
* NOTE: by it's very nature, Thai word breaking is not exact, so it is
* exptected that this program will always report some differences.
*/
/*
* This class is a break iterator that counts words and spaces.
*/
class SpaceBreakIterator
{
public:
// The constructor:
// text - pointer to an array of UChars to iterate over
// count - the number of UChars in text
SpaceBreakIterator(const UChar *text, int32_t count);
// the destructor
~SpaceBreakIterator();
// return next break position
int32_t next();
// return current word count
int32_t getWordCount();
// return current space count
int32_t getSpaceCount();
private:
// No arg constructor: private so clients can't call it.
SpaceBreakIterator();
// The underlying BreakIterator
BreakIterator *fBreakIter;
// address of the UChar array
const UChar *fText;
// number of UChars in fText
int32_t fTextCount;
// current word count
int32_t fWordCount;
// current space count
int32_t fSpaceCount;
// true when fBreakIter has returned DONE
UBool fDone;
};
/*
* This is the main class. It compares word breaks and reports the differences.
*/
class ThaiWordbreakTest
{
public:
// The main constructor:
// spaces - pointer to a UChar array for the text with spaces
// spaceCount - the number of characters in the spaces array
// noSpaces - pointer to a UChar array for the text without spaces
// noSpaceCount - the number of characters in the noSpaces array
// verbose - report all breaks if true, otherwise just report differences
ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
~ThaiWordbreakTest();
// returns the number of breaks that are in the spaces array
// but aren't found in the noSpaces array
int32_t getBreaksNotFound();
// returns the number of breaks which are found in the noSpaces
// array but aren't in the spaces array
int32_t getInvalidBreaks();
// returns the number of words found in the spaces array
int32_t getWordCount();
// reads the input Unicode text file:
// fileName - the path name of the file
// charCount - set to the number of UChars read from the file
// returns - the address of the UChar array containing the characters
static const UChar *readFile(char *fileName, int32_t &charCount);
// removes spaces form the input UChar array:
// spaces - pointer to the input UChar array
// count - number of UChars in the spaces array
// nonSpaceCount - the number of UChars in the result array
// returns - the address of the UChar array with spaces removed
static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
private:
// The no arg constructor - private so clients can't call it
ThaiWordbreakTest();
// This does the actual comparison:
// spaces - the address of the UChar array for the text with spaces
// spaceCount - the number of UChars in the spaces array
// noSpaces - the address of the UChar array for the text without spaces
// noSpaceCount - the number of UChars in the noSpaces array
// returns - true if all breaks match, false otherwise
UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
const UChar *noSpaces, int32_t noSpaceCount);
// helper method to report a break in the spaces
// array that's not found in the noSpaces array
void breakNotFound(int32_t br);
// helper method to report a break that's found in
// the noSpaces array that's not in the spaces array
void foundInvalidBreak(int32_t br);
// count of breaks in the spaces array that
// aren't found in the noSpaces array
int32_t fBreaksNotFound;
// count of breaks found in the noSpaces array
// that aren't in the spaces array
int32_t fInvalidBreaks;
// number of words found in the spaces array
int32_t fWordCount;
// report all breaks if true, otherwise just report differences
UBool fVerbose;
};
/*
* The main constructor: it calls compareWordBreaks and reports any differences
*/
ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
{
compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
}
/*
* The no arg constructor
*/
ThaiWordbreakTest::ThaiWordbreakTest()
{
// nothing
}
/*
* The destructor
*/
ThaiWordbreakTest::~ThaiWordbreakTest()
{
// nothing?
}
/*
* returns the number of breaks in the spaces array
* that aren't found in the noSpaces array
*/
inline int32_t ThaiWordbreakTest::getBreaksNotFound()
{
return fBreaksNotFound;
}
/*
* Returns the number of breaks found in the noSpaces
* array that aren't in the spaces array
*/
inline int32_t ThaiWordbreakTest::getInvalidBreaks()
{
return fInvalidBreaks;
}
/*
* Returns the number of words found in the spaces array
*/
inline int32_t ThaiWordbreakTest::getWordCount()
{
return fWordCount;
}
/*
* This method does the acutal break comparison and reports the results.
* It uses a SpaceBreakIterator to iterate over the text with spaces,
* and a word instance of a Thai BreakIterator to iterate over the text
* without spaces.
*/
UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
const UChar *noSpaces, int32_t noSpaceCount)
{
UBool result = true;
Locale thai("th");
UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
UErrorCode status = U_ZERO_ERROR;
BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
breakIter->adoptText(noSpaceIter);
SpaceBreakIterator spaceIter(spaces, spaceCount);
int32_t nextBreak = 0;
int32_t nextSpaceBreak = 0;
int32_t iterCount = 0;
while (true) {
nextSpaceBreak = spaceIter.next();
nextBreak = breakIter->next();
if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
if (nextBreak != BreakIterator::DONE) {
fprintf(stderr, "break iterator didn't end.\n");
} else if (nextSpaceBreak != BreakIterator::DONE) {
fprintf(stderr, "premature break iterator end.\n");
}
break;
}
while (nextSpaceBreak != nextBreak &&
nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
if (nextSpaceBreak < nextBreak) {
breakNotFound(nextSpaceBreak);
result = false;
nextSpaceBreak = spaceIter.next();
} else if (nextSpaceBreak > nextBreak) {
foundInvalidBreak(nextBreak);
result = false;
nextBreak = breakIter->next();
}
}
if (fVerbose) {
printf("%d %d\n", nextSpaceBreak, nextBreak);
}
}
fWordCount = spaceIter.getWordCount();
delete breakIter;
return result;
}
/*
* Report a break that's in the text with spaces but
* not found in the text without spaces.
*/
void ThaiWordbreakTest::breakNotFound(int32_t br)
{
if (fVerbose) {
printf("%d ****\n", br);
} else {
fprintf(stderr, "break not found: %d\n", br);
}
fBreaksNotFound += 1;
}
/*
* Report a break that's found in the text without spaces
* that isn't in the text with spaces.
*/
void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
{
if (fVerbose) {
printf("**** %d\n", br);
} else {
fprintf(stderr, "found invalid break: %d\n", br);
}
fInvalidBreaks += 1;
}
/*
* Read the text from a file. The text must start with a Unicode Byte
* Order Mark (BOM) so that we know what order to read the bytes in.
*/
const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
{
FILE *f;
size_t bytesRead;
int32_t fileSize;
UChar *buffer;
UChar bom;
f = fopen(fileName, "rb");
if( f == NULL ) {
fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
return 0;
}
fseek(f, 0, SEEK_END);
fileSize = ftell(f) - 2; // - 2 for BOM...
// FIXME: should check for odd file size...
charCount = fileSize / 2;
fseek(f, 0, SEEK_SET);
buffer = new UChar[charCount];
if(buffer == 0) {
fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
fclose(f);
return 0;
}
// read the BOM...
fread(&bom, 1, 2, f);
bytesRead = 0;
while (bytesRead < fileSize && ! feof(f)) {
bytesRead += fread(buffer + bytesRead, 1, fileSize - bytesRead, f);
if( ferror(f) ) {
fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
fclose(f);
delete[] buffer;
return 0;
}
}
fclose(f);
// Swap bytes if the BOM is byte-swapped
if (bom == 0xFFFE) {
char *byteBuffer = (char *) buffer;
int32_t i;
for (i = 0; i < fileSize; i += 2) {
char temp = byteBuffer[i];
byteBuffer[i] = byteBuffer[i + 1];
byteBuffer[i + 1] = temp;
}
} else if (bom != 0xFEFF) {
fprintf(stderr, "File %s does not start with a Byte Order Mark: 0x%4.4X\n", fileName, bom);
delete[] buffer;
return 0;
}
return buffer;
}
/*
* Remove spaces from the input UChar array.
*
* We check explicitly for a Unicode code value of 0x0020
* because Unicode::isSpaceChar returns true for CR, LF, etc.
*
*/
const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
{
int32_t i, out, spaceCount;
spaceCount = 0;
for (i = 0; i < count; i += 1) {
if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
spaceCount += 1;
}
}
nonSpaceCount = count - spaceCount;
UChar *noSpaces = new UChar[nonSpaceCount];
if (noSpaces == 0) {
fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
return 0;
}
for (out = 0, i = 0; i < count; i += 1) {
if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
noSpaces[out++] = spaces[i];
}
}
return noSpaces;
}
/*
* The main routine. Read the command line arguments, read the text file,
* remove the spaces, do the comparison and report the final results
*/
int main(int argc, char **argv)
{
char *fileName = "space.txt";
int arg = 1;
UBool verbose = false;
if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
verbose = true;
arg += 1;
}
if (arg == argc - 1) {
fileName = argv[arg++];
}
if (arg != argc) {
fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
return 1;
}
int32_t spaceCount, nonSpaceCount;
const UChar *spaces, *noSpaces;
spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
if (spaces == 0) {
return 1;
}
noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
if (noSpaces == 0) {
return 1;
}
ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
printf("word count: %d\n", test.getWordCount());
printf("breaks not found: %d\n", test.getBreaksNotFound());
printf("invalid breaks found: %d\n", test.getInvalidBreaks());
return 0;
}
/*
* The main constructor. Clear all the counts and construct a default
* word instance of a BreakIterator.
*/
SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
: fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(false)
{
UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
UErrorCode status = U_ZERO_ERROR;
Locale us("us");
fBreakIter = BreakIterator::createWordInstance(us, status);
fBreakIter->adoptText(iter);
}
SpaceBreakIterator::SpaceBreakIterator()
{
// nothing
}
/*
* The destructor. delete the underlying BreakIterator
*/
SpaceBreakIterator::~SpaceBreakIterator()
{
delete fBreakIter;
}
/*
* Return the next break, counting words and spaces.
*/
int32_t SpaceBreakIterator::next()
{
if (fDone) {
return BreakIterator::DONE;
}
int32_t nextBreak = fBreakIter->next();
if (nextBreak == BreakIterator::DONE) {
fDone = true;
return BreakIterator::DONE;
}
int32_t result = nextBreak - fSpaceCount;
if (nextBreak < fTextCount) {
if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
fSpaceCount += fBreakIter->next() - nextBreak;
}
}
fWordCount += 1;
return result;
}
/*
* Returns the current space count
*/
int32_t SpaceBreakIterator::getSpaceCount()
{
return fSpaceCount;
}
/*
* Returns the current word count
*/
int32_t SpaceBreakIterator::getWordCount()
{
return fWordCount;
}