141 lines
4.7 KiB
C++
141 lines
4.7 KiB
C++
|
/*
|
||
|
**********************************************************************
|
||
|
* Copyright (C) 2011-2011, International Business Machines Corporation
|
||
|
* and others. All Rights Reserved.
|
||
|
**********************************************************************
|
||
|
************************************************************************
|
||
|
* Date Name Description
|
||
|
* 05/14/2011 grhoten Creation.
|
||
|
************************************************************************/
|
||
|
|
||
|
#include "unicode/utypes.h"
|
||
|
|
||
|
#if !UCONFIG_NO_BREAK_ITERATION
|
||
|
|
||
|
#include "dicttest.h"
|
||
|
#include "textfile.h"
|
||
|
#include "uvector.h"
|
||
|
#include "unicode/rbbi.h"
|
||
|
|
||
|
void DictionaryWordTest::TestThaiBreaks() {
|
||
|
UErrorCode status=U_ZERO_ERROR;
|
||
|
BreakIterator* b;
|
||
|
Locale locale = Locale("th");
|
||
|
int32_t p, index;
|
||
|
UChar c[]= {
|
||
|
0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
|
||
|
0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
|
||
|
0x0E16, 0x0E49, 0x0E33, 0x0000
|
||
|
};
|
||
|
int32_t expectedWordResult[] = {
|
||
|
2, 3, 6, 10, 11, 15, 17, 20, 22
|
||
|
};
|
||
|
int32_t expectedLineResult[] = {
|
||
|
3, 6, 11, 15, 17, 20, 22
|
||
|
};
|
||
|
|
||
|
int32_t size = u_strlen(c);
|
||
|
UnicodeString text=UnicodeString(c);
|
||
|
|
||
|
b = BreakIterator::createWordInstance(locale, status);
|
||
|
if (U_FAILURE(status)) {
|
||
|
errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
|
||
|
return;
|
||
|
}
|
||
|
b->setText(text);
|
||
|
p = index = 0;
|
||
|
while ((p=b->next())!=BreakIterator::DONE && p < size) {
|
||
|
if (p != expectedWordResult[index++]) {
|
||
|
errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
|
||
|
}
|
||
|
}
|
||
|
delete b;
|
||
|
|
||
|
b = BreakIterator::createLineInstance(locale, status);
|
||
|
if (U_FAILURE(status)) {
|
||
|
printf("Unable to create thai line break iterator.\n");
|
||
|
return;
|
||
|
}
|
||
|
b->setText(text);
|
||
|
p = index = 0;
|
||
|
while ((p=b->next())!=BreakIterator::DONE && p < size) {
|
||
|
if (p != expectedLineResult[index++]) {
|
||
|
errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
delete b;
|
||
|
}
|
||
|
|
||
|
#define DICTIONARY_TEST_FILE "wordsegments.txt"
|
||
|
|
||
|
void DictionaryWordTest::TestWordBoundaries() {
|
||
|
UErrorCode status = U_ZERO_ERROR;
|
||
|
|
||
|
TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
|
||
|
if (U_FAILURE(status)) {
|
||
|
dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
|
||
|
u_errorName(status));
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Due to how the word break iterator works,
|
||
|
// scripts for languages that use no spaces should use the correct dictionary by default.
|
||
|
BreakIterator *wb = BreakIterator::createWordInstance("en", status);
|
||
|
if (U_FAILURE(status)) {
|
||
|
dataerrln("Word break iterator can not be opened: %s; skipping test",
|
||
|
u_errorName(status));
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
int32_t pos, pIdx;
|
||
|
int32_t testLines = 0;
|
||
|
UnicodeString phrase;
|
||
|
while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
|
||
|
UVector breaks(status);
|
||
|
|
||
|
for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
|
||
|
if (phrase.charAt(pIdx) == 0x007C /* | */) {
|
||
|
breaks.addElement(pIdx, status);
|
||
|
phrase.remove(pIdx, 1);
|
||
|
}
|
||
|
}
|
||
|
breaks.addElement(pIdx, status);
|
||
|
|
||
|
wb->setText(phrase);
|
||
|
int32_t brkArrPos = 0;
|
||
|
while ((pos=wb->next())!=BreakIterator::DONE) {
|
||
|
int32_t expectedPos = breaks.elementAti(brkArrPos);
|
||
|
if (expectedPos != pos) {
|
||
|
errln("Incorrect forward word break on line %d. Expected: %d Got: %d",
|
||
|
phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
|
||
|
}
|
||
|
brkArrPos++;
|
||
|
}
|
||
|
brkArrPos = breaks.size() - 1;
|
||
|
while ((pos=wb->previous())!=BreakIterator::DONE) {
|
||
|
brkArrPos--;
|
||
|
int32_t expectedPos = breaks.elementAti(brkArrPos);
|
||
|
if (expectedPos != pos) {
|
||
|
errln("Incorrect backward word break on line %d. Expected: %d Got: %d",
|
||
|
phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
|
||
|
}
|
||
|
}
|
||
|
testLines++;
|
||
|
}
|
||
|
delete wb;
|
||
|
logln("%d tests were run.", testLines);
|
||
|
}
|
||
|
|
||
|
void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
|
||
|
{
|
||
|
if (exec) logln("TestSuite DictionaryWordTest: ");
|
||
|
TESTCASE_AUTO_BEGIN;
|
||
|
TESTCASE_AUTO(TestThaiBreaks);
|
||
|
TESTCASE_AUTO(TestWordBoundaries);
|
||
|
TESTCASE_AUTO_END;
|
||
|
}
|
||
|
|
||
|
|
||
|
#endif
|