ICU-8576 Dictionary break test updates from George Rhoten

X-SVN-Rev: 30327
This commit is contained in:
Peter Edberg 2011-07-13 06:49:42 +00:00
parent 2ed9168bf9
commit 62d26cc5fe
9 changed files with 227 additions and 79 deletions

View File

@ -51,7 +51,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
bytestrietest.o ucharstrietest.o \
itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
itrbbi.o rbbiapts.o dicttest.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
jamotest.o srchtest.o reptest.o regextst.o \
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \

View File

@ -0,0 +1,140 @@
/*
**********************************************************************
* Copyright (C) 2011-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
************************************************************************
* Date Name Description
* 05/14/2011 grhoten Creation.
************************************************************************/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "dicttest.h"
#include "textfile.h"
#include "uvector.h"
#include "unicode/rbbi.h"
void DictionaryWordTest::TestThaiBreaks() {
UErrorCode status=U_ZERO_ERROR;
BreakIterator* b;
Locale locale = Locale("th");
int32_t p, index;
UChar c[]= {
0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
0x0E16, 0x0E49, 0x0E33, 0x0000
};
int32_t expectedWordResult[] = {
2, 3, 6, 10, 11, 15, 17, 20, 22
};
int32_t expectedLineResult[] = {
3, 6, 11, 15, 17, 20, 22
};
int32_t size = u_strlen(c);
UnicodeString text=UnicodeString(c);
b = BreakIterator::createWordInstance(locale, status);
if (U_FAILURE(status)) {
errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
return;
}
b->setText(text);
p = index = 0;
while ((p=b->next())!=BreakIterator::DONE && p < size) {
if (p != expectedWordResult[index++]) {
errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
}
}
delete b;
b = BreakIterator::createLineInstance(locale, status);
if (U_FAILURE(status)) {
printf("Unable to create thai line break iterator.\n");
return;
}
b->setText(text);
p = index = 0;
while ((p=b->next())!=BreakIterator::DONE && p < size) {
if (p != expectedLineResult[index++]) {
errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
}
}
delete b;
}
#define DICTIONARY_TEST_FILE "wordsegments.txt"
void DictionaryWordTest::TestWordBoundaries() {
UErrorCode status = U_ZERO_ERROR;
TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
if (U_FAILURE(status)) {
dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
u_errorName(status));
return;
}
// Due to how the word break iterator works,
// scripts for languages that use no spaces should use the correct dictionary by default.
BreakIterator *wb = BreakIterator::createWordInstance("en", status);
if (U_FAILURE(status)) {
dataerrln("Word break iterator can not be opened: %s; skipping test",
u_errorName(status));
return;
}
int32_t pos, pIdx;
int32_t testLines = 0;
UnicodeString phrase;
while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
UVector breaks(status);
for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
if (phrase.charAt(pIdx) == 0x007C /* | */) {
breaks.addElement(pIdx, status);
phrase.remove(pIdx, 1);
}
}
breaks.addElement(pIdx, status);
wb->setText(phrase);
int32_t brkArrPos = 0;
while ((pos=wb->next())!=BreakIterator::DONE) {
int32_t expectedPos = breaks.elementAti(brkArrPos);
if (expectedPos != pos) {
errln("Incorrect forward word break on line %d. Expected: %d Got: %d",
phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
}
brkArrPos++;
}
brkArrPos = breaks.size() - 1;
while ((pos=wb->previous())!=BreakIterator::DONE) {
brkArrPos--;
int32_t expectedPos = breaks.elementAti(brkArrPos);
if (expectedPos != pos) {
errln("Incorrect backward word break on line %d. Expected: %d Got: %d",
phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
}
}
testLines++;
}
delete wb;
logln("%d tests were run.", testLines);
}
void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
{
if (exec) logln("TestSuite DictionaryWordTest: ");
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(TestThaiBreaks);
TESTCASE_AUTO(TestWordBoundaries);
TESTCASE_AUTO_END;
}
#endif

View File

@ -0,0 +1,31 @@
/*
**********************************************************************
* Copyright (C) 2011-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
************************************************************************
* Date Name Description
* 05/14/2011 grhoten Creation.
************************************************************************/
#ifndef DICTTEST_H
#define DICTTEST_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "intltest.h"
class DictionaryWordTest: public IntlTest {
public:
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
void TestWordBoundaries();
void TestThaiBreaks();
};
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif

View File

@ -224,6 +224,7 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="bytestrietest.cpp" />
<ClCompile Include="dicttest.cpp" />
<ClCompile Include="ucharstrietest.cpp" />
<ClCompile Include="itrbbi.cpp" />
<ClCompile Include="rbbiapts.cpp" />
@ -389,6 +390,7 @@
<ClCompile Include="bidiconf.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="dicttest.h" />
<ClInclude Include="itrbbi.h" />
<ClInclude Include="rbbiapts.h" />
<ClInclude Include="rbbitst.h" />
@ -533,4 +535,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

View File

@ -444,6 +444,9 @@
<ClCompile Include="alphaindextst.cpp">
<Filter>collation</Filter>
</ClCompile>
<ClCompile Include="dicttest.cpp">
<Filter>break iteration</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="itrbbi.h">
@ -812,5 +815,8 @@
<ClInclude Include="alphaindextst.h">
<Filter>collation</Filter>
</ClInclude>
<ClInclude Include="dicttest.h">
<Filter>break iteration</Filter>
</ClInclude>
</ItemGroup>
</Project>
</Project>

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1998-2001, International Business Machines Corporation
* Copyright (C) 1998-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*/
@ -19,28 +19,27 @@
#include "itrbbi.h"
#include "rbbiapts.h"
#include "rbbitst.h"
#include "dicttest.h"
#define TESTCLASS(n,classname) \
case n: \
name = #classname; \
if (exec) { \
logln(#classname "---"); \
logln(""); \
classname t; \
callTest(t, par); \
} \
break
void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
switch (index) {
case 0:
name = "RBBIAPITest";
if (exec) {
logln("RBBIAPITest--"); logln("");
RBBIAPITest test;
callTest( test, par );
}
break;
case 1:
name = "RBBITest";
if (exec) {
logln("RBBITest---"); logln("");
RBBITest test;
callTest( test, par );
}
break;
TESTCLASS(0, RBBIAPITest);
TESTCLASS(1, RBBITest);
TESTCLASS(2, DictionaryWordTest);
default: name=""; break;
}
}

View File

@ -134,17 +134,15 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
#if !UCONFIG_NO_FILE_IO
case 21: name = "TestBug5775";
if (exec) TestBug5775(); break;
case 22: name = "TestThaiBreaks";
if (exec) TestThaiBreaks(); break;
case 23: name = "TestTailoredBreaks";
case 22: name = "TestTailoredBreaks";
if (exec) TestTailoredBreaks(); break;
#else
case 21: case 22: case 23: name = "skip";
case 21: case 22: name = "skip";
break;
#endif
case 24: name = "TestDictRules";
case 23: name = "TestDictRules";
if (exec) TestDictRules(); break;
case 25: name = "TestBug5532";
case 24: name = "TestBug5532";
if (exec) TestBug5532(); break;
default: name = ""; break; //needed to end loop
}
@ -1810,56 +1808,6 @@ end_test:
#endif
}
void RBBITest::TestThaiBreaks() {
UErrorCode status=U_ZERO_ERROR;
BreakIterator* b;
Locale locale = Locale("th");
int32_t p, index;
UChar c[]= {
0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
0x0E16, 0x0E49, 0x0E33, 0x0000
};
int32_t expectedWordResult[] = {
2, 3, 6, 10, 11, 15, 17, 20, 22
};
int32_t expectedLineResult[] = {
3, 6, 11, 15, 17, 20, 22
};
int32_t size = u_strlen(c);
UnicodeString text=UnicodeString(c);
b = BreakIterator::createWordInstance(locale, status);
if (U_FAILURE(status)) {
errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
return;
}
b->setText(text);
p = index = 0;
while ((p=b->next())!=BreakIterator::DONE && p < size) {
if (p != expectedWordResult[index++]) {
errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
}
}
delete b;
b = BreakIterator::createLineInstance(locale, status);
if (U_FAILURE(status)) {
printf("Unable to create thai line break iterator.\n");
return;
}
b->setText(text);
p = index = 0;
while ((p=b->next())!=BreakIterator::DONE && p < size) {
if (p != expectedLineResult[index++]) {
errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
}
}
delete b;
}
// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
// Words don't include colon or period (cldrbug #1969).
static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types.";

View File

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 1999-2010, International Business Machines
* Copyright (c) 1999-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*************************************************************************
* Date Name Description
@ -68,7 +68,6 @@ public:
void TestTrieDict();
void TestUnicodeFiles();
void TestBug5775();
void TestThaiBreaks();
void TestTailoredBreaks();
void TestDictRules();
void TestBug5532();

View File

@ -0,0 +1,23 @@
# Copyright (C) 2011-2011, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file name: wordsegments.txt
# encoding: UTF-8
#
# created on: 2011may14
# created by: George Rhoten
# created by: Nathan Wells
#
# Word boundary test data for languages that contain no spaces.
# Boundaries are deliminated with the | character so that it's easier to debug.
#
# If you have test data with zero width spaces to deliminate the words, use the following command example.
# Be sure to copy the zero width space in the sed command.
# echo 'សូម​ចំណាយពេល​បន្តិច​ដើម្បី​អធិស្ឋាន​អរ​ព្រះគុណ​ដល់​ព្រះអង្គ' | sed 's//\|/g'
#
# Thai
กู| |กิน|กุ้ง| |ปิ้่|งอ|ยู่|ใน|ถ้ำ
# Khmer
សូម|ចំណាយពេល|បន្តិច|ដើម្បី|អធិស្ឋាន|អរ|ព្រះគុណ|ដល់|ព្រះអង្គ