ICU-8576 Dictionary break test updates from George Rhoten

X-SVN-Rev: 30327
2011-07-13 06:49:42 +00:00 · 2011-07-13 06:49:42 +00:00 · 62d26cc5fe
commit 62d26cc5fe
parent 2ed9168bf9
9 changed files with 227 additions and 79 deletions
--- a/icu4c/source/test/intltest/Makefile.in
+++ b/icu4c/source/test/intltest/Makefile.in
@ -51,7 +51,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o	\
 tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o		\
 tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
 bytestrietest.o ucharstrietest.o \
-itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
+itrbbi.o rbbiapts.o dicttest.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
 testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
 jamotest.o srchtest.o reptest.o regextst.o \
 itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
--- a/icu4c/source/test/intltest/dicttest.cpp
+++ b/icu4c/source/test/intltest/dicttest.cpp
@ -0,0 +1,140 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation 
+* and others.  All Rights Reserved.
+**********************************************************************
+************************************************************************
+*   Date          Name        Description
+*   05/14/2011    grhoten     Creation.
+************************************************************************/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "dicttest.h"
+#include "textfile.h"
+#include "uvector.h"
+#include "unicode/rbbi.h"
+
+void DictionaryWordTest::TestThaiBreaks() {
+    UErrorCode status=U_ZERO_ERROR;
+    BreakIterator* b;
+    Locale locale = Locale("th");
+    int32_t p, index;
+    UChar c[]= { 
+            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 
+            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 
+            0x0E16, 0x0E49, 0x0E33, 0x0000
+    };
+    int32_t expectedWordResult[] = {
+            2, 3, 6, 10, 11, 15, 17, 20, 22
+    };
+    int32_t expectedLineResult[] = {
+            3, 6, 11, 15, 17, 20, 22
+    };
+
+    int32_t size = u_strlen(c);
+    UnicodeString text=UnicodeString(c);
+    
+    b = BreakIterator::createWordInstance(locale, status);
+    if (U_FAILURE(status)) {
+        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
+        return;
+    }
+    b->setText(text);
+    p = index = 0;
+    while ((p=b->next())!=BreakIterator::DONE && p < size) {
+        if (p != expectedWordResult[index++]) {
+            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
+        }
+    }
+    delete b;
+    
+    b = BreakIterator::createLineInstance(locale, status);
+    if (U_FAILURE(status)) {
+        printf("Unable to create thai line break iterator.\n");
+        return;
+    }
+    b->setText(text);
+    p = index = 0;
+    while ((p=b->next())!=BreakIterator::DONE && p < size) {
+        if (p != expectedLineResult[index++]) {
+            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
+        }
+    }
+
+    delete b;
+}
+
+#define DICTIONARY_TEST_FILE "wordsegments.txt"
+
+void DictionaryWordTest::TestWordBoundaries() {
+    UErrorCode      status  = U_ZERO_ERROR;
+
+    TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
+    if (U_FAILURE(status)) {
+        dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
+              u_errorName(status));
+        return;
+    }
+
+    // Due to how the word break iterator works,
+    // scripts for languages that use no spaces should use the correct dictionary by default.
+    BreakIterator *wb = BreakIterator::createWordInstance("en", status);
+    if (U_FAILURE(status)) {
+        dataerrln("Word break iterator can not be opened: %s; skipping test",
+              u_errorName(status));
+        return;
+    }
+
+    int32_t pos, pIdx;
+    int32_t testLines = 0;
+    UnicodeString phrase;
+    while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
+        UVector breaks(status);
+
+        for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
+            if (phrase.charAt(pIdx) == 0x007C /* | */) {
+                breaks.addElement(pIdx, status);
+                phrase.remove(pIdx, 1);
+            }
+        }
+        breaks.addElement(pIdx, status);
+
+        wb->setText(phrase);
+        int32_t brkArrPos = 0;
+        while ((pos=wb->next())!=BreakIterator::DONE) {
+            int32_t expectedPos = breaks.elementAti(brkArrPos);
+            if (expectedPos != pos) {
+                errln("Incorrect forward word break on line %d. Expected: %d  Got: %d",
+                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+            }
+            brkArrPos++;
+        }
+        brkArrPos = breaks.size() - 1;
+        while ((pos=wb->previous())!=BreakIterator::DONE) {
+            brkArrPos--;
+            int32_t expectedPos = breaks.elementAti(brkArrPos);
+            if (expectedPos != pos) {
+                errln("Incorrect backward word break on line %d. Expected: %d  Got: %d",
+                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+            }
+        }
+        testLines++;
+    }
+    delete wb;
+    logln("%d tests were run.", testLines);
+}
+
+void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
+{
+    if (exec) logln("TestSuite DictionaryWordTest: ");
+    TESTCASE_AUTO_BEGIN;
+    TESTCASE_AUTO(TestThaiBreaks);
+    TESTCASE_AUTO(TestWordBoundaries);
+    TESTCASE_AUTO_END;
+}
+
+
+#endif
--- a/icu4c/source/test/intltest/dicttest.h
+++ b/icu4c/source/test/intltest/dicttest.h
@ -0,0 +1,31 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation 
+* and others.  All Rights Reserved.
+**********************************************************************
+************************************************************************
+*   Date          Name        Description
+*   05/14/2011    grhoten     Creation.
+************************************************************************/
+
+#ifndef DICTTEST_H
+#define DICTTEST_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "intltest.h"
+
+
+class DictionaryWordTest: public IntlTest {
+public:
+    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
+    void TestWordBoundaries();
+    void TestThaiBreaks();
+};
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
+#endif
+
--- a/icu4c/source/test/intltest/intltest.vcxproj
+++ b/icu4c/source/test/intltest/intltest.vcxproj
@ -224,6 +224,7 @@
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="bytestrietest.cpp" />
+    <ClCompile Include="dicttest.cpp" />
    <ClCompile Include="ucharstrietest.cpp" />
    <ClCompile Include="itrbbi.cpp" />
    <ClCompile Include="rbbiapts.cpp" />
@ -389,6 +390,7 @@
    <ClCompile Include="bidiconf.cpp" />
  </ItemGroup>
  <ItemGroup>
+    <ClInclude Include="dicttest.h" />
    <ClInclude Include="itrbbi.h" />
    <ClInclude Include="rbbiapts.h" />
    <ClInclude Include="rbbitst.h" />
@ -533,4 +535,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/icu4c/source/test/intltest/intltest.vcxproj.filters
+++ b/icu4c/source/test/intltest/intltest.vcxproj.filters
@ -444,6 +444,9 @@
    <ClCompile Include="alphaindextst.cpp">
      <Filter>collation</Filter>
    </ClCompile>
+    <ClCompile Include="dicttest.cpp">
+      <Filter>break iteration</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="itrbbi.h">
@ -812,5 +815,8 @@
    <ClInclude Include="alphaindextst.h">
      <Filter>collation</Filter>
    </ClInclude>
+    <ClInclude Include="dicttest.h">
+      <Filter>break iteration</Filter>
+    </ClInclude>
  </ItemGroup>
-</Project>
+</Project>
--- a/icu4c/source/test/intltest/itrbbi.cpp
+++ b/icu4c/source/test/intltest/itrbbi.cpp
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-* Copyright (C) 1998-2001, International Business Machines Corporation 
+* Copyright (C) 1998-2011, International Business Machines Corporation 
 * and others.  All Rights Reserved.
 **********************************************************************
 */
@ -19,28 +19,27 @@
 #include "itrbbi.h"
 #include "rbbiapts.h"
 #include "rbbitst.h"
+#include "dicttest.h"
+
+#define TESTCLASS(n,classname)        \
+    case n:                           \
+        name = #classname;            \
+        if (exec) {                   \
+            logln(#classname "---");  \
+            logln("");                \
+            classname t;              \
+            callTest(t, par);         \
+        }                             \
+        break
+

 void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
 {
    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
    switch (index) {
-        case 0:
-            name = "RBBIAPITest"; 
-            if (exec) {
-                logln("RBBIAPITest--"); logln("");
-                RBBIAPITest test;
-                callTest( test, par );
-            }
-            break;
-
-        case 1:
-           name = "RBBITest"; 
-            if (exec) {
-                logln("RBBITest---"); logln("");
-                RBBITest test;
-                callTest( test, par );
-            }
-            break;
+        TESTCLASS(0, RBBIAPITest);
+        TESTCLASS(1, RBBITest);
+        TESTCLASS(2, DictionaryWordTest);
        default: name=""; break;
    }
 }
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -134,17 +134,15 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
 #if !UCONFIG_NO_FILE_IO
        case 21: name = "TestBug5775";
            if (exec) TestBug5775();                           break;
-        case 22: name = "TestThaiBreaks";
-            if (exec) TestThaiBreaks();                        break;
-        case 23: name = "TestTailoredBreaks";
+        case 22: name = "TestTailoredBreaks";
            if (exec) TestTailoredBreaks();                    break;
 #else
-        case 21: case 22: case 23: name = "skip";
+        case 21: case 22: name = "skip";
            break;
 #endif
-        case 24: name = "TestDictRules";
+        case 23: name = "TestDictRules";
            if (exec) TestDictRules();                         break;
-        case 25: name = "TestBug5532";
+        case 24: name = "TestBug5532";
            if (exec) TestBug5532();                           break;
        default: name = ""; break; //needed to end loop
    }
@ -1810,56 +1808,6 @@ end_test:
 #endif
 }

-void RBBITest::TestThaiBreaks() {
-    UErrorCode status=U_ZERO_ERROR;
-    BreakIterator* b;
-    Locale locale = Locale("th");
-    int32_t p, index;
-    UChar c[]= { 
-            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 
-            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 
-            0x0E16, 0x0E49, 0x0E33, 0x0000
-    };
-    int32_t expectedWordResult[] = {
-            2, 3, 6, 10, 11, 15, 17, 20, 22
-    };
-    int32_t expectedLineResult[] = {
-            3, 6, 11, 15, 17, 20, 22
-    };
-
-    int32_t size = u_strlen(c);
-    UnicodeString text=UnicodeString(c);
-    
-    b = BreakIterator::createWordInstance(locale, status);
-    if (U_FAILURE(status)) {
-        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
-        return;
-    }
-    b->setText(text);
-    p = index = 0;
-    while ((p=b->next())!=BreakIterator::DONE && p < size) {
-        if (p != expectedWordResult[index++]) {
-            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
-        }
-    }
-    delete b;
-    
-    b = BreakIterator::createLineInstance(locale, status);
-    if (U_FAILURE(status)) {
-        printf("Unable to create thai line break iterator.\n");
-        return;
-    }
-    b->setText(text);
-    p = index = 0;
-    while ((p=b->next())!=BreakIterator::DONE && p < size) {
-        if (p != expectedLineResult[index++]) {
-            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
-        }
-    }
-
-    delete b;
-}
-
 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
 // Words don't include colon or period (cldrbug #1969).
 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 1999-2010, International Business Machines
+ * Copyright (c) 1999-2011, International Business Machines
 * Corporation and others. All Rights Reserved.
 *************************************************************************
 *   Date        Name        Description
@ -68,7 +68,6 @@ public:
    void TestTrieDict();
    void TestUnicodeFiles();
    void TestBug5775();
-    void TestThaiBreaks();
    void TestTailoredBreaks();
    void TestDictRules();
    void TestBug5532();
--- a/icu4c/source/test/testdata/wordsegments.txt
+++ b/icu4c/source/test/testdata/wordsegments.txt
@ -0,0 +1,23 @@
+# Copyright (C) 2011-2011, International Business Machines Corporation
+# and others. All Rights Reserved.
+#
+#   file name:  wordsegments.txt
+#   encoding:   UTF-8
+#
+#   created on: 2011may14
+#   created by: George Rhoten
+#   created by: Nathan Wells
+#
+# Word boundary test data for languages that contain no spaces.
+# Boundaries are deliminated with the | character so that it's easier to debug.
+#
+# If you have test data with zero width spaces to deliminate the words, use the following command example.
+# Be sure to copy the zero width space in the sed command.
+# echo 'សូមចំណាយពេលបន្តិចដើម្បីអធិស្ឋានអរព្រះគុណដល់ព្រះអង្គ' | sed 's//\|/g'
+#
+
+# Thai
+กู| |กิน|กุ้ง| |ปิ้่|งอ|ยู่|ใน|ถ้ำ
+
+# Khmer
+សូម|ចំណាយពេល|បន្តិច|ដើម្បី|អធិស្ឋាន|អរ|ព្រះគុណ|ដល់|ព្រះអង្គ