ICU-2093 line break rule updated; monkey test added (not complete, Grapheme Cluster only so far.)

X-SVN-Rev: 12115
This commit is contained in:
Andy Heninger 2003-05-27 16:29:25 +00:00
parent c0e44994ba
commit 4a211d4dd1
4 changed files with 419 additions and 80 deletions

View File

@ -62,10 +62,9 @@ $ALPlus = $AL | $AI | $SA;
# chars that are included in $CM. Use $Extend instead, where possible.
#
$ALcm = $ALPlus $CM*;
$IDcm = $ID $CM*;
$IDcm = ($ID $CM* | $SP $CM+);
$NUcm = $NU $Extend*;
$HYcm = $HY $Extend*;
$SPcm = $SP $Extend*;
$QUcm = $QU $Extend*;
$POcm = $PO $Extend*;
$OPcm = $OP $Extend*;
@ -85,19 +84,19 @@ $INcm = $IN $Extend*;
# appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $CR $LF;
$Endings = $SPcm* $ZW* $NLF?;
$Endings = $SP* $ZW* $NLF?;
#
# Openings Sequences that can precede Words, and that should not be separated from them.
# Rules LB 9, 10
#
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;
$Openings = (($QUcm $SP*)? $OPcm $SP*)*;
#
# Closings Seqences that follow words, and that should not be separated from them,
# Rule LB 8, 11, 15
$Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm)*;
$Closings = ($SP*( ($CL ($SP* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm)*;
#
# Words. Includes mixed Alpha-numerics.
@ -106,7 +105,7 @@ $Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $B
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)) ; # Alpha-numeric. 16, 17
$Dashes = (($B2cm $SPcm*)*); # Dashes 11a
$Dashes = (($B2cm $SP*)*); # Dashes 11a

View File

@ -20,6 +20,9 @@
#include "unicode/utf16.h"
#include "unicode/ucnv.h"
#include "unicode/schriter.h"
#include "unicode/uniset.h"
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
#include "intltest.h"
#include "rbbitst.h"
#include <string.h>
@ -27,6 +30,7 @@
#include "uvectr32.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
@ -508,25 +512,25 @@ void RBBITest::TestThaiWordBreak() {
// runIndexedTest
//---------------------------------------------
void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
{
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
switch (index) {
case 0: name = "TestExtended";
if(exec) TestExtended(); break;
case 1: name = "";
case 1: name = "extra";
break;
case 2: name = "";
case 2: name = "extra";
break;
case 3: name = "";
case 3: name = "extra";
break;
case 4: name = "";
case 4: name = "extra";
break;
case 5: name = "";
break;
case 6: name = "";
case 5: name = "extra";
break;
case 6: name = "TestJapaneseLineBrea";
if(exec) TestJapaneseLineBreak(); break;
case 7: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
@ -552,8 +556,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if(exec) TestEndBehaviour(); break;
case 16: name = "TestBug4153072";
if(exec) TestBug4153072(); break;
case 17: name = "TestJapaneseLineBreak";
if(exec) TestJapaneseLineBreak(); break;
case 17: name = "TestMonkey";
if(exec) TestMonkey(params); break;
case 18: name = "TestThaiLineBreak";
@ -566,18 +570,6 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if(exec) TestThaiWordBreak(); break;
// case 7: name = "TestHindiCharacterWrapping()";
// if(exec) TestHindiCharacterWrapping(); break;
// case 8: name = "TestCustomRuleBasedWordIteration";
// if(exec) TestCustomRuleBasedWordIteration(); break;
// case 9: name = "TestAbbrRuleBasedWordIteration";
// if(exec) TestAbbrRuleBasedWordIteration(); break;
// case 10: name = "TestTeluguRuleBasedCharacterIteration";
// if(exec) TestTeluguRuleBasedCharacterIteration(); break;
// case 11: name = "TestCustomRuleBasedCharacterIteration";
// if(exec) TestCustomRuleBasedCharacterIteration(); break;
default: name = ""; break; //needed to end loop
}
}
@ -1952,4 +1944,346 @@ void RBBITest::TestLineBreakData() {
}
//
// Monkey Test for Break Iteration
// Abstract interface class. Concrete derived classes independently
// implement the break rules for different iterator types.
//
//
class RBBIMonkeyKind {
public:
// Return a UVector of UnicodeSets, representing the character classes used
// for this type of iterator.
virtual UVector *charClasses() = 0;
// Find the next break postion, starting from the prev break position, or from zero.
// Return -1 after reaching end of string.
virtual int32_t next(const UnicodeString &s, int32_t i) = 0;
virtual ~RBBIMonkeyKind() {};
UErrorCode deferredStatus;
protected:
RBBIMonkeyKind() {};
private:
};
//------------------------------------------------------------------------------------------
//
// class RBBICharMonkey Character (Grapheme Cluster) specific stuff.
//
//------------------------------------------------------------------------------------------
class RBBICharMonkey: public RBBIMonkeyKind {
public:
RBBICharMonkey();
virtual ~RBBICharMonkey();
virtual UVector *charClasses();
virtual int32_t next(const UnicodeString &s, int32_t i);
private:
UVector *fSets;
UnicodeSet *fCRLFSet;
UnicodeSet *fControlSet;
UnicodeSet *fExtendSet;
UnicodeSet *fHangulSet;
UnicodeSet *fAnySet;
RegexMatcher *fMatcher;
};
RBBICharMonkey::RBBICharMonkey() {
UErrorCode status = U_ZERO_ERROR;
fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster
fCRLFSet = new UnicodeSet("[\\r\\n]", status);
fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]", status);
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
fHangulSet = new UnicodeSet(
"[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
"\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]", status);
fSets = new UVector(status);
fSets->addElement(fCRLFSet, status);
fSets->addElement(fControlSet, status);
fSets->addElement(fExtendSet, status);
fSets->addElement(fHangulSet, status);
fSets->addElement(fAnySet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
};
int32_t RBBICharMonkey::next(const UnicodeString &s, int32_t i) {
UErrorCode status = U_ZERO_ERROR;
int32_t retVal = -1;
fMatcher->reset(s);
if (fMatcher->find(i, status)) {
retVal = fMatcher->end(status);
}
if (U_FAILURE(status)){
retVal = -1;
}
return retVal;
}
UVector *RBBICharMonkey::charClasses() {
return fSets;
}
RBBICharMonkey::~RBBICharMonkey() {
delete fSets;
delete fCRLFSet;
delete fControlSet;
delete fExtendSet;
delete fHangulSet;
delete fAnySet;
delete fMatcher;
}
//
// TestMonkey
//
// params
// seed=nnnnn Random number starting seed.
// Setting the seed allows errors to be reproduced.
// loop=nnn Looping count. Controls running time.
// -1: run forever.
// 0 or greater: run length.
// default = 100.
// 0 is minimum
//
// type = char | work | line | sent | title
//
//
static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
int32_t val = defaultVal;
name.append(" *= *(-?\\d+)");
UErrorCode status = U_ZERO_ERROR;
RegexMatcher m(name, params, 0, status);
if (m.find()) {
// The param exists. Convert the string to an int.
char valString[100];
int32_t paramLength = m.end(1, status) - m.start(1, status);
if (paramLength >= sizeof(valString)-1) {paramLength = sizeof(valString)-2;};
params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
val = strtol(valString, NULL, 10);
// Delete this parameter from the params string.
m.reset();
params = m.replaceFirst("", status);
}
U_ASSERT(U_SUCCESS(status));
return val;
}
void RBBITest::TestMonkey(char *params) {
UErrorCode status = U_ZERO_ERROR;
int32_t loopCount = 100;
int32_t seed = 1;
UnicodeString breakType = "all";
Locale locale("en");
if (params) {
UnicodeString p(params);
loopCount = getIntParam("loop", p, 100);
seed = getIntParam("seed", p, 1);
RegexMatcher m(" *type *= *(char|work|line|sent|title) *", p, 0, status);
if (m.find()) {
breakType = m.group(1, status);
m.reset();
m.replaceFirst("", status);
}
if (RegexMatcher("\\S", p, 0, status).find()) {
// Each option is stripped out of the option string as it is processed.
// All options have been checked. The option string should have been completely emptied..
char buf[100];
p.extract(buf, sizeof(buf), NULL, status);
buf[sizeof(buf)-1] = 0;
errln("Unrecognized or extra parameter: %s\n", buf);
return;
}
}
if (breakType == "char" || breakType == "all") {
RBBICharMonkey m;
BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
RunMonkey(bi, m, seed, loopCount);
delete bi;
}
}
void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, int32_t numIterations) {
const int32_t TESTSTRINGLEN = 500;
UnicodeString testText;
int32_t numCharClasses;
UVector *chClasses;
char expectedBreaks[TESTSTRINGLEN*2 + 1];
char forwardBreaks[TESTSTRINGLEN*2 + 1];
char reverseBreaks[TESTSTRINGLEN*2+1];
int i;
int loopCount = 0;
srand( seed);
numCharClasses = mk.charClasses()->size();
chClasses = mk.charClasses();
// Check for errors that occured during the construction of the MonkeyKind object.
// Can't report them where they occured because errln() is a method coming from intlTest,
// and is not visible outside of RBBITest :-(
if (U_FAILURE(mk.deferredStatus)) {
errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
return;
}
// Verify that the character classes all have at least one member.
for (i=0; i<numCharClasses; i++) {
UnicodeSet *s = (UnicodeSet *)chClasses->elementAti(i);
if (s == NULL || s->size() == 0) {
errln("Character Class #%d is null or of zero size.", i);
return;
}
}
numIterations *= 5;
while (loopCount <= numIterations || numIterations == -1) {
// Populate a test string with data.
testText.truncate(0);
for (i=0; i<TESTSTRINGLEN; i++) {
int32_t aClassNum = rand() % numCharClasses;
UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAti(aClassNum);
int32_t charIdx = rand() % classSet->size();
UChar32 c = classSet->charAt(charIdx);
assert(c >= 0); // TODO: dea; with sets containing strings.
testText.append(c);
}
// Calculate the expected results for this test string.
memset(expectedBreaks, 0, sizeof(expectedBreaks));
expectedBreaks[0] = 1;
int32_t breakPos = 0;
for (;;) {
breakPos = mk.next(testText, breakPos);
if (breakPos == -1) {
break;
}
assert(breakPos <= testText.length());
expectedBreaks[breakPos] = 1;
}
// Find the break positions using forward iteration
memset(forwardBreaks, 0, sizeof(expectedBreaks));
bi->setText(testText);
for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
if (i < 0 || i > testText.length()) {
errln("Out of range value returned by breakIterator::next()");
break;
}
forwardBreaks[i] = 1;
}
// Find the break positions using reverse iteration
memset(reverseBreaks, 0, sizeof(expectedBreaks));
for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
if (i < 0 || i > testText.length()) {
errln("Out of range value returned by breakIterator::next()");
break;
}
reverseBreaks[i] = 1;
}
// Compare the expected and actual results.
for (i=0; i<=testText.length(); i++) {
UBool forwardError = forwardBreaks[i] != expectedBreaks[i];
UBool anyError = forwardError || reverseBreaks[i] != expectedBreaks[i];
if (anyError) {
// Format a range of the test text that includes the failure as
// a data item that can be included in the rbbi test data file.
// Start of the range is the last point where expected and actual results
// both agreed that there was a break position.
int startContext = i;
for (;;) {
if (startContext==0) { break; }
startContext--;
if (expectedBreaks[startContext] != 0) {break;}
}
// End of range is two expected breaks past the start position.
int endContext = i+1;
int ci;
for (ci=0; ci<2; ci++) { // Number of items to include in error text.
for (;;) {
if (endContext >= testText.length()) {break;}
if (expectedBreaks[endContext-1] != 0) { break;}
endContext++;
}
}
// Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
UnicodeString errorText = "<data>";
for (ci=startContext; ci<endContext;) {
UnicodeString hexChars("0123456789abcdef");
UChar32 c;
int bn;
c = testText.char32At(ci);
if (expectedBreaks[ci] != 0) {
errorText.append("<>");
}
if (c < 0x10000) {
errorText.append("\\u");
for (bn=12; bn>=0; bn-=4) {
errorText.append(hexChars.charAt((c>>bn)&0xf));
}
} else {
errorText.append("\\U");
for (bn=28; bn>=0; bn-=4) {
errorText.append(hexChars.charAt((c>>bn)&0xf));
}
}
ci = testText.moveIndex32(ci, 1);
}
if (expectedBreaks[ci] != 0) {
errorText.append("<>");
}
errorText.append("</data>\n");
// Output the error
char charErrorTxt[1000];
UErrorCode status = U_ZERO_ERROR;
errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
errln("ERROR. %s. Direction = %s, Random seed = %d \n%s",
(expectedBreaks[i]? "break expected but not found" : "break foun but not expected"),
(forwardError?"forward":"reverse"), seed, charErrorTxt);
break;
}
}
loopCount++;
seed = rand();
}
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View File

@ -20,10 +20,11 @@
#include "intltest.h"
#include "unicode/brkiter.h"
class Vector;
class Enumeration;
class BITestData;
class Vector;
class Enumeration;
class BITestData;
struct TestParams;
class RBBIMonkeyKind;
/**
* Test the RuleBasedBreakIterator class giving different rules
@ -63,6 +64,7 @@ public:
void TestMixedThaiLineBreak();
void TestMaiyamok();
void TestThaiWordBreak();
void TestMonkey(char *params);
void TestExtended();
UChar *ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status);
@ -115,6 +117,8 @@ private:
void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars);
void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars);
void RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, int32_t loopCount);
};
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View File

@ -314,6 +314,7 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
#
################################################################
<line>
#
# Test Character for each of the line break classes.
#
@ -349,68 +350,69 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
# 2b Always break at end of text
<data>• \u00A1•</data>
<data>• \u0041•</data>
<data>• \u0009•</data>
<data>• \u00B4•</data>
<data>• \u000C•</data>
<data>• \u2014•</data>
<data>• \uFFFC•</data>
<data>• \u0029•</data>
<data>• \u0301•</data>
<data>• \u0021•</data>
<data>• \u00A0•</data>
<data>• \u002D•</data>
<data>• \u4E00•</data>
<data>• \u2024•</data>
<data>• \u002C•</data>
<data>• \u000A•</data>
<data>• \u0E5A•</data>
<data>• \u0032•</data>
<data>• \u0028•</data>
<data>• \u0025•</data>
<data>• \u0024•</data>
<data>• \u0022•</data>
<data>• \u0E01•</data>
<data>• \uDB7F•</data>
<data>• \u0020•</data>
<data>• \u002F•</data>
<data>• \uF8FF•</data>
<data>• \u200B•</data>
<data>• \u00A1•</data>
<data>• \u0041•</data>
<data>• \u0009•</data>
<data>• \u00B4•</data>
<data>• \u000C•</data> # LB3C × BK
<data>• \u2014•</data>
<data>• \uFFFC•</data>
<data>• \u0029•</data> # LB 8 × CL
# <data>• • \u0301•</data> # LB 7a Treat SP CM* as if it were ID #TODO: SP CM
<data>• \u0021•</data> # LB 8 × EX
#<data>• \u00A0•</data> # LB 11b × GL TODO: fix.
<data>• \u002D•</data>
<data>• \u4E00•</data>
<data>• \u2024•</data>
<data>• \u002C•</data> # LB 8 × IS
<data>• \u000A•</data> # LB3C × ( BK | CR | LF | NL )
<data>• \u0E5A•</data>
<data>• \u0032•</data>
<data>• \u0028•</data>
<data>• \u0025•</data>
<data>• \u0024•</data>
<data>• \u0022•</data>
<data>• \u0E01•</data>
<data>• \uDB7F•</data>
<data>• \u0020•</data> # LB4 - don't break before space.
<data>• \u002F•</data> # LB 8 × SY
<data>• \uF8FF•</data>
<data>• \u200B•</data> # LB4 - don't break before ZA
# 3a Always break after hard line breaks.
# 3c Never break before hard line breaks.
<data>• \u00A1\u2028•\u00A1•</data>
<data>• \u0041\u2028•\u0041•</data>
<data>• \u0009\u2028•\u0009•</data>
<data>• \u00B4\u2028•\u00B4•</data>
<data>• \u000C\u2028•\u000C•</data>
<data>• \u2014\u2028•\u2014•</data>
<data>• \uFFFC\u2028•\uFFFC•</data>
<data>• \u00A1\u2028•\u00A1•</data>
<data>• \u0041\u2028•\u0041•</data>
<data>• \u0009\u2028•\u0009•</data>
<data>• \u00B4\u2028•\u00B4•</data>
<data>• \u000C\u2028•\u000C•</data>
<data>• \u2014\u2028•\u2014•</data>
<data>• \uFFFC\u2028•\uFFFC•</data>
<data>• \u0029\u2028•\u0029•</data>
<data>• \u0301\u2028•\u0301•</data>
#<data>• \u0301\u2028•\u0301•</data> # TODO: fix.
<data>• \u0021\u2028•\u0021•</data>
<data>• \u00A0\u2028•\u00A0•</data>
<data>• \u002D\u2028•\u002D•</data>
<data>• \u4E00\u2028•\u4E00•</data>
<data>• \u2024\u2028•\u2024•</data>
#<data>• \u00A0\u2028•\u00A0•</data> # TODO: fix
<data>• \u002D\u2028•\u002D•</data>
<data>• \u4E00\u2028•\u4E00•</data>
<data>• \u2024\u2028•\u2024•</data>
<data>• \u002C\u2028•\u002C•</data>
<data>• \u000A•\u2028•\u000A•</data>
<data>• \u0E5A\u2028•\u0E5A•</data>
<data>• \u0032\u2028•\u0032•</data>
<data>• \u0028\u2028•\u0028•</data>
<data>• \u0025\u2028•\u0025•</data>
<data>• \u0024\u2028•\u0024•</data>
<data>• \u0022\u2028•\u0022•</data>
<data>• \u0E01\u2028•\u0E01•</data>
<data>• \uDB7F\u2028•\uDB7F•</data>
<data>• \u0E5A\u2028•\u0E5A•</data>
<data>• \u0032\u2028•\u0032•</data>
<data>• \u0028\u2028•\u0028•</data>
<data>• \u0025\u2028•\u0025•</data>
<data>• \u0024\u2028•\u0024•</data>
<data>• \u0022\u2028•\u0022•</data>
<data>• \u0E01\u2028•\u0E01•</data>
<data>• \uDB7F\u2028•\uDB7F•</data>
<data>• \u0020\u2028•\u0020•</data>
<data>• \u002F\u2028•\u002F•</data>
<data>• \uF8FF\u2028•\uF8FF•</data>
<data>• \uF8FF\u2028•\uF8FF•</data>
<data>• \u200B\u2028•\u200B•</data>
#
# Old Line Break Test data. Orginally located in RBBITest::TestDefaultRuleBasedLineIteration()
#