231 lines
9.7 KiB
C++
231 lines
9.7 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*************************************************************************
|
|
* Copyright (c) 2016, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*************************************************************************
|
|
*/
|
|
#ifndef RBBIMONKEYTEST_H
|
|
#define RBBIMONKEYTEST_H
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
|
|
|
|
#include "intltest.h"
|
|
|
|
#include "unicode/rbbi.h"
|
|
#include "unicode/regex.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/unistr.h"
|
|
#include "unicode/uobject.h"
|
|
|
|
#include "simplethread.h"
|
|
#include "ucbuf.h"
|
|
#include "uhash.h"
|
|
#include "uvector.h"
|
|
|
|
// RBBI Monkey Test. Run break iterators against randomly generated strings, compare results with
|
|
// an independent reference implementation.
|
|
//
|
|
// The monkey test can be run with parameters, e.g.
|
|
// intltest rbbi/RBBIMonkeyTest@loop=-1,rules=word.txt
|
|
// will run word break testing in an infinite loop.
|
|
// Summary of options
|
|
// rules=name Test against the named reference rule file.
|
|
// Files are found in source/test/testdata/break_rules
|
|
// loop=nnn Loop nnn times. -1 for no limit. loop of 1 is useful for debugging.
|
|
// seed=nnnn Random number generator seed. Allows recreation of a failure.
|
|
// Error messages include the necessary seed value.
|
|
// verbose Display details of a failure. Useful for debugging. Use with loop=1.
|
|
// expansions Debug option, show expansions of rules and sets.
|
|
//
|
|
// TODO:
|
|
// Develop a tailoring format.
|
|
// Hook to old tests that use monkey impl to get expected data.
|
|
// Remove old tests.
|
|
|
|
class BreakRules; // Forward declaration
|
|
class RBBIMonkeyImpl;
|
|
|
|
/**
|
|
* Test the RuleBasedBreakIterator class giving different rules
|
|
*/
|
|
class RBBIMonkeyTest: public IntlTest {
|
|
public:
|
|
RBBIMonkeyTest();
|
|
virtual ~RBBIMonkeyTest();
|
|
|
|
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
|
|
void testMonkey();
|
|
|
|
|
|
private:
|
|
const char *fParams; // Copy of user parameters passed in from IntlTest.
|
|
|
|
|
|
void testRules(const char *ruleFile);
|
|
static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status);
|
|
static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status);
|
|
static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status);
|
|
|
|
};
|
|
|
|
// The following classes are internal to the RBBI Monkey Test implementation.
|
|
|
|
|
|
|
|
// class CharClass Represents a single character class from the source break rules.
|
|
// Inherits from UObject because instances are adopted by UHashtable, which ultimately
|
|
// deletes them using hash's object deleter function.
|
|
|
|
class CharClass: public UObject {
|
|
public:
|
|
UnicodeString fName;
|
|
UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules.
|
|
UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively.
|
|
LocalPointer<const UnicodeSet> fSet;
|
|
CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
|
|
fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
|
|
};
|
|
|
|
|
|
// class BreakRule represents a single rule from a set of break rules.
|
|
// Each rule has the set definitions expanded, and
|
|
// is compiled to a regular expression.
|
|
|
|
class BreakRule: public UObject {
|
|
public:
|
|
BreakRule();
|
|
~BreakRule();
|
|
UnicodeString fName; // Name of the rule.
|
|
UnicodeString fRule; // Rule expression, excluding the name, as written in user source.
|
|
UnicodeString fExpandedRule; // Rule expression after expanding the set definitions.
|
|
LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule.
|
|
bool fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining.
|
|
};
|
|
|
|
|
|
// class BreakRules represents a complete set of break rules, possibly tailored,
|
|
// compiled from testdata break rules.
|
|
|
|
class BreakRules: public UObject {
|
|
public:
|
|
BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
|
|
~BreakRules();
|
|
|
|
void compileRules(UCHARBUF *rules, UErrorCode &status);
|
|
|
|
const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
|
|
|
|
|
|
RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance.
|
|
icu::UVector fBreakRules; // Contents are of type (BreakRule *).
|
|
|
|
LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString).
|
|
// Value is (CharClass *)
|
|
LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values,
|
|
// but in a vector so they can be accessed by index.
|
|
UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined.
|
|
Locale fLocale;
|
|
UBreakIteratorType fType;
|
|
|
|
CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
|
|
void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
|
|
bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
|
|
RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
|
|
|
|
LocalPointer<RegexMatcher> fSetRefsMatcher;
|
|
LocalPointer<RegexMatcher> fCommentsMatcher;
|
|
LocalPointer<RegexMatcher> fClassDefMatcher;
|
|
LocalPointer<RegexMatcher> fRuleDefMatcher;
|
|
};
|
|
|
|
|
|
// class MonkeyTestData represents a randomly synthesized test data string together
|
|
// with the expected break positions obtained by applying
|
|
// the test break rules.
|
|
|
|
class MonkeyTestData: public UObject {
|
|
public:
|
|
MonkeyTestData() {};
|
|
~MonkeyTestData() {};
|
|
void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
|
|
void clearActualBreaks();
|
|
void dump(int32_t around = -1) const;
|
|
|
|
uint32_t fRandomSeed; // The initial seed value from the random number genererator.
|
|
const BreakRules *fBkRules; // The break rules used to generate this data.
|
|
UnicodeString fString; // The text.
|
|
UnicodeString fExpectedBreaks; // Breaks as found by the reference rules.
|
|
// Parallel to fString. Non-zero if break preceding.
|
|
UnicodeString fActualBreaks; // Breaks as found by ICU break iterator.
|
|
UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position.
|
|
// Also parallel to fString.
|
|
UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule
|
|
// didn't cause a break, and a subsequent rule match starts
|
|
// on the last code point of the preceding match.
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey
|
|
// test for one set of break rules.
|
|
//
|
|
// When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
|
|
// between instances of RBBIMonkeyImpl and threads.
|
|
//
|
|
class RBBIMonkeyImpl: public UObject {
|
|
public:
|
|
RBBIMonkeyImpl(UErrorCode &status);
|
|
~RBBIMonkeyImpl();
|
|
|
|
void setup(const char *ruleFileName, UErrorCode &status);
|
|
|
|
void startTest();
|
|
void runTest();
|
|
void join();
|
|
|
|
LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules.
|
|
LocalPointer<BreakRules> fRuleSet;
|
|
LocalPointer<RuleBasedBreakIterator> fBI;
|
|
LocalPointer<MonkeyTestData> fTestData;
|
|
IntlTest::icu_rand fRandomGenerator;
|
|
const char *fRuleFileName;
|
|
UBool fVerbose; // True to do long dump of failing data.
|
|
int32_t fLoopCount;
|
|
|
|
UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets.
|
|
|
|
enum CheckDirection {
|
|
FORWARD = 1,
|
|
REVERSE = 2
|
|
};
|
|
void clearActualBreaks();
|
|
void testForwards(UErrorCode &status);
|
|
void testPrevious(UErrorCode &status);
|
|
void testFollowing(UErrorCode &status);
|
|
void testPreceding(UErrorCode &status);
|
|
void testIsBoundary(UErrorCode &status);
|
|
void testIsBoundaryRandom(UErrorCode &status);
|
|
void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
|
|
|
|
class RBBIMonkeyThread: public SimpleThread {
|
|
private:
|
|
RBBIMonkeyImpl *fMonkeyImpl;
|
|
public:
|
|
RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
|
|
void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
|
|
};
|
|
private:
|
|
void openBreakRules(const char *fileName, UErrorCode &status);
|
|
RBBIMonkeyThread fThread;
|
|
|
|
};
|
|
|
|
#endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
|
|
|
|
#endif // RBBIMONKEYTEST_H
|