52513beddd
X-SVN-Rev: 40302
993 lines
39 KiB
C++
993 lines
39 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/********************************************************************
|
|
* Copyright (c) 2016, International Business Machines Corporation and
|
|
* others. All Rights Reserved.
|
|
********************************************************************/
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
|
|
|
|
#include "rbbimonkeytest.h"
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/brkiter.h"
|
|
#include "unicode/utf16.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/unistr.h"
|
|
|
|
#include "charstr.h"
|
|
#include "cmemory.h"
|
|
#include "cstr.h"
|
|
#include "uelement.h"
|
|
#include "uhash.h"
|
|
|
|
#include <iostream>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string>
|
|
|
|
using namespace icu;
|
|
|
|
|
|
void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
|
|
fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
|
|
|
|
TESTCASE_AUTO_BEGIN;
|
|
TESTCASE_AUTO(testMonkey);
|
|
TESTCASE_AUTO_END;
|
|
}
|
|
|
|
//---------------------------------------------------------------------------------------
|
|
//
|
|
// class BreakRule implementation.
|
|
//
|
|
//---------------------------------------------------------------------------------------
|
|
|
|
BreakRule::BreakRule() // : all field default initialized.
|
|
{
|
|
}
|
|
|
|
BreakRule::~BreakRule() {}
|
|
|
|
|
|
//---------------------------------------------------------------------------------------
|
|
//
|
|
// class BreakRules implementation.
|
|
//
|
|
//---------------------------------------------------------------------------------------
|
|
BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
|
|
fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
|
|
fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
|
|
uhash_compareUnicodeString,
|
|
NULL, // value comparator.
|
|
&status));
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
|
|
uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
|
|
fBreakRules.setDeleter(uprv_deleteUObject);
|
|
|
|
fCharClassList.adoptInstead(new UVector(status));
|
|
|
|
fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
|
"(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
|
|
// (the identifier is a unicode property name or value)
|
|
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
|
|
0, status));
|
|
|
|
// Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
|
|
fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
|
"(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
|
|
"[ \\t]*+" // Match white space.
|
|
"(#.*)?+" // Optional # plus whatever follows
|
|
"\\R$" // new-line at end of line.
|
|
), 0, status));
|
|
|
|
// Match (initial parse) of a character class definition line.
|
|
fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
|
"[ \\t]*" // leading white space
|
|
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
|
|
"[ \\t]*=[ \\t]*" // =
|
|
"(?<ClassDef>.*?)" // The char class UnicodeSet expression
|
|
"[ \\t]*;$"), // ; <end of line>
|
|
0, status));
|
|
|
|
// Match (initial parse) of a break rule line.
|
|
fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
|
"[ \\t]*" // leading white space
|
|
"(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
|
|
"[ \\t]*:[ \\t]*" // :
|
|
"(?<RuleDef>.*?)" // The rule definition
|
|
"[ \\t]*;$"), // ; <end of line>
|
|
0, status));
|
|
|
|
}
|
|
|
|
|
|
BreakRules::~BreakRules() {}
|
|
|
|
|
|
CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
|
|
|
|
// Create the expanded definition for this char class,
|
|
// replacing any set references with the corresponding definition.
|
|
|
|
UnicodeString expandedDef;
|
|
UnicodeString emptyString;
|
|
fSetRefsMatcher->reset(definition);
|
|
while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
|
|
const UnicodeString name =
|
|
fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
|
|
CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
|
|
const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
|
|
|
|
fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
|
|
expandedDef.append(expansionForName);
|
|
}
|
|
fSetRefsMatcher->appendTail(expandedDef);
|
|
|
|
// Verify that the expanded set definition is valid.
|
|
|
|
if (fMonkeyImpl->fDumpExpansions) {
|
|
printf("epandedDef: %s\n", CStr(expandedDef)());
|
|
}
|
|
|
|
UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
|
|
if (U_FAILURE(status)) {
|
|
IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
|
|
u_errorName(status), CStr(name)());
|
|
return NULL;
|
|
}
|
|
CharClass *cclass = new CharClass(name, definition, expandedDef, s);
|
|
CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
|
|
new UnicodeString(name), // Key, owned by hash table.
|
|
cclass, // Value, owned by hash table.
|
|
&status));
|
|
|
|
if (previousClass != NULL) {
|
|
// Duplicate class def.
|
|
// These are legitimate, they are adjustments of an existing class.
|
|
// TODO: will need to keep the old around when we handle tailorings.
|
|
IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
|
|
delete previousClass;
|
|
}
|
|
return cclass;
|
|
}
|
|
|
|
|
|
void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
|
|
LocalPointer<BreakRule> thisRule(new BreakRule);
|
|
thisRule->fName = name;
|
|
thisRule->fRule = definition;
|
|
|
|
// If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
|
|
// This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
|
|
UnicodeString emptyString;
|
|
|
|
// Expand the char class definitions within the rule.
|
|
fSetRefsMatcher->reset(definition);
|
|
while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
|
|
const UnicodeString name =
|
|
fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
|
|
CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
|
|
if (!nameClass) {
|
|
IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
|
|
__FILE__, __LINE__, CStr(name)(), CStr(definition)());
|
|
}
|
|
const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
|
|
|
|
fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
|
|
thisRule->fExpandedRule.append(expansionForName);
|
|
}
|
|
fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
|
|
|
|
// Replace the divide sign (\u00f7) with a regular expression named capture.
|
|
// When running the rules, a match that includes this group means we found a break position.
|
|
|
|
int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
|
|
if (dividePos >= 0) {
|
|
thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
|
|
}
|
|
if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
|
|
status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
|
|
}
|
|
|
|
// UAX break rule set definitions can be empty, just [].
|
|
// Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
|
|
// also matches nothing.
|
|
|
|
static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
|
|
int32_t where = 0;
|
|
while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
|
|
thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
|
|
}
|
|
if (fMonkeyImpl->fDumpExpansions) {
|
|
printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
|
|
}
|
|
|
|
// Compile a regular expression for this rule.
|
|
thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
|
|
if (U_FAILURE(status)) {
|
|
IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
|
|
__FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
|
|
return;
|
|
}
|
|
|
|
// Put this new rule into the vector of all Rules.
|
|
fBreakRules.addElement(thisRule.orphan(), status);
|
|
}
|
|
|
|
|
|
bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
|
|
if (keyword == UnicodeString("locale")) {
|
|
CharString localeName;
|
|
localeName.append(CStr(value)(), -1, status);
|
|
fLocale = Locale::createFromName(localeName.data());
|
|
return true;
|
|
}
|
|
if (keyword == UnicodeString("type")) {
|
|
if (value == UnicodeString("grapheme")) {
|
|
fType = UBRK_CHARACTER;
|
|
} else if (value == UnicodeString("word")) {
|
|
fType = UBRK_WORD;
|
|
} else if (value == UnicodeString("line")) {
|
|
fType = UBRK_LINE;
|
|
} else if (value == UnicodeString("sentence")) {
|
|
fType = UBRK_SENTENCE;
|
|
} else {
|
|
IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
|
|
}
|
|
return true;
|
|
}
|
|
// TODO: add tailoring base setting here.
|
|
return false;
|
|
}
|
|
|
|
RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return NULL;
|
|
}
|
|
RuleBasedBreakIterator *bi = NULL;
|
|
switch(fType) {
|
|
case UBRK_CHARACTER:
|
|
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
|
|
break;
|
|
case UBRK_WORD:
|
|
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
|
|
break;
|
|
case UBRK_LINE:
|
|
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
|
|
break;
|
|
case UBRK_SENTENCE:
|
|
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
|
|
break;
|
|
default:
|
|
IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
return bi;
|
|
}
|
|
|
|
|
|
void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
UnicodeString emptyString;
|
|
for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
int32_t lineLength = 0;
|
|
const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
|
|
if (lineBuf == NULL) {
|
|
break;
|
|
}
|
|
UnicodeString line(lineBuf, lineLength);
|
|
|
|
// Strip comment lines.
|
|
fCommentsMatcher->reset(line);
|
|
line = fCommentsMatcher->replaceFirst(emptyString, status);
|
|
if (line.isEmpty()) {
|
|
continue;
|
|
}
|
|
|
|
// Recognize character class definition and keyword lines
|
|
fClassDefMatcher->reset(line);
|
|
if (fClassDefMatcher->matches(status)) {
|
|
UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
|
|
UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
|
|
if (fMonkeyImpl->fDumpExpansions) {
|
|
printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
|
|
}
|
|
if (setKeywordParameter(className, classDef, status)) {
|
|
// The scanned item was "type = ..." or "locale = ...", etc.
|
|
// which are not actual character classes.
|
|
continue;
|
|
}
|
|
addCharClass(className, classDef, status);
|
|
continue;
|
|
}
|
|
|
|
// Recognize rule lines.
|
|
fRuleDefMatcher->reset(line);
|
|
if (fRuleDefMatcher->matches(status)) {
|
|
UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
|
|
UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
|
|
if (fMonkeyImpl->fDumpExpansions) {
|
|
printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
|
|
}
|
|
addRule(ruleName, ruleDef, status);
|
|
continue;
|
|
}
|
|
|
|
IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
|
|
__FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
|
|
}
|
|
|
|
// Build the vector of char classes, omitting the dictionary class if there is one.
|
|
// This will be used when constructing the random text to be tested.
|
|
|
|
// Also compute the "other" set, consisting of any characters not included in
|
|
// one or more of the user defined sets.
|
|
|
|
UnicodeSet otherSet((UChar32)0, 0x10ffff);
|
|
int32_t pos = UHASH_FIRST;
|
|
const UHashElement *el = NULL;
|
|
while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
|
|
const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
|
|
CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
|
|
// printf(" Adding %s\n", CStr(*ccName)());
|
|
if (*ccName != cclass->fName) {
|
|
IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
|
|
__FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
|
|
}
|
|
const UnicodeSet *set = cclass->fSet.getAlias();
|
|
otherSet.removeAll(*set);
|
|
if (*ccName == UnicodeString("dictionary")) {
|
|
fDictionarySet = *set;
|
|
} else {
|
|
fCharClassList->addElement(cclass, status);
|
|
}
|
|
}
|
|
|
|
if (!otherSet.isEmpty()) {
|
|
// fprintf(stderr, "have an other set.\n");
|
|
UnicodeString pattern;
|
|
CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
|
|
fCharClassList->addElement(cclass, status);
|
|
}
|
|
}
|
|
|
|
|
|
const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
|
|
int32_t localIter = 0;
|
|
int32_t &it = iter? *iter : localIter;
|
|
|
|
while (it < fCharClassList->size()) {
|
|
const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
|
|
++it;
|
|
if (cc->fSet->contains(c)) {
|
|
return cc;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
//---------------------------------------------------------------------------------------
|
|
//
|
|
// class MonkeyTestData implementation.
|
|
//
|
|
//---------------------------------------------------------------------------------------
|
|
|
|
void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
|
|
const int32_t dataLength = 1000;
|
|
|
|
// Fill the test string with random characters.
|
|
// First randomly pick a char class, then randomly pick a character from that class.
|
|
// Exclude any characters from the dictionary set.
|
|
|
|
// std::cout << "Populating Test Data" << std::endl;
|
|
fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
|
|
// allowing recreation of failing data.
|
|
fBkRules = rules;
|
|
fString.remove();
|
|
for (int32_t n=0; n<dataLength;) {
|
|
int charClassIndex = rand() % rules->fCharClassList->size();
|
|
const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
|
|
if (cclass->fSet->size() == 0) {
|
|
// Some rules or tailorings do end up with empty char classes.
|
|
continue;
|
|
}
|
|
int32_t charIndex = rand() % cclass->fSet->size();
|
|
UChar32 c = cclass->fSet->charAt(charIndex);
|
|
if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
|
|
// Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
|
|
// Don't let random unpaired surrogates combine in the test data because they might
|
|
// produce an unwanted dictionary character.
|
|
continue;
|
|
}
|
|
|
|
if (!rules->fDictionarySet.contains(c)) {
|
|
fString.append(c);
|
|
++n;
|
|
}
|
|
}
|
|
|
|
// Reset each rule matcher regex with this new string.
|
|
// (Although we are always using the same string object, ICU regular expressions
|
|
// don't like the underlying string data changing without doing a reset).
|
|
|
|
for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
|
|
BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
|
|
rule->fRuleMatcher->reset(fString);
|
|
}
|
|
|
|
// Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
|
|
// Expected and Actual breaks are one longer than the input string; a non-zero value
|
|
// will indicate a boundary preceding that position.
|
|
|
|
clearActualBreaks();
|
|
fExpectedBreaks = fActualBreaks;
|
|
fRuleForPosition = fActualBreaks;
|
|
f2ndRuleForPos = fActualBreaks;
|
|
|
|
// Apply reference rules to find the expected breaks.
|
|
|
|
fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
|
|
// ICU always reports a break there.
|
|
// The reference rules do not have a means to do so.
|
|
int32_t strIdx = 0;
|
|
while (strIdx < fString.length()) {
|
|
BreakRule *matchingRule = NULL;
|
|
UBool hasBreak = FALSE;
|
|
int32_t ruleNum = 0;
|
|
int32_t matchStart = 0;
|
|
int32_t matchEnd = 0;
|
|
int32_t breakGroup = 0;
|
|
for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
|
|
BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
|
|
rule->fRuleMatcher->reset();
|
|
if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
|
|
// A candidate rule match, check further to see if we take it or continue to check other rules.
|
|
// Matches of zero or one codepoint count only if they also specify a break.
|
|
matchStart = rule->fRuleMatcher->start(status);
|
|
matchEnd = rule->fRuleMatcher->end(status);
|
|
breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
|
|
hasBreak = U_SUCCESS(status);
|
|
if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
|
|
status = U_ZERO_ERROR;
|
|
}
|
|
if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
|
|
matchingRule = rule;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (matchingRule == NULL) {
|
|
// No reference rule matched. This is an error in the rules that should never happen.
|
|
IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
|
|
__FILE__, __LINE__, strIdx);
|
|
dump(strIdx);
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
return;
|
|
}
|
|
if (matchingRule->fRuleMatcher->group(status).length() == 0) {
|
|
// Zero length rule match. This is also an error in the rule expressions.
|
|
IntlTest::gTest->errln("%s:%d Zero length rule match.",
|
|
__FILE__, __LINE__);
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
return;
|
|
}
|
|
|
|
// Record which rule matched over the length of the match.
|
|
for (int i = matchStart; i < matchEnd; i++) {
|
|
if (fRuleForPosition.charAt(i) == 0) {
|
|
fRuleForPosition.setCharAt(i, (UChar)ruleNum);
|
|
} else {
|
|
f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
|
|
}
|
|
}
|
|
|
|
// Break positions appear in rules as a matching named capture of zero length at the break position,
|
|
// the adjusted pattern contains (?<BreakPosition>)
|
|
if (hasBreak) {
|
|
int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
|
|
if (U_FAILURE(status) || breakPos < 0) {
|
|
// Rule specified a break, but that break wasn't part of the match, even
|
|
// though the rule as a whole matched.
|
|
// Can't happen with regular expressions derived from (equivalent to) ICU break rules.
|
|
// Shouldn't get here.
|
|
IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
break;
|
|
}
|
|
fExpectedBreaks.setCharAt(breakPos, (UChar)1);
|
|
// printf("recording break at %d\n", breakPos);
|
|
// For the next iteration, pick up applying rules immediately after the break,
|
|
// which may differ from end of the match. The matching rule may have included
|
|
// context following the boundary that needs to be looked at again.
|
|
strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
|
|
} else {
|
|
// Original rule didn't specify a break.
|
|
// Continue applying rules starting on the last code point of this match.
|
|
strIdx = fString.moveIndex32(matchEnd, -1);
|
|
if (strIdx == matchStart) {
|
|
// Match was only one code point, no progress if we continue.
|
|
// Shouldn't get here, case is filtered out at top of loop.
|
|
CharString ruleName;
|
|
ruleName.appendInvariantChars(matchingRule->fName, status);
|
|
IntlTest::gTest->errln("%s:%d Rule %s internal error",
|
|
__FILE__, __LINE__, ruleName.data());
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
if (U_FAILURE(status)) {
|
|
IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
|
|
__FILE__, __LINE__, u_errorName(status));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void MonkeyTestData::clearActualBreaks() {
|
|
fActualBreaks.remove();
|
|
// Actual Breaks length is one longer than the data string length, allowing
|
|
// for breaks before the first and after the last character in the data.
|
|
for (int32_t i=0; i<=fString.length(); i++) {
|
|
fActualBreaks.append((UChar)0);
|
|
}
|
|
}
|
|
|
|
void MonkeyTestData::dump(int32_t around) const {
|
|
printf("\n"
|
|
" char break Rule Character\n"
|
|
" pos code class R I name name\n"
|
|
"---------------------------------------------------------------------------------------------\n");
|
|
|
|
int32_t start;
|
|
int32_t end;
|
|
|
|
if (around == -1) {
|
|
start = 0;
|
|
end = fString.length();
|
|
} else {
|
|
// Display context around a failure.
|
|
start = fString.moveIndex32(around, -30);
|
|
end = fString.moveIndex32(around, +30);
|
|
}
|
|
|
|
for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UChar32 c = fString.char32At(charIdx);
|
|
const CharClass *cc = fBkRules->getClassForChar(c);
|
|
CharString ccName;
|
|
ccName.appendInvariantChars(cc->fName, status);
|
|
CharString ruleName, secondRuleName;
|
|
const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
|
|
ruleName.appendInvariantChars(rule->fName, status);
|
|
if (f2ndRuleForPos.charAt(charIdx) > 0) {
|
|
const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
|
|
secondRuleName.appendInvariantChars(secondRule->fName, status);
|
|
}
|
|
char cName[200];
|
|
u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
|
|
|
|
printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
|
|
charIdx, c, ccName.data(),
|
|
fExpectedBreaks.charAt(charIdx) ? '*' : '.',
|
|
fActualBreaks.charAt(charIdx) ? '*' : '.',
|
|
ruleName.data(), secondRuleName.data(), cName
|
|
);
|
|
}
|
|
}
|
|
|
|
|
|
//---------------------------------------------------------------------------------------
|
|
//
|
|
// class RBBIMonkeyImpl
|
|
//
|
|
//---------------------------------------------------------------------------------------
|
|
|
|
RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
|
|
(void)status; // suppress unused parameter compiler warning.
|
|
}
|
|
|
|
|
|
// RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
|
|
// reference rules and creating the icu breakiterator to test,
|
|
// with its type and locale coming from the reference rules.
|
|
|
|
void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
|
|
fRuleFileName = ruleFile;
|
|
openBreakRules(ruleFile, status);
|
|
if (U_FAILURE(status)) {
|
|
IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
|
|
return;
|
|
}
|
|
fRuleSet.adoptInstead(new BreakRules(this, status));
|
|
fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
|
|
if (U_FAILURE(status)) {
|
|
IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
|
|
return;
|
|
}
|
|
fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
|
|
fTestData.adoptInstead(new MonkeyTestData());
|
|
}
|
|
|
|
|
|
RBBIMonkeyImpl::~RBBIMonkeyImpl() {
|
|
}
|
|
|
|
|
|
void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
|
|
CharString path;
|
|
path.append(IntlTest::getSourceTestData(status), status);
|
|
path.append("break_rules" U_FILE_SEP_STRING, status);
|
|
path.appendPathPart(fileName, status);
|
|
const char *codePage = "UTF-8";
|
|
fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
|
|
}
|
|
|
|
|
|
void RBBIMonkeyImpl::startTest() {
|
|
fThread.start(); // invokes runTest() in a separate thread.
|
|
}
|
|
|
|
void RBBIMonkeyImpl::join() {
|
|
fThread.join();
|
|
}
|
|
|
|
|
|
#define MONKEY_ERROR(msg, index) { \
|
|
IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
|
|
__FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
|
|
if (fVerbose) { fTestData->dump(index); } \
|
|
status = U_INVALID_STATE_ERROR; \
|
|
}
|
|
|
|
void RBBIMonkeyImpl::runTest() {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
int32_t errorCount = 0;
|
|
for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
|
|
status = U_ZERO_ERROR;
|
|
fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
|
|
if (fBI.isNull()) {
|
|
IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
|
|
return;
|
|
}
|
|
// fTestData->dump();
|
|
testForwards(status);
|
|
testPrevious(status);
|
|
testFollowing(status);
|
|
testPreceding(status);
|
|
testIsBoundary(status);
|
|
|
|
if (fLoopCount < 0 && loopCount % 100 == 0) {
|
|
fprintf(stderr, ".");
|
|
}
|
|
if (U_FAILURE(status)) {
|
|
if (++errorCount > 10) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
fTestData->clearActualBreaks();
|
|
fBI->setText(fTestData->fString);
|
|
int32_t previousBreak = -2;
|
|
for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
|
|
if (bk <= previousBreak) {
|
|
MONKEY_ERROR("Break Iterator Stall", bk);
|
|
return;
|
|
}
|
|
if (bk < 0 || bk > fTestData->fString.length()) {
|
|
MONKEY_ERROR("Boundary out of bounds", bk);
|
|
return;
|
|
}
|
|
fTestData->fActualBreaks.setCharAt(bk, 1);
|
|
}
|
|
checkResults("testForwards", FORWARD, status);
|
|
}
|
|
|
|
void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
fTestData->clearActualBreaks();
|
|
fBI->setText(fTestData->fString);
|
|
int32_t nextBreak = -1;
|
|
for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
|
|
int32_t bk = fBI->following(i);
|
|
if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
|
|
continue;
|
|
}
|
|
if (bk == nextBreak && bk > i) {
|
|
// i is in the gap between two breaks.
|
|
continue;
|
|
}
|
|
if (i == nextBreak && bk > nextBreak) {
|
|
fTestData->fActualBreaks.setCharAt(bk, 1);
|
|
nextBreak = bk;
|
|
continue;
|
|
}
|
|
MONKEY_ERROR("following(i)", i);
|
|
return;
|
|
}
|
|
checkResults("testFollowing", FORWARD, status);
|
|
}
|
|
|
|
|
|
|
|
void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {return;}
|
|
|
|
fTestData->clearActualBreaks();
|
|
fBI->setText(fTestData->fString);
|
|
int32_t previousBreak = INT32_MAX;
|
|
for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
|
|
if (bk >= previousBreak) {
|
|
MONKEY_ERROR("Break Iterator Stall", bk);
|
|
return;
|
|
}
|
|
if (bk < 0 || bk > fTestData->fString.length()) {
|
|
MONKEY_ERROR("Boundary out of bounds", bk);
|
|
return;
|
|
}
|
|
fTestData->fActualBreaks.setCharAt(bk, 1);
|
|
}
|
|
checkResults("testPrevius", REVERSE, status);
|
|
}
|
|
|
|
|
|
void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
fTestData->clearActualBreaks();
|
|
fBI->setText(fTestData->fString);
|
|
int32_t nextBreak = fTestData->fString.length()+1;
|
|
for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
|
|
int32_t bk = fBI->preceding(i);
|
|
// printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
|
|
if (bk == BreakIterator::DONE && i == 0) {
|
|
continue;
|
|
}
|
|
if (bk == nextBreak && bk < i) {
|
|
// i is in the gap between two breaks.
|
|
continue;
|
|
}
|
|
if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
|
|
// i indexes to a trailing surrogate.
|
|
// Break Iterators treat an index to either half as referring to the supplemental code point,
|
|
// with preceding going to some preceding code point.
|
|
if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
|
|
MONKEY_ERROR("preceding of trailing surrogate error", i);
|
|
}
|
|
continue;
|
|
}
|
|
if (i == nextBreak && bk < nextBreak) {
|
|
fTestData->fActualBreaks.setCharAt(bk, 1);
|
|
nextBreak = bk;
|
|
continue;
|
|
}
|
|
MONKEY_ERROR("preceding(i)", i);
|
|
return;
|
|
}
|
|
checkResults("testPreceding", REVERSE, status);
|
|
}
|
|
|
|
|
|
void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
fTestData->clearActualBreaks();
|
|
fBI->setText(fTestData->fString);
|
|
for (int i=fTestData->fString.length(); i>=0; --i) {
|
|
if (fBI->isBoundary(i)) {
|
|
fTestData->fActualBreaks.setCharAt(i, 1);
|
|
}
|
|
}
|
|
checkResults("testForwards", FORWARD, status);
|
|
}
|
|
|
|
void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
if (direction == FORWARD) {
|
|
for (int i=0; i<=fTestData->fString.length(); ++i) {
|
|
if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
|
|
IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
|
|
__FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
|
|
if (fVerbose) {
|
|
fTestData->dump(i);
|
|
}
|
|
status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
|
|
break; // produce many redundant errors.
|
|
}
|
|
}
|
|
} else {
|
|
for (int i=fTestData->fString.length(); i>=0; i--) {
|
|
if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
|
|
IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
|
|
__FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
|
|
if (fVerbose) {
|
|
fTestData->dump(i);
|
|
}
|
|
status = U_INVALID_STATE_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//---------------------------------------------------------------------------------------
|
|
//
|
|
// class RBBIMonkeyTest implementation.
|
|
//
|
|
//---------------------------------------------------------------------------------------
|
|
RBBIMonkeyTest::RBBIMonkeyTest() {
|
|
}
|
|
|
|
RBBIMonkeyTest::~RBBIMonkeyTest() {
|
|
}
|
|
|
|
|
|
// params, taken from this->fParams.
|
|
// rules=file_name Name of file containing the reference rules.
|
|
// seed=nnnnn Random number starting seed.
|
|
// Setting the seed allows errors to be reproduced.
|
|
// loop=nnn Looping count. Controls running time.
|
|
// -1: run forever.
|
|
// 0 or greater: run length.
|
|
// expansions debug option, show expansions of rules and sets.
|
|
// verbose Display details of the failure.
|
|
//
|
|
// Parameters on the intltest command line follow the test name, and are preceded by '@'.
|
|
// For example,
|
|
// intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
|
|
//
|
|
void RBBIMonkeyTest::testMonkey() {
|
|
// printf("Test parameters: %s\n", fParams);
|
|
UnicodeString params(fParams);
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
|
|
"line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
|
|
NULL };
|
|
CharString testNameFromParams;
|
|
if (getStringParam("rules", params, testNameFromParams, status)) {
|
|
tests[0] = testNameFromParams.data();
|
|
tests[1] = NULL;
|
|
}
|
|
|
|
int64_t loopCount = quick? 100 : 5000;
|
|
getIntParam("loop", params, loopCount, status);
|
|
|
|
UBool dumpExpansions = FALSE;
|
|
getBoolParam("expansions", params, dumpExpansions, status);
|
|
|
|
UBool verbose = FALSE;
|
|
getBoolParam("verbose", params, verbose, status);
|
|
|
|
int64_t seed = 0;
|
|
getIntParam("seed", params, seed, status);
|
|
|
|
if (params.length() != 0) {
|
|
// Options processing did not consume all of the parameters. Something unrecognized was present.
|
|
CharString unrecognizedParameters;
|
|
unrecognizedParameters.append(CStr(params)(), -1, status);
|
|
errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
|
|
return;
|
|
}
|
|
|
|
UVector startedTests(status);
|
|
if (U_FAILURE(status)) {
|
|
errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
|
|
return;
|
|
}
|
|
|
|
// Monkey testing is multi-threaded.
|
|
// Each set of break rules to be tested is run in a separate thread.
|
|
// Each thread/set of rules gets a separate RBBIMonkeyImpl object.
|
|
int32_t i;
|
|
for (i=0; tests[i] != NULL; ++i) {
|
|
logln("beginning testing of %s", tests[i]);
|
|
RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status);
|
|
if (U_FAILURE(status)) {
|
|
errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
|
|
break;
|
|
}
|
|
test->fDumpExpansions = dumpExpansions;
|
|
test->fVerbose = verbose;
|
|
test->fRandomGenerator.seed((uint32_t)seed);
|
|
test->fLoopCount = loopCount;
|
|
test->setup(tests[i], status);
|
|
if (U_FAILURE(status)) {
|
|
errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
|
|
break;
|
|
}
|
|
test->startTest();
|
|
startedTests.addElement(test, status);
|
|
if (U_FAILURE(status)) {
|
|
errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (i=0; i<startedTests.size(); ++i) {
|
|
RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
|
|
test->join();
|
|
delete test;
|
|
}
|
|
}
|
|
|
|
|
|
UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status) {
|
|
name.append(" *= *(-?\\d+) *,? *");
|
|
RegexMatcher m(name, params, 0, status);
|
|
if (m.find()) {
|
|
// The param exists. Convert the string to an int.
|
|
CharString str;
|
|
str.append(CStr(m.group(1, status))(), -1, status);
|
|
val = strtol(str.data(), NULL, 10);
|
|
|
|
// Delete this parameter from the params string.
|
|
m.reset();
|
|
params = m.replaceFirst(UnicodeString(), status);
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status) {
|
|
name.append(" *= *([^ ,]*) *,? *");
|
|
RegexMatcher m(name, params, 0, status);
|
|
if (m.find()) {
|
|
// The param exists.
|
|
dest.append(CStr(m.group(1, status))(), -1, status);
|
|
|
|
// Delete this parameter from the params string.
|
|
m.reset();
|
|
params = m.replaceFirst(UnicodeString(), status);
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status) {
|
|
name.append("(?: *= *(true|false))? *,? *");
|
|
RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
|
|
if (m.find()) {
|
|
if (m.start(1, status) > 0) {
|
|
// user option included a value.
|
|
dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
|
|
} else {
|
|
// No explicit user value, implies true.
|
|
dest = TRUE;
|
|
}
|
|
|
|
// Delete this parameter from the params string.
|
|
m.reset();
|
|
params = m.replaceFirst(UnicodeString(), status);
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
#endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
|