2008-05-23 04:22:28 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (C) 2005-2008, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/unistr.h"
|
|
|
|
#include "unicode/putil.h"
|
|
|
|
#include "unicode/usearch.h"
|
|
|
|
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "unicode/coll.h"
|
|
|
|
#include "unicode/tblcoll.h"
|
|
|
|
#include "unicode/coleitr.h"
|
|
|
|
#include "unicode/ucoleitr.h"
|
|
|
|
|
|
|
|
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
|
|
|
|
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "unicode/uset.h"
|
|
|
|
#include "unicode/ustring.h"
|
|
|
|
#include "hash.h"
|
|
|
|
#include "uhash.h"
|
|
|
|
#include "ucol_imp.h"
|
|
|
|
|
|
|
|
#include "intltest.h"
|
|
|
|
#include "ssearch.h"
|
|
|
|
|
|
|
|
#include "xmlparser.h"
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
char testId[100];
|
|
|
|
|
|
|
|
#define TEST_ASSERT(x) {if (!(x)) { \
|
|
|
|
errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
|
|
|
|
|
|
|
|
#define TEST_ASSERT_M(x, m) {if (!(x)) { \
|
|
|
|
errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
|
|
|
|
|
|
|
|
#define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
|
|
|
|
errln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
|
|
|
|
__FILE__, __LINE__, testId, u_errorName(errcode));}}
|
|
|
|
|
|
|
|
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
|
|
|
|
|
|
|
|
//---------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// Test class boilerplate
|
|
|
|
//
|
|
|
|
//---------------------------------------------------------------------------
|
|
|
|
SSearchTest::SSearchTest()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
SSearchTest::~SSearchTest()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
|
|
|
|
{
|
|
|
|
if (exec) logln("TestSuite SSearchTest: ");
|
|
|
|
switch (index) {
|
|
|
|
case 0: name = "searchTest";
|
|
|
|
if (exec) searchTest();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1: name = "offsetTest";
|
|
|
|
if (exec) offsetTest();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2: name = "monkeyTest";
|
|
|
|
if (exec) monkeyTest(params);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default: name = "";
|
|
|
|
break; //needed to end loop
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define PATH_BUFFER_SIZE 2048
|
|
|
|
const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
const char *testDataDirectory = IntlTest::getSourceTestData(status);
|
|
|
|
|
|
|
|
if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
|
|
|
|
errln("ERROR: getPath() failed - %s", u_errorName(status));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
strcpy(buffer, testDataDirectory);
|
|
|
|
strcat(buffer, filename);
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void SSearchTest::searchTest()
|
|
|
|
{
|
|
|
|
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
char path[PATH_BUFFER_SIZE];
|
|
|
|
const char *testFilePath = getPath(path, "ssearch.xml");
|
|
|
|
|
|
|
|
if (testFilePath == NULL) {
|
|
|
|
return; /* Couldn't get path: error message already output. */
|
|
|
|
}
|
|
|
|
|
|
|
|
UXMLParser *parser = UXMLParser::createParser(status);
|
|
|
|
TEST_ASSERT_SUCCESS(status);
|
|
|
|
UXMLElement *root = parser->parseFile(testFilePath, status);
|
|
|
|
TEST_ASSERT_SUCCESS(status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const UnicodeString *debugTestCase = root->getAttribute("debug");
|
|
|
|
if (debugTestCase != NULL) {
|
|
|
|
// setenv("USEARCH_DEBUG", "1", 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const UXMLElement *testCase;
|
|
|
|
int32_t tc = 0;
|
|
|
|
|
|
|
|
while((testCase = root->nextChildElement(tc)) != NULL) {
|
|
|
|
|
|
|
|
if (testCase->getTagName().compare("test-case") != 0) {
|
|
|
|
errln("ssearch, unrecognized XML Element in test file");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
const UnicodeString *id = testCase->getAttribute("id");
|
|
|
|
*testId = 0;
|
|
|
|
if (id != NULL) {
|
|
|
|
id->extract(0, id->length(), testId, sizeof(testId), US_INV);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If debugging test case has been specified and this is not it, skip to next.
|
|
|
|
if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
//
|
|
|
|
// Get the requested collation strength.
|
|
|
|
// Default is tertiary if the XML attribute is missing from the test case.
|
|
|
|
//
|
|
|
|
const UnicodeString *strength = testCase->getAttribute("strength");
|
|
|
|
UColAttributeValue collatorStrength;
|
|
|
|
if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
|
|
|
|
else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
|
|
|
|
else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
|
|
|
|
else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
|
|
|
|
else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
|
|
|
|
else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
|
|
|
|
else {
|
|
|
|
// Bogus value supplied for strength. Shouldn't happen, even from
|
|
|
|
// typos, if the XML source has been validated.
|
|
|
|
// This assert is a little deceiving in that strength can be
|
|
|
|
// any of the allowed values, not just TERTIARY, but it will
|
|
|
|
// do the job of getting the error output.
|
|
|
|
TEST_ASSERT(*strength=="TERTIARY")
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Get the collator normalization flag. Default is UCOL_OFF.
|
|
|
|
//
|
|
|
|
UColAttributeValue normalize = UCOL_OFF;
|
|
|
|
const UnicodeString *norm = testCase->getAttribute("norm");
|
|
|
|
TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
|
|
|
|
if (norm!=NULL && *norm=="ON") {
|
|
|
|
normalize = UCOL_ON;
|
|
|
|
}
|
|
|
|
|
|
|
|
const UnicodeString defLocale("en");
|
|
|
|
char clocale[100];
|
|
|
|
const UnicodeString *locale = testCase->getAttribute("locale");
|
|
|
|
if (locale == NULL || locale->length()==0) {
|
|
|
|
locale = &defLocale;
|
|
|
|
};
|
|
|
|
locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
|
|
|
|
|
|
|
|
|
|
|
|
UnicodeString text;
|
|
|
|
UnicodeString target;
|
|
|
|
UnicodeString pattern;
|
|
|
|
int32_t expectedMatchStart = -1;
|
|
|
|
int32_t expectedMatchLimit = -1;
|
|
|
|
const UXMLElement *n;
|
|
|
|
int nodeCount = 0;
|
|
|
|
|
|
|
|
n = testCase->getChildElement("pattern");
|
|
|
|
TEST_ASSERT(n != NULL);
|
|
|
|
if (n==NULL) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
text = n->getText(FALSE);
|
|
|
|
text = text.unescape();
|
|
|
|
pattern.append(text);
|
|
|
|
nodeCount++;
|
|
|
|
|
|
|
|
n = testCase->getChildElement("pre");
|
|
|
|
if (n!=NULL) {
|
|
|
|
text = n->getText(FALSE);
|
|
|
|
text = text.unescape();
|
|
|
|
target.append(text);
|
|
|
|
nodeCount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = testCase->getChildElement("m");
|
|
|
|
if (n!=NULL) {
|
|
|
|
expectedMatchStart = target.length();
|
|
|
|
text = n->getText(FALSE);
|
|
|
|
text = text.unescape();
|
|
|
|
target.append(text);
|
|
|
|
expectedMatchLimit = target.length();
|
|
|
|
nodeCount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = testCase->getChildElement("post");
|
|
|
|
if (n!=NULL) {
|
|
|
|
text = n->getText(FALSE);
|
|
|
|
text = text.unescape();
|
|
|
|
target.append(text);
|
|
|
|
nodeCount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that there weren't extra things in the XML
|
|
|
|
TEST_ASSERT(nodeCount == testCase->countChildren());
|
|
|
|
|
|
|
|
// Open a collotor and StringSearch based on the parameters
|
|
|
|
// obtained from the XML.
|
|
|
|
//
|
|
|
|
status = U_ZERO_ERROR;
|
|
|
|
UCollator *collator = ucol_open(clocale, &status);
|
|
|
|
ucol_setStrength(collator, collatorStrength);
|
|
|
|
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
|
|
|
|
UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
|
|
|
|
target.getBuffer(), target.length(),
|
|
|
|
collator,
|
|
|
|
NULL, // the break iterator
|
|
|
|
&status);
|
|
|
|
|
|
|
|
TEST_ASSERT_SUCCESS(status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
usearch_close(uss);
|
|
|
|
ucol_close(collator);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t foundStart = 0;
|
|
|
|
int32_t foundLimit = 0;
|
|
|
|
UBool foundMatch;
|
|
|
|
|
|
|
|
//
|
|
|
|
// Do the search, check the match result against the expected results.
|
|
|
|
//
|
|
|
|
foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status);
|
|
|
|
TEST_ASSERT_SUCCESS(status);
|
|
|
|
if (foundMatch && expectedMatchStart<0 ||
|
|
|
|
foundStart != expectedMatchStart ||
|
|
|
|
foundLimit != expectedMatchLimit) {
|
|
|
|
TEST_ASSERT(FALSE); // ouput generic error position
|
|
|
|
infoln("Found, expected match start = %d, %d \n"
|
|
|
|
"Found, expected match limit = %d, %d",
|
|
|
|
foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
|
|
|
|
}
|
|
|
|
|
|
|
|
// In case there are other matches...
|
|
|
|
// (should we only do this if the test case passed?)
|
|
|
|
while (foundMatch) {
|
|
|
|
expectedMatchStart = foundStart;
|
|
|
|
expectedMatchLimit = foundLimit;
|
|
|
|
|
|
|
|
foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status);
|
|
|
|
}
|
|
|
|
|
|
|
|
usearch_close(uss);
|
2008-06-06 00:44:35 +00:00
|
|
|
|
|
|
|
uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
|
2008-05-23 04:22:28 +00:00
|
|
|
target.getBuffer(), target.length(),
|
|
|
|
collator,
|
|
|
|
NULL,
|
|
|
|
&status);
|
|
|
|
|
|
|
|
//
|
|
|
|
// Do the backwards search, check the match result against the expected results.
|
|
|
|
//
|
|
|
|
foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status);
|
|
|
|
TEST_ASSERT_SUCCESS(status);
|
|
|
|
if (foundMatch && expectedMatchStart<0 ||
|
|
|
|
foundStart != expectedMatchStart ||
|
|
|
|
foundLimit != expectedMatchLimit) {
|
|
|
|
TEST_ASSERT(FALSE); // ouput generic error position
|
|
|
|
infoln("Found, expected backwards match start = %d, %d \n"
|
|
|
|
"Found, expected backwards match limit = %d, %d",
|
|
|
|
foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
|
|
|
|
}
|
|
|
|
|
|
|
|
usearch_close(uss);
|
|
|
|
ucol_close(collator);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete root;
|
|
|
|
delete parser;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
struct Order
|
|
|
|
{
|
|
|
|
int32_t order;
|
|
|
|
int32_t lowOffset;
|
|
|
|
int32_t highOffset;
|
|
|
|
};
|
|
|
|
|
|
|
|
class OrderList
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
OrderList();
|
|
|
|
OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
|
|
|
|
~OrderList();
|
|
|
|
|
|
|
|
int32_t size(void) const;
|
|
|
|
void add(int32_t order, int32_t low, int32_t high);
|
|
|
|
const Order *get(int32_t index) const;
|
|
|
|
int32_t getLowOffset(int32_t index) const;
|
|
|
|
int32_t getHighOffset(int32_t index) const;
|
|
|
|
int32_t getOrder(int32_t index) const;
|
|
|
|
void reverse(void);
|
|
|
|
UBool compare(const OrderList &other) const;
|
|
|
|
UBool matchesAt(int32_t offset, const OrderList &other) const;
|
|
|
|
|
|
|
|
private:
|
|
|
|
Order *list;
|
|
|
|
int32_t listMax;
|
|
|
|
int32_t listSize;
|
|
|
|
};
|
|
|
|
|
|
|
|
OrderList::OrderList()
|
|
|
|
: list(NULL), listSize(0), listMax(16)
|
|
|
|
{
|
|
|
|
list = new Order[listMax];
|
|
|
|
}
|
|
|
|
|
|
|
|
OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
|
|
|
|
: list(NULL), listMax(16), listSize(0)
|
|
|
|
{
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
|
|
|
|
uint32_t strengthMask = 0;
|
|
|
|
int32_t order, low, high;
|
|
|
|
|
|
|
|
switch (ucol_getStrength(coll))
|
|
|
|
{
|
|
|
|
default:
|
|
|
|
strengthMask |= UCOL_TERTIARYORDERMASK;
|
|
|
|
/* fall through */
|
|
|
|
|
|
|
|
case UCOL_SECONDARY:
|
|
|
|
strengthMask |= UCOL_SECONDARYORDERMASK;
|
|
|
|
/* fall through */
|
|
|
|
|
|
|
|
case UCOL_PRIMARY:
|
|
|
|
strengthMask |= UCOL_PRIMARYORDERMASK;
|
|
|
|
}
|
|
|
|
|
|
|
|
list = new Order[listMax];
|
|
|
|
|
|
|
|
ucol_setOffset(elems, stringOffset, &status);
|
|
|
|
|
|
|
|
do {
|
|
|
|
low = ucol_getOffset(elems);
|
|
|
|
order = ucol_next(elems, &status);
|
|
|
|
high = ucol_getOffset(elems);
|
|
|
|
|
|
|
|
if (order != UCOL_NULLORDER) {
|
|
|
|
order &= strengthMask;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (order != UCOL_IGNORABLE) {
|
|
|
|
add(order, low, high);
|
|
|
|
}
|
|
|
|
} while (order != UCOL_NULLORDER);
|
|
|
|
|
|
|
|
ucol_closeElements(elems);
|
|
|
|
}
|
|
|
|
|
|
|
|
OrderList::~OrderList()
|
|
|
|
{
|
|
|
|
delete[] list;
|
|
|
|
}
|
|
|
|
|
|
|
|
void OrderList::add(int32_t order, int32_t low, int32_t high)
|
|
|
|
{
|
|
|
|
if (listSize >= listMax) {
|
|
|
|
listMax *= 2;
|
|
|
|
|
|
|
|
Order *newList = new Order[listMax];
|
|
|
|
|
|
|
|
uprv_memcpy(newList, list, listSize * sizeof(Order));
|
|
|
|
delete[] list;
|
|
|
|
list = newList;
|
|
|
|
}
|
|
|
|
|
|
|
|
list[listSize].order = order;
|
|
|
|
list[listSize].lowOffset = low;
|
|
|
|
list[listSize].highOffset = high;
|
|
|
|
|
|
|
|
listSize += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
const Order *OrderList::get(int32_t index) const
|
|
|
|
{
|
|
|
|
if (index >= listSize) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return &list[index];
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t OrderList::getLowOffset(int32_t index) const
|
|
|
|
{
|
|
|
|
const Order *order = get(index);
|
|
|
|
|
|
|
|
if (order != NULL) {
|
|
|
|
return order->lowOffset;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t OrderList::getHighOffset(int32_t index) const
|
|
|
|
{
|
|
|
|
const Order *order = get(index);
|
|
|
|
|
|
|
|
if (order != NULL) {
|
|
|
|
return order->highOffset;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t OrderList::getOrder(int32_t index) const
|
|
|
|
{
|
|
|
|
const Order *order = get(index);
|
|
|
|
|
|
|
|
if (order != NULL) {
|
|
|
|
return order->order;
|
|
|
|
}
|
|
|
|
|
|
|
|
return UCOL_NULLORDER;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t OrderList::size() const
|
|
|
|
{
|
|
|
|
return listSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
void OrderList::reverse()
|
|
|
|
{
|
|
|
|
for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
|
|
|
|
Order swap = list[b];
|
|
|
|
|
|
|
|
list[b] = list[f];
|
|
|
|
list[f] = swap;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool OrderList::compare(const OrderList &other) const
|
|
|
|
{
|
|
|
|
if (listSize != other.listSize) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
for(int32_t i = 0; i < listSize; i += 1) {
|
|
|
|
if (list[i].order != other.list[i].order ||
|
|
|
|
list[i].lowOffset != other.list[i].lowOffset ||
|
|
|
|
list[i].highOffset != other.list[i].highOffset) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
|
|
|
|
{
|
|
|
|
// NOTE: sizes include the NULLORDER, which we don't want to compare.
|
|
|
|
int32_t otherSize = other.size() - 1;
|
|
|
|
|
|
|
|
if (listSize - 1 - offset < otherSize) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
|
|
|
|
if (getOrder(i) != other.getOrder(j)) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static char *printOffsets(char *buffer, OrderList &list)
|
|
|
|
{
|
|
|
|
int32_t size = list.size();
|
|
|
|
char *s = buffer;
|
|
|
|
|
|
|
|
for(int32_t i = 0; i < size; i += 1) {
|
|
|
|
const Order *order = list.get(i);
|
|
|
|
|
|
|
|
if (i != 0) {
|
|
|
|
s += sprintf(s, ", ");
|
|
|
|
}
|
|
|
|
|
|
|
|
s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
|
|
|
|
}
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
static char *printOrders(char *buffer, OrderList &list)
|
|
|
|
{
|
|
|
|
int32_t size = list.size();
|
|
|
|
char *s = buffer;
|
|
|
|
|
|
|
|
for(int32_t i = 0; i < size; i += 1) {
|
|
|
|
const Order *order = list.get(i);
|
|
|
|
|
|
|
|
if (i != 0) {
|
|
|
|
s += sprintf(s, ", ");
|
|
|
|
}
|
|
|
|
|
|
|
|
s += sprintf(s, "%8.8X", order->order);
|
|
|
|
}
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SSearchTest::offsetTest()
|
|
|
|
{
|
|
|
|
UnicodeString test[] = {
|
|
|
|
"\\ua191\\u16ef\\u2036\\u017a",
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
// This results in a complex interaction between contraction,
|
|
|
|
// expansion and normalization that confuses the backwards offset fixups.
|
|
|
|
"\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
|
|
|
|
#endif
|
|
|
|
|
|
|
|
"\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
|
|
|
|
"\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
|
|
|
|
|
|
|
|
"\\u02FE\\u02FF"
|
|
|
|
"\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
|
|
|
|
"\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
|
|
|
|
"\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
|
|
|
|
"\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
|
|
|
|
"\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E",
|
|
|
|
|
|
|
|
"\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318",
|
|
|
|
"abc\\u0E41\\u0301\\u0316",
|
|
|
|
"abc\\u0E41\\u0316\\u0301",
|
|
|
|
"\\u0E41\\u0301\\u0316",
|
|
|
|
"\\u0E41\\u0316\\u0301",
|
|
|
|
"a\\u0301\\u0316",
|
|
|
|
"a\\u0316\\u0301",
|
|
|
|
"\\uAC52\\uAC53",
|
|
|
|
"\\u34CA\\u34CB",
|
|
|
|
"\\u11ED\\u11EE",
|
|
|
|
"\\u30C3\\u30D0",
|
|
|
|
"p\\u00E9ch\\u00E9",
|
|
|
|
"a\\u0301\\u0325",
|
|
|
|
"a\\u0300\\u0325",
|
|
|
|
"a\\u0325\\u0300",
|
|
|
|
"A\\u0323\\u0300B",
|
|
|
|
"A\\u0300\\u0323B",
|
|
|
|
"A\\u0301\\u0323B",
|
|
|
|
"A\\u0302\\u0301\\u0323B",
|
|
|
|
"abc",
|
|
|
|
"ab\\u0300c",
|
|
|
|
"ab\\u0300\\u0323c",
|
|
|
|
" \\uD800\\uDC00\\uDC00",
|
|
|
|
"a\\uD800\\uDC00\\uDC00",
|
|
|
|
"A\\u0301\\u0301",
|
|
|
|
"A\\u0301\\u0323",
|
|
|
|
"A\\u0301\\u0323B",
|
|
|
|
"B\\u0301\\u0323C",
|
|
|
|
"A\\u0300\\u0323B",
|
|
|
|
"\\u0301A\\u0301\\u0301",
|
|
|
|
"abcd\\r\\u0301",
|
|
|
|
"p\\u00EAche",
|
|
|
|
"pe\\u0302che",
|
|
|
|
};
|
|
|
|
|
|
|
|
int32_t testCount = ARRAY_SIZE(test);
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
|
|
|
|
char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
|
|
|
|
// We could allocate one that's the right size by (CE_count * 10) + 2
|
|
|
|
// 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
|
|
|
|
|
|
|
|
col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
|
|
|
|
|
|
|
|
for(int32_t i = 0; i < testCount; i += 1) {
|
|
|
|
UnicodeString ts = test[i].unescape();
|
|
|
|
CollationElementIterator *iter = col->createCollationElementIterator(ts);
|
|
|
|
OrderList forwardList;
|
|
|
|
OrderList backwardList;
|
|
|
|
int32_t order, low, high;
|
|
|
|
|
|
|
|
do {
|
|
|
|
low = iter->getOffset();
|
|
|
|
order = iter->next(status);
|
|
|
|
high = iter->getOffset();
|
|
|
|
|
|
|
|
forwardList.add(order, low, high);
|
|
|
|
} while (order != CollationElementIterator::NULLORDER);
|
|
|
|
|
|
|
|
iter->reset();
|
|
|
|
iter->setOffset(ts.length(), status);
|
|
|
|
|
|
|
|
backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
|
|
|
|
|
|
|
|
do {
|
|
|
|
high = iter->getOffset();
|
|
|
|
order = iter->previous(status);
|
|
|
|
low = iter->getOffset();
|
|
|
|
|
|
|
|
if (order == CollationElementIterator::NULLORDER) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
backwardList.add(order, low, high);
|
|
|
|
} while (TRUE);
|
|
|
|
|
|
|
|
backwardList.reverse();
|
|
|
|
|
|
|
|
if (forwardList.compare(backwardList)) {
|
2008-06-09 21:18:46 +00:00
|
|
|
logln("Works with \"%s\"", test[i].getTerminatedBuffer());
|
2008-05-23 04:22:28 +00:00
|
|
|
logln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
|
|
|
|
// logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
|
|
|
|
|
|
|
|
logln("Forward CEs: [%s]", printOrders(buffer, forwardList));
|
|
|
|
// logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
|
|
|
|
|
|
|
|
logln();
|
|
|
|
} else {
|
|
|
|
errln("Fails with \"%S\"", test[i].getTerminatedBuffer());
|
|
|
|
infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
|
|
|
|
infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
|
|
|
|
|
|
|
|
infoln("Forward CEs: [%s]", printOrders(buffer, forwardList));
|
|
|
|
infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
|
|
|
|
|
|
|
|
infoln();
|
|
|
|
}
|
2008-06-09 21:18:46 +00:00
|
|
|
delete iter;
|
2008-05-23 04:22:28 +00:00
|
|
|
}
|
2008-06-09 21:18:46 +00:00
|
|
|
delete col;
|
2008-05-23 04:22:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
class CEList
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
CEList(UCollator *coll, const UnicodeString &string);
|
|
|
|
~CEList();
|
|
|
|
|
|
|
|
int32_t size() const;
|
|
|
|
int32_t get(int32_t index) const;
|
|
|
|
UBool matchesAt(int32_t offset, const CEList *other) const;
|
|
|
|
|
|
|
|
private:
|
|
|
|
void add(int32_t ce);
|
|
|
|
|
|
|
|
int32_t *ces;
|
|
|
|
int32_t listMax;
|
|
|
|
int32_t listSize;
|
|
|
|
};
|
|
|
|
|
|
|
|
CEList::CEList(UCollator *coll, const UnicodeString &string)
|
|
|
|
: ces(NULL), listMax(8), listSize(0)
|
|
|
|
{
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
|
|
|
|
uint32_t strengthMask = 0;
|
|
|
|
int32_t order;
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
switch (ucol_getStrength(coll))
|
|
|
|
{
|
|
|
|
default:
|
|
|
|
strengthMask |= UCOL_TERTIARYORDERMASK;
|
|
|
|
/* fall through */
|
|
|
|
|
|
|
|
case UCOL_SECONDARY:
|
|
|
|
strengthMask |= UCOL_SECONDARYORDERMASK;
|
|
|
|
/* fall through */
|
|
|
|
|
|
|
|
case UCOL_PRIMARY:
|
|
|
|
strengthMask |= UCOL_PRIMARYORDERMASK;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
strengthMask = UCOL_PRIMARYORDERMASK;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
ces = new int32_t[listMax];
|
|
|
|
|
|
|
|
while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
|
|
|
|
order &= strengthMask;
|
|
|
|
|
|
|
|
if (order == UCOL_IGNORABLE) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
add(order);
|
|
|
|
}
|
|
|
|
|
|
|
|
ucol_closeElements(elems);
|
|
|
|
}
|
|
|
|
|
|
|
|
CEList::~CEList()
|
|
|
|
{
|
|
|
|
delete[] ces;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CEList::add(int32_t ce)
|
|
|
|
{
|
|
|
|
if (listSize >= listMax) {
|
|
|
|
listMax *= 2;
|
|
|
|
|
|
|
|
int32_t *newCEs = new int32_t[listMax];
|
|
|
|
|
|
|
|
uprv_memcpy(newCEs, ces, listSize * sizeof(int32_t));
|
|
|
|
delete[] ces;
|
|
|
|
ces = newCEs;
|
|
|
|
}
|
|
|
|
|
|
|
|
ces[listSize++] = ce;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t CEList::get(int32_t index) const
|
|
|
|
{
|
|
|
|
if (index >= 0 && index < listSize) {
|
|
|
|
return ces[index];
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool CEList::matchesAt(int32_t offset, const CEList *other) const
|
|
|
|
{
|
|
|
|
if (listSize - offset < other->size()) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
|
|
|
|
if (ces[i] != other->get(j)) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t CEList::size() const
|
|
|
|
{
|
|
|
|
return listSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
class StringList
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
StringList();
|
|
|
|
~StringList();
|
|
|
|
|
|
|
|
void add(const UnicodeString *string);
|
|
|
|
void add(const UChar *chars, int32_t count);
|
|
|
|
const UnicodeString *get(int32_t index) const;
|
|
|
|
int32_t size() const;
|
|
|
|
|
|
|
|
private:
|
|
|
|
UnicodeString *strings;
|
|
|
|
int32_t listMax;
|
|
|
|
int32_t listSize;
|
|
|
|
};
|
|
|
|
|
|
|
|
StringList::StringList()
|
|
|
|
: strings(NULL), listMax(16), listSize(0)
|
|
|
|
{
|
|
|
|
strings = new UnicodeString [listMax];
|
|
|
|
}
|
|
|
|
|
|
|
|
StringList::~StringList()
|
|
|
|
{
|
|
|
|
delete[] strings;
|
|
|
|
}
|
|
|
|
|
|
|
|
void StringList::add(const UnicodeString *string)
|
|
|
|
{
|
|
|
|
if (listSize >= listMax) {
|
|
|
|
listMax *= 2;
|
|
|
|
|
|
|
|
UnicodeString *newStrings = new UnicodeString[listMax];
|
|
|
|
|
|
|
|
uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString));
|
|
|
|
|
|
|
|
delete[] strings;
|
|
|
|
strings = newStrings;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The ctor initialized all the strings in
|
|
|
|
// the array to empty strings, so this
|
|
|
|
// is the same as copying the source string.
|
|
|
|
strings[listSize++].append(*string);
|
|
|
|
}
|
|
|
|
|
|
|
|
void StringList::add(const UChar *chars, int32_t count)
|
|
|
|
{
|
|
|
|
const UnicodeString string(chars, count);
|
|
|
|
|
|
|
|
add(&string);
|
|
|
|
}
|
|
|
|
|
|
|
|
const UnicodeString *StringList::get(int32_t index) const
|
|
|
|
{
|
|
|
|
if (index >= 0 && index < listSize) {
|
|
|
|
return &strings[index];
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t StringList::size() const
|
|
|
|
{
|
|
|
|
return listSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
class CEToStringsMap
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
|
|
|
CEToStringsMap();
|
|
|
|
~CEToStringsMap();
|
|
|
|
|
|
|
|
void put(int32_t ce, UnicodeString *string);
|
|
|
|
StringList *getStringList(int32_t ce) const;
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
static void deleteStringList(void *obj);
|
|
|
|
void putStringList(int32_t ce, StringList *stringList);
|
|
|
|
UHashtable *map;
|
|
|
|
};
|
|
|
|
|
|
|
|
CEToStringsMap::CEToStringsMap()
|
|
|
|
{
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
|
|
|
map = uhash_open(uhash_hashLong, uhash_compareLong,
|
|
|
|
uhash_compareCaselessUnicodeString,
|
|
|
|
&status);
|
|
|
|
|
|
|
|
uhash_setValueDeleter(map, deleteStringList);
|
|
|
|
}
|
|
|
|
|
|
|
|
CEToStringsMap::~CEToStringsMap()
|
|
|
|
{
|
|
|
|
uhash_close(map);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CEToStringsMap::put(int32_t ce, UnicodeString *string)
|
|
|
|
{
|
|
|
|
StringList *strings = getStringList(ce);
|
|
|
|
|
|
|
|
if (strings == NULL) {
|
|
|
|
strings = new StringList();
|
|
|
|
putStringList(ce, strings);
|
|
|
|
}
|
|
|
|
|
|
|
|
strings->add(string);
|
|
|
|
}
|
|
|
|
|
|
|
|
StringList *CEToStringsMap::getStringList(int32_t ce) const
|
|
|
|
{
|
|
|
|
return (StringList *) uhash_iget(map, ce);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CEToStringsMap::putStringList(int32_t ce, StringList *stringList)
|
|
|
|
{
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
|
|
|
uhash_iput(map, ce, (void *) stringList, &status);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CEToStringsMap::deleteStringList(void *obj)
|
|
|
|
{
|
|
|
|
StringList *strings = (StringList *) obj;
|
|
|
|
|
|
|
|
delete strings;
|
|
|
|
}
|
|
|
|
|
|
|
|
class StringToCEsMap
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
StringToCEsMap();
|
|
|
|
~StringToCEsMap();
|
|
|
|
|
|
|
|
void put(const UnicodeString *string, const CEList *ces);
|
|
|
|
const CEList *get(const UnicodeString *string);
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
static void deleteCEList(void *obj);
|
2008-06-09 21:18:46 +00:00
|
|
|
static void deleteUnicodeStringKey(void *obj);
|
2008-05-23 04:22:28 +00:00
|
|
|
|
|
|
|
UHashtable *map;
|
|
|
|
};
|
|
|
|
|
|
|
|
StringToCEsMap::StringToCEsMap()
|
|
|
|
{
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
|
|
|
map = uhash_open(uhash_hashCaselessUnicodeString,
|
|
|
|
uhash_compareCaselessUnicodeString,
|
|
|
|
uhash_compareLong,
|
|
|
|
&status);
|
|
|
|
|
|
|
|
uhash_setValueDeleter(map, deleteCEList);
|
2008-06-09 21:18:46 +00:00
|
|
|
uhash_setKeyDeleter(map, deleteUnicodeStringKey);
|
2008-05-23 04:22:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
StringToCEsMap::~StringToCEsMap()
|
|
|
|
{
|
|
|
|
uhash_close(map);
|
|
|
|
}
|
|
|
|
|
|
|
|
void StringToCEsMap::put(const UnicodeString *string, const CEList *ces)
|
|
|
|
{
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
|
|
|
uhash_put(map, (void *) string, (void *) ces, &status);
|
|
|
|
}
|
|
|
|
|
|
|
|
const CEList *StringToCEsMap::get(const UnicodeString *string)
|
|
|
|
{
|
|
|
|
return (const CEList *) uhash_get(map, string);
|
|
|
|
}
|
|
|
|
|
|
|
|
void StringToCEsMap::deleteCEList(void *obj)
|
|
|
|
{
|
|
|
|
CEList *list = (CEList *) obj;
|
|
|
|
|
|
|
|
delete list;
|
|
|
|
}
|
|
|
|
|
2008-06-09 21:18:46 +00:00
|
|
|
void StringToCEsMap::deleteUnicodeStringKey(void *obj)
|
|
|
|
{
|
|
|
|
UnicodeString *key = (UnicodeString *) obj;
|
|
|
|
|
|
|
|
delete key;
|
|
|
|
}
|
|
|
|
|
2008-05-23 04:22:28 +00:00
|
|
|
static void buildData(UCollator *coll, USet *charsToTest, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith)
|
|
|
|
{
|
|
|
|
int32_t itemCount = uset_getItemCount(charsToTest);
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
|
|
|
for(int32_t item = 0; item < itemCount; item += 1) {
|
|
|
|
UChar32 start = 0, end = 0;
|
|
|
|
UChar buffer[16];
|
|
|
|
int32_t len = uset_getItem(charsToTest, item, &start, &end,
|
|
|
|
buffer, 16, &status);
|
|
|
|
|
|
|
|
if (len == 0) {
|
|
|
|
for (UChar32 ch = start; ch <= end; ch += 1) {
|
|
|
|
UnicodeString *st = new UnicodeString(ch);
|
|
|
|
CEList *ceList = new CEList(coll, *st);
|
|
|
|
|
|
|
|
charsToCEList->put(st, ceList);
|
|
|
|
ceToCharsStartingWith->put(ceList->get(0), st);
|
|
|
|
}
|
|
|
|
} else if (len > 0) {
|
|
|
|
UnicodeString *st = new UnicodeString(buffer, len);
|
|
|
|
CEList *ceList = new CEList(coll, *st);
|
|
|
|
|
|
|
|
charsToCEList->put(st, ceList);
|
|
|
|
ceToCharsStartingWith->put(ceList->get(0), st);
|
|
|
|
} else {
|
|
|
|
// shouldn't happen...
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
|
|
|
|
{
|
|
|
|
for(int32_t i = 0; i < string.length(); i += 1) {
|
|
|
|
UChar32 ch = string.char32At(i);
|
|
|
|
|
|
|
|
if (ch >= 0x0020 && ch <= 0x007F) {
|
|
|
|
if (ch == 0x005C) {
|
|
|
|
buffer.append("\\\\");
|
|
|
|
} else {
|
|
|
|
buffer.append(ch);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
char cbuffer[12];
|
|
|
|
|
|
|
|
if (ch <= 0xFFFFL) {
|
|
|
|
sprintf(cbuffer, "\\u%4.4X", ch);
|
|
|
|
} else {
|
|
|
|
sprintf(cbuffer, "\\U%8.8X", ch);
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer.append(cbuffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ch >= 0x10000L) {
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int32_t minLengthInChars(const CEList *ceList, int32_t offset, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith,
|
|
|
|
UnicodeString &debug)
|
|
|
|
{
|
|
|
|
// find out shortest string for the longest sequence of ces.
|
|
|
|
// needs to be refined to use dynamic programming, but will be roughly right
|
|
|
|
int32_t totalStringLength = 0;
|
|
|
|
|
|
|
|
while (offset < ceList->size()) {
|
|
|
|
int32_t ce = ceList->get(offset);
|
|
|
|
int32_t bestLength = INT32_MIN;
|
|
|
|
const UnicodeString *bestString = NULL;
|
|
|
|
int32_t bestCeLength = 0;
|
|
|
|
const StringList *strings = ceToCharsStartingWith->getStringList(ce);
|
|
|
|
int32_t stringCount = strings->size();
|
|
|
|
|
|
|
|
for (int32_t s = 0; s < stringCount; s += 1) {
|
|
|
|
const UnicodeString *string = strings->get(s);
|
|
|
|
const CEList *ceList2 = charsToCEList->get(string);
|
|
|
|
|
|
|
|
if (ceList->matchesAt(offset, ceList2)) {
|
|
|
|
int32_t length = ceList2->size() - string->length();
|
|
|
|
|
|
|
|
if (bestLength < length) {
|
|
|
|
bestLength = length;
|
|
|
|
bestCeLength = ceList2->size();
|
|
|
|
bestString = string;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
totalStringLength += bestString->length();
|
|
|
|
escape(*bestString, debug).append("/");
|
|
|
|
offset += bestCeLength;
|
|
|
|
}
|
|
|
|
|
2008-05-27 16:24:55 +00:00
|
|
|
debug.append((UChar)0x0000);
|
2008-05-23 04:22:28 +00:00
|
|
|
return totalStringLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void minLengthTest(UCollator *coll, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith)
|
|
|
|
{
|
|
|
|
UnicodeString examples[] = {"fuss", "fiss", "affliss", "VII"};
|
|
|
|
UnicodeString debug;
|
|
|
|
int32_t nExamples = sizeof(examples) / sizeof(examples[0]);
|
|
|
|
|
|
|
|
for (int32_t s = 0; s < nExamples; s += 1) {
|
|
|
|
CEList *ceList = new CEList(coll, examples[s]);
|
|
|
|
|
|
|
|
//infoln("%S:", examples[s].getTerminatedBuffer());
|
|
|
|
|
|
|
|
for(int32_t i = 0; i < examples[s].length(); i += 1) {
|
|
|
|
debug.remove();
|
|
|
|
|
|
|
|
int32_t minLength = minLengthInChars(ceList, i, charsToCEList, ceToCharsStartingWith, debug);
|
|
|
|
//infoln("\t%d\t%S", minLength, debug.getTerminatedBuffer());
|
|
|
|
}
|
|
|
|
|
|
|
|
//infoln();
|
|
|
|
delete ceList;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// Random Numbers. Similar to standard lib rand() and srand()
|
|
|
|
// Not using library to
|
|
|
|
// 1. Get same results on all platforms.
|
|
|
|
// 2. Get access to current seed, to more easily reproduce failures.
|
|
|
|
//
|
|
|
|
//---------------------------------------------------------------------------------------
|
|
|
|
static uint32_t m_seed = 1;
|
|
|
|
|
|
|
|
static uint32_t m_rand()
|
|
|
|
{
|
|
|
|
m_seed = m_seed * 1103515245 + 12345;
|
|
|
|
return (uint32_t)(m_seed/65536) % 32768;
|
|
|
|
}
|
|
|
|
|
|
|
|
class Monkey
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
Monkey();
|
|
|
|
virtual ~Monkey();
|
|
|
|
};
|
|
|
|
|
|
|
|
Monkey::Monkey()
|
|
|
|
{
|
|
|
|
// ook?
|
|
|
|
}
|
|
|
|
|
|
|
|
Monkey::~Monkey()
|
|
|
|
{
|
|
|
|
// ook?
|
|
|
|
}
|
|
|
|
|
|
|
|
class SetMonkey : public Monkey
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
SetMonkey(const USet *theSet);
|
|
|
|
~SetMonkey();
|
|
|
|
|
|
|
|
virtual void append(UnicodeString &test, UnicodeString &alternate);
|
|
|
|
|
|
|
|
private:
|
|
|
|
const USet *set;
|
|
|
|
};
|
|
|
|
|
|
|
|
SetMonkey::SetMonkey(const USet *theSet)
|
|
|
|
: Monkey(), set(theSet)
|
|
|
|
{
|
|
|
|
// ook?
|
|
|
|
}
|
|
|
|
|
|
|
|
SetMonkey::~SetMonkey()
|
|
|
|
{
|
|
|
|
//ook...
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
|
|
|
|
{
|
|
|
|
int32_t size = uset_size(set);
|
|
|
|
int32_t index = m_rand() % size;
|
|
|
|
UChar32 ch = uset_charAt(set, index);
|
|
|
|
UnicodeString str(ch);
|
|
|
|
|
|
|
|
test.append(str);
|
|
|
|
alternate.append(str); // flip case, or some junk?
|
|
|
|
}
|
|
|
|
|
|
|
|
class StringSetMonkey : public Monkey
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith);
|
|
|
|
~StringSetMonkey();
|
|
|
|
|
|
|
|
void append(UnicodeString &testCase, UnicodeString &alternate);
|
|
|
|
|
|
|
|
private:
|
|
|
|
UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
|
|
|
|
|
|
|
|
const USet *set;
|
|
|
|
UCollator *coll;
|
|
|
|
StringToCEsMap *charsToCEList;
|
|
|
|
CEToStringsMap *ceToCharsStartingWith;
|
|
|
|
};
|
|
|
|
|
|
|
|
StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith)
|
|
|
|
: Monkey(), set(theSet), coll(theCollator), charsToCEList(theCharsToCEList), ceToCharsStartingWith(theCeToCharsStartingWith)
|
|
|
|
{
|
|
|
|
// ook.
|
|
|
|
}
|
|
|
|
|
|
|
|
StringSetMonkey::~StringSetMonkey()
|
|
|
|
{
|
|
|
|
// ook?
|
|
|
|
}
|
|
|
|
|
|
|
|
void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
|
|
|
|
{
|
|
|
|
int32_t itemCount = uset_getItemCount(set), len = 0;
|
|
|
|
int32_t index = m_rand() % itemCount;
|
|
|
|
UChar32 rangeStart = 0, rangeEnd = 0;
|
|
|
|
UChar buffer[16];
|
|
|
|
UErrorCode err = U_ZERO_ERROR;
|
|
|
|
|
|
|
|
len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
|
|
|
|
|
|
|
|
if (len == 0) {
|
|
|
|
int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
|
|
|
|
UChar32 ch = rangeStart + offset;
|
|
|
|
UnicodeString str(ch);
|
|
|
|
|
|
|
|
testCase.append(str);
|
|
|
|
generateAlternative(str, alternate);
|
|
|
|
} else if (len > 0) {
|
|
|
|
// should check that len < 16...
|
|
|
|
UnicodeString str(buffer, len);
|
|
|
|
|
|
|
|
testCase.append(str);
|
|
|
|
generateAlternative(str, alternate);
|
|
|
|
} else {
|
|
|
|
// shouldn't happen...
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
|
|
|
|
{
|
|
|
|
// find out shortest string for the longest sequence of ces.
|
|
|
|
// needs to be refined to use dynamic programming, but will be roughly right
|
|
|
|
CEList ceList(coll, testCase);
|
|
|
|
UnicodeString alt;
|
|
|
|
int32_t offset = 0;
|
|
|
|
|
|
|
|
if (ceList.size() == 0) {
|
|
|
|
return alternate.append(testCase);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (offset < ceList.size()) {
|
|
|
|
int32_t ce = ceList.get(offset);
|
|
|
|
const StringList *strings = ceToCharsStartingWith->getStringList(ce);
|
|
|
|
|
|
|
|
if (strings == NULL) {
|
|
|
|
return alternate.append(testCase);
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t stringCount = strings->size();
|
|
|
|
int32_t tries = 0;
|
|
|
|
|
|
|
|
// find random string that generates the same CEList
|
|
|
|
const CEList *ceList2;
|
|
|
|
const UnicodeString *string;
|
|
|
|
|
|
|
|
do {
|
|
|
|
int32_t s = m_rand() % stringCount;
|
|
|
|
|
|
|
|
if (tries++ > stringCount) {
|
|
|
|
alternate.append(testCase);
|
|
|
|
return alternate;
|
|
|
|
}
|
|
|
|
|
|
|
|
string = strings->get(s);
|
|
|
|
ceList2 = charsToCEList->get(string);
|
|
|
|
} while (! ceList.matchesAt(offset, ceList2));
|
|
|
|
|
|
|
|
alt.append(*string);
|
|
|
|
offset += ceList2->size();
|
|
|
|
}
|
|
|
|
|
|
|
|
const CEList altCEs(coll, alt);
|
|
|
|
|
|
|
|
if (ceList.matchesAt(0, &altCEs)) {
|
|
|
|
return alternate.append(alt);
|
|
|
|
}
|
|
|
|
|
|
|
|
return alternate.append(testCase);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
|
|
|
|
{
|
|
|
|
int32_t pieces = (m_rand() % 4) + 1;
|
|
|
|
UBool matches;
|
|
|
|
|
|
|
|
do {
|
|
|
|
testCase.remove();
|
|
|
|
alternate.remove();
|
|
|
|
monkeys[0]->append(testCase, alternate);
|
|
|
|
|
|
|
|
for(int32_t piece = 0; piece < pieces; piece += 1) {
|
|
|
|
int32_t monkey = m_rand() % monkeyCount;
|
|
|
|
|
|
|
|
monkeys[monkey]->append(testCase, alternate);
|
|
|
|
}
|
|
|
|
|
|
|
|
const CEList ceTest(coll, testCase);
|
|
|
|
const CEList ceAlt(coll, alternate);
|
|
|
|
|
|
|
|
matches = ceTest.matchesAt(0, &ceAlt);
|
|
|
|
} while (! matches);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline USet *uset_openEmpty()
|
|
|
|
{
|
|
|
|
return uset_open(1, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Find the next acceptable boundary following the specified starting index
|
|
|
|
// in the target text being searched.
|
|
|
|
// TODO: refine what is an acceptable boundary. For the moment,
|
|
|
|
// choose the next position not within a combining sequence.
|
|
|
|
//
|
|
|
|
static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
|
|
|
|
const UChar *text = string.getBuffer();
|
|
|
|
int32_t textLen = string.length();
|
|
|
|
|
|
|
|
if (startIndex >= textLen) {
|
|
|
|
return startIndex;
|
|
|
|
}
|
|
|
|
|
|
|
|
UChar32 c;
|
|
|
|
int32_t i = startIndex;
|
|
|
|
|
|
|
|
U16_NEXT(text, i, textLen, c);
|
|
|
|
|
|
|
|
// If we are on a control character, stop without looking for combining marks.
|
|
|
|
// Control characters do not combine.
|
|
|
|
int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
|
|
|
|
if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The initial character was not a control, and can thus accept trailing
|
|
|
|
// combining characters. Advance over however many of them there are.
|
|
|
|
int32_t indexOfLastCharChecked;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
indexOfLastCharChecked = i;
|
|
|
|
|
|
|
|
if (i>=textLen) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
U16_NEXT(text, i, textLen, c);
|
|
|
|
gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
|
|
|
|
|
|
|
|
if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return indexOfLastCharChecked;
|
|
|
|
}
|
|
|
|
|
|
|
|
static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
|
|
|
|
const UChar *text = string.getBuffer();
|
|
|
|
int32_t textLen = string.length();
|
|
|
|
|
|
|
|
if (index>=textLen || index<=0) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the character at the current index is not a GRAPHEME_EXTEND
|
|
|
|
// then we can not be within a combining sequence.
|
|
|
|
UChar32 c;
|
|
|
|
U16_GET(text, 0, index, textLen, c);
|
|
|
|
int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
|
|
|
|
if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We are at a combining mark. If the preceding character is anything
|
|
|
|
// except a CONTROL, CR or LF, we are in a combining sequence.
|
|
|
|
U16_PREV(text, 0, index, c);
|
|
|
|
gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
|
|
|
|
|
|
|
|
return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
|
|
|
|
}
|
|
|
|
|
|
|
|
static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
|
|
|
|
{
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
OrderList targetOrders(coll, target, offset);
|
|
|
|
OrderList patternOrders(coll, pattern);
|
|
|
|
int32_t targetSize = targetOrders.size() - 1;
|
|
|
|
int32_t patternSize = patternOrders.size() - 1;
|
|
|
|
UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status),
|
|
|
|
target.getBuffer(), target.length(), &status);
|
|
|
|
|
2008-05-23 18:35:24 +00:00
|
|
|
if (patternSize == 0) {
|
|
|
|
matchStart = matchEnd = 0;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
2008-05-23 04:22:28 +00:00
|
|
|
matchStart = matchEnd = -1;
|
|
|
|
|
|
|
|
for(int32_t i = 0; i < targetSize; i += 1) {
|
|
|
|
if (targetOrders.matchesAt(i, patternOrders)) {
|
|
|
|
int32_t start = targetOrders.getLowOffset(i);
|
|
|
|
int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
|
|
|
|
int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
|
|
|
|
|
|
|
|
// if the low and high offsets of the first CE in
|
|
|
|
// the match are the same, it means that the match
|
|
|
|
// starts in the middle of an expansion - all but
|
|
|
|
// the first CE of the expansion will have the offset
|
|
|
|
// of the following character.
|
|
|
|
if (start == targetOrders.getHighOffset(i)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure match starts on a grapheme boundary
|
|
|
|
if (! ubrk_isBoundary(charBreakIterator, start)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the low and high offsets of the CE after the match
|
|
|
|
// are the same, it means that the match ends in the middle
|
|
|
|
// of an expansion sequence.
|
|
|
|
if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
|
|
|
|
targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t mend = maxLimit;
|
|
|
|
|
|
|
|
// Find the first grapheme break after the character index
|
|
|
|
// of the last CE in the match. If it's after character index
|
|
|
|
// that's after the last CE in the match, use that index
|
|
|
|
// as the end of the match.
|
|
|
|
if (minLimit < maxLimit) {
|
|
|
|
int32_t nba = ubrk_following(charBreakIterator, minLimit);
|
|
|
|
|
|
|
|
if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
|
|
|
|
mend = nba;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mend > maxLimit) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (! ubrk_isBoundary(charBreakIterator, mend)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
matchStart = start;
|
|
|
|
matchEnd = mend;
|
|
|
|
|
|
|
|
ubrk_close(charBreakIterator);
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ubrk_close(charBreakIterator);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
|
|
|
static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
|
|
|
|
int32_t val = defaultVal;
|
|
|
|
|
|
|
|
name.append(" *= *(-?\\d+)");
|
|
|
|
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
RegexMatcher m(name, params, 0, status);
|
|
|
|
|
|
|
|
if (m.find()) {
|
|
|
|
// The param exists. Convert the string to an int.
|
|
|
|
char valString[100];
|
|
|
|
int32_t paramLength = m.end(1, status) - m.start(1, status);
|
|
|
|
|
|
|
|
if (paramLength >= (int32_t)(sizeof(valString)-1)) {
|
|
|
|
paramLength = (int32_t)(sizeof(valString)-2);
|
|
|
|
}
|
|
|
|
|
|
|
|
params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
|
|
|
|
val = strtol(valString, NULL, 10);
|
|
|
|
|
|
|
|
// Delete this parameter from the params string.
|
|
|
|
m.reset();
|
|
|
|
params = m.replaceFirst("", status);
|
|
|
|
}
|
|
|
|
|
|
|
|
//U_ASSERT(U_SUCCESS(status));
|
|
|
|
if (! U_SUCCESS(status)) {
|
|
|
|
val = defaultVal;
|
|
|
|
}
|
|
|
|
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
|
|
|
|
const char *name, const char *strength, uint32_t seed)
|
|
|
|
{
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
int32_t actualStart = -1, actualEnd = -1;
|
|
|
|
//int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
|
|
|
|
int32_t expectedStart = -1, expectedEnd = -1;
|
|
|
|
int32_t notFoundCount = 0;
|
|
|
|
UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
|
|
|
|
testCase.getBuffer(), testCase.length(),
|
|
|
|
coll,
|
|
|
|
NULL, // the break iterator
|
|
|
|
&status);
|
|
|
|
|
|
|
|
// **** TODO: find *all* matches, not just first one ****
|
|
|
|
simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
usearch_search(uss, 0, &actualStart, &actualEnd, &status);
|
|
|
|
#else
|
|
|
|
actualStart = usearch_next(uss, &status);
|
|
|
|
actualEnd = actualStart + usearch_getMatchedLength(uss);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (actualStart != expectedStart || actualEnd != expectedEnd) {
|
|
|
|
errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
|
|
|
|
" strength=%s seed=%d",
|
|
|
|
name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (expectedStart == -1 && actualStart == -1) {
|
|
|
|
notFoundCount += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// **** TODO: find *all* matches, not just first one ****
|
|
|
|
simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
|
|
|
|
|
|
|
|
usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status);
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
usearch_search(uss, 0, &actualStart, &actualEnd, &status);
|
|
|
|
#else
|
|
|
|
usearch_reset(uss);
|
|
|
|
actualStart = usearch_next(uss, &status);
|
|
|
|
actualEnd = actualStart + usearch_getMatchedLength(uss);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (actualStart != expectedStart || actualEnd != expectedEnd) {
|
|
|
|
errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
|
|
|
|
" strength=%s seed=%d",
|
|
|
|
name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (expectedStart == -1 && actualStart == -1) {
|
|
|
|
notFoundCount += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
usearch_close(uss);
|
|
|
|
|
|
|
|
return notFoundCount;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SSearchTest::monkeyTest(char *params)
|
|
|
|
{
|
|
|
|
// ook!
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
U_STRING_DECL(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
|
|
|
|
U_STRING_INIT(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
|
|
|
|
UCollator *coll = ucol_open(NULL, &status);
|
|
|
|
USet *charsToTest = uset_openPattern(test_pattern, 47, &status);
|
|
|
|
USet *expansions = uset_openEmpty();
|
|
|
|
USet *contractions = uset_openEmpty();
|
|
|
|
StringToCEsMap *charsToCEList = new StringToCEsMap();
|
|
|
|
CEToStringsMap *ceToCharsStartingWith = new CEToStringsMap();
|
|
|
|
|
|
|
|
ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
|
|
|
|
|
|
|
|
uset_addAll(charsToTest, contractions);
|
|
|
|
uset_addAll(charsToTest, expansions);
|
|
|
|
|
|
|
|
// TODO: set strength to UCOL_PRIMARY, change CEList to use strength?
|
|
|
|
buildData(coll, charsToTest, charsToCEList, ceToCharsStartingWith);
|
|
|
|
|
|
|
|
U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
|
|
|
|
U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
|
|
|
|
USet *letters = uset_openPattern(letter_pattern, 39, &status);
|
|
|
|
SetMonkey letterMonkey(letters);
|
|
|
|
StringSetMonkey contractionMonkey(contractions, coll, charsToCEList, ceToCharsStartingWith);
|
|
|
|
StringSetMonkey expansionMonkey(expansions, coll, charsToCEList, ceToCharsStartingWith);
|
|
|
|
UnicodeString testCase;
|
|
|
|
UnicodeString alternate;
|
|
|
|
UnicodeString pattern, altPattern;
|
|
|
|
UnicodeString prefix, altPrefix;
|
|
|
|
UnicodeString suffix, altSuffix;
|
|
|
|
|
|
|
|
Monkey *monkeys[] = {
|
|
|
|
&letterMonkey,
|
|
|
|
&contractionMonkey,
|
|
|
|
&expansionMonkey,
|
|
|
|
&contractionMonkey,
|
|
|
|
&expansionMonkey,
|
|
|
|
&contractionMonkey,
|
|
|
|
&expansionMonkey,
|
|
|
|
&contractionMonkey,
|
|
|
|
&expansionMonkey};
|
|
|
|
int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
|
|
|
|
int32_t nonMatchCount = 0;
|
|
|
|
|
|
|
|
UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
|
|
|
|
const char *strengthNames[] = {"primary", "secondary", "tertiary"};
|
|
|
|
int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
|
|
|
|
int32_t loopCount = quick? 1000 : 10000;
|
|
|
|
int32_t firstStrength = 0;
|
|
|
|
int32_t lastStrength = strengthCount - 1;
|
|
|
|
|
|
|
|
if (params != NULL) {
|
|
|
|
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
|
|
|
UnicodeString p(params);
|
|
|
|
|
|
|
|
loopCount = getIntParam("loop", p, loopCount);
|
|
|
|
m_seed = getIntParam("seed", p, m_seed);
|
|
|
|
|
|
|
|
RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
|
|
|
|
if (m.find()) {
|
|
|
|
UnicodeString breakType = m.group(1, status);
|
|
|
|
|
|
|
|
for (int32_t s = 0; s < strengthCount; s += 1) {
|
|
|
|
if (breakType == strengthNames[s]) {
|
|
|
|
firstStrength = lastStrength = s;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
m.reset();
|
|
|
|
p = m.replaceFirst("", status);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (RegexMatcher("\\S", p, 0, status).find()) {
|
|
|
|
// Each option is stripped out of the option string as it is processed.
|
|
|
|
// All options have been checked. The option string should have been completely emptied..
|
|
|
|
char buf[100];
|
|
|
|
p.extract(buf, sizeof(buf), NULL, status);
|
|
|
|
buf[sizeof(buf)-1] = 0;
|
|
|
|
errln("Unrecognized or extra parameter: %s\n", buf);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
|
|
|
|
int32_t notFoundCount = 0;
|
|
|
|
|
|
|
|
ucol_setStrength(coll, strengths[s]);
|
|
|
|
|
|
|
|
// TODO: try alternate prefix and suffix too?
|
|
|
|
// TODO: alterntaes are only equal at primary strength. Is this OK?
|
|
|
|
for(int32_t t = 0; t < 10000; t += 1) {
|
|
|
|
uint32_t seed = m_seed;
|
|
|
|
int32_t nmc = 0;
|
|
|
|
|
|
|
|
generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
|
|
|
|
generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
|
|
|
|
generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
|
|
|
|
|
|
|
|
// pattern
|
|
|
|
notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
|
|
|
|
|
|
|
|
testCase.remove();
|
|
|
|
testCase.append(prefix);
|
|
|
|
testCase.append(/*alt*/pattern);
|
|
|
|
|
|
|
|
// prefix + pattern
|
|
|
|
notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
|
|
|
|
|
|
|
|
testCase.append(suffix);
|
|
|
|
|
|
|
|
// prefix + pattern + suffix
|
|
|
|
notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
|
|
|
|
|
|
|
|
testCase.remove();
|
|
|
|
testCase.append(pattern);
|
|
|
|
testCase.append(suffix);
|
|
|
|
|
|
|
|
// pattern + suffix
|
|
|
|
notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
|
|
|
|
}
|
|
|
|
|
|
|
|
logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete ceToCharsStartingWith;
|
|
|
|
delete charsToCEList;
|
|
|
|
|
|
|
|
uset_close(contractions);
|
|
|
|
uset_close(expansions);
|
|
|
|
uset_close(charsToTest);
|
2008-06-09 21:18:46 +00:00
|
|
|
uset_close(letters);
|
|
|
|
|
2008-05-23 04:22:28 +00:00
|
|
|
ucol_close(coll);
|
|
|
|
}
|
|
|
|
|
|
|
|
|