ICU-6659 Merge changes from branches/eric/boyer-moore

X-SVN-Rev: 25282
This commit is contained in:
Eric Mader 2009-01-22 00:24:48 +00:00
parent d9737d2f4a
commit 5f73103b5a
21 changed files with 4476 additions and 531 deletions

View File

@ -1,6 +1,6 @@
#******************************************************************************
#
# Copyright (C) 1998-2008, International Business Machines
# Copyright (C) 1998-2009, International Business Machines
# Corporation and others. All Rights Reserved.
#
#******************************************************************************
@ -81,7 +81,7 @@ ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \
zonemeta.o zstrfmt.o plurrule.o plurfmt.o dtitvfmt.o dtitvinf.o \
tmunit.o tmutamt.o tmutfmt.o
tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o
## Header files to install
HEADERS = $(srcdir)/unicode/*.h

145
icu4c/source/i18n/bms.cpp Normal file
View File

@ -0,0 +1,145 @@
/*
* Copyright (C) 2008-2009, International Business Machines Corporation and Others.
* All rights reserved.
*/
#include "unicode/utypes.h"
#include "cmemory.h"
#include "unicode/bms.h"
#include "unicode/unistr.h"
#include "unicode/colldata.h"
#include "unicode/bmsearch.h"
//#define USE_SAFE_CASTS
#ifdef USE_SAFE_CASTS
#define STATIC_CAST(type,value) static_cast<type>(value)
#define CONST_CAST(type,value) const_cast<type>(value)
#else
#define STATIC_CAST(type,value) (type) (value)
#define CONST_CAST(type,value) (type) (value)
#endif
U_CAPI UCD * U_EXPORT2
ucd_open(UCollator *coll, UErrorCode *status)
{
return STATIC_CAST(UCD *, CollData::open(coll, *status));
}
U_CAPI void U_EXPORT2
ucd_close(UCD *ucd)
{
CollData *data = STATIC_CAST(CollData *, ucd);
CollData::close(data);
}
U_CAPI UCollator * U_EXPORT2
ucd_getCollator(UCD *ucd)
{
CollData *data = STATIC_CAST(CollData *, ucd);
return data->getCollator();
}
U_CAPI void U_EXPORT2
ucd_freeCache()
{
CollData::freeCollDataCache();
}
U_CAPI void U_EXPORT2
ucd_flushCache()
{
CollData::flushCollDataCache();
}
struct BMS
{
BoyerMooreSearch *bms;
const UnicodeString *targetString;
};
U_CAPI BMS * U_EXPORT2
bms_open(UCD *ucd,
const UChar *pattern, int32_t patternLength,
const UChar *target, int32_t targetLength,
UErrorCode *status)
{
BMS *bms = STATIC_CAST(BMS *, uprv_malloc(sizeof(BMS)));
if (bms == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
CollData *data = (CollData *) ucd;
UnicodeString patternString(pattern, patternLength);
if (target != NULL) {
bms->targetString = new UnicodeString(target, targetLength);
if (bms->targetString == NULL) {
bms->bms = NULL;
*status = U_MEMORY_ALLOCATION_ERROR;
return bms;
}
} else {
bms->targetString = NULL;
}
bms->bms = new BoyerMooreSearch(data, patternString, bms->targetString, *status);
if (bms->bms == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
}
return bms;
}
U_CAPI void U_EXPORT2
bms_close(BMS *bms)
{
delete bms->bms;
delete bms->targetString;
uprv_free(bms);
}
U_CAPI UBool U_EXPORT2
bms_empty(BMS *bms)
{
return bms->bms->empty();
}
U_CAPI UCD * U_EXPORT2
bms_getData(BMS *bms)
{
return STATIC_CAST(UCD *, bms->bms->getData());
}
U_CAPI UBool U_EXPORT2
bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end)
{
return bms->bms->search(offset, *start, *end);
}
U_CAPI void U_EXPORT2
bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status)
{
if (U_FAILURE(*status)) {
return;
}
if (bms->targetString != NULL) {
delete bms->targetString;
}
if (target != NULL) {
bms->targetString = new UnicodeString(target, targetLength);
} else {
bms->targetString = NULL;
}
bms->bms->setTargetString(bms->targetString, *status);
}

View File

@ -0,0 +1,864 @@
/*
******************************************************************************
* Copyright (C) 1996-2009, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/unistr.h"
#include "unicode/putil.h"
#include "unicode/usearch.h"
#include "cmemory.h"
#include "unicode/coll.h"
#include "unicode/tblcoll.h"
#include "unicode/coleitr.h"
#include "unicode/ucoleitr.h"
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
#include "unicode/uniset.h"
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "hash.h"
#include "uhash.h"
#include "ucol_imp.h"
#include "unormimp.h"
#include "unicode/colldata.h"
#include "unicode/bmsearch.h"
U_NAMESPACE_BEGIN
#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
struct CEI
{
uint32_t order;
int32_t lowOffset;
int32_t highOffset;
};
class Target : public UMemory
{
public:
Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
~Target();
void setTargetString(const UnicodeString *target);
const CEI *nextCE(int32_t offset);
const CEI *prevCE(int32_t offset);
int32_t stringLength();
UChar charAt(int32_t offset);
UBool isBreakBoundary(int32_t offset);
int32_t nextBreakBoundary(int32_t offset);
int32_t nextSafeBoundary(int32_t offset);
UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
void setOffset(int32_t offset);
void setLast(int32_t last);
int32_t getOffset();
private:
CEI *ceb;
int32_t bufferSize;
int32_t bufferMin;
int32_t bufferMax;
uint32_t strengthMask;
UCollationStrength strength;
uint32_t variableTop;
UBool toShift;
UCollator *coll;
const UnicodeString *targetString;
const UChar *targetBuffer;
int32_t targetLength;
UCollationElements *elements;
UBreakIterator *charBreakIterator;
};
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
: bufferSize(0), bufferMin(0), bufferMax(0),
strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
{
strength = ucol_getStrength(coll);
toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
variableTop = ucol_getVariableTop(coll, &status);
// find the largest expansion
uint8_t maxExpansion = 0;
for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
if (*expansion > maxExpansion) {
maxExpansion = *expansion;
}
}
// room for an extra character on each end, plus 4 for safety
bufferSize = patternLength + (2 * maxExpansion) + 4;
ceb = NEW_ARRAY(CEI, bufferSize);
if (ceb == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (target != NULL) {
setTargetString(target);
}
switch (strength)
{
default:
strengthMask |= UCOL_TERTIARYORDERMASK;
/* fall through */
case UCOL_SECONDARY:
strengthMask |= UCOL_SECONDARYORDERMASK;
/* fall through */
case UCOL_PRIMARY:
strengthMask |= UCOL_PRIMARYORDERMASK;
}
}
Target::~Target()
{
ubrk_close(charBreakIterator);
ucol_closeElements(elements);
DELETE_ARRAY(ceb);
}
void Target::setTargetString(const UnicodeString *target)
{
if (charBreakIterator != NULL) {
ubrk_close(charBreakIterator);
ucol_closeElements(elements);
}
targetString = target;
if (targetString != NULL) {
UErrorCode status = U_ZERO_ERROR;
targetBuffer = targetString->getBuffer();
targetLength = targetString->length();
elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
ucol_forceHanImplicit(elements, &status);
charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status),
targetBuffer, targetLength, &status);
} else {
targetBuffer = NULL;
targetLength = 0;
}
}
const CEI *Target::nextCE(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
int32_t low = -1, high = -1;
uint32_t order;
UBool cont = FALSE;
if (offset >= bufferMin && offset < bufferMax) {
return &ceb[offset];
}
if (bufferMax >= bufferSize || offset != bufferMax) {
return NULL;
}
do {
low = ucol_getOffset(elements);
order = ucol_next(elements, &status);
high = ucol_getOffset(elements);
if (order == UCOL_NULLORDER) {
//high = low = -1;
break;
}
cont = isContinuation(order);
order &= strengthMask;
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
if (strength >= UCOL_QUATERNARY) {
order &= UCOL_PRIMARYORDERMASK;
} else {
order = UCOL_IGNORABLE;
}
}
} while (order == UCOL_IGNORABLE);
if (cont) {
order |= UCOL_CONTINUATION_MARKER;
}
ceb[offset].order = order;
ceb[offset].lowOffset = low;
ceb[offset].highOffset = high;
bufferMax += 1;
return &ceb[offset];
}
const CEI *Target::prevCE(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
int32_t low = -1, high = -1;
uint32_t order;
UBool cont = FALSE;
if (offset >= bufferMin && offset < bufferMax) {
return &ceb[offset];
}
if (bufferMax >= bufferSize || offset != bufferMax) {
return NULL;
}
do {
high = ucol_getOffset(elements);
order = ucol_previous(elements, &status);
low = ucol_getOffset(elements);
if (order == UCOL_NULLORDER) {
break;
}
cont = isContinuation(order);
order &= strengthMask;
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
if (strength >= UCOL_QUATERNARY) {
order &= UCOL_PRIMARYORDERMASK;
} else {
order = UCOL_IGNORABLE;
}
}
} while (order == UCOL_IGNORABLE);
bufferMax += 1;
if (cont) {
order |= UCOL_CONTINUATION_MARKER;
}
ceb[offset].order = order;
ceb[offset].lowOffset = low;
ceb[offset].highOffset = high;
return &ceb[offset];
}
int32_t Target::stringLength()
{
if (targetString != NULL) {
return targetLength;
}
return 0;
}
UChar Target::charAt(int32_t offset)
{
if (targetString != NULL) {
return targetBuffer[offset];
}
return 0x0000;
}
void Target::setOffset(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
bufferMin = 0;
bufferMax = 0;
ucol_setOffset(elements, offset, &status);
}
void Target::setLast(int32_t last)
{
UErrorCode status = U_ZERO_ERROR;
bufferMin = 0;
bufferMax = 1;
ceb[0].order = UCOL_NULLORDER;
ceb[0].lowOffset = last;
ceb[0].highOffset = last;
ucol_setOffset(elements, last, &status);
}
int32_t Target::getOffset()
{
return ucol_getOffset(elements);
}
UBool Target::isBreakBoundary(int32_t offset)
{
return ubrk_isBoundary(charBreakIterator, offset);
}
int32_t Target::nextBreakBoundary(int32_t offset)
{
return ubrk_following(charBreakIterator, offset);
}
int32_t Target::nextSafeBoundary(int32_t offset)
{
while (offset < targetLength) {
//UChar ch = charAt(offset);
UChar ch = targetBuffer[offset];
if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
return offset;
}
offset += 1;
}
return targetLength;
}
UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
{
if (strength < UCOL_IDENTICAL) {
return TRUE;
}
UChar t2[32], p2[32];
const UChar *pBuffer = pattern.getBuffer();
int32_t pLength = pattern.length();
int32_t length = end - start;
UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2),
targetBuffer + start, length,
FALSE, 0, &status);
// use separate status2 in case of buffer overflow
if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2),
pBuffer, pLength,
FALSE, 0, &status2)) {
return FALSE; // lengths are different
}
// compare contents
UChar *text, *pat;
if(U_SUCCESS(status)) {
text = t2;
pat = p2;
} else if(status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
// allocate one buffer for both decompositions
text = NEW_ARRAY(UChar, decomplength * 2);
// Check for allocation failure.
if (text == NULL) {
return FALSE;
}
pat = text + decomplength;
unorm_decompose(text, decomplength, targetBuffer + start,
length, FALSE, 0, &status);
unorm_decompose(pat, decomplength, pBuffer,
pLength, FALSE, 0, &status);
} else {
// NFD failed, make sure that u_memcmp() does not overrun t2 & p2
// and that we don't uprv_free() an undefined text pointer
text = pat = t2;
decomplength = 0;
}
UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0);
if(text != t2) {
DELETE_ARRAY(text);
}
// return FALSE if NFD failed
return U_SUCCESS(status) && result;
}
#define HASH_TABLE_SIZE 257
class BadCharacterTable : public UMemory
{
public:
BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
~BadCharacterTable();
int32_t operator[](uint32_t ce) const;
int32_t getMaxSkip() const;
int32_t minLengthInChars(int32_t index);
private:
static int32_t hash(uint32_t ce);
int32_t maxSkip;
int32_t badCharacterTable[HASH_TABLE_SIZE];
int32_t *minLengthCache;
};
BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
: minLengthCache(NULL)
{
int32_t plen = patternCEs.size();
// **** need a better way to deal with this ****
if (U_FAILURE(status) || plen == 0) {
return;
}
int32_t *history = NEW_ARRAY(int32_t, plen);
if (history == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0; i < plen; i += 1) {
history[i] = -1;
}
minLengthCache = NEW_ARRAY(int32_t, plen + 1);
if (minLengthCache == NULL) {
DELETE_ARRAY(history);
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
badCharacterTable[j] = maxSkip;
}
for(int32_t p = 1; p < plen; p += 1) {
minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
// Make sure this entry is not bigger than the previous one.
// Otherwise, we might skip too far in some cases.
if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
minLengthCache[p] = minLengthCache[p - 1];
}
}
minLengthCache[plen] = 0;
for(int32_t p = 0; p < plen - 1; p += 1) {
badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
}
DELETE_ARRAY(history);
}
BadCharacterTable::~BadCharacterTable()
{
DELETE_ARRAY(minLengthCache);
}
int32_t BadCharacterTable::operator[](uint32_t ce) const
{
return badCharacterTable[hash(ce)];
}
int32_t BadCharacterTable::getMaxSkip() const
{
return maxSkip;
}
int32_t BadCharacterTable::minLengthInChars(int32_t index)
{
return minLengthCache[index];
}
int32_t BadCharacterTable::hash(uint32_t ce)
{
return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
}
class GoodSuffixTable : public UMemory
{
public:
GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
~GoodSuffixTable();
int32_t operator[](int32_t offset) const;
private:
int32_t *goodSuffixTable;
};
GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
: goodSuffixTable(NULL)
{
int32_t patlen = patternCEs.size();
// **** need a better way to deal with this ****
if (U_FAILURE(status) || patlen <= 0) {
return;
}
int32_t *suff = NEW_ARRAY(int32_t, patlen);
int32_t start = patlen - 1, end = - 1;
int32_t maxSkip = badCharacterTable.getMaxSkip();
if (suff == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// initialze suff
suff[patlen - 1] = patlen;
for (int32_t i = patlen - 2; i >= 0; i -= 1) {
// (i > start) means we're inside the last suffix match we found
// ((patlen - 1) - end) is how far the end of that match is from end of pattern
// (i - start) is how far we are from start of that match
// (i + (patlen - 1) - end) is index of same character at end of pattern
// so if any suffix match at that character doesn't extend beyond the last match,
// it's the suffix for this character as well
if (i > start && suff[i + patlen - 1 - end] < i - start) {
suff[i] = suff[i + patlen - 1 - end];
} else {
start = end = i;
int32_t s = patlen;
while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
start -= 1;
}
suff[i] = end - start;
}
}
// now build goodSuffixTable
goodSuffixTable = NEW_ARRAY(int32_t, patlen);
if (goodSuffixTable == NULL) {
DELETE_ARRAY(suff);
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// initialize entries to minLengthInChars of the pattern
for (int32_t i = 0; i < patlen; i += 1) {
goodSuffixTable[i] = maxSkip;
}
int32_t prefix = 0;
for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
if (suff[i] == i + 1) {
// this matching suffix is a prefix of the pattern
int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
// for any mis-match before this suffix, we should skip
// so that the front of the pattern (i.e. the prefix)
// lines up with the front of the suffix.
// (patlen - 1 - i) is the start of the suffix
while (prefix < patlen - 1 - i) {
// value of maxSkip means never set...
if (goodSuffixTable[prefix] == maxSkip) {
goodSuffixTable[prefix] = prefixSkip;
}
prefix += 1;
}
}
}
for (int32_t i = 0; i < patlen - 1; i += 1) {
goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
}
DELETE_ARRAY(suff);
}
GoodSuffixTable::~GoodSuffixTable()
{
DELETE_ARRAY(goodSuffixTable);
}
int32_t GoodSuffixTable::operator[](int32_t offset) const
{
return goodSuffixTable[offset];
}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
UBool BoyerMooreSearch::empty()
{
return patCEs->size() <= 0;
}
CollData *BoyerMooreSearch::getData()
{
return data;
}
CEList *BoyerMooreSearch::getPatternCEs()
{
return patCEs;
}
BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
{
return badCharacterTable;
}
GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
{
return goodSuffixTable;
}
BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
UErrorCode &status)
: data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
{
if (U_FAILURE(status)) {
return;
}
UCollator *collator = data->getCollator();
patCEs = new CEList(collator, patternString, status);
if (patCEs == NULL || U_FAILURE(status)) {
return;
}
badCharacterTable = new BadCharacterTable(*patCEs, data, status);
if (badCharacterTable == NULL || U_FAILURE(status)) {
return;
}
goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
if (targetString != NULL) {
target = new Target(collator, targetString, patCEs->size(), status);
}
}
BoyerMooreSearch::~BoyerMooreSearch()
{
delete target;
delete goodSuffixTable;
delete badCharacterTable;
delete patCEs;
}
void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
if (target == NULL) {
target = new Target(data->getCollator(), targetString, patCEs->size(), status);
} else {
target->setTargetString(targetString);
}
}
// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
/*
* TODO:
* * deal with trailing (and leading?) ignorables.
* * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
*/
UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
{
UCollator *coll = data->getCollator();
int32_t plen = patCEs->size();
int32_t tlen = target->stringLength();
int32_t maxSkip = badCharacterTable->getMaxSkip();
int32_t tOffset = offset + maxSkip;
if (plen <= 0) {
// Searching for a zero length pattern always fails.
start = end = -1;
return FALSE;
}
while (tOffset <= tlen) {
int32_t pIndex = plen - 1;
int32_t tIndex = 0;
int32_t lIndex = 0;
if (tOffset < tlen) {
// **** we really want to skip ahead enough to ****
// **** be sure we get at least 1 non-ignorable ****
// **** CE after the end of the pattern. ****
int32_t next = target->nextSafeBoundary(tOffset + 1);
target->setOffset(next);
for (lIndex = 0; ; lIndex += 1) {
const CEI *cei = target->prevCE(lIndex);
int32_t low = cei->lowOffset;
int32_t high = cei->highOffset;
if (high == 0 || (low < high && low <= tOffset)) {
if (low < tOffset) {
while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
lIndex -= 1;
}
if (high > tOffset) {
tOffset = high;
}
}
break;
}
}
} else {
target->setLast(tOffset);
lIndex = 0;
}
tIndex = ++lIndex;
// Iterate backward until we hit the beginning of the pattern
while (pIndex >= 0) {
uint32_t pce = (*patCEs)[pIndex];
const CEI *tcei = target->prevCE(tIndex++);
if (tcei->order != pce) {
// There is a mismatch at this position. Decide how far
// over to shift the pattern, then try again.
int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
#ifdef EXTRA_CAUTIOUS
int32_t old = tOffset;
#endif
tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
if (gsOffset > tOffset) {
tOffset = gsOffset;
}
#ifdef EXTRA_CAUTIOUS
// Make sure we don't skip backwards...
if (tOffset <= old) {
tOffset = old + 1;
}
#endif
break;
}
pIndex -= 1;
}
if (pIndex < 0) {
// We made it back to the beginning of the pattern,
// which means we matched it all. Return the location.
const CEI firstCEI = *target->prevCE(tIndex - 1);
const CEI lastCEI = *target->prevCE(lIndex);
int32_t mStart = firstCEI.lowOffset;
int32_t minLimit = lastCEI.lowOffset;
int32_t maxLimit = lastCEI.highOffset;
int32_t mLimit;
UBool found = TRUE;
target->setOffset(/*tOffset*/maxLimit);
const CEI nextCEI = *target->nextCE(0);
if (nextCEI.lowOffset > maxLimit) {
maxLimit = nextCEI.lowOffset;
}
if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != UCOL_NULLORDER) {
found = FALSE;
}
if (! target->isBreakBoundary(mStart)) {
found = FALSE;
}
if (firstCEI.lowOffset == firstCEI.highOffset) {
found = FALSE;
}
mLimit = maxLimit;
if (minLimit < maxLimit) {
int32_t nbb = target->nextBreakBoundary(minLimit);
if (nbb >= lastCEI.highOffset) {
mLimit = nbb;
}
}
if (mLimit > maxLimit) {
found = FALSE;
}
if (! target->isBreakBoundary(mLimit)) {
found = FALSE;
}
if (! target->isIdentical(pattern, mStart, mLimit)) {
found = FALSE;
}
if (found) {
start = mStart;
end = mLimit;
return TRUE;
}
tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
}
// Otherwise, we're here because of a mismatch, so keep going....
}
// no match
start = -1;
end = -1;
return FALSE;
}
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_COLLATION

File diff suppressed because it is too large Load Diff

View File

@ -408,6 +408,40 @@
<Filter
Name="collation"
>
<File
RelativePath=".\bms.cpp"
>
</File>
<File
RelativePath=".\unicode\bms.h"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\bmsearch.cpp"
>
</File>
<File
RelativePath=".\unicode\bmsearch.h"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\bocsu.c"
>
@ -504,6 +538,23 @@
/>
</FileConfiguration>
</File>
<File
RelativePath=".\colldata.cpp"
>
</File>
<File
RelativePath=".\unicode\colldata.h"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\search.cpp"
>

View File

@ -1,7 +1,7 @@
/*
******************************************************************************
* *
* Copyright (C) 2001-2008, International Business Machines *
* Copyright (C) 2001-2009, International Business Machines *
* Corporation and others. All Rights Reserved. *
* *
******************************************************************************
@ -45,6 +45,7 @@ typedef enum ECleanupI18NType {
UCLN_I18N_UCOL_RES,
UCLN_I18N_UCOL_BLD,
UCLN_I18N_CSDET,
UCLN_I18N_COLL_DATA,
UCLN_I18N_COUNT /* This must be last */
} ECleanupI18NType;

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2008, International Business Machines
* Copyright (C) 1996-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucol.cpp
@ -123,7 +123,6 @@ uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
IInit_collIterate(collator, sourceString, sourceLen, s);
}
/**
* Backup the state of the collIterate struct data
* @param data collIterate to backup
@ -1499,10 +1498,30 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
}
else
{
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
// Always use UCA for Han, Hangul
// (Han extension A is before main Han block)
// **** Han compatibility chars ?? ****
if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
(ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
// between the two target ranges; do normal lookup
// **** this range is YI, Modifier tone letters, ****
// **** Latin-D, Syloti Nagari, Phagas-pa. ****
// **** Latin-D might be tailored, so we need to ****
// **** do the normal lookup for these guys. ****
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
} else {
// in one of the target ranges; use UCA
order = UCOL_NOT_FOUND;
}
} else {
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
}
if(order > UCOL_NOT_FOUND) { /* if a CE is special */
order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
}
if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
/* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
@ -1939,7 +1958,23 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
result = coll->latinOneMapping[ch];
}
else {
result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
// Always use UCA for [3400..9FFF], [AC00..D7AF]
// **** [FA0E..FA2F] ?? ****
if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
(ch >= 0x3400 && ch <= 0xD7AF)) {
if (ch > 0x9FFF && ch < 0xAC00) {
// between the two target ranges; do normal lookup
// **** this range is YI, Modifier tone letters, ****
// **** Latin-D, Syloti Nagari, Phagas-pa. ****
// **** Latin-D might be tailored, so we need to ****
// **** do the normal lookup for these guys. ****
result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
} else {
result = UCOL_NOT_FOUND;
}
} else {
result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
}
}
if (result > UCOL_NOT_FOUND) {
result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
@ -3545,38 +3580,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
int32_t offsetBias;
#if 0
if (source->offsetReturn != NULL) {
source->offsetStore = source->offsetReturn - noChars;
}
// **** doesn't work if using iterator ****
if (source->flags & UCOL_ITER_INNORMBUF) {
if (source->fcdPosition == NULL) {
offsetBias = 0;
} else {
offsetBias = (int32_t)(source->fcdPosition - source->string);
}
} else {
offsetBias = (int32_t)(source->pos - source->string);
}
#else
// **** doesn't work if using iterator ****
if (source->flags & UCOL_ITER_INNORMBUF) {
#if 1
offsetBias = -1;
#else
if (source->fcdPosition == NULL) {
offsetBias = 0;
} else {
offsetBias = (int32_t)(source->fcdPosition - source->string);
}
#endif
} else {
offsetBias = (int32_t)(source->pos - source->string);
}
#endif
/* a new collIterate is used to simplify things, since using the current
collIterate will mean that the forward and backwards iteration will
@ -3584,9 +3593,9 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
collIterate temp;
int32_t rawOffset;
//IInit_collIterate(coll, UCharOffset, -1, &temp);
IInit_collIterate(coll, UCharOffset, noChars, &temp);
temp.flags &= ~UCOL_ITER_NORM;
temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
rawOffset = temp.pos - temp.string; // should always be zero?
CE = ucol_IGetNextCE(coll, &temp, status);
@ -3679,7 +3688,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
}
}
rawOffset = temp.pos - temp.string;
if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
rawOffset = temp.fcdPosition - temp.string;
} else {
rawOffset = temp.pos - temp.string;
}
CE = ucol_IGetNextCE(coll, &temp, status);
}
@ -4136,29 +4150,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
}
case IMPLICIT_TAG: /* everything that is not defined otherwise */
#if 0
if (source->offsetBuffer == NULL) {
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
source->offsetStore = source->offsetBuffer;
}
// **** doesn't work if using iterator ****
if (source->flags & UCOL_ITER_INNORMBUF) {
source->offsetRepeatCount = 1;
} else {
int32_t firstOffset = (int32_t)(source->pos - source->string);
*(source->offsetStore++) = firstOffset;
*(source->offsetStore++) = firstOffset + 1;
source->offsetReturn = source->offsetStore - 1;
if (source->offsetReturn == source->offsetBuffer) {
source->offsetStore = source->offsetBuffer;
}
}
#endif
return getPrevImplicit(ch, source);
// TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1998-2008, International Business Machines
* Copyright (C) 1998-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -260,6 +260,8 @@ minimum number for special Jamo
/* by index */
#define UCOL_USE_ITERATOR 64
#define UCOL_FORCE_HAN_IMPLICIT 128
#define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300
typedef struct collIterate {
@ -390,6 +392,29 @@ uprv_init_pce(const struct UCollationElements *elems);
(((uint32_t)(ch) - 0x1161) <= (0x1175 - 0x1161)) || \
(((uint32_t)(ch) - 0x11A8) <= (0x11C2 - 0x11A8)))
/* Han character ranges */
#define UCOL_FIRST_HAN 0x4E00
#define UCOL_LAST_HAN 0x9FFF
#define UCOL_FIRST_HAN_A 0x3400
#define UCOL_LAST_HAN_A 0x4DBF
#define UCOL_FIRST_HAN_COMPAT 0xFAE0
#define UCOL_LAST_HAN_COMPAT 0xFA2F
/* Han extension B is in plane 2 */
#define UCOL_FIRST_HAN_B_LEAD 0xD840
#define UCOL_FIRST_HAN_B_TRAIL 0xDC00
#define UCOL_LAST_HAN_B_LEAD 0xD869
#define UCOL_LAST_HAN_B_TRAIL 0xDEDF
/* Hangul range */
#define UCOL_FIRST_HANGUL 0xAC00
#define UCOL_LAST_HANGUL 0xD7AF
/* Jamo ranges */
#define UCOL_FIRST_L_JAMO 0x1100
#define UCOL_FIRST_V_JAMO 0x1161
#define UCOL_FIRST_T_JAMO 0x11A8
#define UCOL_LAST_T_JAMO 0x11F9
#if 0

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2004-2008, International Business Machines
* Copyright (C) 2004-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucol_sit.cpp
@ -578,15 +578,15 @@ ucol_getShortDefinitionString(const UCollator *coll,
if(elementSize) {
// we should probably canonicalize here...
elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, languageArg);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg);
elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, regionArg);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg);
elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, scriptArg);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg);
elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variantArg);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg);
elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, keywordArg);
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg);
}
int32_t i = 0;
@ -597,7 +597,7 @@ ucol_getShortDefinitionString(const UCollator *coll,
if(attribute != UCOL_DEFAULT) {
char letter = ucol_sit_attributeValueToLetter(attribute, status);
appendShortStringElement(&letter, 1,
buffer, &resultSize, capacity, options[i].optionStart);
buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart);
}
}
}

View File

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 2001-2008, International Business Machines
* Copyright (C) 2001-2009, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
*
@ -263,7 +263,14 @@ inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
primary = ucol_primaryOrder(ce);
}
// Continuation?
// **** This should probably handle continuations too. ****
// **** That means that we need 24 bits for the primary ****
// **** instead of the 16 that we're currently using. ****
// **** So we can lay out the 64 bits as: 24.12.12.16. ****
// **** Another complication with continuations is that ****
// **** the *second* CE is marked as a continuation, so ****
// **** we always have to peek ahead to know how long ****
// **** the primary is... ****
if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0)
|| (elems->pce->isShifted && primary == 0)) {
@ -285,7 +292,6 @@ inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
elems->pce->isShifted = FALSE;
}
return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
}
@ -332,6 +338,7 @@ ucol_openElements(const UCollator *coll,
return result;
}
U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements *elems)
{
@ -375,7 +382,7 @@ ucol_reset(UCollationElements *elems)
ci->endp = ci->string + u_strlen(ci->string);
}
ci->CEpos = ci->toReturn = ci->CEs;
ci->flags = UCOL_ITER_HASLEN;
ci->flags = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN;
if (ci->coll->normalizationMode == UCOL_ON) {
ci->flags |= UCOL_ITER_NORM;
}
@ -391,6 +398,21 @@ ucol_reset(UCollationElements *elems)
ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
}
U_CAPI void U_EXPORT2
ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status)
{
if (U_FAILURE(*status)) {
return;
}
if (elems == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT;
}
U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements *elems,
UErrorCode *status)

View File

@ -0,0 +1,265 @@
/*
* Copyright (C) 1996-2009, International Business Machines Corporation and Others.
* All rights reserved.
*/
/**
* \file
* \brief C API: Boyer-Moore StringSearch prototype.
* \internal
*/
#ifndef _BMS_H
#define _BMS_H
#include "unicode/utypes.h"
#include "unicode/ucol.h"
/**
* A <code>UCD</code> object holds the Collator-specific data needed to
* compute the length of the shortest string that can
* generate a partcular list of CEs.
*
* <code>UCD</code> objects are quite expensive to compute. Because
* of this, they are cached. When you call <code>ucd_open</code> it
* returns a reference counted cached object. When you call <code>ucd_close</code>
* the reference count on the object is decremented but the object is not deleted.
*
* If you do not need to reuse any unreferenced objects in the cache, you can call
* <code>ucd_flushCCache</code>. If you no longer need any <code>UCD</code>
* objects, you can call <code>ucd_freeCache</code>
*/
typedef void UCD;
/**
* Open a <code>UCD</code> object.
*
* @param collator - the collator
* @param status - will be set if any errors occur.
*
* @return the <code>UCD</code> object. You must call
* <code>ucd_close</code> when you are done using the object.
*
* Note: if on return status is set to an error, the only safe
* thing to do with the returned object is to call <code>ucd_close</code>.
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI UCD * U_EXPORT2
ucd_open(UCollator *coll, UErrorCode *status);
/**
* Release a <code>UCD</code> object.
*
* @param ucd - the object
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI void U_EXPORT2
ucd_close(UCD *ucd);
/**
* Get the <code>UCollator</code> object used to create a <code>UCD</code> object.
* The <code>UCollator</code> object returned may not be the exact
* object that was used to create this object, but it will have the
* same behavior.
*
* @param ucd - the <code>UCD</code> object
*
* @return the <code>UCollator</code> used to create the given
* <code>UCD</code> object.
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI UCollator * U_EXPORT2
ucd_getCollator(UCD *ucd);
/**
* <code>UCD</code> objects are expensive to compute, and so
* may be cached. This routine will free the cached objects and delete
* the cache.
*
* WARNING: Don't call this until you are have called <code>close</code>
* for each <code>UCD</code> object that you have used. also,
* DO NOT call this if another thread may be calling <code>ucd_flushCache</code>
* at the same time.
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI void U_EXPORT2
ucd_freeCache();
/**
* <code>UCD</code> objects are expensive to compute, and so
* may be cached. This routine will remove any unused <code>UCD</code>
* objects from the cache.
*
* @internal 4.0.1 technology preview
*/
U_CAPI void U_EXPORT2
ucd_flushCache();
/**
* BMS
*
* This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
* the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
* and a reference to the text being searched.
*
* To do a search, you fist need to get a <code>UCD</code> object by calling <code>ucd_open</code>.
* Then you construct a <code>BMS</code> object from the <code>UCD</code> object, the pattern
* string and the target string. Then you call the <code>search</code> method. Here's a code sample:
*
* <pre>
* void boyerMooreExample(UCollator *collator, UChar *pattern, int32_t patternLen, UChar *target, int32_t targetLength)
* {
* UErrorCode status = U_ZERO_ERROR;
* int32_t offset = 0, start = -1, end = -1;
* UCD *ucd = NULL);
* BMS *bms = NULL;
*
* ucd = ucd_open(collator, &status);
* if (U_FAILURE(status)) {
* // could not create a UCD object
* return;
* }
*
* BMS *bms = bms_open(ucd, pattern, patternLength, target, targetlength, &status);
* if (U_FAILURE(status)) {
* // could not create a BMS object
* ucd_close(ucd);
* return;
* }
*
*
* // Find all matches
* while (bms_search(bms, offset, &start, &end)) {
* // process the match between start and end
* ...
*
* // advance past the match
* offset = end;
* }
*
* // at this point, if offset == 0, there were no matches
* if (offset == 0) {
* // handle the case of no matches
* }
*
* bms_close(bms);
* ucd_close(ucd);
*
* // UCD objects are cached, so the call to
* // ucd_close doesn't delete the object.
* // Call this if you don't need the object any more.
* ucd_flushCache();
* }
* </pre>
*
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
*
* Knows linitations:
* 1) Backwards searching has not been implemented.
*
* 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
* this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
* to be equal to Han characters with the same pronounciation. Because this code ignroes
* tailorings, searching for a Hangul character will not find a Han character and visa-versa.
*
* 3) In some cases, searching for a pattern that needs to be normalized and ends
* in a discontiguous contraction may fail. The only known cases of this are with
* the Tibetan script. For example searching for the pattern
* "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
* been unable to find a pratical, real-world example of this failure.)
*
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
*
* @internal ICU 4.0.1 technology preview
*/
struct BMS;
typedef struct BMS BMS;
/**
* Construct a <code>MBS</code> object.
*
* @param ucd - A <code>UCD</code> object holding the Collator-sensitive data
* @param pattern - the string for which to search
* @param latternLength - the length of the string for which to search
* @param target - the string in which to search
* @param targetLength - the length of the string in which to search
* @param status - will be set if any errors occur.
*
* @return the <code>BMS</code> object.
*
* Note: if on return status is set to an error, the only safe
* thing to do with the returned object is to call
* <code>bms_close</code>.
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI BMS * U_EXPORT2
bms_open(UCD *ucd,
const UChar *pattern, int32_t patternLength,
const UChar *target, int32_t targetLength,
UErrorCode *status);
/**
* Close a <code>BMS</code> object and release all the
* storage associated with it.
*
* @param bms - the <code>BMS</code> object to close.
*/
U_CAPI void U_EXPORT2
bms_close(BMS *bms);
/**
* Test the pattern to see if it generates any CEs.
*
* @return <code>TRUE</code> if the pattern string did not generate any CEs
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI UBool U_EXPORT2
bms_empty(BMS *bms);
/**
* Get the <code>UCD</code> object used to create
* a given <code>BMS</code> object.
*
* @param bms - the <code>BMS</code> object
*
* @return - the <code>UCD</code> object used to create
* the given <code>BMS</code> object.
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI UCD * U_EXPORT2
bms_getData(BMS *bms);
/**
* Search for the pattern string in the target string.
*
* @param offset - the offset in the target string at which to begin the search
* @param start - will be set to the starting offset of the match, or -1 if there's no match
* @param end - will be set to the ending offset of the match, or -1 if there's no match
*
* @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI UBool U_EXPORT2
bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end);
/**
* Set the target string for the match.
*
* @param target - the new target string
* @param targetLength - the length of the new target string
* @param status - will be set if any errors occur.
*
* @internal ICU 4.0.1 technology preview
*/
U_CAPI void U_EXPORT2
bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status);
#endif /* _BMS_H */

View File

@ -0,0 +1,221 @@
/*
******************************************************************************
* Copyright (C) 1996-2009, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
/**
* \file
* \brief C++ API: Boyer-Moore StringSearch technology preview
* \internal ICU 4.0.1 technology preview
*/
#ifndef B_M_SEARCH_H
#define B_M_SEARCH_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/uobject.h"
#include "unicode/ucol.h"
#include "unicode/colldata.h"
U_NAMESPACE_BEGIN
class BadCharacterTable;
class GoodSuffixTable;
class Target;
/**
* BoyerMooreSearch
*
* This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
* the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
* and a reference to the text being searched.
*
* To do a search, you fist need to get a <code>CollData</code> object by calling <code>CollData::open</code>.
* Then you construct a <code>BoyerMooreSearch</code> object from the <code>CollData</code> object, the pattern
* string and the target string. Then you call the <code>search</code> method. Here's a code sample:
*
* <pre>
* void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeString *target)
* {
* UErrorCode status = U_ZERO_ERROR;
* CollData *collData = CollData::open(collator, status);
*
* if (U_FAILURE(status)) {
* // could not create a CollData object
* return;
* }
*
* BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString, target, status);
*
* if (U_FAILURE(status)) {
* // could not create a BoyerMooreSearch object
* CollData::close(collData);
* return;
* }
*
* int32_t offset = 0, start = -1, end = -1;
*
* // Find all matches
* while (search->search(offset, start, end)) {
* // process the match between start and end
* ...
* // advance past the match
* offset = end;
* }
*
* // at this point, if offset == 0, there were no matches
* if (offset == 0) {
* // handle the case of no matches
* }
*
* delete search;
* CollData::close(collData);
*
* // CollData objects are cached, so the call to
* // CollData::close doesn't delete the object.
* // Call this if you don't need the object any more.
* CollData::flushCollDataCache();
* }
* </pre>
*
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
*
* Knows linitations:
* 1) Backwards searching has not been implemented.
*
* 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
* this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
* to be equal to Han characters with the same pronounciation. Because this code ignroes
* tailorings, searching for a Hangul character will not find a Han character and visa-versa.
*
* 3) In some cases, searching for a pattern that needs to be normalized and ends
* in a discontiguous contraction may fail. The only known cases of this are with
* the Tibetan script. For example searching for the pattern
* "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
* been unable to find a pratical, real-world example of this failure.)
*
* @internal ICU 4.0.1 technology preview
*
* @see CollData
*/
class U_I18N_API BoyerMooreSearch : public UObject
{
public:
/**
* Construct a <code>BoyerMooreSearch</code> object.
*
* @param theData - A <code>CollData</code> object holding the Collator-sensitive data
* @param patternString - the string for which to search
* @param targetString - the string in which to search or <code>NULL</code> if youu will
* set it later by calling <code>setTargetString</code>.
* @param status - will be set if any errors occur.
*
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* the destructor.
*
* @internal ICU 4.0.1 technology preview
*/
BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status);
/**
* The desstructor
*
* @internal ICU 4.0.1 technology preview
*/
~BoyerMooreSearch();
/**
* Test the pattern to see if it generates any CEs.
*
* @return <code>TRUE</code> if the pattern string did not generate any CEs
*
* @internal ICU 4.0.1 technology preview
*/
UBool empty();
/**
* Search for the pattern string in the target string.
*
* @param offset - the offset in the target string at which to begin the search
* @param start - will be set to the starting offset of the match, or -1 if there's no match
* @param end - will be set to the ending offset of the match, or -1 if there's no match
*
* @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
*
* @internal ICU 4.0.1 technology preview
*/
UBool search(int32_t offset, int32_t &start, int32_t &end);
/**
* Set the target string for the match.
*
* @param targetString - the new target string
* @param status - will be set if any errors occur.
*
* @internal ICU 4.0.1 technology preview
*/
void setTargetString(const UnicodeString *targetString, UErrorCode &status);
// **** no longer need these? ****
/**
* Return the <code>CollData</code> object used for searching
*
* @return the <code>CollData</code> object used for searching
*
* @internal ICU 4.0.1 technology preview
*/
CollData *getData();
/**
* Return the CEs generated by the pattern string.
*
* @return a <code>CEList</code> object holding the CEs generated by the pattern string.
*
* @internal ICU 4.0.1 technology preview
*/
CEList *getPatternCEs();
/**
* Return the <code>BadCharacterTable</code> object computed for the pattern string.
*
* @return the <code>BadCharacterTable</code> object.
*
* @internal ICU 4.0.1 technology preview
*/
BadCharacterTable *getBadCharacterTable();
/**
* Return the <code>GoodSuffixTable</code> object computed for the pattern string.
*
* @return the <code>GoodSuffixTable</code> object computed for the pattern string.
*
* @internal ICU 4.0.1 technology preview
*/
GoodSuffixTable *getGoodSuffixTable();
/*
* UObject glue...
*/
virtual UClassID getDynamicClassID() const;
static UClassID getStaticClassID();
private:
CollData *data;
CEList *patCEs;
BadCharacterTable *badCharacterTable;
GoodSuffixTable *goodSuffixTable;
UnicodeString pattern;
Target *target;
};
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_COLLATION
#endif // #ifndef B_M_SEARCH_H

View File

@ -0,0 +1,430 @@
/*
******************************************************************************
* Copyright (C) 1996-2009, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
/**
* \file
* \brief C++ API: Collation data used to compute minLengthInChars.
* \internal
*/
#ifndef COLL_DATA_H
#define COLL_DATA_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/uobject.h"
#include "unicode/ucol.h"
U_NAMESPACE_BEGIN
/*
* The size of the internal buffer for the Collator's short description string.
*/
#define KEY_BUFFER_SIZE 64
/*
* The size of the internal CE buffer in a <code>CEList</code> object
*/
#define CELIST_BUFFER_SIZE 4
/*
* Define this to enable the <code>CEList</code> objects to collect
* statistics.
*/
//#define INSTRUMENT_CELIST
/*
* The size of the initial list in a <code>StringList</code> object.
*/
#define STRING_LIST_BUFFER_SIZE 16
/*
* Define this to enable the <code>StringList</code> objects to
* collect statistics.
*/
//#define INSTRUMENT_STRING_LIST
/**
* CEList
*
* This object holds a list of CEs generated from a particular
* <code>UnicodeString</code>
*
* @internal ICU 4.0.1 technology preview
*/
class U_I18N_API CEList : public UObject
{
public:
/**
* Construct a <code>CEList</code> object.
*
* @param coll - the Collator used to collect the CEs.
* @param string - the string for which to collect the CEs.
* @param status - will be set if any errors occur.
*
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* the destructor.
*
* @internal ICU 4.0.1 technology preview
*/
CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
/**
* The destructor.
*/
~CEList();
/**
* Return the number of CEs in the list.
*
* @return the number of CEs in the list.
*
* @internal ICU 4.0.1 technology preview
*/
int32_t size() const;
/**
* Get a particular CE from the list.
*
* @param index - the index of the CE to return
*
* @return the CE, or <code>0</code> if <code>index</code> is out of range
*
* @internal ICU 4.0.1 technology preview
*/
uint32_t get(int32_t index) const;
/**
* Check if the CEs in another <code>CEList</code> match the
* suffix of this list starting at a give offset.
*
* @param offsset - the offset of the suffix
* @param other - the other <code>CEList</code>
*
* @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
*
* @internal ICU 4.0.1 technology preview
*/
UBool matchesAt(int32_t offset, const CEList *other) const;
/**
* The index operator.
*
* @param index - the index
*
* @return a reference to the given CE in the list
*
* @internal ICU 4.0.1 technology preview
*/
uint32_t &operator[](int32_t index) const;
/*
* UObject glue...
*/
virtual UClassID getDynamicClassID() const;
static UClassID getStaticClassID();
private:
void add(uint32_t ce, UErrorCode &status);
uint32_t ceBuffer[CELIST_BUFFER_SIZE];
uint32_t *ces;
int32_t listMax;
int32_t listSize;
#ifdef INSTRUMENT_CELIST
static int32_t _active;
static int32_t _histogram[10];
#endif
};
/**
* StringList
*
* This object holds a list of <code>UnicodeString</code> objects.
*
* @internal ICU 4.0.1 technology preview
*/
class U_I18N_API StringList : public UObject
{
public:
/**
* Construct an empty <code>StringList</code>
*
* @param status - will be set if any errors occur.
*
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* the destructor.
*
* @internal ICU 4.0.1 technology preview
*/
StringList(UErrorCode &status);
/**
* The destructor.
*
* @internal ICU 4.0.1 technology preview
*/
~StringList();
/**
* Add a string to the list.
*
* @param string - the string to add
* @param status - will be set if any errors occur.
*
* @internal ICU 4.0.1 technology preview
*/
void add(const UnicodeString *string, UErrorCode &status);
/**
* Add an array of Unicode code points to the list.
*
* @param chars - the address of the array of code points
* @param count - the number of code points in the array
* @param status - will be set if any errors occur.
*
* @internal ICU 4.0.1 technology preview
*/
void add(const UChar *chars, int32_t count, UErrorCode &status);
/**
* Get a particular string from the list.
*
* @param index - the index of the string
*
* @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
* if <code>index</code> is out of bounds.
*
* @internal ICU 4.0.1 technology preview
*/
const UnicodeString *get(int32_t index) const;
/**
* Get the number of stings in the list.
*
* @return the number of strings in the list.
*
* @internal ICU 4.0.1 technology preview
*/
int32_t size() const;
/*
* the UObject glue...
*/
virtual UClassID getDynamicClassID() const;
static UClassID getStaticClassID();
private:
UnicodeString *strings;
int32_t listMax;
int32_t listSize;
#ifdef INSTRUMENT_STRING_LIST
static int32_t _lists;
static int32_t _strings;
static int32_t _histogram[101];
#endif
};
/*
* Forward references to internal classes.
*/
class StringToCEsMap;
class CEToStringsMap;
class CollDataCache;
/**
* CollData
*
* This class holds the Collator-specific data needed to
* compute the length of the shortest string that can
* generate a partcular list of CEs.
*
* <code>CollData</code> objects are quite expensive to compute. Because
* of this, they are cached. When you call <code>CollData::open</code> it
* returns a reference counted cached object. When you call <code>CollData::close</code>
* the reference count on the object is decremented but the object is not deleted.
*
* If you do not need to reuse any unreferenced objects in the cache, you can call
* <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
* objects, you can call <code>CollData::freeCollDataCache</code>
*
* @internal ICU 4.0.1 technology preview
*/
class U_I18N_API CollData : public UObject
{
public:
/**
* Construct a <code>CollData</code> object.
*
* @param collator - the collator
* @param status - will be set if any errors occur.
*
* @return the <code>CollData</code> object. You must call
* <code>close</code> when you are done using the object.
*
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* <code>CollData::close</code>.
*
* @internal ICU 4.0.1 technology preview
*/
static CollData *open(UCollator *collator, UErrorCode &status);
/**
* Release a <code>CollData</code> object.
*
* @param collData - the object
*
* @internal ICU 4.0.1 technology preview
*/
static void close(CollData *collData);
/**
* Get the <code>UCollator</code> object used to create this object.
* The object returned may not be the exact object that was used to
* create this object, but it will have the same behavior.
*/
UCollator *getCollator() const;
/**
* Get a list of all the strings which generate a list
* of CEs starting with a given CE.
*
* @param ce - the CE
*
* return a <code>StringList</code> object containing all
* the stirngs, or <code>NULL</code> if there are
* no such strings.
*
* @internal ICU 4.0.1 technology preview.
*/
const StringList *getStringList(int32_t ce) const;
/**
* Get a list of the CEs generated by a partcular stirng.
*
* @param string - the string
*
* @return a <code>CEList</code> object containt the CEs. You
* must call <code>freeCEList</code> when you are finished
* using the <code>CEList</code>/
*
* @internal ICU 4.0.1 technology preview.
*/
const CEList *getCEList(const UnicodeString *string) const;
/**
* Release a <code>CEList</code> returned by <code>getCEList</code>.
*
* @param list - the <CEList> to free.
*
* @internal ICU 4.0.1 technology preview
*/
void freeCEList(const CEList *list);
/**
* Return the length of the shortest string that will generate
* the given list of CEs.
*
* @param ces - the CEs
* @param offset - the offset of the first CE in the list to use.
*
* @return the length of the shortest string.
*
* @internal ICU 4.0.1 technology preview
*/
int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
/**
* Return the length of the shortest string that will generate
* the given list of CEs.
*
* Note: the algorithm used to do this computation is recursive. To
* limit the amount of recursion, a "history" list is used to record
* the best answer starting at a particular offset in the list of CEs.
* If the same offset is visited again during the recursion, the answer
* in the history list is used.
*
* @param ces - the CEs
* @param offset - the offset of the first CE in the list to use.
* param history - the history list. Must be at least as long as
* the number of cEs in the <code>CEList</code>
*
* @return the length of the shortest string.
*
* @internal ICU 4.0.1 technology preview
*/
int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
/*
* UObject glue...
*/
virtual UClassID getDynamicClassID() const;
static UClassID getStaticClassID();
/**
* <code>CollData</code> objects are expensive to compute, and so
* may be cached. This routine will free the cached objects and delete
* the cache.
*
* WARNING: Don't call this until you are have called <code>close</code>
* for each <code>CollData</code> object that you have used. also,
* DO NOT call this if another thread may be calling <code>flushCollDataCache</code>
* at the same time.
*
* @internal 4.0.1 technology preview
*/
static void freeCollDataCache();
/**
* <code>CollData</code> objects are expensive to compute, and so
* may be cached. This routine will remove any unused <code>CollData</code>
* objects from the cache.
*
* @internal 4.0.1 technology preview
*/
static void flushCollDataCache();
private:
friend class CollDataCache;
friend class CollDataCacheEntry;
CollData(UCollator *collator, char *cacheKey, int32_t cachekeyLength, UErrorCode &status);
~CollData();
CollData();
static char *getCollatorKey(UCollator *collator, char *buffer, int32_t bufferLength);
static CollDataCache *getCollDataCache();
UCollator *coll;
StringToCEsMap *charsToCEList;
CEToStringsMap *ceToCharsStartingWith;
char keyBuffer[KEY_BUFFER_SIZE];
char *key;
static CollDataCache *collDataCache;
uint32_t minHan;
uint32_t maxHan;
uint32_t jamoLimits[4];
};
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_COLLATION
#endif // #ifndef COLL_DATA_H

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2001-2008, International Business Machines
* Copyright (C) 2001-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*
@ -121,6 +121,7 @@ ucol_openElements(const UCollator *coll,
int32_t textLength,
UErrorCode *status);
/**
* get a hash code for a key... Not very useful!
* @param key the given key.
@ -152,6 +153,20 @@ ucol_closeElements(UCollationElements *elems);
U_STABLE void U_EXPORT2
ucol_reset(UCollationElements *elems);
/**
* Set the collation elements to use implicit ordering for Han
* even if they've been tailored. This will also force Hangul
* syllables to be ordered by decomposing them to their component
* Jamo.
*
* @param elems The UCollationElements containing the text.
* @param status A pointer to a UErrorCode to reveive any errors.
*
* @internal
*/
U_INTERNAL void U_EXPORT2
ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status);
/**
* Get the ordering priority of the next collation element in the text.
* A single character may contain more than one collation element.

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
* Copyright (C) 2001-2009 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 07/02/2001 synwee Creation.
@ -3785,7 +3785,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
found = TRUE;
// Inner loop checks for a match beginning at each
// position from the outer loop.
for (patIx=0; patIx<strsrch->pattern.CELength; patIx++) {
for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
int64_t patCE = strsrch->pattern.PCE[patIx];
targetCEI = ceb.get(targetIx+patIx);
// Compare CE from target string with CE from the pattern.
@ -3814,11 +3814,9 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
// an acceptable character range.
//
const CEI *firstCEI = ceb.get(targetIx);
const CEI *lastCEI = ceb.get(targetIx + strsrch->pattern.CELength - 1);
const CEI *nextCEI = ceb.get(targetIx + strsrch->pattern.CELength);
const CEI *lastCEI = ceb.get(targetIx + strsrch->pattern.PCELength - 1);
const CEI *nextCEI = ceb.get(targetIx + strsrch->pattern.PCELength);
// targetCEI = ceb.get(targetIx+strsrch->pattern.CELength);
// maxLimit = targetCEI->lowIndex;
mStart = firstCEI->lowIndex;
minLimit = lastCEI->lowIndex;
maxLimit = nextCEI->lowIndex;
@ -3883,7 +3881,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
found = FALSE;
}
if (!checkIdentical(strsrch, mStart, mLimit)) {
if (! checkIdentical(strsrch, mStart, mLimit)) {
found = FALSE;
}
@ -4006,10 +4004,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
found = TRUE;
// Inner loop checks for a match beginning at each
// position from the outer loop.
for (patIx = strsrch->pattern.CELength - 1; patIx >= 0; patIx -= 1) {
for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
int64_t patCE = strsrch->pattern.PCE[patIx];
targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1 - patIx);
targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx);
// Compare CE from target string with CE from the pattern.
// Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
// which will fail the compare, below.
@ -4035,7 +4033,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
// There still is a chance of match failure if the CE range not correspond to
// an acceptable character range.
//
const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1);
const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1);
const CEI *lastCEI = ceb.getPrevious(targetIx);
const CEI *nextCEI = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;
@ -4102,6 +4100,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
found = FALSE;
}
if (! checkIdentical(strsrch, mStart, mLimit)) {
found = FALSE;
}
if (found) {
break;
}

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2008, International Business Machines Corporation and
* Copyright (c) 1997-2009, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*******************************************************************************
@ -515,7 +515,7 @@ backAndForth(UCollationElements *iter)
}
if (o != orders[index].order) {
log_err("Mismatched order at index %d: 0x%0:8X vs. 0x%0:8X\n", index,
log_err("Mismatched order at index %d: 0x%8.8X vs. 0x%8.8X\n", index,
orders[index].order, o);
goto bail;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2008, International Business Machines
* Copyright (C) 2005-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -11,6 +11,7 @@
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/ucol.h"
#include "unicode/bmsearch.h"
#include "intltest.h"
@ -34,10 +35,24 @@ public:
virtual void offsetTest();
virtual void monkeyTest(char *params);
virtual void bmMonkeyTest(char *params);
virtual void boyerMooreTest();
virtual void goodSuffixTest();
virtual void searchTime();
virtual void bmsTest();
virtual void bmSearchTest();
virtual void udhrTest();
private:
virtual const char *getPath(char buffer[2048], const char *filename);
virtual int32_t monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
const char *name, const char *strength, uint32_t seed);
virtual int32_t bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
BoyerMooreSearch *bms, BoyerMooreSearch *abms,
const char *name, const char *strength, uint32_t seed);
#endif
};

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (C) 2008 IBM, Inc. All Rights Reserved.
* Copyright (C) 2008-2009 IBM, Inc. All Rights Reserved.
*
********************************************************************/
/**
@ -14,7 +14,13 @@
StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
:UPerfTest(argc,argv,status){
int32_t start, end;
#ifdef TEST_BOYER_MOORE_SEARCH
bms = NULL;
#else
srch = NULL;
#endif
pttrn = NULL;
if(status== U_ILLEGAL_ARGUMENT_ERROR || line_mode){
fprintf(stderr,gUsageString, "strsrchperf");
@ -22,7 +28,8 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
}
/* Get the Text */
src = getBuffer(srcLen, status);
#if 0
/* Get a word to find. Do this by selecting a random word with a word breakiterator. */
UBreakIterator* brk = ubrk_open(UBRK_WORD, locale, src, srcLen, &status);
if(U_FAILURE(status)){
@ -38,9 +45,38 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
}
pttrn = temp; /* store word in pttrn */
ubrk_close(brk);
#else
/* The first line of the file contains the pattern */
start = 0;
for(end = start; ; end += 1) {
UChar ch = src[end];
if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
break;
}
}
pttrnLen = end - start;
UChar* temp = (UChar*)malloc(sizeof(UChar)*(pttrnLen));
for (int i = 0; i < pttrnLen; i++) {
temp[i] = src[start++];
}
pttrn = temp; /* store word in pttrn */
#endif
#ifdef TEST_BOYER_MOORE_SEARCH
UnicodeString patternString(pttrn, pttrnLen);
UCollator *coll = ucol_open(locale, &status);
CollData *data = CollData::open(coll, status);
targetString = new UnicodeString(src, srcLen);
bms = new BoyerMooreSearch(data, patternString, targetString, status);
#else
/* Create the StringSearch object to be use in performance test. */
srch = usearch_open(pttrn, pttrnLen, src, srcLen, locale, NULL, &status);
#endif
if(U_FAILURE(status)){
fprintf(stderr, "FAILED to create UPerfTest object. Error: %s\n", u_errorName(status));
return;
@ -49,12 +85,23 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
}
StringSearchPerformanceTest::~StringSearchPerformanceTest() {
CollData *data = bms->getData();
UCollator *coll = data->getCollator();
delete bms;
delete targetString;
CollData::close(data);
ucol_close(coll);
if (pttrn != NULL) {
free(pttrn);
}
#ifndef TEST_BOYER_MOORE_SEARCH
if (srch != NULL) {
usearch_close(srch);
}
#endif
}
UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char *par) {
@ -70,12 +117,20 @@ UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool
}
UPerfFunction* StringSearchPerformanceTest::Test_ICU_Forward_Search(){
#ifdef TEST_BOYER_MOORE_SEARCH
StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUForwardSearch, bms, src, srcLen, pttrn, pttrnLen);
#else
StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUForwardSearch, srch, src, srcLen, pttrn, pttrnLen);
#endif
return func;
}
UPerfFunction* StringSearchPerformanceTest::Test_ICU_Backward_Search(){
#ifdef TEST_BOYER_MOORE_SEARCH
StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUBackwardSearch, bms, src, srcLen, pttrn, pttrnLen);
#else
StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUBackwardSearch, srch, src, srcLen, pttrn, pttrnLen);
#endif
return func;
}

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (C) 2008 IBM, Inc. All Rights Reserved.
* Copyright (C) 2008-2009 IBM, Inc. All Rights Reserved.
*
********************************************************************/
#ifndef _STRSRCHPERF_H
@ -8,11 +8,19 @@
#include "unicode/ubrk.h"
#include "unicode/usearch.h"
#include "unicode/colldata.h"
#include "unicode/bmsearch.h"
#include "unicode/uperf.h"
#include <stdlib.h>
#include <stdio.h>
#define TEST_BOYER_MOORE_SEARCH
#ifdef TEST_BOYER_MOORE_SEARCH
typedef void (*StrSrchFn) (BoyerMooreSearch * bms, const UChar *src, int32_t srcLen, const UChar *pttrn, int32_t pttrnLen, UErrorCode *status);
#else
typedef void (*StrSrchFn)(UStringSearch* srch, const UChar* src,int32_t srcLen, const UChar* pttrn, int32_t pttrnLen, UErrorCode* status);
#endif
class StringSearchPerfFunction : public UPerfFunction {
private:
@ -21,17 +29,39 @@ private:
int32_t srcLen;
const UChar* pttrn;
int32_t pttrnLen;
#ifdef TEST_BOYER_MOORE_SEARCH
BoyerMooreSearch *bms;
#else
UStringSearch* srch;
#endif
public:
virtual void call(UErrorCode* status) {
#ifdef TEST_BOYER_MOORE_SEARCH
(*fn)(bms, src, srcLen, pttrn, pttrnLen, status);
#else
(*fn)(srch, src, srcLen, pttrn, pttrnLen, status);
#endif
}
virtual long getOperationsPerIteration() {
#if 0
return (long)(srcLen/pttrnLen);
#else
return (long) srcLen;
#endif
}
#ifdef TEST_BOYER_MOORE_SEARCH
StringSearchPerfFunction(StrSrchFn func, BoyerMooreSearch *search, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen) {
fn = func;
src = source;
srcLen = sourceLen;
pttrn = pattern;
pttrnLen = patternLen;
bms = search;
}
#else
StringSearchPerfFunction(StrSrchFn func, UStringSearch* search, const UChar* source,int32_t sourceLen, const UChar* pattern, int32_t patternLen) {
fn = func;
src = source;
@ -40,6 +70,7 @@ public:
pttrnLen = patternLen;
srch = search;
}
#endif
};
class StringSearchPerformanceTest : public UPerfTest {
@ -48,7 +79,12 @@ private:
int32_t srcLen;
UChar* pttrn;
int32_t pttrnLen;
#ifdef TEST_BOYER_MOORE_SEARCH
UnicodeString *targetString;
BoyerMooreSearch *bms;
#else
UStringSearch* srch;
#endif
public:
StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status);
@ -56,9 +92,29 @@ public:
virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = NULL);
UPerfFunction* Test_ICU_Forward_Search();
UPerfFunction* Test_ICU_Backward_Search();
};
#ifdef TEST_BOYER_MOORE_SEARCH
void ICUForwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) {
int32_t offset = 0, start = -1, end = -1;
while (bms->search(offset, start, end)) {
offset = end;
}
}
void ICUBackwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) {
int32_t offset = 0, start = -1, end = -1;
/* NOTE: No Boyer-Moore backward search yet... */
while (bms->search(offset, start, end)) {
offset = end;
}
}
#else
void ICUForwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceLen, const UChar* pattern, int32_t patternLen, UErrorCode* status) {
int32_t match;
@ -76,5 +132,6 @@ void ICUBackwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceL
match = usearch_previous(srch, status);
}
}
#endif
#endif /* _STRSRCHPERF_H */

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Copyright (c) 2007-2008 IBM Corporation and others. All rights reserved -->
<!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->
<!-- Test data file for string search -->
<!DOCTYPE stringsearch-tests [
@ -12,6 +12,7 @@
locale CDATA "en"
strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY"
norm (ON | OFF) "OFF"
alternate_handling (NON_IGNORABLE | SHIFTED) "NON_IGNORABLE"
>
<!ELEMENT pattern (#PCDATA)>
@ -20,7 +21,7 @@
<!ELEMENT post (#PCDATA)>
]>
<stringsearch-tests debug="test32">
<stringsearch-tests>
<!-- debug="test11" (for copying into the above element) -->
<!-- Very simple match -->
@ -174,8 +175,15 @@
<pattern>A\u0300</pattern>
<pre>At IDENTICAL, shoud this match? </pre><m>\u00c0</m><post></post>
</test-case>
<test-case id="test25" strength="SECONDARY" locale="en">
<test-case id="test24b" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
<pattern>A\u0300</pattern>
<pre>At IDENTICAL, shoud this match? </pre>
<m>\u00c0</m>
<post></post>
</test-case>
<test-case id="test25" strength="SECONDARY" locale="en">
<pattern>Ű</pattern>
<pre>12</pre><m>ű</m><post> Ű</post>
</test-case>
@ -285,11 +293,13 @@
<!-- Long combining sequences -->
<!-- Backwards search fails because patterns ends w/ ignorables
<test-case id="test60" strength="PRIMARY">
<pattern>A\u0301\u0301\u0301\u0301</pattern>
<m>A\u0301\u0301\u0301\u0301\u0301</m>
</test-case>
-->
<test-case id="test61" strength="TERTIARY">
<pattern>A\u0301\u0301\u0301\u0301</pattern>
<pre>A\u0301\u0301\u0301\u0301\u0301</pre>
@ -409,5 +419,27 @@
<pattern>VII</pattern>
<m>\u2166</m>
</test-case>
<test-case id="test83" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
<pattern>Universal Declaration of Human Rights</pattern>
<pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of achievement for all peoples and all nations</post>
</test-case>
<test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en">
<pattern>Universal Declaration of Human Rights</pattern>
<pre>Proclaims this </pre>
<m>Universal-Declaration-of-Human-Rights</m>
<post> as a common standard of achievement for all peoples and all nations</post>
</test-case>
<test-case id="test84" strength="TERTIARY" locale="en">
<pattern>\u05E9\u0591\u05E9</pattern>
<m>\u05E9\u0592\u05E9</m>
</test-case>
<test-case id="test84b" strength="IDENTICAL" locale="en">
<pattern>\u05E9\u0591\u05E9</pattern>
<pre>\u05E9\u0592\u05E9</pre>
</test-case>
</stringsearch-tests>