scuffed-code/icu4c/source/i18n/bmsearch.cpp

818 lines
21 KiB
C++

/*
******************************************************************************
* Copyright (C) 1996-2010, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
#include "unicode/unistr.h"
#include "unicode/putil.h"
#include "unicode/usearch.h"
#include "cmemory.h"
#include "unicode/coll.h"
#include "unicode/tblcoll.h"
#include "unicode/coleitr.h"
#include "unicode/ucoleitr.h"
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
#include "unicode/uniset.h"
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "hash.h"
#include "uhash.h"
#include "ucol_imp.h"
#include "normalizer2impl.h"
#include "unicode/colldata.h"
#include "unicode/bmsearch.h"
U_NAMESPACE_BEGIN
#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
struct CEI
{
uint32_t order;
int32_t lowOffset;
int32_t highOffset;
};
class Target : public UMemory
{
public:
Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
~Target();
void setTargetString(const UnicodeString *target);
const CEI *nextCE(int32_t offset);
const CEI *prevCE(int32_t offset);
int32_t stringLength();
UChar charAt(int32_t offset);
UBool isBreakBoundary(int32_t offset);
int32_t nextBreakBoundary(int32_t offset);
int32_t nextSafeBoundary(int32_t offset);
UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
void setOffset(int32_t offset);
void setLast(int32_t last);
int32_t getOffset();
private:
CEI *ceb;
int32_t bufferSize;
int32_t bufferMin;
int32_t bufferMax;
uint32_t strengthMask;
UCollationStrength strength;
uint32_t variableTop;
UBool toShift;
UCollator *coll;
const Normalizer2 &nfd;
const UnicodeString *targetString;
const UChar *targetBuffer;
int32_t targetLength;
UCollationElements *elements;
UBreakIterator *charBreakIterator;
};
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
: bufferSize(0), bufferMin(0), bufferMax(0),
strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
nfd(*Normalizer2Factory::getNFDInstance(status)),
targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
{
strength = ucol_getStrength(coll);
toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
variableTop = ucol_getVariableTop(coll, &status);
// find the largest expansion
uint8_t maxExpansion = 0;
for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
if (*expansion > maxExpansion) {
maxExpansion = *expansion;
}
}
// room for an extra character on each end, plus 4 for safety
bufferSize = patternLength + (2 * maxExpansion) + 4;
ceb = NEW_ARRAY(CEI, bufferSize);
if (ceb == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (target != NULL) {
setTargetString(target);
}
switch (strength)
{
default:
strengthMask |= UCOL_TERTIARYORDERMASK;
/* fall through */
case UCOL_SECONDARY:
strengthMask |= UCOL_SECONDARYORDERMASK;
/* fall through */
case UCOL_PRIMARY:
strengthMask |= UCOL_PRIMARYORDERMASK;
}
}
Target::~Target()
{
ubrk_close(charBreakIterator);
ucol_closeElements(elements);
DELETE_ARRAY(ceb);
}
void Target::setTargetString(const UnicodeString *target)
{
if (charBreakIterator != NULL) {
ubrk_close(charBreakIterator);
ucol_closeElements(elements);
}
targetString = target;
if (targetString != NULL) {
UErrorCode status = U_ZERO_ERROR;
targetBuffer = targetString->getBuffer();
targetLength = targetString->length();
elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
ucol_forceHanImplicit(elements, &status);
charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
targetBuffer, targetLength, &status);
} else {
targetBuffer = NULL;
targetLength = 0;
}
}
const CEI *Target::nextCE(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
int32_t low = -1, high = -1;
uint32_t order;
UBool cont = FALSE;
if (offset >= bufferMin && offset < bufferMax) {
return &ceb[offset];
}
if (bufferMax >= bufferSize || offset != bufferMax) {
return NULL;
}
do {
low = ucol_getOffset(elements);
order = ucol_next(elements, &status);
high = ucol_getOffset(elements);
if (order == (uint32_t)UCOL_NULLORDER) {
//high = low = -1;
break;
}
cont = isContinuation(order);
order &= strengthMask;
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
if (strength >= UCOL_QUATERNARY) {
order &= UCOL_PRIMARYORDERMASK;
} else {
order = UCOL_IGNORABLE;
}
}
} while (order == UCOL_IGNORABLE);
if (cont) {
order |= UCOL_CONTINUATION_MARKER;
}
ceb[offset].order = order;
ceb[offset].lowOffset = low;
ceb[offset].highOffset = high;
bufferMax += 1;
return &ceb[offset];
}
const CEI *Target::prevCE(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
int32_t low = -1, high = -1;
uint32_t order;
UBool cont = FALSE;
if (offset >= bufferMin && offset < bufferMax) {
return &ceb[offset];
}
if (bufferMax >= bufferSize || offset != bufferMax) {
return NULL;
}
do {
high = ucol_getOffset(elements);
order = ucol_previous(elements, &status);
low = ucol_getOffset(elements);
if (order == (uint32_t)UCOL_NULLORDER) {
break;
}
cont = isContinuation(order);
order &= strengthMask;
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
if (strength >= UCOL_QUATERNARY) {
order &= UCOL_PRIMARYORDERMASK;
} else {
order = UCOL_IGNORABLE;
}
}
} while (order == UCOL_IGNORABLE);
bufferMax += 1;
if (cont) {
order |= UCOL_CONTINUATION_MARKER;
}
ceb[offset].order = order;
ceb[offset].lowOffset = low;
ceb[offset].highOffset = high;
return &ceb[offset];
}
int32_t Target::stringLength()
{
if (targetString != NULL) {
return targetLength;
}
return 0;
}
UChar Target::charAt(int32_t offset)
{
if (targetString != NULL) {
return targetBuffer[offset];
}
return 0x0000;
}
void Target::setOffset(int32_t offset)
{
UErrorCode status = U_ZERO_ERROR;
bufferMin = 0;
bufferMax = 0;
ucol_setOffset(elements, offset, &status);
}
void Target::setLast(int32_t last)
{
UErrorCode status = U_ZERO_ERROR;
bufferMin = 0;
bufferMax = 1;
ceb[0].order = UCOL_NULLORDER;
ceb[0].lowOffset = last;
ceb[0].highOffset = last;
ucol_setOffset(elements, last, &status);
}
int32_t Target::getOffset()
{
return ucol_getOffset(elements);
}
UBool Target::isBreakBoundary(int32_t offset)
{
return ubrk_isBoundary(charBreakIterator, offset);
}
int32_t Target::nextBreakBoundary(int32_t offset)
{
return ubrk_following(charBreakIterator, offset);
}
int32_t Target::nextSafeBoundary(int32_t offset)
{
while (offset < targetLength) {
//UChar ch = charAt(offset);
UChar ch = targetBuffer[offset];
if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
return offset;
}
offset += 1;
}
return targetLength;
}
UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
{
if (strength < UCOL_IDENTICAL) {
return TRUE;
}
// Note: We could use Normalizer::compare() or similar, but for short strings
// which may not be in FCD it might be faster to just NFD them.
UErrorCode status = U_ZERO_ERROR;
UnicodeString t2, p2;
nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
nfd.normalize(pattern, p2, status);
// return FALSE if NFD failed
return U_SUCCESS(status) && t2 == p2;
}
#define HASH_TABLE_SIZE 257
class BadCharacterTable : public UMemory
{
public:
BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
~BadCharacterTable();
int32_t operator[](uint32_t ce) const;
int32_t getMaxSkip() const;
int32_t minLengthInChars(int32_t index);
private:
static int32_t hash(uint32_t ce);
int32_t maxSkip;
int32_t badCharacterTable[HASH_TABLE_SIZE];
int32_t *minLengthCache;
};
BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
: minLengthCache(NULL)
{
int32_t plen = patternCEs.size();
// **** need a better way to deal with this ****
if (U_FAILURE(status) || plen == 0) {
return;
}
int32_t *history = NEW_ARRAY(int32_t, plen);
if (history == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0; i < plen; i += 1) {
history[i] = -1;
}
minLengthCache = NEW_ARRAY(int32_t, plen + 1);
if (minLengthCache == NULL) {
DELETE_ARRAY(history);
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
badCharacterTable[j] = maxSkip;
}
for(int32_t p = 1; p < plen; p += 1) {
minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
// Make sure this entry is not bigger than the previous one.
// Otherwise, we might skip too far in some cases.
if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
minLengthCache[p] = minLengthCache[p - 1];
}
}
minLengthCache[plen] = 0;
for(int32_t p = 0; p < plen - 1; p += 1) {
badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
}
DELETE_ARRAY(history);
}
BadCharacterTable::~BadCharacterTable()
{
DELETE_ARRAY(minLengthCache);
}
int32_t BadCharacterTable::operator[](uint32_t ce) const
{
return badCharacterTable[hash(ce)];
}
int32_t BadCharacterTable::getMaxSkip() const
{
return maxSkip;
}
int32_t BadCharacterTable::minLengthInChars(int32_t index)
{
return minLengthCache[index];
}
int32_t BadCharacterTable::hash(uint32_t ce)
{
return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
}
class GoodSuffixTable : public UMemory
{
public:
GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
~GoodSuffixTable();
int32_t operator[](int32_t offset) const;
private:
int32_t *goodSuffixTable;
};
GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
: goodSuffixTable(NULL)
{
int32_t patlen = patternCEs.size();
// **** need a better way to deal with this ****
if (U_FAILURE(status) || patlen <= 0) {
return;
}
int32_t *suff = NEW_ARRAY(int32_t, patlen);
int32_t start = patlen - 1, end = - 1;
int32_t maxSkip = badCharacterTable.getMaxSkip();
if (suff == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// initialze suff
suff[patlen - 1] = patlen;
for (int32_t i = patlen - 2; i >= 0; i -= 1) {
// (i > start) means we're inside the last suffix match we found
// ((patlen - 1) - end) is how far the end of that match is from end of pattern
// (i - start) is how far we are from start of that match
// (i + (patlen - 1) - end) is index of same character at end of pattern
// so if any suffix match at that character doesn't extend beyond the last match,
// it's the suffix for this character as well
if (i > start && suff[i + patlen - 1 - end] < i - start) {
suff[i] = suff[i + patlen - 1 - end];
} else {
start = end = i;
int32_t s = patlen;
while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
start -= 1;
}
suff[i] = end - start;
}
}
// now build goodSuffixTable
goodSuffixTable = NEW_ARRAY(int32_t, patlen);
if (goodSuffixTable == NULL) {
DELETE_ARRAY(suff);
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// initialize entries to minLengthInChars of the pattern
for (int32_t i = 0; i < patlen; i += 1) {
goodSuffixTable[i] = maxSkip;
}
int32_t prefix = 0;
for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
if (suff[i] == i + 1) {
// this matching suffix is a prefix of the pattern
int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
// for any mis-match before this suffix, we should skip
// so that the front of the pattern (i.e. the prefix)
// lines up with the front of the suffix.
// (patlen - 1 - i) is the start of the suffix
while (prefix < patlen - 1 - i) {
// value of maxSkip means never set...
if (goodSuffixTable[prefix] == maxSkip) {
goodSuffixTable[prefix] = prefixSkip;
}
prefix += 1;
}
}
}
for (int32_t i = 0; i < patlen - 1; i += 1) {
goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
}
DELETE_ARRAY(suff);
}
GoodSuffixTable::~GoodSuffixTable()
{
DELETE_ARRAY(goodSuffixTable);
}
int32_t GoodSuffixTable::operator[](int32_t offset) const
{
return goodSuffixTable[offset];
}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
UBool BoyerMooreSearch::empty()
{
return patCEs->size() <= 0;
}
CollData *BoyerMooreSearch::getData()
{
return data;
}
CEList *BoyerMooreSearch::getPatternCEs()
{
return patCEs;
}
BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
{
return badCharacterTable;
}
GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
{
return goodSuffixTable;
}
BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
UErrorCode &status)
: data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
{
if (U_FAILURE(status)) {
return;
}
UCollator *collator = data->getCollator();
patCEs = new CEList(collator, patternString, status);
if (patCEs == NULL || U_FAILURE(status)) {
return;
}
badCharacterTable = new BadCharacterTable(*patCEs, data, status);
if (badCharacterTable == NULL || U_FAILURE(status)) {
return;
}
goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
if (targetString != NULL) {
target = new Target(collator, targetString, patCEs->size(), status);
}
}
BoyerMooreSearch::~BoyerMooreSearch()
{
delete target;
delete goodSuffixTable;
delete badCharacterTable;
delete patCEs;
}
void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
if (target == NULL) {
target = new Target(data->getCollator(), targetString, patCEs->size(), status);
} else {
target->setTargetString(targetString);
}
}
// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
/*
* TODO:
* * deal with trailing (and leading?) ignorables.
* * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
*/
UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
{
/*UCollator *coll =*/ data->getCollator();
int32_t plen = patCEs->size();
int32_t tlen = target->stringLength();
int32_t maxSkip = badCharacterTable->getMaxSkip();
int32_t tOffset = offset + maxSkip;
if (plen <= 0) {
// Searching for a zero length pattern always fails.
start = end = -1;
return FALSE;
}
while (tOffset <= tlen) {
int32_t pIndex = plen - 1;
int32_t tIndex = 0;
int32_t lIndex = 0;
if (tOffset < tlen) {
// **** we really want to skip ahead enough to ****
// **** be sure we get at least 1 non-ignorable ****
// **** CE after the end of the pattern. ****
int32_t next = target->nextSafeBoundary(tOffset + 1);
target->setOffset(next);
for (lIndex = 0; ; lIndex += 1) {
const CEI *cei = target->prevCE(lIndex);
int32_t low = cei->lowOffset;
int32_t high = cei->highOffset;
if (high == 0 || (low < high && low <= tOffset)) {
if (low < tOffset) {
while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
lIndex -= 1;
}
if (high > tOffset) {
tOffset = high;
}
}
break;
}
}
} else {
target->setLast(tOffset);
lIndex = 0;
}
tIndex = ++lIndex;
// Iterate backward until we hit the beginning of the pattern
while (pIndex >= 0) {
uint32_t pce = (*patCEs)[pIndex];
const CEI *tcei = target->prevCE(tIndex++);
if (tcei->order != pce) {
// There is a mismatch at this position. Decide how far
// over to shift the pattern, then try again.
int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
#ifdef EXTRA_CAUTIOUS
int32_t old = tOffset;
#endif
tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
if (gsOffset > tOffset) {
tOffset = gsOffset;
}
#ifdef EXTRA_CAUTIOUS
// Make sure we don't skip backwards...
if (tOffset <= old) {
tOffset = old + 1;
}
#endif
break;
}
pIndex -= 1;
}
if (pIndex < 0) {
// We made it back to the beginning of the pattern,
// which means we matched it all. Return the location.
const CEI firstCEI = *target->prevCE(tIndex - 1);
const CEI lastCEI = *target->prevCE(lIndex);
int32_t mStart = firstCEI.lowOffset;
int32_t minLimit = lastCEI.lowOffset;
int32_t maxLimit = lastCEI.highOffset;
int32_t mLimit;
UBool found = TRUE;
target->setOffset(/*tOffset*/maxLimit);
const CEI nextCEI = *target->nextCE(0);
if (nextCEI.lowOffset > maxLimit) {
maxLimit = nextCEI.lowOffset;
}
if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
found = FALSE;
}
if (! target->isBreakBoundary(mStart)) {
found = FALSE;
}
if (firstCEI.lowOffset == firstCEI.highOffset) {
found = FALSE;
}
mLimit = maxLimit;
if (minLimit < maxLimit) {
int32_t nbb = target->nextBreakBoundary(minLimit);
if (nbb >= lastCEI.highOffset) {
mLimit = nbb;
}
}
if (mLimit > maxLimit) {
found = FALSE;
}
if (! target->isBreakBoundary(mLimit)) {
found = FALSE;
}
if (! target->isIdentical(pattern, mStart, mLimit)) {
found = FALSE;
}
if (found) {
start = mStart;
end = mLimit;
return TRUE;
}
tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
}
// Otherwise, we're here because of a mismatch, so keep going....
}
// no match
start = -1;
end = -1;
return FALSE;
}
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_COLLATION