c6385f6209
X-SVN-Rev: 26466
865 lines
22 KiB
C++
865 lines
22 KiB
C++
/*
|
|
******************************************************************************
|
|
* Copyright (C) 1996-2009, International Business Machines *
|
|
* Corporation and others. All Rights Reserved. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
|
|
|
|
#include "unicode/unistr.h"
|
|
#include "unicode/putil.h"
|
|
#include "unicode/usearch.h"
|
|
|
|
#include "cmemory.h"
|
|
#include "unicode/coll.h"
|
|
#include "unicode/tblcoll.h"
|
|
#include "unicode/coleitr.h"
|
|
#include "unicode/ucoleitr.h"
|
|
|
|
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
|
|
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/uset.h"
|
|
#include "unicode/ustring.h"
|
|
#include "hash.h"
|
|
#include "uhash.h"
|
|
#include "ucol_imp.h"
|
|
#include "unormimp.h"
|
|
|
|
#include "unicode/colldata.h"
|
|
#include "unicode/bmsearch.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
|
|
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
|
|
#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
|
|
|
|
|
struct CEI
|
|
{
|
|
uint32_t order;
|
|
int32_t lowOffset;
|
|
int32_t highOffset;
|
|
};
|
|
|
|
class Target : public UMemory
|
|
{
|
|
public:
|
|
Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
|
|
~Target();
|
|
|
|
void setTargetString(const UnicodeString *target);
|
|
|
|
const CEI *nextCE(int32_t offset);
|
|
const CEI *prevCE(int32_t offset);
|
|
|
|
int32_t stringLength();
|
|
UChar charAt(int32_t offset);
|
|
|
|
UBool isBreakBoundary(int32_t offset);
|
|
int32_t nextBreakBoundary(int32_t offset);
|
|
int32_t nextSafeBoundary(int32_t offset);
|
|
|
|
UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
|
|
|
|
void setOffset(int32_t offset);
|
|
void setLast(int32_t last);
|
|
int32_t getOffset();
|
|
|
|
private:
|
|
CEI *ceb;
|
|
int32_t bufferSize;
|
|
int32_t bufferMin;
|
|
int32_t bufferMax;
|
|
|
|
uint32_t strengthMask;
|
|
UCollationStrength strength;
|
|
uint32_t variableTop;
|
|
UBool toShift;
|
|
UCollator *coll;
|
|
|
|
const UnicodeString *targetString;
|
|
const UChar *targetBuffer;
|
|
int32_t targetLength;
|
|
|
|
UCollationElements *elements;
|
|
UBreakIterator *charBreakIterator;
|
|
};
|
|
|
|
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
|
|
: bufferSize(0), bufferMin(0), bufferMax(0),
|
|
strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
|
|
targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
|
|
{
|
|
strength = ucol_getStrength(coll);
|
|
toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
|
|
variableTop = ucol_getVariableTop(coll, &status);
|
|
|
|
// find the largest expansion
|
|
uint8_t maxExpansion = 0;
|
|
for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
|
|
if (*expansion > maxExpansion) {
|
|
maxExpansion = *expansion;
|
|
}
|
|
}
|
|
|
|
// room for an extra character on each end, plus 4 for safety
|
|
bufferSize = patternLength + (2 * maxExpansion) + 4;
|
|
|
|
ceb = NEW_ARRAY(CEI, bufferSize);
|
|
|
|
if (ceb == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
|
|
if (target != NULL) {
|
|
setTargetString(target);
|
|
}
|
|
|
|
switch (strength)
|
|
{
|
|
default:
|
|
strengthMask |= UCOL_TERTIARYORDERMASK;
|
|
/* fall through */
|
|
|
|
case UCOL_SECONDARY:
|
|
strengthMask |= UCOL_SECONDARYORDERMASK;
|
|
/* fall through */
|
|
|
|
case UCOL_PRIMARY:
|
|
strengthMask |= UCOL_PRIMARYORDERMASK;
|
|
}
|
|
}
|
|
|
|
Target::~Target()
|
|
{
|
|
ubrk_close(charBreakIterator);
|
|
ucol_closeElements(elements);
|
|
|
|
DELETE_ARRAY(ceb);
|
|
}
|
|
|
|
void Target::setTargetString(const UnicodeString *target)
|
|
{
|
|
if (charBreakIterator != NULL) {
|
|
ubrk_close(charBreakIterator);
|
|
ucol_closeElements(elements);
|
|
}
|
|
|
|
targetString = target;
|
|
|
|
if (targetString != NULL) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
targetBuffer = targetString->getBuffer();
|
|
targetLength = targetString->length();
|
|
|
|
elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
|
|
ucol_forceHanImplicit(elements, &status);
|
|
|
|
charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
|
|
targetBuffer, targetLength, &status);
|
|
} else {
|
|
targetBuffer = NULL;
|
|
targetLength = 0;
|
|
}
|
|
}
|
|
|
|
const CEI *Target::nextCE(int32_t offset)
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
int32_t low = -1, high = -1;
|
|
uint32_t order;
|
|
UBool cont = FALSE;
|
|
|
|
if (offset >= bufferMin && offset < bufferMax) {
|
|
return &ceb[offset];
|
|
}
|
|
|
|
if (bufferMax >= bufferSize || offset != bufferMax) {
|
|
return NULL;
|
|
}
|
|
|
|
do {
|
|
low = ucol_getOffset(elements);
|
|
order = ucol_next(elements, &status);
|
|
high = ucol_getOffset(elements);
|
|
|
|
if (order == (uint32_t)UCOL_NULLORDER) {
|
|
//high = low = -1;
|
|
break;
|
|
}
|
|
|
|
cont = isContinuation(order);
|
|
order &= strengthMask;
|
|
|
|
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
|
|
if (strength >= UCOL_QUATERNARY) {
|
|
order &= UCOL_PRIMARYORDERMASK;
|
|
} else {
|
|
order = UCOL_IGNORABLE;
|
|
}
|
|
}
|
|
} while (order == UCOL_IGNORABLE);
|
|
|
|
if (cont) {
|
|
order |= UCOL_CONTINUATION_MARKER;
|
|
}
|
|
|
|
ceb[offset].order = order;
|
|
ceb[offset].lowOffset = low;
|
|
ceb[offset].highOffset = high;
|
|
|
|
bufferMax += 1;
|
|
|
|
return &ceb[offset];
|
|
}
|
|
|
|
const CEI *Target::prevCE(int32_t offset)
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
int32_t low = -1, high = -1;
|
|
uint32_t order;
|
|
UBool cont = FALSE;
|
|
|
|
if (offset >= bufferMin && offset < bufferMax) {
|
|
return &ceb[offset];
|
|
}
|
|
|
|
if (bufferMax >= bufferSize || offset != bufferMax) {
|
|
return NULL;
|
|
}
|
|
|
|
do {
|
|
high = ucol_getOffset(elements);
|
|
order = ucol_previous(elements, &status);
|
|
low = ucol_getOffset(elements);
|
|
|
|
if (order == (uint32_t)UCOL_NULLORDER) {
|
|
break;
|
|
}
|
|
|
|
cont = isContinuation(order);
|
|
order &= strengthMask;
|
|
|
|
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
|
|
if (strength >= UCOL_QUATERNARY) {
|
|
order &= UCOL_PRIMARYORDERMASK;
|
|
} else {
|
|
order = UCOL_IGNORABLE;
|
|
}
|
|
}
|
|
} while (order == UCOL_IGNORABLE);
|
|
|
|
bufferMax += 1;
|
|
|
|
if (cont) {
|
|
order |= UCOL_CONTINUATION_MARKER;
|
|
}
|
|
|
|
ceb[offset].order = order;
|
|
ceb[offset].lowOffset = low;
|
|
ceb[offset].highOffset = high;
|
|
|
|
return &ceb[offset];
|
|
}
|
|
|
|
int32_t Target::stringLength()
|
|
{
|
|
if (targetString != NULL) {
|
|
return targetLength;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
UChar Target::charAt(int32_t offset)
|
|
{
|
|
if (targetString != NULL) {
|
|
return targetBuffer[offset];
|
|
}
|
|
|
|
return 0x0000;
|
|
}
|
|
|
|
void Target::setOffset(int32_t offset)
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
bufferMin = 0;
|
|
bufferMax = 0;
|
|
|
|
ucol_setOffset(elements, offset, &status);
|
|
}
|
|
|
|
void Target::setLast(int32_t last)
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
bufferMin = 0;
|
|
bufferMax = 1;
|
|
|
|
ceb[0].order = UCOL_NULLORDER;
|
|
ceb[0].lowOffset = last;
|
|
ceb[0].highOffset = last;
|
|
|
|
ucol_setOffset(elements, last, &status);
|
|
}
|
|
|
|
int32_t Target::getOffset()
|
|
{
|
|
return ucol_getOffset(elements);
|
|
}
|
|
|
|
UBool Target::isBreakBoundary(int32_t offset)
|
|
{
|
|
return ubrk_isBoundary(charBreakIterator, offset);
|
|
}
|
|
|
|
int32_t Target::nextBreakBoundary(int32_t offset)
|
|
{
|
|
return ubrk_following(charBreakIterator, offset);
|
|
}
|
|
|
|
int32_t Target::nextSafeBoundary(int32_t offset)
|
|
{
|
|
while (offset < targetLength) {
|
|
//UChar ch = charAt(offset);
|
|
UChar ch = targetBuffer[offset];
|
|
|
|
if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
|
|
return offset;
|
|
}
|
|
|
|
offset += 1;
|
|
}
|
|
|
|
return targetLength;
|
|
}
|
|
|
|
UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
|
|
{
|
|
if (strength < UCOL_IDENTICAL) {
|
|
return TRUE;
|
|
}
|
|
|
|
UChar t2[32], p2[32];
|
|
const UChar *pBuffer = pattern.getBuffer();
|
|
int32_t pLength = pattern.length();
|
|
int32_t length = end - start;
|
|
|
|
UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
|
|
|
|
int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2),
|
|
targetBuffer + start, length,
|
|
FALSE, 0, &status);
|
|
|
|
// use separate status2 in case of buffer overflow
|
|
if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2),
|
|
pBuffer, pLength,
|
|
FALSE, 0, &status2)) {
|
|
return FALSE; // lengths are different
|
|
}
|
|
|
|
// compare contents
|
|
UChar *text, *pat;
|
|
|
|
if(U_SUCCESS(status)) {
|
|
text = t2;
|
|
pat = p2;
|
|
} else if(status == U_BUFFER_OVERFLOW_ERROR) {
|
|
status = U_ZERO_ERROR;
|
|
|
|
// allocate one buffer for both decompositions
|
|
text = NEW_ARRAY(UChar, decomplength * 2);
|
|
|
|
// Check for allocation failure.
|
|
if (text == NULL) {
|
|
return FALSE;
|
|
}
|
|
|
|
pat = text + decomplength;
|
|
|
|
unorm_decompose(text, decomplength, targetBuffer + start,
|
|
length, FALSE, 0, &status);
|
|
|
|
unorm_decompose(pat, decomplength, pBuffer,
|
|
pLength, FALSE, 0, &status);
|
|
} else {
|
|
// NFD failed, make sure that u_memcmp() does not overrun t2 & p2
|
|
// and that we don't uprv_free() an undefined text pointer
|
|
text = pat = t2;
|
|
decomplength = 0;
|
|
}
|
|
|
|
UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0);
|
|
|
|
if(text != t2) {
|
|
DELETE_ARRAY(text);
|
|
}
|
|
|
|
// return FALSE if NFD failed
|
|
return U_SUCCESS(status) && result;
|
|
}
|
|
|
|
#define HASH_TABLE_SIZE 257
|
|
|
|
class BadCharacterTable : public UMemory
|
|
{
|
|
public:
|
|
BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
|
|
~BadCharacterTable();
|
|
|
|
int32_t operator[](uint32_t ce) const;
|
|
int32_t getMaxSkip() const;
|
|
int32_t minLengthInChars(int32_t index);
|
|
|
|
private:
|
|
static int32_t hash(uint32_t ce);
|
|
|
|
int32_t maxSkip;
|
|
int32_t badCharacterTable[HASH_TABLE_SIZE];
|
|
|
|
int32_t *minLengthCache;
|
|
};
|
|
|
|
BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
|
|
: minLengthCache(NULL)
|
|
{
|
|
int32_t plen = patternCEs.size();
|
|
|
|
// **** need a better way to deal with this ****
|
|
if (U_FAILURE(status) || plen == 0) {
|
|
return;
|
|
}
|
|
|
|
int32_t *history = NEW_ARRAY(int32_t, plen);
|
|
|
|
if (history == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
|
|
for (int32_t i = 0; i < plen; i += 1) {
|
|
history[i] = -1;
|
|
}
|
|
|
|
minLengthCache = NEW_ARRAY(int32_t, plen + 1);
|
|
|
|
if (minLengthCache == NULL) {
|
|
DELETE_ARRAY(history);
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
|
|
maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
|
|
|
|
for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
|
|
badCharacterTable[j] = maxSkip;
|
|
}
|
|
|
|
for(int32_t p = 1; p < plen; p += 1) {
|
|
minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
|
|
|
|
// Make sure this entry is not bigger than the previous one.
|
|
// Otherwise, we might skip too far in some cases.
|
|
if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
|
|
minLengthCache[p] = minLengthCache[p - 1];
|
|
}
|
|
}
|
|
|
|
minLengthCache[plen] = 0;
|
|
|
|
for(int32_t p = 0; p < plen - 1; p += 1) {
|
|
badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
|
|
}
|
|
|
|
DELETE_ARRAY(history);
|
|
}
|
|
|
|
BadCharacterTable::~BadCharacterTable()
|
|
{
|
|
DELETE_ARRAY(minLengthCache);
|
|
}
|
|
|
|
int32_t BadCharacterTable::operator[](uint32_t ce) const
|
|
{
|
|
return badCharacterTable[hash(ce)];
|
|
}
|
|
|
|
int32_t BadCharacterTable::getMaxSkip() const
|
|
{
|
|
return maxSkip;
|
|
}
|
|
|
|
int32_t BadCharacterTable::minLengthInChars(int32_t index)
|
|
{
|
|
return minLengthCache[index];
|
|
}
|
|
|
|
int32_t BadCharacterTable::hash(uint32_t ce)
|
|
{
|
|
return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
|
|
}
|
|
|
|
class GoodSuffixTable : public UMemory
|
|
{
|
|
public:
|
|
GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
|
|
~GoodSuffixTable();
|
|
|
|
int32_t operator[](int32_t offset) const;
|
|
|
|
private:
|
|
int32_t *goodSuffixTable;
|
|
};
|
|
|
|
GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
|
|
: goodSuffixTable(NULL)
|
|
{
|
|
int32_t patlen = patternCEs.size();
|
|
|
|
// **** need a better way to deal with this ****
|
|
if (U_FAILURE(status) || patlen <= 0) {
|
|
return;
|
|
}
|
|
|
|
int32_t *suff = NEW_ARRAY(int32_t, patlen);
|
|
int32_t start = patlen - 1, end = - 1;
|
|
int32_t maxSkip = badCharacterTable.getMaxSkip();
|
|
|
|
if (suff == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
|
|
// initialze suff
|
|
suff[patlen - 1] = patlen;
|
|
|
|
for (int32_t i = patlen - 2; i >= 0; i -= 1) {
|
|
// (i > start) means we're inside the last suffix match we found
|
|
// ((patlen - 1) - end) is how far the end of that match is from end of pattern
|
|
// (i - start) is how far we are from start of that match
|
|
// (i + (patlen - 1) - end) is index of same character at end of pattern
|
|
// so if any suffix match at that character doesn't extend beyond the last match,
|
|
// it's the suffix for this character as well
|
|
if (i > start && suff[i + patlen - 1 - end] < i - start) {
|
|
suff[i] = suff[i + patlen - 1 - end];
|
|
} else {
|
|
start = end = i;
|
|
|
|
int32_t s = patlen;
|
|
|
|
while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
|
|
start -= 1;
|
|
}
|
|
|
|
suff[i] = end - start;
|
|
}
|
|
}
|
|
|
|
// now build goodSuffixTable
|
|
goodSuffixTable = NEW_ARRAY(int32_t, patlen);
|
|
|
|
if (goodSuffixTable == NULL) {
|
|
DELETE_ARRAY(suff);
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
|
|
|
|
// initialize entries to minLengthInChars of the pattern
|
|
for (int32_t i = 0; i < patlen; i += 1) {
|
|
goodSuffixTable[i] = maxSkip;
|
|
}
|
|
|
|
int32_t prefix = 0;
|
|
|
|
for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
|
|
if (suff[i] == i + 1) {
|
|
// this matching suffix is a prefix of the pattern
|
|
int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
|
|
|
|
// for any mis-match before this suffix, we should skip
|
|
// so that the front of the pattern (i.e. the prefix)
|
|
// lines up with the front of the suffix.
|
|
// (patlen - 1 - i) is the start of the suffix
|
|
while (prefix < patlen - 1 - i) {
|
|
// value of maxSkip means never set...
|
|
if (goodSuffixTable[prefix] == maxSkip) {
|
|
goodSuffixTable[prefix] = prefixSkip;
|
|
}
|
|
|
|
prefix += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int32_t i = 0; i < patlen - 1; i += 1) {
|
|
goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
|
|
}
|
|
|
|
DELETE_ARRAY(suff);
|
|
}
|
|
|
|
GoodSuffixTable::~GoodSuffixTable()
|
|
{
|
|
DELETE_ARRAY(goodSuffixTable);
|
|
}
|
|
|
|
int32_t GoodSuffixTable::operator[](int32_t offset) const
|
|
{
|
|
return goodSuffixTable[offset];
|
|
}
|
|
|
|
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
|
|
|
|
|
|
UBool BoyerMooreSearch::empty()
|
|
{
|
|
return patCEs->size() <= 0;
|
|
}
|
|
|
|
CollData *BoyerMooreSearch::getData()
|
|
{
|
|
return data;
|
|
}
|
|
|
|
CEList *BoyerMooreSearch::getPatternCEs()
|
|
{
|
|
return patCEs;
|
|
}
|
|
|
|
BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
|
|
{
|
|
return badCharacterTable;
|
|
}
|
|
|
|
GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
|
|
{
|
|
return goodSuffixTable;
|
|
}
|
|
|
|
BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
|
|
UErrorCode &status)
|
|
: data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
|
|
{
|
|
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
UCollator *collator = data->getCollator();
|
|
|
|
patCEs = new CEList(collator, patternString, status);
|
|
|
|
if (patCEs == NULL || U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
badCharacterTable = new BadCharacterTable(*patCEs, data, status);
|
|
|
|
if (badCharacterTable == NULL || U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
|
|
|
|
if (targetString != NULL) {
|
|
target = new Target(collator, targetString, patCEs->size(), status);
|
|
}
|
|
}
|
|
|
|
BoyerMooreSearch::~BoyerMooreSearch()
|
|
{
|
|
delete target;
|
|
delete goodSuffixTable;
|
|
delete badCharacterTable;
|
|
delete patCEs;
|
|
}
|
|
|
|
void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
|
|
{
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
if (target == NULL) {
|
|
target = new Target(data->getCollator(), targetString, patCEs->size(), status);
|
|
} else {
|
|
target->setTargetString(targetString);
|
|
}
|
|
}
|
|
|
|
// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
|
|
/*
|
|
* TODO:
|
|
* * deal with trailing (and leading?) ignorables.
|
|
* * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
|
|
*/
|
|
UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
|
|
{
|
|
/*UCollator *coll =*/ data->getCollator();
|
|
int32_t plen = patCEs->size();
|
|
int32_t tlen = target->stringLength();
|
|
int32_t maxSkip = badCharacterTable->getMaxSkip();
|
|
int32_t tOffset = offset + maxSkip;
|
|
|
|
if (plen <= 0) {
|
|
// Searching for a zero length pattern always fails.
|
|
start = end = -1;
|
|
return FALSE;
|
|
}
|
|
|
|
while (tOffset <= tlen) {
|
|
int32_t pIndex = plen - 1;
|
|
int32_t tIndex = 0;
|
|
int32_t lIndex = 0;
|
|
|
|
if (tOffset < tlen) {
|
|
// **** we really want to skip ahead enough to ****
|
|
// **** be sure we get at least 1 non-ignorable ****
|
|
// **** CE after the end of the pattern. ****
|
|
int32_t next = target->nextSafeBoundary(tOffset + 1);
|
|
|
|
target->setOffset(next);
|
|
|
|
for (lIndex = 0; ; lIndex += 1) {
|
|
const CEI *cei = target->prevCE(lIndex);
|
|
int32_t low = cei->lowOffset;
|
|
int32_t high = cei->highOffset;
|
|
|
|
if (high == 0 || (low < high && low <= tOffset)) {
|
|
if (low < tOffset) {
|
|
while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
|
|
lIndex -= 1;
|
|
}
|
|
|
|
if (high > tOffset) {
|
|
tOffset = high;
|
|
}
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
target->setLast(tOffset);
|
|
lIndex = 0;
|
|
}
|
|
|
|
tIndex = ++lIndex;
|
|
|
|
// Iterate backward until we hit the beginning of the pattern
|
|
while (pIndex >= 0) {
|
|
uint32_t pce = (*patCEs)[pIndex];
|
|
const CEI *tcei = target->prevCE(tIndex++);
|
|
|
|
|
|
if (tcei->order != pce) {
|
|
// There is a mismatch at this position. Decide how far
|
|
// over to shift the pattern, then try again.
|
|
|
|
int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
|
|
#ifdef EXTRA_CAUTIOUS
|
|
int32_t old = tOffset;
|
|
#endif
|
|
|
|
tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
|
|
|
|
if (gsOffset > tOffset) {
|
|
tOffset = gsOffset;
|
|
}
|
|
|
|
#ifdef EXTRA_CAUTIOUS
|
|
// Make sure we don't skip backwards...
|
|
if (tOffset <= old) {
|
|
tOffset = old + 1;
|
|
}
|
|
#endif
|
|
|
|
break;
|
|
}
|
|
|
|
pIndex -= 1;
|
|
}
|
|
|
|
if (pIndex < 0) {
|
|
// We made it back to the beginning of the pattern,
|
|
// which means we matched it all. Return the location.
|
|
const CEI firstCEI = *target->prevCE(tIndex - 1);
|
|
const CEI lastCEI = *target->prevCE(lIndex);
|
|
int32_t mStart = firstCEI.lowOffset;
|
|
int32_t minLimit = lastCEI.lowOffset;
|
|
int32_t maxLimit = lastCEI.highOffset;
|
|
int32_t mLimit;
|
|
UBool found = TRUE;
|
|
|
|
target->setOffset(/*tOffset*/maxLimit);
|
|
|
|
const CEI nextCEI = *target->nextCE(0);
|
|
|
|
if (nextCEI.lowOffset > maxLimit) {
|
|
maxLimit = nextCEI.lowOffset;
|
|
}
|
|
|
|
if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
|
|
found = FALSE;
|
|
}
|
|
|
|
if (! target->isBreakBoundary(mStart)) {
|
|
found = FALSE;
|
|
}
|
|
|
|
if (firstCEI.lowOffset == firstCEI.highOffset) {
|
|
found = FALSE;
|
|
}
|
|
|
|
mLimit = maxLimit;
|
|
if (minLimit < maxLimit) {
|
|
int32_t nbb = target->nextBreakBoundary(minLimit);
|
|
|
|
if (nbb >= lastCEI.highOffset) {
|
|
mLimit = nbb;
|
|
}
|
|
}
|
|
|
|
if (mLimit > maxLimit) {
|
|
found = FALSE;
|
|
}
|
|
|
|
if (! target->isBreakBoundary(mLimit)) {
|
|
found = FALSE;
|
|
}
|
|
|
|
if (! target->isIdentical(pattern, mStart, mLimit)) {
|
|
found = FALSE;
|
|
}
|
|
|
|
if (found) {
|
|
start = mStart;
|
|
end = mLimit;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
|
|
}
|
|
// Otherwise, we're here because of a mismatch, so keep going....
|
|
}
|
|
|
|
// no match
|
|
start = -1;
|
|
end = -1;
|
|
return FALSE;
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif // #if !UCONFIG_NO_COLLATION
|