/* ****************************************************************************** * Copyright (C) 1996-2012, International Business Machines * * Corporation and others. All Rights Reserved. * ****************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION #include "unicode/unistr.h" #include "unicode/putil.h" #include "unicode/usearch.h" #include "cmemory.h" #include "unicode/coll.h" #include "unicode/tblcoll.h" #include "unicode/coleitr.h" #include "unicode/ucoleitr.h" #include "unicode/regex.h" // TODO: make conditional on regexp being built. #include "unicode/uniset.h" #include "unicode/uset.h" #include "unicode/ustring.h" #include "hash.h" #include "uhash.h" #include "ucol_imp.h" #include "normalizer2impl.h" #include "unicode/colldata.h" #include "unicode/bmsearch.h" U_NAMESPACE_BEGIN #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) #define DELETE_ARRAY(array) uprv_free((void *) (array)) struct CEI { uint32_t order; int32_t lowOffset; int32_t highOffset; }; class Target : public UMemory { public: Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status); ~Target(); void setTargetString(const UnicodeString *target); const CEI *nextCE(int32_t offset); const CEI *prevCE(int32_t offset); int32_t stringLength(); UChar charAt(int32_t offset); UBool isBreakBoundary(int32_t offset); int32_t nextBreakBoundary(int32_t offset); int32_t nextSafeBoundary(int32_t offset); UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end); void setOffset(int32_t offset); void setLast(int32_t last); int32_t getOffset(); private: CEI *ceb; int32_t bufferSize; int32_t bufferMin; int32_t bufferMax; uint32_t strengthMask; UCollationStrength strength; uint32_t variableTop; UBool toShift; UCollator *coll; const Normalizer2 &nfd; const UnicodeString *targetString; const UChar *targetBuffer; int32_t targetLength; UCollationElements *elements; UBreakIterator *charBreakIterator; }; Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status) : bufferSize(0), bufferMin(0), bufferMax(0), strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator), nfd(*Normalizer2Factory::getNFDInstance(status)), targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL) { strength = ucol_getStrength(coll); toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; variableTop = ucol_getVariableTop(coll, &status); // find the largest expansion uint8_t maxExpansion = 0; for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) { if (*expansion > maxExpansion) { maxExpansion = *expansion; } } // room for an extra character on each end, plus 4 for safety bufferSize = patternLength + (2 * maxExpansion) + 4; ceb = NEW_ARRAY(CEI, bufferSize); if (ceb == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (target != NULL) { setTargetString(target); } switch (strength) { default: strengthMask |= UCOL_TERTIARYORDERMASK; /* fall through */ case UCOL_SECONDARY: strengthMask |= UCOL_SECONDARYORDERMASK; /* fall through */ case UCOL_PRIMARY: strengthMask |= UCOL_PRIMARYORDERMASK; } } Target::~Target() { ubrk_close(charBreakIterator); ucol_closeElements(elements); DELETE_ARRAY(ceb); } void Target::setTargetString(const UnicodeString *target) { if (charBreakIterator != NULL) { ubrk_close(charBreakIterator); ucol_closeElements(elements); } targetString = target; if (targetString != NULL) { UErrorCode status = U_ZERO_ERROR; targetBuffer = targetString->getBuffer(); targetLength = targetString->length(); elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status); ucol_forceHanImplicit(elements, &status); charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), targetBuffer, targetLength, &status); } else { targetBuffer = NULL; targetLength = 0; } } const CEI *Target::nextCE(int32_t offset) { UErrorCode status = U_ZERO_ERROR; int32_t low = -1, high = -1; uint32_t order; UBool cont = FALSE; if (offset >= bufferMin && offset < bufferMax) { return &ceb[offset]; } if (bufferMax >= bufferSize || offset != bufferMax) { return NULL; } do { low = ucol_getOffset(elements); order = ucol_next(elements, &status); high = ucol_getOffset(elements); if (order == (uint32_t)UCOL_NULLORDER) { //high = low = -1; break; } cont = isContinuation(order); order &= strengthMask; if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) { if (strength >= UCOL_QUATERNARY) { order &= UCOL_PRIMARYORDERMASK; } else { order = UCOL_IGNORABLE; } } } while (order == UCOL_IGNORABLE); if (cont) { order |= UCOL_CONTINUATION_MARKER; } ceb[offset].order = order; ceb[offset].lowOffset = low; ceb[offset].highOffset = high; bufferMax += 1; return &ceb[offset]; } const CEI *Target::prevCE(int32_t offset) { UErrorCode status = U_ZERO_ERROR; int32_t low = -1, high = -1; uint32_t order; UBool cont = FALSE; if (offset >= bufferMin && offset < bufferMax) { return &ceb[offset]; } if (bufferMax >= bufferSize || offset != bufferMax) { return NULL; } do { high = ucol_getOffset(elements); order = ucol_previous(elements, &status); low = ucol_getOffset(elements); if (order == (uint32_t)UCOL_NULLORDER) { break; } cont = isContinuation(order); order &= strengthMask; if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) { if (strength >= UCOL_QUATERNARY) { order &= UCOL_PRIMARYORDERMASK; } else { order = UCOL_IGNORABLE; } } } while (order == UCOL_IGNORABLE); bufferMax += 1; if (cont) { order |= UCOL_CONTINUATION_MARKER; } ceb[offset].order = order; ceb[offset].lowOffset = low; ceb[offset].highOffset = high; return &ceb[offset]; } int32_t Target::stringLength() { if (targetString != NULL) { return targetLength; } return 0; } UChar Target::charAt(int32_t offset) { if (targetString != NULL) { return targetBuffer[offset]; } return 0x0000; } void Target::setOffset(int32_t offset) { UErrorCode status = U_ZERO_ERROR; bufferMin = 0; bufferMax = 0; ucol_setOffset(elements, offset, &status); } void Target::setLast(int32_t last) { UErrorCode status = U_ZERO_ERROR; bufferMin = 0; bufferMax = 1; ceb[0].order = (uint32_t)UCOL_NULLORDER; ceb[0].lowOffset = last; ceb[0].highOffset = last; ucol_setOffset(elements, last, &status); } int32_t Target::getOffset() { return ucol_getOffset(elements); } UBool Target::isBreakBoundary(int32_t offset) { return ubrk_isBoundary(charBreakIterator, offset); } int32_t Target::nextBreakBoundary(int32_t offset) { return ubrk_following(charBreakIterator, offset); } int32_t Target::nextSafeBoundary(int32_t offset) { while (offset < targetLength) { //UChar ch = charAt(offset); UChar ch = targetBuffer[offset]; if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) { return offset; } offset += 1; } return targetLength; } UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end) { if (strength < UCOL_IDENTICAL) { return TRUE; } // Note: We could use Normalizer::compare() or similar, but for short strings // which may not be in FCD it might be faster to just NFD them. UErrorCode status = U_ZERO_ERROR; UnicodeString t2, p2; nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status); nfd.normalize(pattern, p2, status); // return FALSE if NFD failed return U_SUCCESS(status) && t2 == p2; } #define HASH_TABLE_SIZE 257 class BadCharacterTable : public UMemory { public: BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status); ~BadCharacterTable(); int32_t operator[](uint32_t ce) const; int32_t getMaxSkip() const; int32_t minLengthInChars(int32_t index); private: static int32_t hash(uint32_t ce); int32_t maxSkip; int32_t badCharacterTable[HASH_TABLE_SIZE]; int32_t *minLengthCache; }; BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status) : minLengthCache(NULL) { int32_t plen = patternCEs.size(); // **** need a better way to deal with this **** if (U_FAILURE(status) || plen == 0) { return; } int32_t *history = NEW_ARRAY(int32_t, plen); if (history == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } for (int32_t i = 0; i < plen; i += 1) { history[i] = -1; } minLengthCache = NEW_ARRAY(int32_t, plen + 1); if (minLengthCache == NULL) { DELETE_ARRAY(history); status = U_MEMORY_ALLOCATION_ERROR; return; } maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history); for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) { badCharacterTable[j] = maxSkip; } for(int32_t p = 1; p < plen; p += 1) { minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history); // Make sure this entry is not bigger than the previous one. // Otherwise, we might skip too far in some cases. if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) { minLengthCache[p] = minLengthCache[p - 1]; } } minLengthCache[plen] = 0; for(int32_t p = 0; p < plen - 1; p += 1) { badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1]; } DELETE_ARRAY(history); } BadCharacterTable::~BadCharacterTable() { DELETE_ARRAY(minLengthCache); } int32_t BadCharacterTable::operator[](uint32_t ce) const { return badCharacterTable[hash(ce)]; } int32_t BadCharacterTable::getMaxSkip() const { return maxSkip; } int32_t BadCharacterTable::minLengthInChars(int32_t index) { return minLengthCache[index]; } int32_t BadCharacterTable::hash(uint32_t ce) { return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE; } class GoodSuffixTable : public UMemory { public: GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status); ~GoodSuffixTable(); int32_t operator[](int32_t offset) const; private: int32_t *goodSuffixTable; }; GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status) : goodSuffixTable(NULL) { int32_t patlen = patternCEs.size(); // **** need a better way to deal with this **** if (U_FAILURE(status) || patlen <= 0) { return; } int32_t *suff = NEW_ARRAY(int32_t, patlen); int32_t start = patlen - 1, end = - 1; int32_t maxSkip = badCharacterTable.getMaxSkip(); if (suff == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } // initialze suff suff[patlen - 1] = patlen; for (int32_t i = patlen - 2; i >= 0; i -= 1) { // (i > start) means we're inside the last suffix match we found // ((patlen - 1) - end) is how far the end of that match is from end of pattern // (i - start) is how far we are from start of that match // (i + (patlen - 1) - end) is index of same character at end of pattern // so if any suffix match at that character doesn't extend beyond the last match, // it's the suffix for this character as well if (i > start && suff[i + patlen - 1 - end] < i - start) { suff[i] = suff[i + patlen - 1 - end]; } else { start = end = i; int32_t s = patlen; while (start >= 0 && patternCEs[start] == patternCEs[--s]) { start -= 1; } suff[i] = end - start; } } // now build goodSuffixTable goodSuffixTable = NEW_ARRAY(int32_t, patlen); if (goodSuffixTable == NULL) { DELETE_ARRAY(suff); status = U_MEMORY_ALLOCATION_ERROR; return; } // initialize entries to minLengthInChars of the pattern for (int32_t i = 0; i < patlen; i += 1) { goodSuffixTable[i] = maxSkip; } int32_t prefix = 0; for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) { if (suff[i] == i + 1) { // this matching suffix is a prefix of the pattern int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1); // for any mis-match before this suffix, we should skip // so that the front of the pattern (i.e. the prefix) // lines up with the front of the suffix. // (patlen - 1 - i) is the start of the suffix while (prefix < patlen - 1 - i) { // value of maxSkip means never set... if (goodSuffixTable[prefix] == maxSkip) { goodSuffixTable[prefix] = prefixSkip; } prefix += 1; } } } for (int32_t i = 0; i < patlen - 1; i += 1) { goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1); } DELETE_ARRAY(suff); } GoodSuffixTable::~GoodSuffixTable() { DELETE_ARRAY(goodSuffixTable); } int32_t GoodSuffixTable::operator[](int32_t offset) const { return goodSuffixTable[offset]; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch) UBool BoyerMooreSearch::empty() { return patCEs->size() <= 0; } CollData *BoyerMooreSearch::getData() { return data; } CEList *BoyerMooreSearch::getPatternCEs() { return patCEs; } BadCharacterTable *BoyerMooreSearch::getBadCharacterTable() { return badCharacterTable; } GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable() { return goodSuffixTable; } BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status) : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL) { if (U_FAILURE(status)) { return; } UCollator *collator = data->getCollator(); patCEs = new CEList(collator, patternString, status); if (patCEs == NULL || U_FAILURE(status)) { return; } badCharacterTable = new BadCharacterTable(*patCEs, data, status); if (badCharacterTable == NULL || U_FAILURE(status)) { return; } goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status); if (targetString != NULL) { target = new Target(collator, targetString, patCEs->size(), status); } } BoyerMooreSearch::~BoyerMooreSearch() { delete target; delete goodSuffixTable; delete badCharacterTable; delete patCEs; } void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status) { if (U_FAILURE(status)) { return; } if (target == NULL) { target = new Target(data->getCollator(), targetString, patCEs->size(), status); } else { target->setTargetString(targetString); } } // **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. **** /* * TODO: * * deal with trailing (and leading?) ignorables. * * Adding BoyerMooreSearch object slowed it down. How can we speed it up? */ UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end) { /*UCollator *coll =*/ data->getCollator(); int32_t plen = patCEs->size(); int32_t tlen = target->stringLength(); int32_t maxSkip = badCharacterTable->getMaxSkip(); int32_t tOffset = offset + maxSkip; if (plen <= 0) { // Searching for a zero length pattern always fails. start = end = -1; return FALSE; } while (tOffset <= tlen) { int32_t pIndex = plen - 1; int32_t tIndex = 0; int32_t lIndex = 0; if (tOffset < tlen) { // **** we really want to skip ahead enough to **** // **** be sure we get at least 1 non-ignorable **** // **** CE after the end of the pattern. **** int32_t next = target->nextSafeBoundary(tOffset + 1); target->setOffset(next); for (lIndex = 0; ; lIndex += 1) { const CEI *cei = target->prevCE(lIndex); int32_t low = cei->lowOffset; int32_t high = cei->highOffset; if (high == 0 || (low < high && low <= tOffset)) { if (low < tOffset) { while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) { lIndex -= 1; } if (high > tOffset) { tOffset = high; } } break; } } } else { target->setLast(tOffset); lIndex = 0; } tIndex = ++lIndex; // Iterate backward until we hit the beginning of the pattern while (pIndex >= 0) { uint32_t pce = (*patCEs)[pIndex]; const CEI *tcei = target->prevCE(tIndex++); if (tcei->order != pce) { // There is a mismatch at this position. Decide how far // over to shift the pattern, then try again. int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex]; #ifdef EXTRA_CAUTIOUS int32_t old = tOffset; #endif tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1); if (gsOffset > tOffset) { tOffset = gsOffset; } #ifdef EXTRA_CAUTIOUS // Make sure we don't skip backwards... if (tOffset <= old) { tOffset = old + 1; } #endif break; } pIndex -= 1; } if (pIndex < 0) { // We made it back to the beginning of the pattern, // which means we matched it all. Return the location. const CEI firstCEI = *target->prevCE(tIndex - 1); const CEI lastCEI = *target->prevCE(lIndex); int32_t mStart = firstCEI.lowOffset; int32_t minLimit = lastCEI.lowOffset; int32_t maxLimit = lastCEI.highOffset; int32_t mLimit; UBool found = TRUE; target->setOffset(/*tOffset*/maxLimit); const CEI nextCEI = *target->nextCE(0); if (nextCEI.lowOffset > maxLimit) { maxLimit = nextCEI.lowOffset; } if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) { found = FALSE; } if (! target->isBreakBoundary(mStart)) { found = FALSE; } if (firstCEI.lowOffset == firstCEI.highOffset) { found = FALSE; } mLimit = maxLimit; if (minLimit < maxLimit) { // When the last CE's low index is same with its high index, the CE is likely // a part of expansion. In this case, the index is located just after the // character corresponding to the CEs compared above. If the index is right // at the break boundary, move the position to the next boundary will result // incorrect match length when there are ignorable characters exist between // the position and the next character produces CE(s). See ticket#8482. if (minLimit == lastCEI.highOffset && target->isBreakBoundary(minLimit)) { mLimit = minLimit; } else { int32_t nbb = target->nextBreakBoundary(minLimit); if (nbb >= lastCEI.highOffset) { mLimit = nbb; } } } if (mLimit > maxLimit) { found = FALSE; } if (! target->isBreakBoundary(mLimit)) { found = FALSE; } if (! target->isIdentical(pattern, mStart, mLimit)) { found = FALSE; } if (found) { start = mStart; end = mLimit; return TRUE; } tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip? } // Otherwise, we're here because of a mismatch, so keep going.... } // no match start = -1; end = -1; return FALSE; } U_NAMESPACE_END #endif // #if !UCONFIG_NO_COLLATION