/* ********************************************************************** * Copyright (C) 1999-2000 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 03/22/2000 helena Creation. ********************************************************************** */ #include #include "unicode/coleitr.h" #include "unicode/schriter.h" #include "strsrch.h" /** * StringSearch is a SearchIterator that provides * language-sensitive text searching based on the comparison rules defined * in a {@link RuleBasedCollator} object. * Instances of StringSearch function as iterators * maintain a current position and scan over text returning the index of * characters where the pattern occurs and the length of each match. *

* StringSearch uses a version of the fast Boyer-Moore search * algorithm that has been adapted to work with the large character set of * Unicode. See "Efficient Text Searching in Java", to be published in * Java Report in February, 1999, for further information on the algorithm. *

* Consult the SearchIterator documentation for information on * and examples of how to use instances of this class to implement text * searching. SearchIterator provides all of the necessary * API; this class only provides constructors and internal implementation * methods. * * @see SearchIterator * @see RuleBasedCollator * * @author Laura Werner * @version 1.0 */ char StringSearch::fgClassID = 0; // Value is irrelevant // class id /* to be removed */ void StringSearch::dumpTables() { int i; for (i = 0; i < 256; i++) { if (shiftTable[i] != minLen) { // debug("shift[" + Integer.toString(i,16) + "] = " + shiftTable[i]); } } for (i = 0; i < 256; i++) { if (backShiftTable[i] != minLen) { // debug("backShift[" + Integer.toString(i,16) + "] = " + backShiftTable[i]); } } } StringSearch::StringSearch(const UnicodeString& pat, CharacterIterator* target, RuleBasedCollator* coll, BreakIterator* breaker, UErrorCode& status) : SearchIterator(target, breaker), strength(coll->getStrength()), valueList(NULL), valueListLen(0), pattern(pat), normLen(0), // num. of collation elements in pattern. minLen(0), // Min of composed, decomposed versions maxLen(0), // Max it(NULL) { if (U_FAILURE(status)) return; collator = (RuleBasedCollator*)(coll->clone()); iter = collator->createCollationElementIterator(*target); it = collator->createCollationElementIterator(pat); initialize(status); // Initialize the Boyer-Moore tables } /** * Construct a StringSearch object using a specific collator. *

* @param pattern The text for which this object will search. * * @param target The text in which to search for the pattern. * * @param collator A RuleBasedCollator object which defines the * language-sensitive comparison rules used to determine * whether text in the pattern and target matches. */ StringSearch::StringSearch(const UnicodeString& pat, CharacterIterator* target, RuleBasedCollator* collator, UErrorCode& status) : SearchIterator(), strength(collator->getStrength()), valueList(NULL), valueListLen(0), pattern(pat), normLen(0), // num. of collation elements in pattern. minLen(0), // Min of composed, decomposed versions maxLen(0), // Max it(NULL) { if (U_FAILURE(status)) return; this->adoptTarget(target); this->collator = (RuleBasedCollator*)(collator->clone()); this->iter = collator->createCollationElementIterator(*target); this->it = collator->createCollationElementIterator(pat); initialize(status); } /** * Construct a StringSearch object using the collator and * character boundary detection rules for a given locale *

* @param pattern The text for which this object will search. * * @param target The text in which to search for the pattern. * * @param loc The locale whose collation and break-detection rules * should be used. * * @exception ClassCastException thrown if the collator for the specified * locale is not a RuleBasedCollator. */ StringSearch::StringSearch(const StringSearch& that) : SearchIterator(that), iter(NULL), collator(that.collator), strength(that.strength), valueList(NULL), valueListLen(that.valueListLen), normLen(that.normLen), // num. of collation elements in pattern. minLen(that.minLen), // Min of composed, decomposed versions maxLen(that.maxLen), it(NULL) { valueList = new int32_t[valueListLen]; memcpy(valueList, that.valueList, valueListLen*sizeof(int32_t)); iter = that.collator->createCollationElementIterator(that.getTarget()); it = that.collator->createCollationElementIterator(that.pattern); } StringSearch::StringSearch(const UnicodeString& pat, CharacterIterator* target, const Locale& loc, UErrorCode& status) : SearchIterator(), valueList(NULL), valueListLen(0), pattern(pat), normLen(0), // num. of collation elements in pattern. minLen(0), // Min of composed, decomposed versions maxLen(0) // Max { if (U_FAILURE(status)) return; this->adoptTarget(target); collator = (RuleBasedCollator*)Collator::createInstance(loc, status); iter = collator->createCollationElementIterator(*target); it = collator->createCollationElementIterator(pat); strength = collator->getStrength(); initialize(status); } UBool StringSearch::operator==(const SearchIterator& that) const { if (that.getDynamicClassID() != getDynamicClassID()) return FALSE; if (!SearchIterator::operator==(that)) return FALSE; const StringSearch& that2 = (const StringSearch&)that; if (*that2.iter != *iter) return FALSE; else if (*that2.collator != *collator) return FALSE; else if (that2.strength != strength) return FALSE; else if (that2.valueListLen != valueListLen) return FALSE; else if (memcmp(that2.valueList, valueList, valueListLen*sizeof(int32_t)) != 0) return FALSE; else if (that2.pattern != pattern) return FALSE; else if (that2.normLen != normLen) return FALSE; else if (that2.minLen != minLen) return FALSE; else if (that2.maxLen != maxLen) return FALSE; else return TRUE; } SearchIterator* StringSearch::clone(void) const { return new StringSearch(*this); } /** * Construct a StringSearch object using the collator for the default * locale *

* @param pattern The text for which this object will search. * * @param target The text in which to search for the pattern. * * @param collator A RuleBasedCollator object which defines the * language-sensitive comparison rules used to determine * whether text in the pattern and target matches. */ StringSearch::StringSearch(const UnicodeString& pat, const UnicodeString& newText, UErrorCode& status) : SearchIterator(), valueList(NULL), valueListLen(0), pattern(pat), normLen(0), // num. of collation elements in pattern. minLen(0), // Min of composed, decomposed versions maxLen(0) // Max { StringCharacterIterator *s = new StringCharacterIterator(newText); collator = (RuleBasedCollator*)Collator::createInstance(Locale::getDefault(), status); strength = collator->getStrength(); iter = collator->createCollationElementIterator(newText); it = collator->createCollationElementIterator(pat); this->adoptTarget(s); initialize(status); } StringSearch::~StringSearch(void) { if (valueList != NULL) { delete [] valueList; valueList = 0; } if (iter != NULL) { delete iter; iter = 0; } if (collator != NULL) { delete collator; collator = 0; } if (it != NULL) { delete it; it = 0; } } //------------------------------------------------------------------- // Getters and Setters //------------------------------------------------------------------- /** * Sets this object's strength property. The strength determines the * minimum level of difference considered significant during a * search. Generally, {@link Collator#TERTIARY} and * {@link Collator#IDENTICAL} indicate that all differences are * considered significant, {@link Collator#SECONDARY} indicates * that upper/lower case distinctions should be ignored, and * {@link Collator#PRIMARY} indicates that both case and accents * should be ignored. However, the exact meanings of these constants * are determined by individual Collator objects. *

* @see Collator#PRIMARY * @see Collator#SECONDARY * @see Collator#TERTIARY * @see Collator#IDENTICAL */ void StringSearch::setStrength(Collator::ECollationStrength newStrength, UErrorCode& status) { if (U_FAILURE(status)) { return; } strength = newStrength; // Due to a bug (?) in CollationElementIterator, we must set the // collator's strength as well, since the iterator is going to // mask out the portions of the collation element that are not // relevant for the collator's current strength setting // Note that this makes it impossible to share a Collator among // multiple StringSearch objects if you adjust Strength settings. collator->setStrength(strength); initialize(status); } /** * Returns this object's strength property, which indicates what level * of differences are considered significant during a search. *

* @see #setStrength */ Collator::ECollationStrength StringSearch::getStrength() const { return strength; } /** * Set the collator to be used for this string search. Also changes * the search strength to match that of the new collator. *

* This method causes internal data such as Boyer-Moore shift tables * to be recalculated, but the iterator's position is unchanged. *

* @see #getCollator */ void StringSearch::setCollator(const RuleBasedCollator *coll, UErrorCode& status) { delete iter; delete collator; collator = (RuleBasedCollator*)coll->clone(); strength = collator->getStrength(); // Also need to recompute the pattern and get a new target iterator iter = collator->createCollationElementIterator(getTarget()); initialize(status); } /** * Return the RuleBasedCollator being used for this string search. */ const RuleBasedCollator& StringSearch::getCollator(void) const { return *collator; } /** * Set the pattern for which to search. * This method causes internal data such as Boyer-Moore shift tables * to be recalculated, but the iterator's position is unchanged. */ void StringSearch::setPattern(const UnicodeString& pat, UErrorCode& status) { pattern = pat; initialize(status); } /** * Returns the pattern for which this object is searching. */ const UnicodeString& StringSearch::getPattern() const { return pattern; } /** * Set the target text which should be searched and resets the * iterator's position to point before the start of the new text. * This method is useful if you want to re-use an iterator to * search for the same pattern within a different body of text. */ void StringSearch::adoptTarget(CharacterIterator* target) { UErrorCode status = U_ZERO_ERROR; SearchIterator::adoptTarget(target); // fix me: Skipped the error code // Since we're caching a CollationElementIterator, recreate it iter->setText(*target, status); } void StringSearch::setTarget(const UnicodeString& newText) { UErrorCode status = U_ZERO_ERROR; SearchIterator::setTarget(newText); // Since we're caching a CollationElementIterator, recreate it iter->setText(newText, status); } void StringSearch::reset(void) { SearchIterator::reset(); iter->reset(); }//------------------------------------------------------------------- // Privates //------------------------------------------------------------------- /** * Search forward for matching text, starting at a given location. * Clients should not call this method directly; instead they should call * {@link SearchIterator#next}. *

* If a match is found, this method returns the index at which the match * starts and calls {@link SearchIterator#setMatchLength} * with the number of characters in the target * text that make up the match. If no match is found, the method returns * DONE and does not call setMatchLength. *

* @param start The index in the target text at which the search starts. * * @return The index at which the matched text in the target starts, or DONE * if no match was found. *

* @see SearchIterator#next * @see SearchIterator#DONE */ int32_t StringSearch::handleNext(int32_t start, UErrorCode& status) { if (U_FAILURE(status)) { return SearchIterator::DONE; } const CharacterIterator& target = getTarget(); int mask = getMask(strength); #if 0 int done = CollationElementIterator::NULLORDER & mask; if (DEBUG) { debug("-------------------------handleNext-----------------------------------"); debug(""); debug("strength=" + strength + ", mask=" + Integer.toString(mask,16) + ", done=" + Integer.toString(done,16)); debug("decomp=" + collator.getDecomposition()); debug("target.begin=" + getTarget().getBeginIndex()); debug("target.end=" + getTarget().getEndIndex()); debug("start = " + start); } #endif int32_t index = start + minLen; int32_t matchEnd = 0; while (index <= target.endIndex()) { int32_t patIndex = normLen; int32_t tval = 0, pval = 0; UBool getP = TRUE; iter->setOffset(index, status); matchEnd = index; //if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index); while ((patIndex > 0 || getP == false) && iter->getOffset() > start) { #if 0 if (DEBUG) { debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); debug(" getP=" + getP); } #endif // Get the previous character in both the pattern and the target tval = iter->previous(status) & mask; if (U_FAILURE(status)) { return SearchIterator::DONE; } if (getP) pval = valueList[--patIndex]; getP = TRUE; // (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16)); if (tval == 0) { // skip tval, use same pval // (DEBUG) debug(" tval is ignorable"); getP = FALSE; } else if (pval != tval) { // Mismatch, skip ahead // (DEBUG) debug(" mismatch: skippping " + getShift(tval, patIndex)); index += getShift(tval, patIndex); break; } else if (patIndex == 0) { // The values matched, and we're at the beginning of the pattern, // which means we matched the whole thing. start = iter->getOffset(); setMatchLength(matchEnd - start); // if (DEBUG) debug("Found match at index "+ start ); return start; } } #if 0 if (DEBUG) debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); if (DEBUG) debug(" getP=" + getP); #endif if (iter->getOffset() <= start) { // We hit the beginning of the text being searched, which is // possible if it contains lots of ignorable characters. // Advance one character and try again. // if (DEBUG) debug("hit beginning of target; advance by one"); index++; } } // if (DEBUG) debug("Fell off end of outer loop; returning DONE"); return SearchIterator::DONE; } /** * Search backward for matching text ,starting at a given location. * Clients should not call this method directly; instead they should call * SearchIterator.previous(), which this method overrides. *

* If a match is found, this method returns the index at which the match * starts and calls {@link SearchIterator#setMatchLength} * with the number of characters in the target * text that make up the match. If no match is found, the method returns * DONE and does not call setMatchLength. *

* @param start The index in the target text at which the search starts. * * @return The index at which the matched text in the target starts, or DONE * if no match was found. *

* @see SearchIterator#previous * @see SearchIterator#DONE */ int32_t StringSearch::handlePrev(int32_t start, UErrorCode& status) { if (U_FAILURE(status)) { return SearchIterator::DONE; } int patLen = normLen; int index = start - minLen; int mask = getMask(strength); int done = CollationElementIterator::NULLORDER & mask; #if 0 if (DEBUG) { debug("-------------------------handlePrev-----------------------------------"); debug(""); debug("strength=" + strength + ", mask=" + Integer.toString(mask,16) + ", done=" + Integer.toString(done,16)); debug("decomp=" + collator.getDecomposition()); debug("target.begin=" + getTarget().getBeginIndex()); debug("target.end=" + getTarget().getEndIndex()); } #endif while (index >= 0) { int patIndex = 0; int tval = 0, pval = 0; UBool getP = TRUE; iter->setOffset(index, status); if (U_FAILURE(status)) { return SearchIterator::DONE; } // if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index); while ((patIndex < patLen || !getP) && iter->getOffset() < start) { /* if (DEBUG) { debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); } */ tval = iter->next(status) & mask; if (U_FAILURE(status)) { return SearchIterator::DONE; } if (getP) pval = valueList[patIndex++]; getP = TRUE; //if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16)); if (tval == done) { // if (DEBUG) debug(" end of target; no match"); return DONE; } else if (tval == 0) { // if (DEBUG) debug(" tval is ignorable"); getP = false; } else if (pval != tval) { // We didn't match this pattern. Skip ahead // if (DEBUG) debug(" mismatch: skippping " + getBackShift(tval, patIndex)); int shift = getBackShift(tval, patIndex); index -= shift; break; } else if (patIndex == patLen) { // The elements matched and we're at the end of the pattern, // which means we matched the whole thing. setMatchLength(iter->getOffset() - index); return index; } } if (iter->getOffset() >= start) { // We hit the end of the text being searched, which is // possible if it contains lots of ignorable characters. // Back up one character and try again. // if (DEBUG) debug("hit end of target; back by one"); index--; } } return SearchIterator::DONE; } /** * Return a bitmask that will select only the portions of a collation * element that are significant at the given strength level. */ int32_t StringSearch::getMask(Collator::ECollationStrength strength) { switch (strength) { case Collator::PRIMARY: return 0xFFFF0000; case Collator::SECONDARY: return 0xFFFFFF00; default: return 0xFFFFFFFF; } } void StringSearch::initialize(UErrorCode& status) { /* if (DEBUG) { debug("-------------------------initialize-----------------------------------"); debug("pattern=" + pattern); } */ it->setText(pattern, status); if (U_FAILURE(status)) { delete it; return; } int mask = getMask(strength); // See how many non-ignorable collation keys are in the text normLen = 0; int32_t elem; while ((elem = it->next(status)) != CollationElementIterator::NULLORDER) { if (U_FAILURE(status)) { return; } if ((elem & mask) != 0) { normLen++; } } if (valueList != NULL) { delete [] valueList; } // Save them all valueList = new int32_t[normLen]; int expandLen = 0; it->reset(); int32_t i; for (i = 0; i < normLen; i++) { elem = it->next(status); if (U_FAILURE(status)) { return; } if ((elem & mask) != 0) { valueList[i] = elem & mask; } // Keep track of whether there are any expanding-character // sequences that can result in one of the characters that's in // the pattern. If there are, we have to reduce the shift // distances calculated below to account for it. expandLen += it->getMaxExpansion(elem) - 1; } // // We need to remember the size of the composed and decomposed // versions of the string. Standard Boyer-Moore shift calculations // can be wrong by an amount up to that difference, since a small // small number of characters in the pattern can map to a larger // number in the text being searched, or vice-versa. // int uniLen = pattern.length(); maxLen = uprv_max(normLen, uniLen); minLen = uprv_min(normLen, uniLen) - expandLen; /* if (DEBUG) debug("normLen=" + normLen + ", expandLen=" + expandLen + ", maxLen=" + maxLen + ", minLen=" + minLen); */ // Now initialize the shift tables // // NOTE: This is the most conservative way to build them. If we had a way // of knowing that there were no expanding/contracting chars in the rules, // we could get rid of the "- 1" in the shiftTable calculations. // But all of the default collators have at least one expansion or // contraction, so it probably doesn't matter anyway. // for (i = 0; i < 256; i++) { shiftTable[i] = backShiftTable[i] = minLen; } for (i = 0; i < normLen-1; i++) { shiftTable[hash(valueList[i])] = uprv_max(minLen - i - 1, 1); } shiftTable[hash(valueList[normLen-1])] = 1; for (i = normLen - 1; i > 0; i--) { backShiftTable[hash(valueList[i])] = i; } backShiftTable[hash(valueList[0])] = 1; /* dumpTables(); */ } /** * Method used by StringSearch to determine how far to the right to * shift the pattern during a Boyer-Moore search. * * @param curValue The current value in the target text * @param curIndex The index in the pattern at which we failed to match * curValue in the target text. */ int32_t StringSearch::getShift( int32_t curValue, int32_t curIndex ) const { int32_t shiftAmt = shiftTable[hash(curValue)]; if (minLen != maxLen) { int adjust = normLen - curIndex; if (shiftAmt > adjust + 1) { // if (DEBUG) debug("getShift: adjusting by " + adjust); shiftAmt -= adjust; } } return shiftAmt; } /** * Method used by StringSearch to determine how far to the left to * shift the pattern during a reverse Boyer-Moore search. * * @param curValue The current value in the target text * @param curIndex The index in the pattern at which we failed to match * curValue in the target text. */ int32_t StringSearch::getBackShift( int32_t curValue, int32_t curIndex ) const { int shiftAmt = backShiftTable[hash(curValue)]; if (minLen != maxLen) { int adjust = normLen - (minLen - curIndex); if (shiftAmt > adjust + 1) { // if (DEBUG) debug("getBackShift: adjusting by " + adjust); shiftAmt -= adjust; } } return shiftAmt; } /** * Hash a collation element from its full size (32 bits) down into a * value that can be used as an index into the shift tables. Right * now we do a modulus by the size of the hash table. * * TODO: At some point I should experiment to see whether a slightly * more complicated hash function gives us a better distribution * on multilingual text. I doubt it will have much effect on * performance, though. */ int32_t StringSearch::hash(int32_t order) { return CollationElementIterator::primaryOrder(order) % 256; }