diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index d3467b7259..f17971aa4e 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -1,6 +1,6 @@ #****************************************************************************** # -# Copyright (C) 1998-2008, International Business Machines +# Copyright (C) 1998-2009, International Business Machines # Corporation and others. All Rights Reserved. # #****************************************************************************** @@ -81,7 +81,7 @@ ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \ csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \ wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \ zonemeta.o zstrfmt.o plurrule.o plurfmt.o dtitvfmt.o dtitvinf.o \ -tmunit.o tmutamt.o tmutfmt.o +tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o ## Header files to install HEADERS = $(srcdir)/unicode/*.h diff --git a/icu4c/source/i18n/bms.cpp b/icu4c/source/i18n/bms.cpp new file mode 100644 index 0000000000..cbcdb38368 --- /dev/null +++ b/icu4c/source/i18n/bms.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2008-2009, International Business Machines Corporation and Others. + * All rights reserved. + */ + +#include "unicode/utypes.h" +#include "cmemory.h" +#include "unicode/bms.h" +#include "unicode/unistr.h" +#include "unicode/colldata.h" +#include "unicode/bmsearch.h" + +//#define USE_SAFE_CASTS +#ifdef USE_SAFE_CASTS +#define STATIC_CAST(type,value) static_cast(value) +#define CONST_CAST(type,value) const_cast(value) +#else +#define STATIC_CAST(type,value) (type) (value) +#define CONST_CAST(type,value) (type) (value) +#endif + +U_CAPI UCD * U_EXPORT2 +ucd_open(UCollator *coll, UErrorCode *status) +{ + return STATIC_CAST(UCD *, CollData::open(coll, *status)); +} + +U_CAPI void U_EXPORT2 +ucd_close(UCD *ucd) +{ + CollData *data = STATIC_CAST(CollData *, ucd); + + CollData::close(data); +} + +U_CAPI UCollator * U_EXPORT2 +ucd_getCollator(UCD *ucd) +{ + CollData *data = STATIC_CAST(CollData *, ucd); + + return data->getCollator(); +} + +U_CAPI void U_EXPORT2 +ucd_freeCache() +{ + CollData::freeCollDataCache(); +} + +U_CAPI void U_EXPORT2 +ucd_flushCache() +{ + CollData::flushCollDataCache(); +} + +struct BMS +{ + BoyerMooreSearch *bms; + const UnicodeString *targetString; +}; + +U_CAPI BMS * U_EXPORT2 +bms_open(UCD *ucd, + const UChar *pattern, int32_t patternLength, + const UChar *target, int32_t targetLength, + UErrorCode *status) +{ + BMS *bms = STATIC_CAST(BMS *, uprv_malloc(sizeof(BMS))); + + if (bms == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + + CollData *data = (CollData *) ucd; + UnicodeString patternString(pattern, patternLength); + + if (target != NULL) { + bms->targetString = new UnicodeString(target, targetLength); + + if (bms->targetString == NULL) { + bms->bms = NULL; + *status = U_MEMORY_ALLOCATION_ERROR; + return bms; + } + } else { + bms->targetString = NULL; + } + + bms->bms = new BoyerMooreSearch(data, patternString, bms->targetString, *status); + + if (bms->bms == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + } + + return bms; +} + +U_CAPI void U_EXPORT2 +bms_close(BMS *bms) +{ + delete bms->bms; + + delete bms->targetString; + + uprv_free(bms); +} + +U_CAPI UBool U_EXPORT2 +bms_empty(BMS *bms) +{ + return bms->bms->empty(); +} + +U_CAPI UCD * U_EXPORT2 +bms_getData(BMS *bms) +{ + return STATIC_CAST(UCD *, bms->bms->getData()); +} + +U_CAPI UBool U_EXPORT2 +bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end) +{ + return bms->bms->search(offset, *start, *end); +} + +U_CAPI void U_EXPORT2 +bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status) +{ + if (U_FAILURE(*status)) { + return; + } + + if (bms->targetString != NULL) { + delete bms->targetString; + } + + if (target != NULL) { + bms->targetString = new UnicodeString(target, targetLength); + } else { + bms->targetString = NULL; + } + + bms->bms->setTargetString(bms->targetString, *status); +} diff --git a/icu4c/source/i18n/bmsearch.cpp b/icu4c/source/i18n/bmsearch.cpp new file mode 100644 index 0000000000..1e5f90bbc3 --- /dev/null +++ b/icu4c/source/i18n/bmsearch.cpp @@ -0,0 +1,864 @@ +/* + ****************************************************************************** + * Copyright (C) 1996-2009, International Business Machines * + * Corporation and others. All Rights Reserved. * + ****************************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/unistr.h" +#include "unicode/putil.h" +#include "unicode/usearch.h" + +#include "cmemory.h" +#include "unicode/coll.h" +#include "unicode/tblcoll.h" +#include "unicode/coleitr.h" +#include "unicode/ucoleitr.h" + +#include "unicode/regex.h" // TODO: make conditional on regexp being built. + +#include "unicode/uniset.h" +#include "unicode/uset.h" +#include "unicode/ustring.h" +#include "hash.h" +#include "uhash.h" +#include "ucol_imp.h" +#include "unormimp.h" + +#include "unicode/colldata.h" +#include "unicode/bmsearch.h" + +U_NAMESPACE_BEGIN + +#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) +#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) uprv_free((void *) (array)) + + +struct CEI +{ + uint32_t order; + int32_t lowOffset; + int32_t highOffset; +}; + +class Target : public UMemory +{ +public: + Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status); + ~Target(); + + void setTargetString(const UnicodeString *target); + + const CEI *nextCE(int32_t offset); + const CEI *prevCE(int32_t offset); + + int32_t stringLength(); + UChar charAt(int32_t offset); + + UBool isBreakBoundary(int32_t offset); + int32_t nextBreakBoundary(int32_t offset); + int32_t nextSafeBoundary(int32_t offset); + + UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end); + + void setOffset(int32_t offset); + void setLast(int32_t last); + int32_t getOffset(); + +private: + CEI *ceb; + int32_t bufferSize; + int32_t bufferMin; + int32_t bufferMax; + + uint32_t strengthMask; + UCollationStrength strength; + uint32_t variableTop; + UBool toShift; + UCollator *coll; + + const UnicodeString *targetString; + const UChar *targetBuffer; + int32_t targetLength; + + UCollationElements *elements; + UBreakIterator *charBreakIterator; +}; + +Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status) + : bufferSize(0), bufferMin(0), bufferMax(0), + strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator), + targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL) +{ + strength = ucol_getStrength(coll); + toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; + variableTop = ucol_getVariableTop(coll, &status); + + // find the largest expansion + uint8_t maxExpansion = 0; + for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) { + if (*expansion > maxExpansion) { + maxExpansion = *expansion; + } + } + + // room for an extra character on each end, plus 4 for safety + bufferSize = patternLength + (2 * maxExpansion) + 4; + + ceb = NEW_ARRAY(CEI, bufferSize); + + if (ceb == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + if (target != NULL) { + setTargetString(target); + } + + switch (strength) + { + default: + strengthMask |= UCOL_TERTIARYORDERMASK; + /* fall through */ + + case UCOL_SECONDARY: + strengthMask |= UCOL_SECONDARYORDERMASK; + /* fall through */ + + case UCOL_PRIMARY: + strengthMask |= UCOL_PRIMARYORDERMASK; + } +} + +Target::~Target() +{ + ubrk_close(charBreakIterator); + ucol_closeElements(elements); + + DELETE_ARRAY(ceb); +} + +void Target::setTargetString(const UnicodeString *target) +{ + if (charBreakIterator != NULL) { + ubrk_close(charBreakIterator); + ucol_closeElements(elements); + } + + targetString = target; + + if (targetString != NULL) { + UErrorCode status = U_ZERO_ERROR; + + targetBuffer = targetString->getBuffer(); + targetLength = targetString->length(); + + elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status); + ucol_forceHanImplicit(elements, &status); + + charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status), + targetBuffer, targetLength, &status); + } else { + targetBuffer = NULL; + targetLength = 0; + } +} + +const CEI *Target::nextCE(int32_t offset) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t low = -1, high = -1; + uint32_t order; + UBool cont = FALSE; + + if (offset >= bufferMin && offset < bufferMax) { + return &ceb[offset]; + } + + if (bufferMax >= bufferSize || offset != bufferMax) { + return NULL; + } + + do { + low = ucol_getOffset(elements); + order = ucol_next(elements, &status); + high = ucol_getOffset(elements); + + if (order == UCOL_NULLORDER) { + //high = low = -1; + break; + } + + cont = isContinuation(order); + order &= strengthMask; + + if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) { + if (strength >= UCOL_QUATERNARY) { + order &= UCOL_PRIMARYORDERMASK; + } else { + order = UCOL_IGNORABLE; + } + } + } while (order == UCOL_IGNORABLE); + + if (cont) { + order |= UCOL_CONTINUATION_MARKER; + } + + ceb[offset].order = order; + ceb[offset].lowOffset = low; + ceb[offset].highOffset = high; + + bufferMax += 1; + + return &ceb[offset]; +} + +const CEI *Target::prevCE(int32_t offset) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t low = -1, high = -1; + uint32_t order; + UBool cont = FALSE; + + if (offset >= bufferMin && offset < bufferMax) { + return &ceb[offset]; + } + + if (bufferMax >= bufferSize || offset != bufferMax) { + return NULL; + } + + do { + high = ucol_getOffset(elements); + order = ucol_previous(elements, &status); + low = ucol_getOffset(elements); + + if (order == UCOL_NULLORDER) { + break; + } + + cont = isContinuation(order); + order &= strengthMask; + + if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) { + if (strength >= UCOL_QUATERNARY) { + order &= UCOL_PRIMARYORDERMASK; + } else { + order = UCOL_IGNORABLE; + } + } + } while (order == UCOL_IGNORABLE); + + bufferMax += 1; + + if (cont) { + order |= UCOL_CONTINUATION_MARKER; + } + + ceb[offset].order = order; + ceb[offset].lowOffset = low; + ceb[offset].highOffset = high; + + return &ceb[offset]; +} + +int32_t Target::stringLength() +{ + if (targetString != NULL) { + return targetLength; + } + + return 0; +} + +UChar Target::charAt(int32_t offset) +{ + if (targetString != NULL) { + return targetBuffer[offset]; + } + + return 0x0000; +} + +void Target::setOffset(int32_t offset) +{ + UErrorCode status = U_ZERO_ERROR; + + bufferMin = 0; + bufferMax = 0; + + ucol_setOffset(elements, offset, &status); +} + +void Target::setLast(int32_t last) +{ + UErrorCode status = U_ZERO_ERROR; + + bufferMin = 0; + bufferMax = 1; + + ceb[0].order = UCOL_NULLORDER; + ceb[0].lowOffset = last; + ceb[0].highOffset = last; + + ucol_setOffset(elements, last, &status); +} + +int32_t Target::getOffset() +{ + return ucol_getOffset(elements); +} + +UBool Target::isBreakBoundary(int32_t offset) +{ + return ubrk_isBoundary(charBreakIterator, offset); +} + +int32_t Target::nextBreakBoundary(int32_t offset) +{ + return ubrk_following(charBreakIterator, offset); +} + +int32_t Target::nextSafeBoundary(int32_t offset) +{ + while (offset < targetLength) { + //UChar ch = charAt(offset); + UChar ch = targetBuffer[offset]; + + if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) { + return offset; + } + + offset += 1; + } + + return targetLength; +} + +UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end) +{ + if (strength < UCOL_IDENTICAL) { + return TRUE; + } + + UChar t2[32], p2[32]; + const UChar *pBuffer = pattern.getBuffer(); + int32_t pLength = pattern.length(); + int32_t length = end - start; + + UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR; + + int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2), + targetBuffer + start, length, + FALSE, 0, &status); + + // use separate status2 in case of buffer overflow + if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2), + pBuffer, pLength, + FALSE, 0, &status2)) { + return FALSE; // lengths are different + } + + // compare contents + UChar *text, *pat; + + if(U_SUCCESS(status)) { + text = t2; + pat = p2; + } else if(status == U_BUFFER_OVERFLOW_ERROR) { + status = U_ZERO_ERROR; + + // allocate one buffer for both decompositions + text = NEW_ARRAY(UChar, decomplength * 2); + + // Check for allocation failure. + if (text == NULL) { + return FALSE; + } + + pat = text + decomplength; + + unorm_decompose(text, decomplength, targetBuffer + start, + length, FALSE, 0, &status); + + unorm_decompose(pat, decomplength, pBuffer, + pLength, FALSE, 0, &status); + } else { + // NFD failed, make sure that u_memcmp() does not overrun t2 & p2 + // and that we don't uprv_free() an undefined text pointer + text = pat = t2; + decomplength = 0; + } + + UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0); + + if(text != t2) { + DELETE_ARRAY(text); + } + + // return FALSE if NFD failed + return U_SUCCESS(status) && result; +} + +#define HASH_TABLE_SIZE 257 + +class BadCharacterTable : public UMemory +{ +public: + BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status); + ~BadCharacterTable(); + + int32_t operator[](uint32_t ce) const; + int32_t getMaxSkip() const; + int32_t minLengthInChars(int32_t index); + +private: + static int32_t hash(uint32_t ce); + + int32_t maxSkip; + int32_t badCharacterTable[HASH_TABLE_SIZE]; + + int32_t *minLengthCache; +}; + +BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status) + : minLengthCache(NULL) +{ + int32_t plen = patternCEs.size(); + + // **** need a better way to deal with this **** + if (U_FAILURE(status) || plen == 0) { + return; + } + + int32_t *history = NEW_ARRAY(int32_t, plen); + + if (history == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + for (int32_t i = 0; i < plen; i += 1) { + history[i] = -1; + } + + minLengthCache = NEW_ARRAY(int32_t, plen + 1); + + if (minLengthCache == NULL) { + DELETE_ARRAY(history); + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history); + + for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) { + badCharacterTable[j] = maxSkip; + } + + for(int32_t p = 1; p < plen; p += 1) { + minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history); + + // Make sure this entry is not bigger than the previous one. + // Otherwise, we might skip too far in some cases. + if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) { + minLengthCache[p] = minLengthCache[p - 1]; + } + } + + minLengthCache[plen] = 0; + + for(int32_t p = 0; p < plen - 1; p += 1) { + badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1]; + } + + DELETE_ARRAY(history); +} + +BadCharacterTable::~BadCharacterTable() +{ + DELETE_ARRAY(minLengthCache); +} + +int32_t BadCharacterTable::operator[](uint32_t ce) const +{ + return badCharacterTable[hash(ce)]; +} + +int32_t BadCharacterTable::getMaxSkip() const +{ + return maxSkip; +} + +int32_t BadCharacterTable::minLengthInChars(int32_t index) +{ + return minLengthCache[index]; +} + +int32_t BadCharacterTable::hash(uint32_t ce) +{ + return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE; +} + +class GoodSuffixTable : public UMemory +{ +public: + GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status); + ~GoodSuffixTable(); + + int32_t operator[](int32_t offset) const; + +private: + int32_t *goodSuffixTable; +}; + +GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status) + : goodSuffixTable(NULL) +{ + int32_t patlen = patternCEs.size(); + + // **** need a better way to deal with this **** + if (U_FAILURE(status) || patlen <= 0) { + return; + } + + int32_t *suff = NEW_ARRAY(int32_t, patlen); + int32_t start = patlen - 1, end = - 1; + int32_t maxSkip = badCharacterTable.getMaxSkip(); + + if (suff == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + // initialze suff + suff[patlen - 1] = patlen; + + for (int32_t i = patlen - 2; i >= 0; i -= 1) { + // (i > start) means we're inside the last suffix match we found + // ((patlen - 1) - end) is how far the end of that match is from end of pattern + // (i - start) is how far we are from start of that match + // (i + (patlen - 1) - end) is index of same character at end of pattern + // so if any suffix match at that character doesn't extend beyond the last match, + // it's the suffix for this character as well + if (i > start && suff[i + patlen - 1 - end] < i - start) { + suff[i] = suff[i + patlen - 1 - end]; + } else { + start = end = i; + + int32_t s = patlen; + + while (start >= 0 && patternCEs[start] == patternCEs[--s]) { + start -= 1; + } + + suff[i] = end - start; + } + } + + // now build goodSuffixTable + goodSuffixTable = NEW_ARRAY(int32_t, patlen); + + if (goodSuffixTable == NULL) { + DELETE_ARRAY(suff); + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + + // initialize entries to minLengthInChars of the pattern + for (int32_t i = 0; i < patlen; i += 1) { + goodSuffixTable[i] = maxSkip; + } + + int32_t prefix = 0; + + for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) { + if (suff[i] == i + 1) { + // this matching suffix is a prefix of the pattern + int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1); + + // for any mis-match before this suffix, we should skip + // so that the front of the pattern (i.e. the prefix) + // lines up with the front of the suffix. + // (patlen - 1 - i) is the start of the suffix + while (prefix < patlen - 1 - i) { + // value of maxSkip means never set... + if (goodSuffixTable[prefix] == maxSkip) { + goodSuffixTable[prefix] = prefixSkip; + } + + prefix += 1; + } + } + } + + for (int32_t i = 0; i < patlen - 1; i += 1) { + goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1); + } + + DELETE_ARRAY(suff); +} + +GoodSuffixTable::~GoodSuffixTable() +{ + DELETE_ARRAY(goodSuffixTable); +} + +int32_t GoodSuffixTable::operator[](int32_t offset) const +{ + return goodSuffixTable[offset]; +} + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch) + + +UBool BoyerMooreSearch::empty() +{ + return patCEs->size() <= 0; +} + +CollData *BoyerMooreSearch::getData() +{ + return data; +} + +CEList *BoyerMooreSearch::getPatternCEs() +{ + return patCEs; +} + +BadCharacterTable *BoyerMooreSearch::getBadCharacterTable() +{ + return badCharacterTable; +} + +GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable() +{ + return goodSuffixTable; +} + +BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, + UErrorCode &status) + : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL) +{ + + if (U_FAILURE(status)) { + return; + } + + UCollator *collator = data->getCollator(); + + patCEs = new CEList(collator, patternString, status); + + if (patCEs == NULL || U_FAILURE(status)) { + return; + } + + badCharacterTable = new BadCharacterTable(*patCEs, data, status); + + if (badCharacterTable == NULL || U_FAILURE(status)) { + return; + } + + goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status); + + if (targetString != NULL) { + target = new Target(collator, targetString, patCEs->size(), status); + } +} + +BoyerMooreSearch::~BoyerMooreSearch() +{ + delete target; + delete goodSuffixTable; + delete badCharacterTable; + delete patCEs; +} + +void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + + if (target == NULL) { + target = new Target(data->getCollator(), targetString, patCEs->size(), status); + } else { + target->setTargetString(targetString); + } +} + +// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. **** +/* + * TODO: + * * deal with trailing (and leading?) ignorables. + * * Adding BoyerMooreSearch object slowed it down. How can we speed it up? + */ +UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end) +{ + UCollator *coll = data->getCollator(); + int32_t plen = patCEs->size(); + int32_t tlen = target->stringLength(); + int32_t maxSkip = badCharacterTable->getMaxSkip(); + int32_t tOffset = offset + maxSkip; + + if (plen <= 0) { + // Searching for a zero length pattern always fails. + start = end = -1; + return FALSE; + } + + while (tOffset <= tlen) { + int32_t pIndex = plen - 1; + int32_t tIndex = 0; + int32_t lIndex = 0; + + if (tOffset < tlen) { + // **** we really want to skip ahead enough to **** + // **** be sure we get at least 1 non-ignorable **** + // **** CE after the end of the pattern. **** + int32_t next = target->nextSafeBoundary(tOffset + 1); + + target->setOffset(next); + + for (lIndex = 0; ; lIndex += 1) { + const CEI *cei = target->prevCE(lIndex); + int32_t low = cei->lowOffset; + int32_t high = cei->highOffset; + + if (high == 0 || (low < high && low <= tOffset)) { + if (low < tOffset) { + while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) { + lIndex -= 1; + } + + if (high > tOffset) { + tOffset = high; + } + } + + break; + } + } + } else { + target->setLast(tOffset); + lIndex = 0; + } + + tIndex = ++lIndex; + + // Iterate backward until we hit the beginning of the pattern + while (pIndex >= 0) { + uint32_t pce = (*patCEs)[pIndex]; + const CEI *tcei = target->prevCE(tIndex++); + + + if (tcei->order != pce) { + // There is a mismatch at this position. Decide how far + // over to shift the pattern, then try again. + + int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex]; +#ifdef EXTRA_CAUTIOUS + int32_t old = tOffset; +#endif + + tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1); + + if (gsOffset > tOffset) { + tOffset = gsOffset; + } + +#ifdef EXTRA_CAUTIOUS + // Make sure we don't skip backwards... + if (tOffset <= old) { + tOffset = old + 1; + } +#endif + + break; + } + + pIndex -= 1; + } + + if (pIndex < 0) { + // We made it back to the beginning of the pattern, + // which means we matched it all. Return the location. + const CEI firstCEI = *target->prevCE(tIndex - 1); + const CEI lastCEI = *target->prevCE(lIndex); + int32_t mStart = firstCEI.lowOffset; + int32_t minLimit = lastCEI.lowOffset; + int32_t maxLimit = lastCEI.highOffset; + int32_t mLimit; + UBool found = TRUE; + + target->setOffset(/*tOffset*/maxLimit); + + const CEI nextCEI = *target->nextCE(0); + + if (nextCEI.lowOffset > maxLimit) { + maxLimit = nextCEI.lowOffset; + } + + if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != UCOL_NULLORDER) { + found = FALSE; + } + + if (! target->isBreakBoundary(mStart)) { + found = FALSE; + } + + if (firstCEI.lowOffset == firstCEI.highOffset) { + found = FALSE; + } + + mLimit = maxLimit; + if (minLimit < maxLimit) { + int32_t nbb = target->nextBreakBoundary(minLimit); + + if (nbb >= lastCEI.highOffset) { + mLimit = nbb; + } + } + + if (mLimit > maxLimit) { + found = FALSE; + } + + if (! target->isBreakBoundary(mLimit)) { + found = FALSE; + } + + if (! target->isIdentical(pattern, mStart, mLimit)) { + found = FALSE; + } + + if (found) { + start = mStart; + end = mLimit; + + return TRUE; + } + + tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip? + } + // Otherwise, we're here because of a mismatch, so keep going.... + } + + // no match + start = -1; + end = -1; + return FALSE; +} + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_COLLATION diff --git a/icu4c/source/i18n/colldata.cpp b/icu4c/source/i18n/colldata.cpp new file mode 100644 index 0000000000..9860b62b34 --- /dev/null +++ b/icu4c/source/i18n/colldata.cpp @@ -0,0 +1,1104 @@ +/* + ****************************************************************************** + * Copyright (C) 1996-2009, International Business Machines * + * Corporation and others. All Rights Reserved. * + ****************************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/unistr.h" +#include "unicode/putil.h" +#include "unicode/usearch.h" + +#include "cmemory.h" +#include "unicode/coll.h" +#include "unicode/tblcoll.h" +#include "unicode/coleitr.h" +#include "unicode/ucoleitr.h" + +#include "unicode/regex.h" // TODO: make conditional on regexp being built. + +#include "unicode/uniset.h" +#include "unicode/uset.h" +#include "unicode/ustring.h" +#include "hash.h" +#include "uhash.h" +#include "ucln_in.h" +#include "ucol_imp.h" +#include "umutex.h" + +#include "unicode/colldata.h" + +U_NAMESPACE_BEGIN + +#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) +#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) uprv_free((void *) (array)) +#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) + +static inline USet *uset_openEmpty() +{ + return uset_open(1, 0); +} + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList) + +#ifdef INSTRUMENT_CELIST +int32_t CEList::_active = 0; +int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +#endif + +CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) + : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) +{ + UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); + UCollationStrength strength = ucol_getStrength(coll); + UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; + uint32_t variableTop = ucol_getVariableTop(coll, &status); + uint32_t strengthMask = 0; + int32_t order; + + if (U_FAILURE(status)) { + return; + } + + // **** only set flag if string has Han(gul) **** + ucol_forceHanImplicit(elems, &status); + + switch (strength) + { + default: + strengthMask |= UCOL_TERTIARYORDERMASK; + /* fall through */ + + case UCOL_SECONDARY: + strengthMask |= UCOL_SECONDARYORDERMASK; + /* fall through */ + + case UCOL_PRIMARY: + strengthMask |= UCOL_PRIMARYORDERMASK; + } + +#ifdef INSTRUMENT_CELIST + _active += 1; + _histogram[0] += 1; +#endif + + ces = ceBuffer; + + while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { + UBool cont = isContinuation(order); + + order &= strengthMask; + + if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) { + if (strength >= UCOL_QUATERNARY) { + order &= UCOL_PRIMARYORDERMASK; + } else { + order = UCOL_IGNORABLE; + } + } + + if (order == UCOL_IGNORABLE) { + continue; + } + + if (cont) { + order |= UCOL_CONTINUATION_MARKER; + } + + add(order, status); + } + + ucol_closeElements(elems); +} + +CEList::~CEList() +{ +#ifdef INSTRUMENT_CELIST + _active -= 1; +#endif + + if (ces != ceBuffer) { + DELETE_ARRAY(ces); + } +} + +void CEList::add(uint32_t ce, UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + + if (listSize >= listMax) { + int32_t newMax = listMax + CELIST_BUFFER_SIZE; + +#ifdef INSTRUMENT_CELIST + _histogram[listSize / CELIST_BUFFER_SIZE] += 1; +#endif + + uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); + + if (newCEs == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); + + if (ces != ceBuffer) { + DELETE_ARRAY(ces); + } + + ces = newCEs; + listMax = newMax; + } + + ces[listSize++] = ce; +} + +uint32_t CEList::get(int32_t index) const +{ + if (index >= 0 && index < listSize) { + return ces[index]; + } + + return UCOL_NULLORDER; +} + +uint32_t &CEList::operator[](int32_t index) const +{ + return ces[index]; +} + +UBool CEList::matchesAt(int32_t offset, const CEList *other) const +{ + if (other == NULL || listSize - offset < other->size()) { + return FALSE; + } + + for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { + if (ces[i] != (*other)[j]) { + return FALSE; + } + } + + return TRUE; +} + +int32_t CEList::size() const +{ + return listSize; +} + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList) + +#ifdef INSTRUMENT_STRING_LIST +int32_t StringList::_lists = 0; +int32_t StringList::_strings = 0; +int32_t StringList::_histogram[101] = {0}; +#endif + +StringList::StringList(UErrorCode &status) + : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) +{ + if (U_FAILURE(status)) { + return; + } + + strings = new UnicodeString [listMax]; + + if (strings == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + +#ifdef INSTRUMENT_STRING_LIST + _lists += 1; + _histogram[0] += 1; +#endif +} + +StringList::~StringList() +{ + delete[] strings; +} + +void StringList::add(const UnicodeString *string, UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + +#ifdef INSTRUMENT_STRING_LIST + _strings += 1; +#endif + + if (listSize >= listMax) { + int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; + + UnicodeString *newStrings = new UnicodeString[newMax]; + + uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString)); + +#ifdef INSTRUMENT_STRING_LIST + int32_t _h = listSize / STRING_LIST_BUFFER_SIZE; + + if (_h > 100) { + _h = 100; + } + + _histogram[_h] += 1; +#endif + + delete[] strings; + strings = newStrings; + listMax = newMax; + } + + // The ctor initialized all the strings in + // the array to empty strings, so this + // is the same as copying the source string. + strings[listSize++].append(*string); +} + +void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) +{ + const UnicodeString string(chars, count); + + add(&string, status); +} + +const UnicodeString *StringList::get(int32_t index) const +{ + if (index >= 0 && index < listSize) { + return &strings[index]; + } + + return NULL; +} + +int32_t StringList::size() const +{ + return listSize; +} + + +U_CFUNC void deleteStringList(void *obj); + +class CEToStringsMap : public UMemory +{ +public: + + CEToStringsMap(UErrorCode &status); + ~CEToStringsMap(); + + void put(uint32_t ce, UnicodeString *string, UErrorCode &status); + StringList *getStringList(uint32_t ce) const; + +private: + + void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); + UHashtable *map; +}; + +CEToStringsMap::CEToStringsMap(UErrorCode &status) + : map(NULL) +{ + if (U_FAILURE(status)) { + return; + } + + map = uhash_open(uhash_hashLong, uhash_compareLong, + uhash_compareCaselessUnicodeString, + &status); + + if (U_FAILURE(status)) { + return; + } + + uhash_setValueDeleter(map, deleteStringList); +} + +CEToStringsMap::~CEToStringsMap() +{ + uhash_close(map); +} + +void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) +{ + StringList *strings = getStringList(ce); + + if (strings == NULL) { + strings = new StringList(status); + + if (strings == NULL || U_FAILURE(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + putStringList(ce, strings, status); + } + + strings->add(string, status); +} + +StringList *CEToStringsMap::getStringList(uint32_t ce) const +{ + return (StringList *) uhash_iget(map, ce); +} + +void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) +{ + uhash_iput(map, ce, (void *) stringList, &status); +} + +U_CFUNC void deleteStringList(void *obj) +{ + StringList *strings = (StringList *) obj; + + delete strings; +} + +U_CFUNC void deleteCEList(void *obj); +U_CFUNC void deleteUnicodeStringKey(void *obj); + +class StringToCEsMap : public UMemory +{ +public: + StringToCEsMap(UErrorCode &status); + ~StringToCEsMap(); + + void put(const UnicodeString *string, const CEList *ces, UErrorCode &status); + const CEList *get(const UnicodeString *string); + void free(const CEList *list); + +private: + + + UHashtable *map; +}; + +StringToCEsMap::StringToCEsMap(UErrorCode &status) + : map(NULL) +{ + if (U_FAILURE(status)) { + return; + } + + map = uhash_open(uhash_hashUnicodeString, + uhash_compareUnicodeString, + uhash_compareLong, + &status); + + if (U_FAILURE(status)) { + return; + } + + uhash_setValueDeleter(map, deleteCEList); + uhash_setKeyDeleter(map, deleteUnicodeStringKey); +} + +StringToCEsMap::~StringToCEsMap() +{ + uhash_close(map); +} + +void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status) +{ + uhash_put(map, (void *) string, (void *) ces, &status); +} + +const CEList *StringToCEsMap::get(const UnicodeString *string) +{ + return (const CEList *) uhash_get(map, string); +} + +U_CFUNC void deleteCEList(void *obj) +{ + CEList *list = (CEList *) obj; + + delete list; +} + +U_CFUNC void deleteUnicodeStringKey(void *obj) +{ + UnicodeString *key = (UnicodeString *) obj; + + delete key; +} + +class CollDataCacheEntry : public UMemory +{ +public: + CollDataCacheEntry(CollData *theData); + ~CollDataCacheEntry(); + + CollData *data; + int32_t refCount; +}; + +CollDataCacheEntry::CollDataCacheEntry(CollData *theData) + : data(theData), refCount(1) +{ + // nothing else to do +} + +CollDataCacheEntry::~CollDataCacheEntry() +{ + // check refCount? + delete data; +} + +class CollDataCache : public UMemory +{ +public: + CollDataCache(UErrorCode &status); + ~CollDataCache(); + + CollData *get(UCollator *collator, UErrorCode &status); + void unref(CollData *collData); + + void flush(); + +private: + static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength); + static void deleteKey(char *key); + + UMTX lock; + UHashtable *cache; +}; + +U_CFUNC void deleteChars(void *obj) +{ + char *chars = (char *) obj; + + // All the key strings are owned by the + // CollData objects and don't need to + // be freed here. + //DELETE_ARRAY(chars); +} + +U_CFUNC void deleteCollDataCacheEntry(void *obj) +{ + CollDataCacheEntry *entry = (CollDataCacheEntry *) obj; + + delete entry; +} + +CollDataCache::CollDataCache(UErrorCode &status) + : lock(0), cache(NULL) +{ + if (U_FAILURE(status)) { + return; + } + + umtx_init(&lock); + + cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status); + + if (U_FAILURE(status)) { + return; + } + + uhash_setValueDeleter(cache, deleteCollDataCacheEntry); + uhash_setKeyDeleter(cache, deleteChars); +} + +CollDataCache::~CollDataCache() +{ + umtx_lock(&lock); + uhash_close(cache); + cache = NULL; + umtx_unlock(&lock); + + umtx_destroy(&lock); +} + +CollData *CollDataCache::get(UCollator *collator, UErrorCode &status) +{ + char keyBuffer[KEY_BUFFER_SIZE]; + int32_t keyLength = KEY_BUFFER_SIZE; + char *key = getKey(collator, keyBuffer, &keyLength); + CollData *result = NULL, *newData = NULL; + CollDataCacheEntry *entry = NULL, *newEntry = NULL; + + umtx_lock(&lock); + entry = (CollDataCacheEntry *) uhash_get(cache, key); + + if (entry == NULL) { + umtx_unlock(&lock); + + newData = new CollData(collator, key, keyLength, status); + newEntry = new CollDataCacheEntry(newData); + + if (U_FAILURE(status) || newData == NULL || newEntry == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + + umtx_lock(&lock); + entry = (CollDataCacheEntry *) uhash_get(cache, key); + + if (entry == NULL) { + uhash_put(cache, newData->key, newEntry, &status); + umtx_unlock(&lock); + + if (U_FAILURE(status)) { + delete newEntry; + delete newData; + + return NULL; + } + + return newData; + } + } + + result = entry->data; + entry->refCount += 1; + umtx_unlock(&lock); + + if (key != keyBuffer) { + deleteKey(key); + } + + if (newEntry != NULL) { + delete newEntry; + delete newData; + } + + return result; +} + +void CollDataCache::unref(CollData *collData) +{ + CollDataCacheEntry *entry = NULL; + + umtx_lock(&lock); + entry = (CollDataCacheEntry *) uhash_get(cache, collData->key); + + if (entry != NULL) { + entry->refCount -= 1; + } + umtx_unlock(&lock); +} + +char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); + + if (len >= *keyBufferLength) { + *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null + keyBuffer = NEW_ARRAY(char, *keyBufferLength); + status = U_ZERO_ERROR; + + len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); + } + + keyBuffer[len] = '\0'; + + return keyBuffer; +} + +void CollDataCache::flush() +{ + const UHashElement *element; + int32_t pos = -1; + + umtx_lock(&lock); + while ((element = uhash_nextElement(cache, &pos)) != NULL) { + CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer; + + if (entry->refCount <= 0) { + uhash_removeElement(cache, element); + } + } + umtx_unlock(&lock); +} + +void CollDataCache::deleteKey(char *key) +{ + DELETE_ARRAY(key); +} + +U_CDECL_BEGIN +static UBool coll_data_cleanup(void) { + CollData::freeCollDataCache(); + return TRUE; +} +U_CDECL_END + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData) + +CollData::CollData() +{ + // nothing +} + +#define CLONE_COLLATOR + +//#define CACHE_CELISTS +CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status) + : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL) +{ + // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] + // i.e. other, control, private use, format, surrogate + U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); + U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); + USet *charsToTest = uset_openPattern(test_pattern, 20, &status); + + // Han ext. A, Han, Jamo, Hangul, Han Ext. B + // i.e. all the characers we handle implicitly + U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); + U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); + USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); + + if (U_FAILURE(status)) { + return; + } + + USet *expansions = uset_openEmpty(); + USet *contractions = uset_openEmpty(); + int32_t itemCount; + +#ifdef CACHE_CELISTS + charsToCEList = new StringToCEsMap(status); + + if (U_FAILURE(status)) { + goto bail; + } +#else + charsToCEList = NULL; +#endif + + ceToCharsStartingWith = new CEToStringsMap(status); + + if (U_FAILURE(status)) { + goto bail; + } + + if (cacheKeyLength > KEY_BUFFER_SIZE) { + key = NEW_ARRAY(char, cacheKeyLength); + + if (key == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + goto bail; + } + } else { + key = keyBuffer; + } + + ARRAY_COPY(key, cacheKey, cacheKeyLength); + +#ifdef CLONE_COLLATOR + coll = ucol_safeClone(collator, NULL, NULL, &status); + + if (U_FAILURE(status)) { + goto bail; + } +#else + coll = collator; +#endif + + ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); + + uset_addAll(charsToTest, contractions); + uset_addAll(charsToTest, expansions); + uset_removeAll(charsToTest, charsToRemove); + + itemCount = uset_getItemCount(charsToTest); + for(int32_t item = 0; item < itemCount; item += 1) { + UChar32 start = 0, end = 0; + UChar buffer[16]; + int32_t len = uset_getItem(charsToTest, item, &start, &end, + buffer, 16, &status); + + if (len == 0) { + for (UChar32 ch = start; ch <= end; ch += 1) { + UnicodeString *st = new UnicodeString(ch); + + if (st == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + + CEList *ceList = new CEList(coll, *st, status); + + ceToCharsStartingWith->put(ceList->get(0), st, status); + +#ifdef CACHE_CELISTS + charsToCEList->put(st, ceList, status); +#else + delete ceList; + delete st; +#endif + } + } else if (len > 0) { + UnicodeString *st = new UnicodeString(buffer, len); + + if (st == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + + CEList *ceList = new CEList(coll, *st, status); + + ceToCharsStartingWith->put(ceList->get(0), st, status); + +#ifdef CACHE_CELISTS + charsToCEList->put(st, ceList, status); +#else + delete ceList; + delete st; +#endif + } else { + // shouldn't happen... + } + + if (U_FAILURE(status)) { + break; + } + } + +bail: + uset_close(contractions); + uset_close(expansions); + uset_close(charsToRemove); + uset_close(charsToTest); + + if (U_FAILURE(status)) { + return; + } + + UChar hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A, + UCOL_FIRST_HAN_B_LEAD, UCOL_FIRST_HAN_B_TRAIL, UCOL_LAST_HAN_B_LEAD, UCOL_LAST_HAN_B_TRAIL}; + UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO}; + UnicodeString hanString(hanRanges, ARRAY_SIZE(hanRanges)); + UnicodeString jamoString(jamoRanges, ARRAY_SIZE(jamoRanges)); + CEList hanList(coll, hanString, status); + CEList jamoList(coll, jamoString, status); + int32_t j = 0; + + if (U_FAILURE(status)) { + return; + } + + for (int32_t c = 0; c < jamoList.size(); c += 1) { + uint32_t jce = jamoList[c]; + + if (! isContinuation(jce)) { + jamoLimits[j++] = jce; + } + } + + jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); + + minHan = 0xFFFFFFFF; + maxHan = 0; + + for(int32_t h = 0; h < hanList.size(); h += 2) { + uint32_t han = (uint32_t) hanList[h]; + + if (han < minHan) { + minHan = han; + } + + if (han > maxHan) { + maxHan = han; + } + } + + maxHan += (1 << UCOL_PRIMARYORDERSHIFT); +} + +CollData::~CollData() +{ +#ifdef CLONE_COLLATOR + ucol_close(coll); +#endif + + if (key != keyBuffer) { + DELETE_ARRAY(key); + } + + delete ceToCharsStartingWith; + +#ifdef CACHE_CELISTS + delete charsToCEList; +#endif +} + +UCollator *CollData::getCollator() const +{ + return coll; +} + +const StringList *CollData::getStringList(int32_t ce) const +{ + return ceToCharsStartingWith->getStringList(ce); +} + +const CEList *CollData::getCEList(const UnicodeString *string) const +{ +#ifdef CACHE_CELISTS + return charsToCEList->get(string); +#else + UErrorCode status = U_ZERO_ERROR; + const CEList *list = new CEList(coll, *string, status); + + if (U_FAILURE(status)) { + delete list; + list = NULL; + } + + return list; +#endif +} + +void CollData::freeCEList(const CEList *list) +{ +#ifndef CACHE_CELISTS + delete list; +#endif +} + +int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const +{ + // find out shortest string for the longest sequence of ces. + // this can probably be folded with the minLengthCache... + + if (history[offset] >= 0) { + return history[offset]; + } + + uint32_t ce = ceList->get(offset); + int32_t maxOffset = ceList->size(); + int32_t shortestLength = INT32_MAX; + const StringList *strings = ceToCharsStartingWith->getStringList(ce); + + if (strings != NULL) { + int32_t stringCount = strings->size(); + + for (int32_t s = 0; s < stringCount; s += 1) { + const UnicodeString *string = strings->get(s); +#ifdef CACHE_CELISTS + const CEList *ceList2 = charsToCEList->get(string); +#else + UErrorCode status = U_ZERO_ERROR; + const CEList *ceList2 = new CEList(coll, *string, status); + + if (U_FAILURE(status)) { + delete ceList2; + ceList2 = NULL; + } +#endif + + if (ceList->matchesAt(offset, ceList2)) { + int32_t clength = ceList2->size(); + int32_t slength = string->length(); + int32_t roffset = offset + clength; + int32_t rlength = 0; + + if (roffset < maxOffset) { + rlength = minLengthInChars(ceList, roffset, history); + + if (rlength <= 0) { + // ignore any dead ends + continue; + } + } + + if (shortestLength > slength + rlength) { + shortestLength = slength + rlength; + } + } + +#ifndef CACHE_CELISTS + delete ceList2; +#endif + } + } + + if (shortestLength == INT32_MAX) { + // No matching strings at this offset. See if + // the CE is in a range we can handle manually. + if (ce >= minHan && ce < maxHan) { + // all han have implicit orders which + // generate two CEs. + int32_t roffset = offset + 2; + int32_t rlength = 0; + + //history[roffset++] = -1; + //history[roffset++] = 1; + + if (roffset < maxOffset) { + rlength = minLengthInChars(ceList, roffset, history); + } + + if (rlength < 0) { + return -1; + } + + shortestLength = 1 + rlength; + goto have_shortest; + } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { + int32_t roffset = offset; + int32_t rlength = 0; + + // **** this loop may not handle archaic Hangul correctly **** + for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { + uint32_t jce = ceList->get(roffset); + + // Some Jamo have 24-bit primary order; skip the + // 2nd CE. This should always be OK because if + // we're still in the loop all we've seen are + // a series of Jamo in LVT order. + if (isContinuation(jce)) { + continue; + } + + if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { + break; + } + } + + if (roffset == offset) { + // we started with a non-L Jamo... + // just say it comes from a single character + roffset += 1; + + // See if the single Jamo has a 24-bit order. + if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { + roffset += 1; + } + } + + if (roffset < maxOffset) { + rlength = minLengthInChars(ceList, roffset, history); + } + + if (rlength < 0) { + return -1; + } + + shortestLength = 1 + rlength; + goto have_shortest; + } + + // Can't handle it manually either. Just move on. + return -1; + } + +have_shortest: + history[offset] = shortestLength; + + return shortestLength; +} + +int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const +{ + int32_t clength = ceList->size(); + int32_t *history = NEW_ARRAY(int32_t, clength); + + for (int32_t i = 0; i < clength; i += 1) { + history[i] = -1; + } + + int32_t minLength = minLengthInChars(ceList, offset, history); + + DELETE_ARRAY(history); + + return minLength; +} + +CollData *CollData::open(UCollator *collator, UErrorCode &status) +{ + if (U_FAILURE(status)) { + return NULL; + } + + CollDataCache *cache = getCollDataCache(); + + return cache->get(collator, status); +} + +void CollData::close(CollData *collData) +{ + CollDataCache *cache = getCollDataCache(); + + cache->unref(collData); +} + +CollDataCache *CollData::collDataCache = NULL; + +CollDataCache *CollData::getCollDataCache() +{ + UErrorCode status = U_ZERO_ERROR; + CollDataCache *cache = NULL; + + UMTX_CHECK(NULL, collDataCache, cache); + + if (cache == NULL) { + cache = new CollDataCache(status); + + if (U_FAILURE(status)) { + delete cache; + return NULL; + } + + umtx_lock(NULL); + if (collDataCache == NULL) { + collDataCache = cache; + + ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup); + } + umtx_unlock(NULL); + + if (collDataCache != cache) { + delete cache; + } + } + + return collDataCache; +} + +void CollData::freeCollDataCache() +{ + CollDataCache *cache = NULL; + + UMTX_CHECK(NULL, collDataCache, cache); + + if (cache != NULL) { + umtx_lock(NULL); + if (collDataCache != NULL) { + collDataCache = NULL; + } else { + cache = NULL; + } + umtx_unlock(NULL); + + delete cache; + } +} + +void CollData::flushCollDataCache() +{ + CollDataCache *cache = NULL; + + UMTX_CHECK(NULL, collDataCache, cache); + + // **** this will fail if the another **** + // **** thread deletes the cache here **** + if (cache != NULL) { + cache->flush(); + } +} + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_COLLATION diff --git a/icu4c/source/i18n/i18n.vcproj b/icu4c/source/i18n/i18n.vcproj index ea6e0b6dd1..3d58425c76 100644 --- a/icu4c/source/i18n/i18n.vcproj +++ b/icu4c/source/i18n/i18n.vcproj @@ -408,6 +408,40 @@ + + + + + + + + + + + + + + @@ -504,6 +538,23 @@ /> + + + + + + + diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h index d133db0b91..82de8389bc 100644 --- a/icu4c/source/i18n/ucln_in.h +++ b/icu4c/source/i18n/ucln_in.h @@ -1,7 +1,7 @@ /* ****************************************************************************** * * -* Copyright (C) 2001-2008, International Business Machines * +* Copyright (C) 2001-2009, International Business Machines * * Corporation and others. All Rights Reserved. * * * ****************************************************************************** @@ -45,6 +45,7 @@ typedef enum ECleanupI18NType { UCLN_I18N_UCOL_RES, UCLN_I18N_UCOL_BLD, UCLN_I18N_CSDET, + UCLN_I18N_COLL_DATA, UCLN_I18N_COUNT /* This must be last */ } ECleanupI18NType; diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index d22684dd1f..c9a751c36a 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 1996-2008, International Business Machines +* Copyright (C) 1996-2009, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ucol.cpp @@ -123,7 +123,6 @@ uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, IInit_collIterate(collator, sourceString, sourceLen, s); } - /** * Backup the state of the collIterate struct data * @param data collIterate to backup @@ -1499,10 +1498,30 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou } else { - order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + // Always use UCA for Han, Hangul + // (Han extension A is before main Han block) + // **** Han compatibility chars ?? **** + if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && + (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { + if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { + // between the two target ranges; do normal lookup + // **** this range is YI, Modifier tone letters, **** + // **** Latin-D, Syloti Nagari, Phagas-pa. **** + // **** Latin-D might be tailored, so we need to **** + // **** do the normal lookup for these guys. **** + order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + } else { + // in one of the target ranges; use UCA + order = UCOL_NOT_FOUND; + } + } else { + order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + } + if(order > UCOL_NOT_FOUND) { /* if a CE is special */ order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ } + if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); @@ -1939,7 +1958,23 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, result = coll->latinOneMapping[ch]; } else { - result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + // Always use UCA for [3400..9FFF], [AC00..D7AF] + // **** [FA0E..FA2F] ?? **** + if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && + (ch >= 0x3400 && ch <= 0xD7AF)) { + if (ch > 0x9FFF && ch < 0xAC00) { + // between the two target ranges; do normal lookup + // **** this range is YI, Modifier tone letters, **** + // **** Latin-D, Syloti Nagari, Phagas-pa. **** + // **** Latin-D might be tailored, so we need to **** + // **** do the normal lookup for these guys. **** + result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + } else { + result = UCOL_NOT_FOUND; + } + } else { + result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + } } if (result > UCOL_NOT_FOUND) { result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); @@ -3545,38 +3580,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, int32_t offsetBias; -#if 0 - if (source->offsetReturn != NULL) { - source->offsetStore = source->offsetReturn - noChars; - } - // **** doesn't work if using iterator **** if (source->flags & UCOL_ITER_INNORMBUF) { - if (source->fcdPosition == NULL) { - offsetBias = 0; - } else { - offsetBias = (int32_t)(source->fcdPosition - source->string); - } - } else { - offsetBias = (int32_t)(source->pos - source->string); - } - -#else - // **** doesn't work if using iterator **** - if (source->flags & UCOL_ITER_INNORMBUF) { -#if 1 offsetBias = -1; -#else - if (source->fcdPosition == NULL) { - offsetBias = 0; - } else { - offsetBias = (int32_t)(source->fcdPosition - source->string); - } -#endif } else { offsetBias = (int32_t)(source->pos - source->string); } -#endif /* a new collIterate is used to simplify things, since using the current collIterate will mean that the forward and backwards iteration will @@ -3584,9 +3593,9 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate temp; int32_t rawOffset; - //IInit_collIterate(coll, UCharOffset, -1, &temp); IInit_collIterate(coll, UCharOffset, noChars, &temp); temp.flags &= ~UCOL_ITER_NORM; + temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; rawOffset = temp.pos - temp.string; // should always be zero? CE = ucol_IGetNextCE(coll, &temp, status); @@ -3679,7 +3688,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, } } - rawOffset = temp.pos - temp.string; + if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { + rawOffset = temp.fcdPosition - temp.string; + } else { + rawOffset = temp.pos - temp.string; + } + CE = ucol_IGetNextCE(coll, &temp, status); } @@ -4136,29 +4150,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, } case IMPLICIT_TAG: /* everything that is not defined otherwise */ -#if 0 - if (source->offsetBuffer == NULL) { - source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; - source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); - source->offsetStore = source->offsetBuffer; - } - - // **** doesn't work if using iterator **** - if (source->flags & UCOL_ITER_INNORMBUF) { - source->offsetRepeatCount = 1; - } else { - int32_t firstOffset = (int32_t)(source->pos - source->string); - - *(source->offsetStore++) = firstOffset; - *(source->offsetStore++) = firstOffset + 1; - - source->offsetReturn = source->offsetStore - 1; - if (source->offsetReturn == source->offsetBuffer) { - source->offsetStore = source->offsetBuffer; - } - } -#endif - return getPrevImplicit(ch, source); // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function diff --git a/icu4c/source/i18n/ucol_imp.h b/icu4c/source/i18n/ucol_imp.h index 9ed7f3f12f..0e1736f906 100644 --- a/icu4c/source/i18n/ucol_imp.h +++ b/icu4c/source/i18n/ucol_imp.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1998-2008, International Business Machines +* Copyright (C) 1998-2009, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -260,6 +260,8 @@ minimum number for special Jamo /* by index */ #define UCOL_USE_ITERATOR 64 +#define UCOL_FORCE_HAN_IMPLICIT 128 + #define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300 typedef struct collIterate { @@ -390,6 +392,29 @@ uprv_init_pce(const struct UCollationElements *elems); (((uint32_t)(ch) - 0x1161) <= (0x1175 - 0x1161)) || \ (((uint32_t)(ch) - 0x11A8) <= (0x11C2 - 0x11A8))) +/* Han character ranges */ +#define UCOL_FIRST_HAN 0x4E00 +#define UCOL_LAST_HAN 0x9FFF +#define UCOL_FIRST_HAN_A 0x3400 +#define UCOL_LAST_HAN_A 0x4DBF +#define UCOL_FIRST_HAN_COMPAT 0xFAE0 +#define UCOL_LAST_HAN_COMPAT 0xFA2F + +/* Han extension B is in plane 2 */ +#define UCOL_FIRST_HAN_B_LEAD 0xD840 +#define UCOL_FIRST_HAN_B_TRAIL 0xDC00 +#define UCOL_LAST_HAN_B_LEAD 0xD869 +#define UCOL_LAST_HAN_B_TRAIL 0xDEDF + +/* Hangul range */ +#define UCOL_FIRST_HANGUL 0xAC00 +#define UCOL_LAST_HANGUL 0xD7AF + +/* Jamo ranges */ +#define UCOL_FIRST_L_JAMO 0x1100 +#define UCOL_FIRST_V_JAMO 0x1161 +#define UCOL_FIRST_T_JAMO 0x11A8 +#define UCOL_LAST_T_JAMO 0x11F9 #if 0 diff --git a/icu4c/source/i18n/ucol_sit.cpp b/icu4c/source/i18n/ucol_sit.cpp index 96fc7b8aef..9c6df7cd89 100644 --- a/icu4c/source/i18n/ucol_sit.cpp +++ b/icu4c/source/i18n/ucol_sit.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2004-2008, International Business Machines +* Copyright (C) 2004-2009, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ucol_sit.cpp @@ -578,15 +578,15 @@ ucol_getShortDefinitionString(const UCollator *coll, if(elementSize) { // we should probably canonicalize here... elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, languageArg); + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg); elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, regionArg); + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg); elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, scriptArg); + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg); elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variantArg); + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg); elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, keywordArg); + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg); } int32_t i = 0; @@ -597,7 +597,7 @@ ucol_getShortDefinitionString(const UCollator *coll, if(attribute != UCOL_DEFAULT) { char letter = ucol_sit_attributeValueToLetter(attribute, status); appendShortStringElement(&letter, 1, - buffer, &resultSize, capacity, options[i].optionStart); + buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart); } } } diff --git a/icu4c/source/i18n/ucoleitr.cpp b/icu4c/source/i18n/ucoleitr.cpp index 0b7751e489..ee1e751550 100644 --- a/icu4c/source/i18n/ucoleitr.cpp +++ b/icu4c/source/i18n/ucoleitr.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 2001-2008, International Business Machines +* Copyright (C) 2001-2009, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * @@ -263,7 +263,14 @@ inline uint64_t processCE(UCollationElements *elems, uint32_t ce) primary = ucol_primaryOrder(ce); } - // Continuation? + // **** This should probably handle continuations too. **** + // **** That means that we need 24 bits for the primary **** + // **** instead of the 16 that we're currently using. **** + // **** So we can lay out the 64 bits as: 24.12.12.16. **** + // **** Another complication with continuations is that **** + // **** the *second* CE is marked as a continuation, so **** + // **** we always have to peek ahead to know how long **** + // **** the primary is... **** if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0) || (elems->pce->isShifted && primary == 0)) { @@ -285,7 +292,6 @@ inline uint64_t processCE(UCollationElements *elems, uint32_t ce) elems->pce->isShifted = FALSE; } - return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; } @@ -332,6 +338,7 @@ ucol_openElements(const UCollator *coll, return result; } + U_CAPI void U_EXPORT2 ucol_closeElements(UCollationElements *elems) { @@ -375,7 +382,7 @@ ucol_reset(UCollationElements *elems) ci->endp = ci->string + u_strlen(ci->string); } ci->CEpos = ci->toReturn = ci->CEs; - ci->flags = UCOL_ITER_HASLEN; + ci->flags = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN; if (ci->coll->normalizationMode == UCOL_ON) { ci->flags |= UCOL_ITER_NORM; } @@ -391,6 +398,21 @@ ucol_reset(UCollationElements *elems) ci->offsetRepeatCount = ci->offsetRepeatValue = 0; } +U_CAPI void U_EXPORT2 +ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status) +{ + if (U_FAILURE(*status)) { + return; + } + + if (elems == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT; +} + U_CAPI int32_t U_EXPORT2 ucol_next(UCollationElements *elems, UErrorCode *status) diff --git a/icu4c/source/i18n/unicode/bms.h b/icu4c/source/i18n/unicode/bms.h new file mode 100644 index 0000000000..bcc138dc8a --- /dev/null +++ b/icu4c/source/i18n/unicode/bms.h @@ -0,0 +1,265 @@ +/* + * Copyright (C) 1996-2009, International Business Machines Corporation and Others. + * All rights reserved. + */ + +/** + * \file + * \brief C API: Boyer-Moore StringSearch prototype. + * \internal + */ + +#ifndef _BMS_H +#define _BMS_H + +#include "unicode/utypes.h" +#include "unicode/ucol.h" + +/** + * A UCD object holds the Collator-specific data needed to + * compute the length of the shortest string that can + * generate a partcular list of CEs. + * + * UCD objects are quite expensive to compute. Because + * of this, they are cached. When you call ucd_open it + * returns a reference counted cached object. When you call ucd_close + * the reference count on the object is decremented but the object is not deleted. + * + * If you do not need to reuse any unreferenced objects in the cache, you can call + * ucd_flushCCache. If you no longer need any UCD + * objects, you can call ucd_freeCache + */ +typedef void UCD; + +/** + * Open a UCD object. + * + * @param collator - the collator + * @param status - will be set if any errors occur. + * + * @return the UCD object. You must call + * ucd_close when you are done using the object. + * + * Note: if on return status is set to an error, the only safe + * thing to do with the returned object is to call ucd_close. + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI UCD * U_EXPORT2 +ucd_open(UCollator *coll, UErrorCode *status); + +/** + * Release a UCD object. + * + * @param ucd - the object + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI void U_EXPORT2 +ucd_close(UCD *ucd); + +/** + * Get the UCollator object used to create a UCD object. + * The UCollator object returned may not be the exact + * object that was used to create this object, but it will have the + * same behavior. + * + * @param ucd - the UCD object + * + * @return the UCollator used to create the given + * UCD object. + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI UCollator * U_EXPORT2 +ucd_getCollator(UCD *ucd); + +/** + * UCD objects are expensive to compute, and so + * may be cached. This routine will free the cached objects and delete + * the cache. + * + * WARNING: Don't call this until you are have called close + * for each UCD object that you have used. also, + * DO NOT call this if another thread may be calling ucd_flushCache + * at the same time. + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI void U_EXPORT2 +ucd_freeCache(); + +/** + * UCD objects are expensive to compute, and so + * may be cached. This routine will remove any unused UCD + * objects from the cache. + * + * @internal 4.0.1 technology preview + */ +U_CAPI void U_EXPORT2 +ucd_flushCache(); + +/** + * BMS + * + * This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates + * the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them, + * and a reference to the text being searched. + * + * To do a search, you fist need to get a UCD object by calling ucd_open. + * Then you construct a BMS object from the UCD object, the pattern + * string and the target string. Then you call the search method. Here's a code sample: + * + *
+ * void boyerMooreExample(UCollator *collator, UChar *pattern, int32_t patternLen, UChar *target, int32_t targetLength)
+ * {
+ *     UErrorCode status = U_ZERO_ERROR;
+ *     int32_t offset = 0, start = -1, end = -1;
+ *     UCD *ucd = NULL);
+ *     BMS *bms = NULL;
+ *
+ *     ucd = ucd_open(collator, &status);
+ *     if (U_FAILURE(status)) {
+ *         // could not create a UCD object
+ *         return;
+ *     }
+ *
+ *     BMS *bms = bms_open(ucd, pattern, patternLength, target, targetlength, &status);
+ *     if (U_FAILURE(status)) {
+ *         // could not create a BMS object
+ *         ucd_close(ucd);
+ *         return;
+ *     }
+ *
+ *
+ *     // Find all matches
+ *     while (bms_search(bms, offset, &start, &end)) {
+ *         // process the match between start and end
+ *         ...
+ *
+ *         // advance past the match
+ *         offset = end; 
+ *     }
+ *
+ *     // at this point, if offset == 0, there were no matches
+ *     if (offset == 0) {
+ *         // handle the case of no matches
+ *     }
+ *
+ *     bms_close(bms);
+ *     ucd_close(ucd);
+ *
+ *     // UCD objects are cached, so the call to
+ *     // ucd_close doesn't delete the object.
+ *     // Call this if you don't need the object any more.
+ *     ucd_flushCache();
+ * }
+ * 
+ * + * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API. + * + * Knows linitations: + * 1) Backwards searching has not been implemented. + * + * 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general, + * this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored + * to be equal to Han characters with the same pronounciation. Because this code ignroes + * tailorings, searching for a Hangul character will not find a Han character and visa-versa. + * + * 3) In some cases, searching for a pattern that needs to be normalized and ends + * in a discontiguous contraction may fail. The only known cases of this are with + * the Tibetan script. For example searching for the pattern + * "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've + * been unable to find a pratical, real-world example of this failure.) + * + * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API. + * + * @internal ICU 4.0.1 technology preview + */ +struct BMS; +typedef struct BMS BMS; + +/** + * Construct a MBS object. + * + * @param ucd - A UCD object holding the Collator-sensitive data + * @param pattern - the string for which to search + * @param latternLength - the length of the string for which to search + * @param target - the string in which to search + * @param targetLength - the length of the string in which to search + * @param status - will be set if any errors occur. + * + * @return the BMS object. + * + * Note: if on return status is set to an error, the only safe + * thing to do with the returned object is to call + * bms_close. + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI BMS * U_EXPORT2 +bms_open(UCD *ucd, + const UChar *pattern, int32_t patternLength, + const UChar *target, int32_t targetLength, + UErrorCode *status); + +/** + * Close a BMS object and release all the + * storage associated with it. + * + * @param bms - the BMS object to close. + */ +U_CAPI void U_EXPORT2 +bms_close(BMS *bms); + +/** + * Test the pattern to see if it generates any CEs. + * + * @return TRUE if the pattern string did not generate any CEs + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI UBool U_EXPORT2 +bms_empty(BMS *bms); + +/** + * Get the UCD object used to create + * a given BMS object. + * + * @param bms - the BMS object + * + * @return - the UCD object used to create + * the given BMS object. + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI UCD * U_EXPORT2 +bms_getData(BMS *bms); + +/** + * Search for the pattern string in the target string. + * + * @param offset - the offset in the target string at which to begin the search + * @param start - will be set to the starting offset of the match, or -1 if there's no match + * @param end - will be set to the ending offset of the match, or -1 if there's no match + * + * @return TRUE if the match succeeds, FALSE otherwise. + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI UBool U_EXPORT2 +bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end); + +/** + * Set the target string for the match. + * + * @param target - the new target string + * @param targetLength - the length of the new target string + * @param status - will be set if any errors occur. + * + * @internal ICU 4.0.1 technology preview + */ +U_CAPI void U_EXPORT2 +bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status); + +#endif /* _BMS_H */ diff --git a/icu4c/source/i18n/unicode/bmsearch.h b/icu4c/source/i18n/unicode/bmsearch.h new file mode 100644 index 0000000000..d02e289b6f --- /dev/null +++ b/icu4c/source/i18n/unicode/bmsearch.h @@ -0,0 +1,221 @@ +/* + ****************************************************************************** + * Copyright (C) 1996-2009, International Business Machines * + * Corporation and others. All Rights Reserved. * + ****************************************************************************** + */ + +/** + * \file + * \brief C++ API: Boyer-Moore StringSearch technology preview + * \internal ICU 4.0.1 technology preview + */ + +#ifndef B_M_SEARCH_H +#define B_M_SEARCH_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/uobject.h" +#include "unicode/ucol.h" + +#include "unicode/colldata.h" + +U_NAMESPACE_BEGIN + +class BadCharacterTable; +class GoodSuffixTable; +class Target; + +/** + * BoyerMooreSearch + * + * This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates + * the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them, + * and a reference to the text being searched. + * + * To do a search, you fist need to get a CollData object by calling CollData::open. + * Then you construct a BoyerMooreSearch object from the CollData object, the pattern + * string and the target string. Then you call the search method. Here's a code sample: + * + *
+ * void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeString *target)
+ * {
+ *     UErrorCode status = U_ZERO_ERROR;
+ *     CollData *collData = CollData::open(collator, status);
+ *
+ *     if (U_FAILURE(status)) {
+ *         // could not create a CollData object
+ *         return;
+ *     }
+ *
+ *     BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString, target, status);
+ *
+ *     if (U_FAILURE(status)) {
+ *         // could not create a BoyerMooreSearch object
+ *         CollData::close(collData);
+ *         return;
+ *     }
+ *
+ *     int32_t offset = 0, start = -1, end = -1;
+ *
+ *     // Find all matches
+ *     while (search->search(offset, start, end)) {
+ *         // process the match between start and end
+ *         ...
+ *         // advance past the match
+ *         offset = end; 
+ *     }
+ *
+ *     // at this point, if offset == 0, there were no matches
+ *     if (offset == 0) {
+ *         // handle the case of no matches
+ *     }
+ *
+ *     delete search;
+ *     CollData::close(collData);
+ *
+ *     // CollData objects are cached, so the call to
+ *     // CollData::close doesn't delete the object.
+ *     // Call this if you don't need the object any more.
+ *     CollData::flushCollDataCache();
+ * }
+ * 
+ * + * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API. + * + * Knows linitations: + * 1) Backwards searching has not been implemented. + * + * 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general, + * this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored + * to be equal to Han characters with the same pronounciation. Because this code ignroes + * tailorings, searching for a Hangul character will not find a Han character and visa-versa. + * + * 3) In some cases, searching for a pattern that needs to be normalized and ends + * in a discontiguous contraction may fail. The only known cases of this are with + * the Tibetan script. For example searching for the pattern + * "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've + * been unable to find a pratical, real-world example of this failure.) + * + * @internal ICU 4.0.1 technology preview + * + * @see CollData + */ +class U_I18N_API BoyerMooreSearch : public UObject +{ +public: + /** + * Construct a BoyerMooreSearch object. + * + * @param theData - A CollData object holding the Collator-sensitive data + * @param patternString - the string for which to search + * @param targetString - the string in which to search or NULL if youu will + * set it later by calling setTargetString. + * @param status - will be set if any errors occur. + * + * Note: if on return, status is set to an error code, + * the only safe thing to do with this object is to call + * the destructor. + * + * @internal ICU 4.0.1 technology preview + */ + BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status); + + /** + * The desstructor + * + * @internal ICU 4.0.1 technology preview + */ + ~BoyerMooreSearch(); + + /** + * Test the pattern to see if it generates any CEs. + * + * @return TRUE if the pattern string did not generate any CEs + * + * @internal ICU 4.0.1 technology preview + */ + UBool empty(); + + /** + * Search for the pattern string in the target string. + * + * @param offset - the offset in the target string at which to begin the search + * @param start - will be set to the starting offset of the match, or -1 if there's no match + * @param end - will be set to the ending offset of the match, or -1 if there's no match + * + * @return TRUE if the match succeeds, FALSE otherwise. + * + * @internal ICU 4.0.1 technology preview + */ + UBool search(int32_t offset, int32_t &start, int32_t &end); + + /** + * Set the target string for the match. + * + * @param targetString - the new target string + * @param status - will be set if any errors occur. + * + * @internal ICU 4.0.1 technology preview + */ + void setTargetString(const UnicodeString *targetString, UErrorCode &status); + + // **** no longer need these? **** + /** + * Return the CollData object used for searching + * + * @return the CollData object used for searching + * + * @internal ICU 4.0.1 technology preview + */ + CollData *getData(); + + /** + * Return the CEs generated by the pattern string. + * + * @return a CEList object holding the CEs generated by the pattern string. + * + * @internal ICU 4.0.1 technology preview + */ + CEList *getPatternCEs(); + + /** + * Return the BadCharacterTable object computed for the pattern string. + * + * @return the BadCharacterTable object. + * + * @internal ICU 4.0.1 technology preview + */ + BadCharacterTable *getBadCharacterTable(); + + /** + * Return the GoodSuffixTable object computed for the pattern string. + * + * @return the GoodSuffixTable object computed for the pattern string. + * + * @internal ICU 4.0.1 technology preview + */ + GoodSuffixTable *getGoodSuffixTable(); + + /* + * UObject glue... + */ + virtual UClassID getDynamicClassID() const; + static UClassID getStaticClassID(); + +private: + CollData *data; + CEList *patCEs; + BadCharacterTable *badCharacterTable; + GoodSuffixTable *goodSuffixTable; + UnicodeString pattern; + Target *target; +}; + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_COLLATION +#endif // #ifndef B_M_SEARCH_H diff --git a/icu4c/source/i18n/unicode/colldata.h b/icu4c/source/i18n/unicode/colldata.h new file mode 100644 index 0000000000..ce0c0e150b --- /dev/null +++ b/icu4c/source/i18n/unicode/colldata.h @@ -0,0 +1,430 @@ +/* + ****************************************************************************** + * Copyright (C) 1996-2009, International Business Machines * + * Corporation and others. All Rights Reserved. * + ****************************************************************************** + */ + +/** + * \file + * \brief C++ API: Collation data used to compute minLengthInChars. + * \internal + */ + +#ifndef COLL_DATA_H +#define COLL_DATA_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/uobject.h" +#include "unicode/ucol.h" + +U_NAMESPACE_BEGIN + +/* + * The size of the internal buffer for the Collator's short description string. + */ +#define KEY_BUFFER_SIZE 64 + + /* + * The size of the internal CE buffer in a CEList object + */ +#define CELIST_BUFFER_SIZE 4 + +/* + * Define this to enable the CEList objects to collect + * statistics. + */ +//#define INSTRUMENT_CELIST + + /* + * The size of the initial list in a StringList object. + */ +#define STRING_LIST_BUFFER_SIZE 16 + +/* + * Define this to enable the StringList objects to + * collect statistics. + */ +//#define INSTRUMENT_STRING_LIST + + /** + * CEList + * + * This object holds a list of CEs generated from a particular + * UnicodeString + * + * @internal ICU 4.0.1 technology preview + */ +class U_I18N_API CEList : public UObject +{ +public: + /** + * Construct a CEList object. + * + * @param coll - the Collator used to collect the CEs. + * @param string - the string for which to collect the CEs. + * @param status - will be set if any errors occur. + * + * Note: if on return, status is set to an error code, + * the only safe thing to do with this object is to call + * the destructor. + * + * @internal ICU 4.0.1 technology preview + */ + CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status); + + /** + * The destructor. + */ + ~CEList(); + + /** + * Return the number of CEs in the list. + * + * @return the number of CEs in the list. + * + * @internal ICU 4.0.1 technology preview + */ + int32_t size() const; + + /** + * Get a particular CE from the list. + * + * @param index - the index of the CE to return + * + * @return the CE, or 0 if index is out of range + * + * @internal ICU 4.0.1 technology preview + */ + uint32_t get(int32_t index) const; + + /** + * Check if the CEs in another CEList match the + * suffix of this list starting at a give offset. + * + * @param offsset - the offset of the suffix + * @param other - the other CEList + * + * @return TRUE if the CEs match, FALSE otherwise. + * + * @internal ICU 4.0.1 technology preview + */ + UBool matchesAt(int32_t offset, const CEList *other) const; + + /** + * The index operator. + * + * @param index - the index + * + * @return a reference to the given CE in the list + * + * @internal ICU 4.0.1 technology preview + */ + uint32_t &operator[](int32_t index) const; + + /* + * UObject glue... + */ + virtual UClassID getDynamicClassID() const; + static UClassID getStaticClassID(); + +private: + void add(uint32_t ce, UErrorCode &status); + + uint32_t ceBuffer[CELIST_BUFFER_SIZE]; + uint32_t *ces; + int32_t listMax; + int32_t listSize; + +#ifdef INSTRUMENT_CELIST + static int32_t _active; + static int32_t _histogram[10]; +#endif +}; + +/** + * StringList + * + * This object holds a list of UnicodeString objects. + * + * @internal ICU 4.0.1 technology preview + */ +class U_I18N_API StringList : public UObject +{ +public: + /** + * Construct an empty StringList + * + * @param status - will be set if any errors occur. + * + * Note: if on return, status is set to an error code, + * the only safe thing to do with this object is to call + * the destructor. + * + * @internal ICU 4.0.1 technology preview + */ + StringList(UErrorCode &status); + + /** + * The destructor. + * + * @internal ICU 4.0.1 technology preview + */ + ~StringList(); + + /** + * Add a string to the list. + * + * @param string - the string to add + * @param status - will be set if any errors occur. + * + * @internal ICU 4.0.1 technology preview + */ + void add(const UnicodeString *string, UErrorCode &status); + + /** + * Add an array of Unicode code points to the list. + * + * @param chars - the address of the array of code points + * @param count - the number of code points in the array + * @param status - will be set if any errors occur. + * + * @internal ICU 4.0.1 technology preview + */ + void add(const UChar *chars, int32_t count, UErrorCode &status); + + /** + * Get a particular string from the list. + * + * @param index - the index of the string + * + * @return a pointer to the UnicodeString or NULL + * if index is out of bounds. + * + * @internal ICU 4.0.1 technology preview + */ + const UnicodeString *get(int32_t index) const; + + /** + * Get the number of stings in the list. + * + * @return the number of strings in the list. + * + * @internal ICU 4.0.1 technology preview + */ + int32_t size() const; + + /* + * the UObject glue... + */ + virtual UClassID getDynamicClassID() const; + static UClassID getStaticClassID(); + +private: + UnicodeString *strings; + int32_t listMax; + int32_t listSize; + +#ifdef INSTRUMENT_STRING_LIST + static int32_t _lists; + static int32_t _strings; + static int32_t _histogram[101]; +#endif +}; + +/* + * Forward references to internal classes. + */ +class StringToCEsMap; +class CEToStringsMap; +class CollDataCache; + +/** + * CollData + * + * This class holds the Collator-specific data needed to + * compute the length of the shortest string that can + * generate a partcular list of CEs. + * + * CollData objects are quite expensive to compute. Because + * of this, they are cached. When you call CollData::open it + * returns a reference counted cached object. When you call CollData::close + * the reference count on the object is decremented but the object is not deleted. + * + * If you do not need to reuse any unreferenced objects in the cache, you can call + * CollData::flushCollDataCache. If you no longer need any CollData + * objects, you can call CollData::freeCollDataCache + * + * @internal ICU 4.0.1 technology preview + */ +class U_I18N_API CollData : public UObject +{ +public: + /** + * Construct a CollData object. + * + * @param collator - the collator + * @param status - will be set if any errors occur. + * + * @return the CollData object. You must call + * close when you are done using the object. + * + * Note: if on return, status is set to an error code, + * the only safe thing to do with this object is to call + * CollData::close. + * + * @internal ICU 4.0.1 technology preview + */ + static CollData *open(UCollator *collator, UErrorCode &status); + + /** + * Release a CollData object. + * + * @param collData - the object + * + * @internal ICU 4.0.1 technology preview + */ + static void close(CollData *collData); + + /** + * Get the UCollator object used to create this object. + * The object returned may not be the exact object that was used to + * create this object, but it will have the same behavior. + */ + UCollator *getCollator() const; + + /** + * Get a list of all the strings which generate a list + * of CEs starting with a given CE. + * + * @param ce - the CE + * + * return a StringList object containing all + * the stirngs, or NULL if there are + * no such strings. + * + * @internal ICU 4.0.1 technology preview. + */ + const StringList *getStringList(int32_t ce) const; + + /** + * Get a list of the CEs generated by a partcular stirng. + * + * @param string - the string + * + * @return a CEList object containt the CEs. You + * must call freeCEList when you are finished + * using the CEList/ + * + * @internal ICU 4.0.1 technology preview. + */ + const CEList *getCEList(const UnicodeString *string) const; + + /** + * Release a CEList returned by getCEList. + * + * @param list - the to free. + * + * @internal ICU 4.0.1 technology preview + */ + void freeCEList(const CEList *list); + + /** + * Return the length of the shortest string that will generate + * the given list of CEs. + * + * @param ces - the CEs + * @param offset - the offset of the first CE in the list to use. + * + * @return the length of the shortest string. + * + * @internal ICU 4.0.1 technology preview + */ + int32_t minLengthInChars(const CEList *ces, int32_t offset) const; + + + /** + * Return the length of the shortest string that will generate + * the given list of CEs. + * + * Note: the algorithm used to do this computation is recursive. To + * limit the amount of recursion, a "history" list is used to record + * the best answer starting at a particular offset in the list of CEs. + * If the same offset is visited again during the recursion, the answer + * in the history list is used. + * + * @param ces - the CEs + * @param offset - the offset of the first CE in the list to use. + * param history - the history list. Must be at least as long as + * the number of cEs in the CEList + * + * @return the length of the shortest string. + * + * @internal ICU 4.0.1 technology preview + */ + int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const; + + /* + * UObject glue... + */ + virtual UClassID getDynamicClassID() const; + static UClassID getStaticClassID(); + + /** + * CollData objects are expensive to compute, and so + * may be cached. This routine will free the cached objects and delete + * the cache. + * + * WARNING: Don't call this until you are have called close + * for each CollData object that you have used. also, + * DO NOT call this if another thread may be calling flushCollDataCache + * at the same time. + * + * @internal 4.0.1 technology preview + */ + static void freeCollDataCache(); + + /** + * CollData objects are expensive to compute, and so + * may be cached. This routine will remove any unused CollData + * objects from the cache. + * + * @internal 4.0.1 technology preview + */ + static void flushCollDataCache(); + +private: + friend class CollDataCache; + friend class CollDataCacheEntry; + + CollData(UCollator *collator, char *cacheKey, int32_t cachekeyLength, UErrorCode &status); + ~CollData(); + + CollData(); + + static char *getCollatorKey(UCollator *collator, char *buffer, int32_t bufferLength); + + static CollDataCache *getCollDataCache(); + + UCollator *coll; + StringToCEsMap *charsToCEList; + CEToStringsMap *ceToCharsStartingWith; + + char keyBuffer[KEY_BUFFER_SIZE]; + char *key; + + static CollDataCache *collDataCache; + + uint32_t minHan; + uint32_t maxHan; + + uint32_t jamoLimits[4]; +}; + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_COLLATION +#endif // #ifndef COLL_DATA_H diff --git a/icu4c/source/i18n/unicode/ucoleitr.h b/icu4c/source/i18n/unicode/ucoleitr.h index 9c951a9ab8..419cb9f7de 100644 --- a/icu4c/source/i18n/unicode/ucoleitr.h +++ b/icu4c/source/i18n/unicode/ucoleitr.h @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2001-2008, International Business Machines +* Copyright (C) 2001-2009, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * @@ -121,6 +121,7 @@ ucol_openElements(const UCollator *coll, int32_t textLength, UErrorCode *status); + /** * get a hash code for a key... Not very useful! * @param key the given key. @@ -152,6 +153,20 @@ ucol_closeElements(UCollationElements *elems); U_STABLE void U_EXPORT2 ucol_reset(UCollationElements *elems); +/** + * Set the collation elements to use implicit ordering for Han + * even if they've been tailored. This will also force Hangul + * syllables to be ordered by decomposing them to their component + * Jamo. + * + * @param elems The UCollationElements containing the text. + * @param status A pointer to a UErrorCode to reveive any errors. + * + * @internal + */ +U_INTERNAL void U_EXPORT2 +ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status); + /** * Get the ordering priority of the next collation element in the text. * A single character may contain more than one collation element. diff --git a/icu4c/source/i18n/usearch.cpp b/icu4c/source/i18n/usearch.cpp index 6078cddc2b..3a446534d9 100644 --- a/icu4c/source/i18n/usearch.cpp +++ b/icu4c/source/i18n/usearch.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2001-2008 IBM and others. All rights reserved. +* Copyright (C) 2001-2009 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 07/02/2001 synwee Creation. @@ -3785,7 +3785,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, found = TRUE; // Inner loop checks for a match beginning at each // position from the outer loop. - for (patIx=0; patIxpattern.CELength; patIx++) { + for (patIx=0; patIxpattern.PCELength; patIx++) { int64_t patCE = strsrch->pattern.PCE[patIx]; targetCEI = ceb.get(targetIx+patIx); // Compare CE from target string with CE from the pattern. @@ -3814,11 +3814,9 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, // an acceptable character range. // const CEI *firstCEI = ceb.get(targetIx); - const CEI *lastCEI = ceb.get(targetIx + strsrch->pattern.CELength - 1); - const CEI *nextCEI = ceb.get(targetIx + strsrch->pattern.CELength); + const CEI *lastCEI = ceb.get(targetIx + strsrch->pattern.PCELength - 1); + const CEI *nextCEI = ceb.get(targetIx + strsrch->pattern.PCELength); - // targetCEI = ceb.get(targetIx+strsrch->pattern.CELength); - // maxLimit = targetCEI->lowIndex; mStart = firstCEI->lowIndex; minLimit = lastCEI->lowIndex; maxLimit = nextCEI->lowIndex; @@ -3883,7 +3881,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, found = FALSE; } - if (!checkIdentical(strsrch, mStart, mLimit)) { + if (! checkIdentical(strsrch, mStart, mLimit)) { found = FALSE; } @@ -4006,10 +4004,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, found = TRUE; // Inner loop checks for a match beginning at each // position from the outer loop. - for (patIx = strsrch->pattern.CELength - 1; patIx >= 0; patIx -= 1) { + for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) { int64_t patCE = strsrch->pattern.PCE[patIx]; - targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1 - patIx); + targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx); // Compare CE from target string with CE from the pattern. // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, // which will fail the compare, below. @@ -4035,7 +4033,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, // There still is a chance of match failure if the CE range not correspond to // an acceptable character range. // - const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1); + const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1); const CEI *lastCEI = ceb.getPrevious(targetIx); const CEI *nextCEI = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL; @@ -4102,6 +4100,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, found = FALSE; } + if (! checkIdentical(strsrch, mStart, mLimit)) { + found = FALSE; + } + if (found) { break; } diff --git a/icu4c/source/test/cintltst/callcoll.c b/icu4c/source/test/cintltst/callcoll.c index 4bf383feef..cf76b28fb5 100644 --- a/icu4c/source/test/cintltst/callcoll.c +++ b/icu4c/source/test/cintltst/callcoll.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2008, International Business Machines Corporation and + * Copyright (c) 1997-2009, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************* @@ -515,7 +515,7 @@ backAndForth(UCollationElements *iter) } if (o != orders[index].order) { - log_err("Mismatched order at index %d: 0x%0:8X vs. 0x%0:8X\n", index, + log_err("Mismatched order at index %d: 0x%8.8X vs. 0x%8.8X\n", index, orders[index].order, o); goto bail; } diff --git a/icu4c/source/test/intltest/ssearch.cpp b/icu4c/source/test/intltest/ssearch.cpp index 3eab390d60..a0e1c29722 100644 --- a/icu4c/source/test/intltest/ssearch.cpp +++ b/icu4c/source/test/intltest/ssearch.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2008, International Business Machines + * Copyright (C) 2005-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -32,7 +32,12 @@ #include "intltest.h" #include "ssearch.h" +#include "unicode/colldata.h" +#include "unicode/bmsearch.h" +#include "unicode/bms.h" + #include "xmlparser.h" +#include "ucbuf.h" #include #include @@ -51,6 +56,8 @@ char testId[100]; __FILE__, __LINE__, testId, u_errorName(errcode));}} #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) +#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) uprv_free((void *) (array)) //--------------------------------------------------------------------------- // @@ -81,6 +88,34 @@ void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, case 2: name = "monkeyTest"; if (exec) monkeyTest(params); break; + + case 3: name = "bmMonkeyTest"; + if (exec) bmMonkeyTest(params); + break; + + case 4: name = "boyerMooreTest"; + if (exec) boyerMooreTest(); + break; + + case 5: name = "goodSuffixTest"; + if (exec) goodSuffixTest(); + break; + + case 6: name = "searchTime"; + if (exec) searchTime(); + break; + + case 7: name = "bmsTest"; + if (exec) bmsTest(); + break; + + case 8: name = "bmSearchTest"; + if (exec) bmSearchTest(); + break; + + case 9: name = "udhrTest"; + if (exec) udhrTest(); + break; #endif default: name = ""; break; //needed to end loop @@ -181,6 +216,16 @@ void SSearchTest::searchTest() normalize = UCOL_ON; } + // + // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. + // + UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; + const UnicodeString *alt = testCase->getAttribute("alternate_handling"); + TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); + if (alt != NULL && *alt == "SHIFTED") { + alternateHandling = UCOL_SHIFTED; + } + const UnicodeString defLocale("en"); char clocale[100]; const UnicodeString *locale = testCase->getAttribute("locale"); @@ -196,7 +241,7 @@ void SSearchTest::searchTest() int32_t expectedMatchStart = -1; int32_t expectedMatchLimit = -1; const UXMLElement *n; - int nodeCount = 0; + int32_t nodeCount = 0; n = testCase->getChildElement("pattern"); TEST_ASSERT(n != NULL); @@ -237,13 +282,14 @@ void SSearchTest::searchTest() // Check that there weren't extra things in the XML TEST_ASSERT(nodeCount == testCase->countChildren()); - // Open a collotor and StringSearch based on the parameters + // Open a collator and StringSearch based on the parameters // obtained from the XML. // status = U_ZERO_ERROR; UCollator *collator = ucol_open(clocale, &status); ucol_setStrength(collator, collatorStrength); ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); + ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status); UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), collator, @@ -315,6 +361,323 @@ void SSearchTest::searchTest() #endif } +struct UdhrTestCase +{ + char *locale; + char *file; +}; + +void SSearchTest::udhrTest() +{ + UErrorCode status = U_ZERO_ERROR; + char path[PATH_BUFFER_SIZE]; + const char *udhrPath = getPath(path, "udhr"); + + if (udhrPath == NULL) { + // couldn't get path: error message already output... + return; + } + + UdhrTestCase testCases[] = { + {"en", "udhr_eng.txt"}, + {"de", "udhr_deu_1996.txt"}, + {"fr", "udhr_fra.txt"}, + {"ru", "udhr_rus.txt"}, + {"th", "udhr_tha.txt"}, + {"ja", "udhr_jpn.txt"}, + {"ko", "udhr_kor.txt"}, + {"zh", "udhr_cmn_hans.txt"}, + {"zh_Hant", "udhr_cmn_hant.txt"} + }; + + int32_t testCount = ARRAY_SIZE(testCases); + + for (int32_t t = 0; t < testCount; t += 1) { + int32_t len = 0; + char *resolvedFileName = NULL; + const char *encoding = NULL; + UCHARBUF *ucharBuf = NULL; + + ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status); + resolvedFileName = NEW_ARRAY(char, len); + + if(resolvedFileName == NULL){ + continue; + } + + if(status == U_BUFFER_OVERFLOW_ERROR){ + status = U_ZERO_ERROR; + } + + ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status); + ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status); + + DELETE_ARRAY(resolvedFileName); + + if(U_FAILURE(status)){ + infoln("Could not open the input file %s. Test skipped\n", testCases[t].file); + continue; + } + + int32_t targetLen = 0; + const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status); + + /* The first line of the file contains the pattern */ + int32_t start = 0, end = 0, plen = 0; + + for(end = start; ; end += 1) { + UChar ch = target[end]; + + if (ch == 0x000A || ch == 0x000D || ch == 0x2028) { + break; + } + } + + plen = end - start; + + UChar *pattern = NEW_ARRAY(UChar, plen); + for (int32_t i = 0; i < plen; i += 1) { + pattern[i] = target[start++]; + } + + int32_t offset = 0; + UCollator *coll = ucol_open(testCases[t].locale, &status); + UCD *ucd = NULL; + BMS *bms = NULL; + + if (U_FAILURE(status)) { + errln("Could not open collator for %s", testCases[t].locale); + goto delete_collator; + } + + ucd = ucd_open(coll, &status); + + if (U_FAILURE(status)) { + errln("Could not open CollData object for %s", testCases[t].locale); + goto delete_ucd; + } + + bms = bms_open(ucd, pattern, plen, target, targetLen, &status); + + if (U_FAILURE(status)) { + errln("Could not open search object for %s", testCases[t].locale); + goto delete_bms; + } + + start = end = -1; + while (bms_search(bms, offset, &start, &end)) { + offset = end; + } + + if (offset == 0) { + errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file); + } + +delete_bms: + bms_close(bms); + +delete_ucd: + ucd_close(ucd); + +delete_collator: + ucol_close(coll); + + DELETE_ARRAY(pattern); + ucbuf_close(ucharBuf); + } + + ucd_flushCache(); +} + +void SSearchTest::bmSearchTest() +{ +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + UErrorCode status = U_ZERO_ERROR; + char path[PATH_BUFFER_SIZE]; + const char *testFilePath = getPath(path, "ssearch.xml"); + + if (testFilePath == NULL) { + return; /* Couldn't get path: error message already output. */ + } + + UXMLParser *parser = UXMLParser::createParser(status); + TEST_ASSERT_SUCCESS(status); + UXMLElement *root = parser->parseFile(testFilePath, status); + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { + return; + } + + const UnicodeString *debugTestCase = root->getAttribute("debug"); + if (debugTestCase != NULL) { +// setenv("USEARCH_DEBUG", "1", 1); + } + + + const UXMLElement *testCase; + int32_t tc = 0; + + while((testCase = root->nextChildElement(tc)) != NULL) { + + if (testCase->getTagName().compare("test-case") != 0) { + errln("ssearch, unrecognized XML Element in test file"); + continue; + } + const UnicodeString *id = testCase->getAttribute("id"); + *testId = 0; + if (id != NULL) { + id->extract(0, id->length(), testId, sizeof(testId), US_INV); + } + + // If debugging test case has been specified and this is not it, skip to next. + if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { + continue; + } + // + // Get the requested collation strength. + // Default is tertiary if the XML attribute is missing from the test case. + // + const UnicodeString *strength = testCase->getAttribute("strength"); + UColAttributeValue collatorStrength; + if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} + else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} + else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} + else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} + else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} + else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} + else { + // Bogus value supplied for strength. Shouldn't happen, even from + // typos, if the XML source has been validated. + // This assert is a little deceiving in that strength can be + // any of the allowed values, not just TERTIARY, but it will + // do the job of getting the error output. + TEST_ASSERT(*strength=="TERTIARY") + } + + // + // Get the collator normalization flag. Default is UCOL_OFF. + // + UColAttributeValue normalize = UCOL_OFF; + const UnicodeString *norm = testCase->getAttribute("norm"); + TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); + if (norm!=NULL && *norm=="ON") { + normalize = UCOL_ON; + } + + // + // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. + // + UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; + const UnicodeString *alt = testCase->getAttribute("alternate_handling"); + TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); + if (alt != NULL && *alt == "SHIFTED") { + alternateHandling = UCOL_SHIFTED; + } + + const UnicodeString defLocale("en"); + char clocale[100]; + const UnicodeString *locale = testCase->getAttribute("locale"); + if (locale == NULL || locale->length()==0) { + locale = &defLocale; + }; + locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); + + + UnicodeString text; + UnicodeString target; + UnicodeString pattern; + int32_t expectedMatchStart = -1; + int32_t expectedMatchLimit = -1; + const UXMLElement *n; + int32_t nodeCount = 0; + + n = testCase->getChildElement("pattern"); + TEST_ASSERT(n != NULL); + if (n==NULL) { + continue; + } + text = n->getText(FALSE); + text = text.unescape(); + pattern.append(text); + nodeCount++; + + n = testCase->getChildElement("pre"); + if (n!=NULL) { + text = n->getText(FALSE); + text = text.unescape(); + target.append(text); + nodeCount++; + } + + n = testCase->getChildElement("m"); + if (n!=NULL) { + expectedMatchStart = target.length(); + text = n->getText(FALSE); + text = text.unescape(); + target.append(text); + expectedMatchLimit = target.length(); + nodeCount++; + } + + n = testCase->getChildElement("post"); + if (n!=NULL) { + text = n->getText(FALSE); + text = text.unescape(); + target.append(text); + nodeCount++; + } + + // Check that there weren't extra things in the XML + TEST_ASSERT(nodeCount == testCase->countChildren()); + + // Open a collator and StringSearch based on the parameters + // obtained from the XML. + // + status = U_ZERO_ERROR; + UCollator *collator = ucol_open(clocale, &status); + ucol_setStrength(collator, collatorStrength); + ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); + ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status); + UCD *ucd = ucd_open(collator, &status); + BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status); + + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { + bms_close(bms); + ucd_close(ucd); + ucol_close(collator); + continue; + } + + int32_t foundStart = 0; + int32_t foundLimit = 0; + UBool foundMatch; + + // + // Do the search, check the match result against the expected results. + // + foundMatch = bms_search(bms, 0, &foundStart, &foundLimit); + //TEST_ASSERT_SUCCESS(status); + if (foundMatch && expectedMatchStart < 0 || + foundStart != expectedMatchStart || + foundLimit != expectedMatchLimit) { + TEST_ASSERT(FALSE); // ouput generic error position + infoln("Found, expected match start = %d, %d \n" + "Found, expected match limit = %d, %d", + foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); + } + + bms_close(bms); + ucd_close(ucd); + ucol_close(collator); + } + + ucd_flushCache(); + delete root; + delete parser; +#endif +} + struct Order { int32_t order; @@ -549,6 +912,10 @@ static char *printOrders(char *buffer, OrderList &list) void SSearchTest::offsetTest() { const char *test[] = { + // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous + // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. + "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", + "\\ua191\\u16ef\\u2036\\u017a", #if 0 @@ -673,341 +1040,6 @@ void SSearchTest::offsetTest() delete col; } -class CEList -{ -public: - CEList(UCollator *coll, const UnicodeString &string); - ~CEList(); - - int32_t size() const; - int32_t get(int32_t index) const; - UBool matchesAt(int32_t offset, const CEList *other) const; - -private: - void add(int32_t ce); - - int32_t *ces; - int32_t listMax; - int32_t listSize; -}; - -CEList::CEList(UCollator *coll, const UnicodeString &string) - : ces(NULL), listMax(8), listSize(0) -{ - UErrorCode status = U_ZERO_ERROR; - UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); - uint32_t strengthMask = 0; - int32_t order; - -#if 0 - switch (ucol_getStrength(coll)) - { - default: - strengthMask |= UCOL_TERTIARYORDERMASK; - /* fall through */ - - case UCOL_SECONDARY: - strengthMask |= UCOL_SECONDARYORDERMASK; - /* fall through */ - - case UCOL_PRIMARY: - strengthMask |= UCOL_PRIMARYORDERMASK; - } -#else - strengthMask = UCOL_PRIMARYORDERMASK; -#endif - - ces = new int32_t[listMax]; - - while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { - order &= strengthMask; - - if (order == UCOL_IGNORABLE) { - continue; - } - - add(order); - } - - ucol_closeElements(elems); -} - -CEList::~CEList() -{ - delete[] ces; -} - -void CEList::add(int32_t ce) -{ - if (listSize >= listMax) { - listMax *= 2; - - int32_t *newCEs = new int32_t[listMax]; - - uprv_memcpy(newCEs, ces, listSize * sizeof(int32_t)); - delete[] ces; - ces = newCEs; - } - - ces[listSize++] = ce; -} - -int32_t CEList::get(int32_t index) const -{ - if (index >= 0 && index < listSize) { - return ces[index]; - } - - return -1; -} - -UBool CEList::matchesAt(int32_t offset, const CEList *other) const -{ - if (listSize - offset < other->size()) { - return FALSE; - } - - for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { - if (ces[i] != other->get(j)) { - return FALSE; - } - } - - return TRUE; -} - -int32_t CEList::size() const -{ - return listSize; -} - -class StringList -{ -public: - StringList(); - ~StringList(); - - void add(const UnicodeString *string); - void add(const UChar *chars, int32_t count); - const UnicodeString *get(int32_t index) const; - int32_t size() const; - -private: - UnicodeString *strings; - int32_t listMax; - int32_t listSize; -}; - -StringList::StringList() - : strings(NULL), listMax(16), listSize(0) -{ - strings = new UnicodeString [listMax]; -} - -StringList::~StringList() -{ - delete[] strings; -} - -void StringList::add(const UnicodeString *string) -{ - if (listSize >= listMax) { - listMax *= 2; - - UnicodeString *newStrings = new UnicodeString[listMax]; - - uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString)); - - delete[] strings; - strings = newStrings; - } - - // The ctor initialized all the strings in - // the array to empty strings, so this - // is the same as copying the source string. - strings[listSize++].append(*string); -} - -void StringList::add(const UChar *chars, int32_t count) -{ - const UnicodeString string(chars, count); - - add(&string); -} - -const UnicodeString *StringList::get(int32_t index) const -{ - if (index >= 0 && index < listSize) { - return &strings[index]; - } - - return NULL; -} - -int32_t StringList::size() const -{ - return listSize; -} - - -U_CFUNC void deleteStringList(void *obj); - -class CEToStringsMap -{ -public: - - CEToStringsMap(); - ~CEToStringsMap(); - - void put(int32_t ce, UnicodeString *string); - StringList *getStringList(int32_t ce) const; - -private: - - void putStringList(int32_t ce, StringList *stringList); - UHashtable *map; -}; - -CEToStringsMap::CEToStringsMap() -{ - UErrorCode status = U_ZERO_ERROR; - - map = uhash_open(uhash_hashLong, uhash_compareLong, - uhash_compareCaselessUnicodeString, - &status); - - uhash_setValueDeleter(map, deleteStringList); -} - -CEToStringsMap::~CEToStringsMap() -{ - uhash_close(map); -} - -void CEToStringsMap::put(int32_t ce, UnicodeString *string) -{ - StringList *strings = getStringList(ce); - - if (strings == NULL) { - strings = new StringList(); - putStringList(ce, strings); - } - - strings->add(string); -} - -StringList *CEToStringsMap::getStringList(int32_t ce) const -{ - return (StringList *) uhash_iget(map, ce); -} - -void CEToStringsMap::putStringList(int32_t ce, StringList *stringList) -{ - UErrorCode status = U_ZERO_ERROR; - - uhash_iput(map, ce, (void *) stringList, &status); -} - -U_CFUNC void deleteStringList(void *obj) -{ - StringList *strings = (StringList *) obj; - - delete strings; -} - -U_CFUNC void deleteCEList(void *obj); -U_CFUNC void deleteUnicodeStringKey(void *obj); - -class StringToCEsMap -{ -public: - StringToCEsMap(); - ~StringToCEsMap(); - - void put(const UnicodeString *string, const CEList *ces); - const CEList *get(const UnicodeString *string); - -private: - - - UHashtable *map; -}; - -StringToCEsMap::StringToCEsMap() -{ - UErrorCode status = U_ZERO_ERROR; - - map = uhash_open(uhash_hashCaselessUnicodeString, - uhash_compareCaselessUnicodeString, - uhash_compareLong, - &status); - - uhash_setValueDeleter(map, deleteCEList); - uhash_setKeyDeleter(map, deleteUnicodeStringKey); -} - -StringToCEsMap::~StringToCEsMap() -{ - uhash_close(map); -} - -void StringToCEsMap::put(const UnicodeString *string, const CEList *ces) -{ - UErrorCode status = U_ZERO_ERROR; - - uhash_put(map, (void *) string, (void *) ces, &status); -} - -const CEList *StringToCEsMap::get(const UnicodeString *string) -{ - return (const CEList *) uhash_get(map, string); -} - -U_CFUNC void deleteCEList(void *obj) -{ - CEList *list = (CEList *) obj; - - delete list; -} - -U_CFUNC void deleteUnicodeStringKey(void *obj) -{ - UnicodeString *key = (UnicodeString *) obj; - - delete key; -} - -static void buildData(UCollator *coll, USet *charsToTest, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith) -{ - int32_t itemCount = uset_getItemCount(charsToTest); - UErrorCode status = U_ZERO_ERROR; - - for(int32_t item = 0; item < itemCount; item += 1) { - UChar32 start = 0, end = 0; - UChar buffer[16]; - int32_t len = uset_getItem(charsToTest, item, &start, &end, - buffer, 16, &status); - - if (len == 0) { - for (UChar32 ch = start; ch <= end; ch += 1) { - UnicodeString *st = new UnicodeString(ch); - CEList *ceList = new CEList(coll, *st); - - charsToCEList->put(st, ceList); - ceToCharsStartingWith->put(ceList->get(0), st); - } - } else if (len > 0) { - UnicodeString *st = new UnicodeString(buffer, len); - CEList *ceList = new CEList(coll, *st); - - charsToCEList->put(st, ceList); - ceToCharsStartingWith->put(ceList->get(0), st); - } else { - // shouldn't happen... - } - } -} - static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) { for(int32_t i = 0; i < string.length(); i += 1) { @@ -1038,69 +1070,502 @@ static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) return buffer; } +static USet *uset_openEmpty(); +#if 1 -static int32_t minLengthInChars(const CEList *ceList, int32_t offset, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith, - UnicodeString &debug) +struct PCE { - // find out shortest string for the longest sequence of ces. - // needs to be refined to use dynamic programming, but will be roughly right - int32_t totalStringLength = 0; - - while (offset < ceList->size()) { - int32_t ce = ceList->get(offset); - int32_t bestLength = INT32_MIN; - const UnicodeString *bestString = NULL; - int32_t bestCeLength = 0; - const StringList *strings = ceToCharsStartingWith->getStringList(ce); - int32_t stringCount = strings->size(); - - for (int32_t s = 0; s < stringCount; s += 1) { - const UnicodeString *string = strings->get(s); - const CEList *ceList2 = charsToCEList->get(string); + uint64_t ce; + int32_t lowOffset; + int32_t highOffset; +}; - if (ceList->matchesAt(offset, ceList2)) { - int32_t length = ceList2->size() - string->length(); +class PCEList +{ +public: + PCEList(UCollator *coll, const UnicodeString &string); + ~PCEList(); - if (bestLength < length) { - bestLength = length; - bestCeLength = ceList2->size(); - bestString = string; - } - } - } - - totalStringLength += bestString->length(); - escape(*bestString, debug).append("/"); - offset += bestCeLength; - } + int32_t size() const; - debug.append((UChar)0x0000); - return totalStringLength; + const PCE *get(int32_t index) const; + + int32_t getLowOffset(int32_t index) const; + int32_t getHighOffset(int32_t index) const; + uint64_t getOrder(int32_t index) const; + + UBool matchesAt(int32_t offset, const PCEList &other) const; + + uint64_t operator[](int32_t index) const; + +private: + void add(uint64_t ce, int32_t low, int32_t high); + + PCE *list; + int32_t listMax; + int32_t listSize; +}; + +PCEList::PCEList(UCollator *coll, const UnicodeString &string) +{ + UErrorCode status = U_ZERO_ERROR; + UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); + uint64_t order; + int32_t low, high; + + list = new PCE[listMax]; + + ucol_setOffset(elems, 0, &status); + + do { + order = ucol_nextProcessed(elems, &low, &high, &status); + add(order, low, high); + } while (order != UCOL_PROCESSED_NULLORDER); + + ucol_closeElements(elems); } -static void minLengthTest(UCollator *coll, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith) +PCEList::~PCEList() { - UnicodeString examples[] = {"fuss", "fiss", "affliss", "VII"}; - UnicodeString debug; - int32_t nExamples = sizeof(examples) / sizeof(examples[0]); + delete[] list; +} - for (int32_t s = 0; s < nExamples; s += 1) { - CEList *ceList = new CEList(coll, examples[s]); +void PCEList::add(uint64_t order, int32_t low, int32_t high) +{ + if (listSize >= listMax) { + listMax *= 2; - //infoln("%S:", examples[s].getTerminatedBuffer()); + PCE *newList = new PCE[listMax]; - for(int32_t i = 0; i < examples[s].length(); i += 1) { - debug.remove(); + uprv_memcpy(newList, list, listSize * sizeof(Order)); + delete[] list; + list = newList; + } - int32_t minLength = minLengthInChars(ceList, i, charsToCEList, ceToCharsStartingWith, debug); - //infoln("\t%d\t%S", minLength, debug.getTerminatedBuffer()); + list[listSize].ce = order; + list[listSize].lowOffset = low; + list[listSize].highOffset = high; + + listSize += 1; +} + +const PCE *PCEList::get(int32_t index) const +{ + if (index >= listSize) { + return NULL; + } + + return &list[index]; +} + +int32_t PCEList::getLowOffset(int32_t index) const +{ + const PCE *pce = get(index); + + if (pce != NULL) { + return pce->lowOffset; + } + + return -1; +} + +int32_t PCEList::getHighOffset(int32_t index) const +{ + const PCE *pce = get(index); + + if (pce != NULL) { + return pce->highOffset; + } + + return -1; +} + +uint64_t PCEList::getOrder(int32_t index) const +{ + const PCE *pce = get(index); + + if (pce != NULL) { + return pce->ce; + } + + return UCOL_PROCESSED_NULLORDER; +} + +int32_t PCEList::size() const +{ + return listSize; +} + +UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const +{ + // NOTE: sizes include the NULLORDER, which we don't want to compare. + int32_t otherSize = other.size() - 1; + + if (listSize - 1 - offset < otherSize) { + return FALSE; + } + + for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { + if (getOrder(i) != other.getOrder(j)) { + return FALSE; + } + } + + return TRUE; +} + +uint64_t PCEList::operator[](int32_t index) const +{ + return getOrder(index); +} + +void SSearchTest::boyerMooreTest() +{ + UErrorCode status = U_ZERO_ERROR; + UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); + CollData *data = NULL; + UnicodeString lp = "fuss"; + UnicodeString sp = "fu\\u00DF"; + BoyerMooreSearch *longPattern = NULL; + BoyerMooreSearch *shortPattern = NULL; + UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", + "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", + "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; + int32_t start = -1, end = -1; + + coll = ucol_openFromShortString("S1", FALSE, NULL, &status); + if (U_FAILURE(status)) { + errln("Could not open collator."); + return; + } + + data = CollData::open(coll, status); + if (U_FAILURE(status)) { + errln("Could not open CollData object."); + goto close_data; + } + + + longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status); + shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status); + if (U_FAILURE(status)) { + errln("Could not create pattern objects."); + goto close_patterns; + } + + for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { + UnicodeString target = targets[t].unescape(); + + longPattern->setTargetString(&target, status); + if (longPattern->search(0, start, end)) { + logln("Test %d: found long pattern at [%d, %d].", t, start, end); + } else { + errln("Test %d: did not find long pattern.", t); } - //infoln(); - delete ceList; + shortPattern->setTargetString(&target, status); + if (shortPattern->search(0, start, end)) { + logln("Test %d: found short pattern at [%d, %d].", t, start, end); + } else { + errln("Test %d: did not find short pattern.", t); + } } + +close_patterns: + delete shortPattern; + delete longPattern; + +close_data: + CollData::close(data); + ucol_close(coll); } +void SSearchTest::bmsTest() +{ + UErrorCode status = U_ZERO_ERROR; + UCollator *coll = NULL; + UCD *data = NULL; + UnicodeString lp = "fuss"; + UnicodeString lpu = lp.unescape(); + UnicodeString sp = "fu\\u00DF"; + UnicodeString spu = sp.unescape(); + BMS *longPattern = NULL; + BMS *shortPattern = NULL; + UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", + "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", + "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; + int32_t start = -1, end = -1; + + coll = ucol_openFromShortString("S1", FALSE, NULL, &status); + if (U_FAILURE(status)) { + errln("Could not open collator."); + return; + } + + data = ucd_open(coll, &status); + if (U_FAILURE(status)) { + errln("Could not open CollData object."); + goto close_data; + } + + longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status); + shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status); + if (U_FAILURE(status)) { + errln("Couldn't open pattern objects."); + goto close_patterns; + } + + for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { + UnicodeString target = targets[t].unescape(); + + bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status); + if (bms_search(longPattern, 0, &start, &end)) { + logln("Test %d: found long pattern at [%d, %d].", t, start, end); + } else { + errln("Test %d: did not find long pattern.", t); + } + + bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status); + if (bms_search(shortPattern, 0, &start, &end)) { + logln("Test %d: found short pattern at [%d, %d].", t, start, end); + } else { + errln("Test %d: did not find short pattern.", t); + } + } + +close_patterns: + bms_close(shortPattern); + bms_close(longPattern); + +close_data: + ucd_close(data); + ucol_close(coll); +} + +void SSearchTest::goodSuffixTest() +{ + UErrorCode status = U_ZERO_ERROR; + UCollator *coll = NULL; + CollData *data = NULL; + UnicodeString pat = /*"gcagagag"*/ "fxeld"; + UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; + BoyerMooreSearch *pattern = NULL; + int32_t start = -1, end = -1; + + coll = ucol_open(NULL, &status); + if (U_FAILURE(status)) { + errln("Couldn't open collator."); + return; + } + + data = CollData::open(coll, status); + if (U_FAILURE(status)) { + errln("Couldn't open CollData object."); + goto close_data; + } + + pattern = new BoyerMooreSearch(data, pat, &target, status); + if (U_FAILURE(status)) { + errln("Couldn't open pattern object."); + goto close_pattern; + } + + if (pattern->search(0, start, end)) { + logln("Found pattern at [%d, %d].", start, end); + } else { + errln("Did not find pattern."); + } + +close_pattern: + delete pattern; + +close_data: + CollData::close(data); + ucol_close(coll); +} + +// +// searchTime() A quick and dirty performance test for string search. +// Probably doesn't really belong as part of intltest, but it +// does check that the search succeeds, and gets the right result, +// so it serves as a functionality test also. +// +// To run as a perf test, up the loop count, select by commenting +// and uncommenting in the code the operation to be measured, +// rebuild, and measure the running time of this test alone. +// +// time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime +// +void SSearchTest::searchTime() { + static const char *longishText = +"Whylom, as olde stories tellen us,\n" +"Ther was a duk that highte Theseus:\n" +"Of Athenes he was lord and governour,\n" +"And in his tyme swich a conquerour,\n" +"That gretter was ther noon under the sonne.\n" +"Ful many a riche contree hadde he wonne;\n" +"What with his wisdom and his chivalrye,\n" +"He conquered al the regne of Femenye,\n" +"That whylom was y-cleped Scithia;\n" +"And weddede the quene Ipolita,\n" +"And broghte hir hoom with him in his contree\n" +"With muchel glorie and greet solempnitee,\n" +"And eek hir yonge suster Emelye.\n" +"And thus with victorie and with melodye\n" +"Lete I this noble duk to Athenes ryde,\n" +"And al his hoost, in armes, him bisyde.\n" +"And certes, if it nere to long to here,\n" +"I wolde han told yow fully the manere,\n" +"How wonnen was the regne of Femenye\n" +"By Theseus, and by his chivalrye;\n" +"And of the grete bataille for the nones\n" +"Bitwixen Athen's and Amazones;\n" +"And how asseged was Ipolita,\n" +"The faire hardy quene of Scithia;\n" +"And of the feste that was at hir weddinge,\n" +"And of the tempest at hir hoom-cominge;\n" +"But al that thing I moot as now forbere.\n" +"I have, God woot, a large feeld to ere,\n" +"And wayke been the oxen in my plough.\n" +"The remenant of the tale is long y-nough.\n" +"I wol nat letten eek noon of this route;\n" +"Lat every felawe telle his tale aboute,\n" +"And lat see now who shal the soper winne;\n" +"And ther I lefte, I wol ageyn biginne.\n" +"This duk, of whom I make mencioun,\n" +"When he was come almost unto the toun,\n" +"In al his wele and in his moste pryde,\n" +"He was war, as he caste his eye asyde,\n" +"Wher that ther kneled in the hye weye\n" +"A companye of ladies, tweye and tweye,\n" +"Ech after other, clad in clothes blake; \n" +"But swich a cry and swich a wo they make,\n" +"That in this world nis creature livinge,\n" +"That herde swich another weymentinge;\n" +"And of this cry they nolde never stenten,\n" +"Til they the reynes of his brydel henten.\n" +"'What folk ben ye, that at myn hoomcominge\n" +"Perturben so my feste with cryinge'?\n" +"Quod Theseus, 'have ye so greet envye\n" +"Of myn honour, that thus compleyne and crye? \n" +"Or who hath yow misboden, or offended?\n" +"And telleth me if it may been amended;\n" +"And why that ye ben clothed thus in blak'?\n" +"The eldest lady of hem alle spak,\n" +"When she hadde swowned with a deedly chere,\n" +"That it was routhe for to seen and here,\n" +"And seyde: 'Lord, to whom Fortune hath yiven\n" +"Victorie, and as a conquerour to liven,\n" +"Noght greveth us your glorie and your honour;\n" +"But we biseken mercy and socour.\n" +"Have mercy on our wo and our distresse.\n" +"Som drope of pitee, thurgh thy gentilesse,\n" +"Up-on us wrecched wommen lat thou falle.\n" +"For certes, lord, ther nis noon of us alle,\n" +"That she nath been a duchesse or a quene;\n" +"Now be we caitifs, as it is wel sene:\n" +"Thanked be Fortune, and hir false wheel,\n" +"That noon estat assureth to be weel.\n" +"And certes, lord, t'abyden your presence,\n" +"Here in the temple of the goddesse Clemence\n" +"We han ben waytinge al this fourtenight;\n" +"Now help us, lord, sith it is in thy might.\n" +"I wrecche, which that wepe and waille thus,\n" +"Was whylom wyf to king Capaneus,\n" +"That starf at Thebes, cursed be that day!\n" +"And alle we, that been in this array,\n" +"And maken al this lamentacioun,\n" +"We losten alle our housbondes at that toun,\n" +"Whyl that the sege ther-aboute lay.\n" +"And yet now th'olde Creon, weylaway!\n" +"The lord is now of Thebes the citee, \n" +"Fulfild of ire and of iniquitee,\n" +"He, for despyt, and for his tirannye,\n" +"To do the dede bodyes vileinye,\n" +"Of alle our lordes, whiche that ben slawe,\n" +"Hath alle the bodyes on an heep y-drawe,\n" +"And wol nat suffren hem, by noon assent,\n" +"Neither to been y-buried nor y-brent,\n" +"But maketh houndes ete hem in despyt. zet'\n"; + +#define TEST_BOYER_MOORE 1 +const char *cPattern = "maketh houndes ete hem"; +//const char *cPattern = "Whylom"; +//const char *cPattern = "zet"; + const char *testId = "searchTime()"; // for error macros. + UnicodeString target = longishText; + UErrorCode status = U_ZERO_ERROR; + + + UCollator *collator = ucol_open("en", &status); + CollData *data = CollData::open(collator, status); + TEST_ASSERT_SUCCESS(status); + //ucol_setStrength(collator, collatorStrength); + //ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); + UnicodeString uPattern = cPattern; +#ifndef TEST_BOYER_MOORE + UStringSearch *uss = usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(), + target.getBuffer(), target.length(), + collator, + NULL, // the break iterator + &status); + TEST_ASSERT_SUCCESS(status); +#else + BoyerMooreSearch bms(data, uPattern, &target, status); + TEST_ASSERT_SUCCESS(status); +#endif + +// int32_t foundStart; +// int32_t foundEnd; + UBool found; + + // Find the match position usgin strstr + const char *pm = strstr(longishText, cPattern); + TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); + int32_t refMatchPos = (int32_t)(pm - longishText); + int32_t icuMatchPos; + int32_t icuMatchEnd; +#ifndef TEST_BOYER_MOORE + usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status); + TEST_ASSERT_SUCCESS(status); +#else + found = bms.search(0, icuMatchPos, icuMatchEnd); +#endif + TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions."); + + int32_t i; + int32_t j=0; + + // Try loopcounts around 100000 to some millions, depending on the operation, + // to get runtimes of at least several seconds. + for (i=0; i<10000; i++) { +#ifndef TEST_BOYER_MOORE + found = usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status); +#else + found = bms.search(0, icuMatchPos, icuMatchEnd); +#endif + //TEST_ASSERT_SUCCESS(status); + //TEST_ASSERT(found); + + // usearch_setOffset(uss, 0, &status); + // icuMatchPos = usearch_next(uss, &status); + + // The i+j stuff is to confuse the optimizer and get it to actually leave the + // call to strstr in place. + //pm = strstr(longishText+j, cPattern); + //j = (j + i)%5; + } + + printf("%d\n", pm-longishText, j); +#ifndef TEST_BOYER_MOORE + usearch_close(uss); +#else + CollData::close(data); +#endif + ucol_close(collator); +} +#endif + //---------------------------------------------------------------------------------------- // // Random Numbers. Similar to standard lib rand() and srand() @@ -1174,7 +1639,7 @@ void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) class StringSetMonkey : public Monkey { public: - StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith); + StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData); ~StringSetMonkey(); void append(UnicodeString &testCase, UnicodeString &alternate); @@ -1183,13 +1648,12 @@ private: UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate); const USet *set; - UCollator *coll; - StringToCEsMap *charsToCEList; - CEToStringsMap *ceToCharsStartingWith; + UCollator *coll; + CollData *collData; }; -StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith) -: Monkey(), set(theSet), coll(theCollator), charsToCEList(theCharsToCEList), ceToCharsStartingWith(theCeToCharsStartingWith) +StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData) +: Monkey(), set(theSet), coll(theCollator), collData(theCollData) { // ook. } @@ -1231,7 +1695,8 @@ UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCas { // find out shortest string for the longest sequence of ces. // needs to be refined to use dynamic programming, but will be roughly right - CEList ceList(coll, testCase); + UErrorCode status = U_ZERO_ERROR; + CEList ceList(coll, testCase, status); UnicodeString alt; int32_t offset = 0; @@ -1241,7 +1706,7 @@ UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCas while (offset < ceList.size()) { int32_t ce = ceList.get(offset); - const StringList *strings = ceToCharsStartingWith->getStringList(ce); + const StringList *strings = collData->getStringList(ce); if (strings == NULL) { return alternate.append(testCase); @@ -1251,8 +1716,9 @@ UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCas int32_t tries = 0; // find random string that generates the same CEList - const CEList *ceList2; - const UnicodeString *string; + const CEList *ceList2 = NULL; + const UnicodeString *string = NULL; + UBool matches = FALSE; do { int32_t s = m_rand() % stringCount; @@ -1263,14 +1729,20 @@ UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCas } string = strings->get(s); - ceList2 = charsToCEList->get(string); - } while (! ceList.matchesAt(offset, ceList2)); + ceList2 = collData->getCEList(string); + matches = ceList.matchesAt(offset, ceList2); + + if (! matches) { + collData->freeCEList((CEList *) ceList2); + } + } while (! matches); alt.append(*string); offset += ceList2->size(); + collData->freeCEList(ceList2); } - const CEList altCEs(coll, alt); + const CEList altCEs(coll, alt, status); if (ceList.matchesAt(0, &altCEs)) { return alternate.append(alt); @@ -1282,6 +1754,7 @@ UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCas static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate) { int32_t pieces = (m_rand() % 4) + 1; + UErrorCode status = U_ZERO_ERROR; UBool matches; do { @@ -1295,8 +1768,8 @@ static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyC monkeys[monkey]->append(testCase, alternate); } - const CEList ceTest(coll, testCase); - const CEList ceAlt(coll, alternate); + const CEList ceTest(coll, testCase, status); + const CEList ceAlt(coll, alternate, status); matches = ceTest.matchesAt(0, &ceAlt); } while (! matches); @@ -1391,7 +1864,8 @@ static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t target.getBuffer(), target.length(), &status); if (patternSize == 0) { - matchStart = matchEnd = 0; + // Searching for an empty pattern always fails + matchStart = matchEnd = -1; return FALSE; } @@ -1512,14 +1986,9 @@ int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCa // **** TODO: find *all* matches, not just first one **** simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); -#if 0 usearch_search(uss, 0, &actualStart, &actualEnd, &status); -#else - actualStart = usearch_next(uss, &status); - actualEnd = actualStart + usearch_getMatchedLength(uss); -#endif - if (actualStart != expectedStart || actualEnd != expectedEnd) { + if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { errln("Search for in <%s> failed: expected [%d, %d], got [%d, %d]\n" " strength=%s seed=%d", name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); @@ -1534,15 +2003,9 @@ int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCa usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status); -#if 0 usearch_search(uss, 0, &actualStart, &actualEnd, &status); -#else - usearch_reset(uss); - actualStart = usearch_next(uss, &status); - actualEnd = actualStart + usearch_getMatchedLength(uss); -#endif - if (actualStart != expectedStart || actualEnd != expectedEnd) { + if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { errln("Search for in <%s> failed: expected [%d, %d], got [%d, %d]\n" " strength=%s seed=%d", name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); @@ -1554,6 +2017,52 @@ int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCa usearch_close(uss); + return notFoundCount; +} + +int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, + BoyerMooreSearch *bms, BoyerMooreSearch *abms, + const char *name, const char *strength, uint32_t seed) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t actualStart = -1, actualEnd = -1; + //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); + int32_t expectedStart = -1, expectedEnd = -1; + int32_t notFoundCount = 0; + + // **** TODO: find *all* matches, not just first one **** + simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); + + bms->setTargetString(&testCase, status); + bms->search(0, actualStart, actualEnd); + + if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { + errln("Boyer-Moore Search for in <%s> failed: expected [%d, %d], got [%d, %d]\n" + " strength=%s seed=%d", + name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); + } + + if (expectedStart == -1 && actualStart == -1) { + notFoundCount += 1; + } + + // **** TODO: find *all* matches, not just first one **** + simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); + + abms->setTargetString(&testCase, status); + abms->search(0, actualStart, actualEnd); + + if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { + errln("Boyer-Moore Search for in <%s> failed: expected [%d, %d], got [%d, %d]\n" + " strength=%s seed=%d", + name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); + } + + if (expectedStart == -1 && actualStart == -1) { + notFoundCount += 1; + } + + return notFoundCount; } #endif @@ -1562,33 +2071,27 @@ void SSearchTest::monkeyTest(char *params) { // ook! UErrorCode status = U_ZERO_ERROR; - U_STRING_DECL(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47); - U_STRING_INIT(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47); - UCollator *coll = ucol_open(NULL, &status); + //UCollator *coll = ucol_open(NULL, &status); + UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); + if (U_FAILURE(status)) { errln("Failed to create collator in MonkeyTest!"); return; } - USet *charsToTest = uset_openPattern(test_pattern, 47, &status); + + CollData *monkeyData = CollData::open(coll, status); + USet *expansions = uset_openEmpty(); USet *contractions = uset_openEmpty(); - StringToCEsMap *charsToCEList = new StringToCEsMap(); - CEToStringsMap *ceToCharsStartingWith = new CEToStringsMap(); ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); - uset_addAll(charsToTest, contractions); - uset_addAll(charsToTest, expansions); - - // TODO: set strength to UCOL_PRIMARY, change CEList to use strength? - buildData(coll, charsToTest, charsToCEList, ceToCharsStartingWith); - U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); USet *letters = uset_openPattern(letter_pattern, 39, &status); SetMonkey letterMonkey(letters); - StringSetMonkey contractionMonkey(contractions, coll, charsToCEList, ceToCharsStartingWith); - StringSetMonkey expansionMonkey(expansions, coll, charsToCEList, ceToCharsStartingWith); + StringSetMonkey contractionMonkey(contractions, coll, monkeyData); + StringSetMonkey expansionMonkey(expansions, coll, monkeyData); UnicodeString testCase; UnicodeString alternate; UnicodeString pattern, altPattern; @@ -1613,7 +2116,7 @@ void SSearchTest::monkeyTest(char *params) int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); int32_t loopCount = quick? 1000 : 10000; int32_t firstStrength = 0; - int32_t lastStrength = strengthCount - 1; + int32_t lastStrength = strengthCount - 1; //*/ 0; if (params != NULL) { #if !UCONFIG_NO_REGULAR_EXPRESSIONS @@ -1654,15 +2157,12 @@ void SSearchTest::monkeyTest(char *params) for(int32_t s = firstStrength; s <= lastStrength; s += 1) { int32_t notFoundCount = 0; + logln("Setting strength to %s.", strengthNames[s]); ucol_setStrength(coll, strengths[s]); - - int32_t i = 1000; //Reduce the number of iteration in non-exhaustive mode - if(!quick){ - i = 10000; - } + // TODO: try alternate prefix and suffix too? // TODO: alterntaes are only equal at primary strength. Is this OK? - for(int32_t t = 0; t < i; t += 1) { + for(int32_t t = 0; t < loopCount; t += 1) { uint32_t seed = m_seed; int32_t nmc = 0; @@ -1693,16 +2193,166 @@ void SSearchTest::monkeyTest(char *params) notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); } - logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); + logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); } - delete ceToCharsStartingWith; - delete charsToCEList; - uset_close(contractions); uset_close(expansions); - uset_close(charsToTest); uset_close(letters); + + CollData::close(monkeyData); + + ucol_close(coll); +} + +void SSearchTest::bmMonkeyTest(char *params) +{ + // ook! + UErrorCode status = U_ZERO_ERROR; + UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); + + if (U_FAILURE(status)) { + errln("Failed to create collator in MonkeyTest!"); + return; + } + + CollData *monkeyData = CollData::open(coll, status); + + USet *expansions = uset_openEmpty(); + USet *contractions = uset_openEmpty(); + + ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); + + U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); + U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); + USet *letters = uset_openPattern(letter_pattern, 39, &status); + SetMonkey letterMonkey(letters); + StringSetMonkey contractionMonkey(contractions, coll, monkeyData); + StringSetMonkey expansionMonkey(expansions, coll, monkeyData); + UnicodeString testCase; + UnicodeString alternate; + UnicodeString pattern, altPattern; + UnicodeString prefix, altPrefix; + UnicodeString suffix, altSuffix; + + Monkey *monkeys[] = { + &letterMonkey, + &contractionMonkey, + &expansionMonkey, + &contractionMonkey, + &expansionMonkey, + &contractionMonkey, + &expansionMonkey, + &contractionMonkey, + &expansionMonkey}; + int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); + int32_t nonMatchCount = 0; + + UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; + const char *strengthNames[] = {"primary", "secondary", "tertiary"}; + int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); + int32_t loopCount = quick? 1000 : 10000; + int32_t firstStrength = 0; + int32_t lastStrength = strengthCount - 1; //*/ 0; + + if (params != NULL) { +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + UnicodeString p(params); + + loopCount = getIntParam("loop", p, loopCount); + m_seed = getIntParam("seed", p, m_seed); + + RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); + if (m.find()) { + UnicodeString breakType = m.group(1, status); + + for (int32_t s = 0; s < strengthCount; s += 1) { + if (breakType == strengthNames[s]) { + firstStrength = lastStrength = s; + break; + } + } + + m.reset(); + p = m.replaceFirst("", status); + } + + if (RegexMatcher("\\S", p, 0, status).find()) { + // Each option is stripped out of the option string as it is processed. + // All options have been checked. The option string should have been completely emptied.. + char buf[100]; + p.extract(buf, sizeof(buf), NULL, status); + buf[sizeof(buf)-1] = 0; + errln("Unrecognized or extra parameter: %s\n", buf); + return; + } +#else + infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); +#endif + } + + for(int32_t s = firstStrength; s <= lastStrength; s += 1) { + int32_t notFoundCount = 0; + + logln("Setting strength to %s.", strengthNames[s]); + ucol_setStrength(coll, strengths[s]); + + CollData *data = CollData::open(coll, status); + + // TODO: try alternate prefix and suffix too? + // TODO: alterntaes are only equal at primary strength. Is this OK? + for(int32_t t = 0; t < loopCount; t += 1) { + uint32_t seed = m_seed; + int32_t nmc = 0; + + generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); + generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); + generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); + + BoyerMooreSearch pat(data, pattern, NULL, status); + BoyerMooreSearch alt(data, altPattern, NULL, status); + + // **** need a better way to deal with this **** +#if 0 + if (pat.empty() || + alt.empty()) { + continue; + } +#endif + + // pattern + notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed); + + testCase.remove(); + testCase.append(prefix); + testCase.append(/*alt*/pattern); + + // prefix + pattern + notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed); + + testCase.append(suffix); + + // prefix + pattern + suffix + notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed); + + testCase.remove(); + testCase.append(pattern); + testCase.append(suffix); + + // pattern + suffix + notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed); + } + + CollData::close(data); + + logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); + } + + uset_close(contractions); + uset_close(expansions); + uset_close(letters); + + CollData::close(monkeyData); ucol_close(coll); } diff --git a/icu4c/source/test/intltest/ssearch.h b/icu4c/source/test/intltest/ssearch.h index 4afc7bf832..d888413641 100644 --- a/icu4c/source/test/intltest/ssearch.h +++ b/icu4c/source/test/intltest/ssearch.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2008, International Business Machines + * Copyright (C) 2005-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -11,6 +11,7 @@ #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/ucol.h" +#include "unicode/bmsearch.h" #include "intltest.h" @@ -34,10 +35,24 @@ public: virtual void offsetTest(); virtual void monkeyTest(char *params); + virtual void bmMonkeyTest(char *params); + virtual void boyerMooreTest(); + virtual void goodSuffixTest(); + virtual void searchTime(); + + virtual void bmsTest(); + virtual void bmSearchTest(); + + virtual void udhrTest(); + private: virtual const char *getPath(char buffer[2048], const char *filename); virtual int32_t monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, const char *name, const char *strength, uint32_t seed); + + virtual int32_t bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, + BoyerMooreSearch *bms, BoyerMooreSearch *abms, + const char *name, const char *strength, uint32_t seed); #endif }; diff --git a/icu4c/source/test/perf/strsrchperf/strsrchperf.cpp b/icu4c/source/test/perf/strsrchperf/strsrchperf.cpp index 4d89921164..23d16d7de2 100644 --- a/icu4c/source/test/perf/strsrchperf/strsrchperf.cpp +++ b/icu4c/source/test/perf/strsrchperf/strsrchperf.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (C) 2008 IBM, Inc. All Rights Reserved. + * Copyright (C) 2008-2009 IBM, Inc. All Rights Reserved. * ********************************************************************/ /** @@ -14,7 +14,13 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status) :UPerfTest(argc,argv,status){ int32_t start, end; + +#ifdef TEST_BOYER_MOORE_SEARCH + bms = NULL; +#else srch = NULL; +#endif + pttrn = NULL; if(status== U_ILLEGAL_ARGUMENT_ERROR || line_mode){ fprintf(stderr,gUsageString, "strsrchperf"); @@ -22,7 +28,8 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha } /* Get the Text */ src = getBuffer(srcLen, status); - + +#if 0 /* Get a word to find. Do this by selecting a random word with a word breakiterator. */ UBreakIterator* brk = ubrk_open(UBRK_WORD, locale, src, srcLen, &status); if(U_FAILURE(status)){ @@ -38,9 +45,38 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha } pttrn = temp; /* store word in pttrn */ ubrk_close(brk); +#else + /* The first line of the file contains the pattern */ + start = 0; + + for(end = start; ; end += 1) { + UChar ch = src[end]; + + if (ch == 0x000A || ch == 0x000D || ch == 0x2028) { + break; + } + } + + pttrnLen = end - start; + UChar* temp = (UChar*)malloc(sizeof(UChar)*(pttrnLen)); + for (int i = 0; i < pttrnLen; i++) { + temp[i] = src[start++]; + } + pttrn = temp; /* store word in pttrn */ +#endif +#ifdef TEST_BOYER_MOORE_SEARCH + UnicodeString patternString(pttrn, pttrnLen); + UCollator *coll = ucol_open(locale, &status); + CollData *data = CollData::open(coll, status); + + targetString = new UnicodeString(src, srcLen); + bms = new BoyerMooreSearch(data, patternString, targetString, status); +#else /* Create the StringSearch object to be use in performance test. */ srch = usearch_open(pttrn, pttrnLen, src, srcLen, locale, NULL, &status); +#endif + if(U_FAILURE(status)){ fprintf(stderr, "FAILED to create UPerfTest object. Error: %s\n", u_errorName(status)); return; @@ -49,12 +85,23 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha } StringSearchPerformanceTest::~StringSearchPerformanceTest() { + CollData *data = bms->getData(); + UCollator *coll = data->getCollator(); + + delete bms; + delete targetString; + CollData::close(data); + ucol_close(coll); + if (pttrn != NULL) { free(pttrn); } + +#ifndef TEST_BOYER_MOORE_SEARCH if (srch != NULL) { usearch_close(srch); } +#endif } UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char *par) { @@ -70,12 +117,20 @@ UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool } UPerfFunction* StringSearchPerformanceTest::Test_ICU_Forward_Search(){ +#ifdef TEST_BOYER_MOORE_SEARCH + StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUForwardSearch, bms, src, srcLen, pttrn, pttrnLen); +#else StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUForwardSearch, srch, src, srcLen, pttrn, pttrnLen); +#endif return func; } UPerfFunction* StringSearchPerformanceTest::Test_ICU_Backward_Search(){ +#ifdef TEST_BOYER_MOORE_SEARCH + StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUBackwardSearch, bms, src, srcLen, pttrn, pttrnLen); +#else StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUBackwardSearch, srch, src, srcLen, pttrn, pttrnLen); +#endif return func; } diff --git a/icu4c/source/test/perf/strsrchperf/strsrchperf.h b/icu4c/source/test/perf/strsrchperf/strsrchperf.h index 3c01279398..6f2281c585 100644 --- a/icu4c/source/test/perf/strsrchperf/strsrchperf.h +++ b/icu4c/source/test/perf/strsrchperf/strsrchperf.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (C) 2008 IBM, Inc. All Rights Reserved. + * Copyright (C) 2008-2009 IBM, Inc. All Rights Reserved. * ********************************************************************/ #ifndef _STRSRCHPERF_H @@ -8,11 +8,19 @@ #include "unicode/ubrk.h" #include "unicode/usearch.h" +#include "unicode/colldata.h" +#include "unicode/bmsearch.h" #include "unicode/uperf.h" #include #include +#define TEST_BOYER_MOORE_SEARCH + +#ifdef TEST_BOYER_MOORE_SEARCH +typedef void (*StrSrchFn) (BoyerMooreSearch * bms, const UChar *src, int32_t srcLen, const UChar *pttrn, int32_t pttrnLen, UErrorCode *status); +#else typedef void (*StrSrchFn)(UStringSearch* srch, const UChar* src,int32_t srcLen, const UChar* pttrn, int32_t pttrnLen, UErrorCode* status); +#endif class StringSearchPerfFunction : public UPerfFunction { private: @@ -21,17 +29,39 @@ private: int32_t srcLen; const UChar* pttrn; int32_t pttrnLen; +#ifdef TEST_BOYER_MOORE_SEARCH + BoyerMooreSearch *bms; +#else UStringSearch* srch; +#endif public: virtual void call(UErrorCode* status) { +#ifdef TEST_BOYER_MOORE_SEARCH + (*fn)(bms, src, srcLen, pttrn, pttrnLen, status); +#else (*fn)(srch, src, srcLen, pttrn, pttrnLen, status); +#endif } virtual long getOperationsPerIteration() { +#if 0 return (long)(srcLen/pttrnLen); +#else + return (long) srcLen; +#endif } +#ifdef TEST_BOYER_MOORE_SEARCH + StringSearchPerfFunction(StrSrchFn func, BoyerMooreSearch *search, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen) { + fn = func; + src = source; + srcLen = sourceLen; + pttrn = pattern; + pttrnLen = patternLen; + bms = search; + } +#else StringSearchPerfFunction(StrSrchFn func, UStringSearch* search, const UChar* source,int32_t sourceLen, const UChar* pattern, int32_t patternLen) { fn = func; src = source; @@ -40,6 +70,7 @@ public: pttrnLen = patternLen; srch = search; } +#endif }; class StringSearchPerformanceTest : public UPerfTest { @@ -48,7 +79,12 @@ private: int32_t srcLen; UChar* pttrn; int32_t pttrnLen; +#ifdef TEST_BOYER_MOORE_SEARCH + UnicodeString *targetString; + BoyerMooreSearch *bms; +#else UStringSearch* srch; +#endif public: StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status); @@ -56,9 +92,29 @@ public: virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = NULL); UPerfFunction* Test_ICU_Forward_Search(); + UPerfFunction* Test_ICU_Backward_Search(); }; + +#ifdef TEST_BOYER_MOORE_SEARCH +void ICUForwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) { + int32_t offset = 0, start = -1, end = -1; + + while (bms->search(offset, start, end)) { + offset = end; + } +} + +void ICUBackwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) { + int32_t offset = 0, start = -1, end = -1; + + /* NOTE: No Boyer-Moore backward search yet... */ + while (bms->search(offset, start, end)) { + offset = end; + } +} +#else void ICUForwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceLen, const UChar* pattern, int32_t patternLen, UErrorCode* status) { int32_t match; @@ -76,5 +132,6 @@ void ICUBackwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceL match = usearch_previous(srch, status); } } +#endif #endif /* _STRSRCHPERF_H */ diff --git a/icu4c/source/test/testdata/ssearch.xml b/icu4c/source/test/testdata/ssearch.xml index 26d676ea87..c4beaf24cb 100644 --- a/icu4c/source/test/testdata/ssearch.xml +++ b/icu4c/source/test/testdata/ssearch.xml @@ -1,6 +1,6 @@ - + @@ -20,7 +21,7 @@ ]> - + @@ -174,8 +175,15 @@ A\u0300
At IDENTICAL, shoud this match?  
\u00c0 - - + + + A\u0300 +
At IDENTICAL, shoud this match?  
+ \u00c0 + +
+ + Ű
12
ű Ű
@@ -285,11 +293,13 @@ + + A\u0301\u0301\u0301\u0301
A\u0301\u0301\u0301\u0301\u0301
@@ -409,5 +419,27 @@ VII \u2166
+ + + Universal Declaration of Human Rights +
Proclaims this 
Universal Declaration of Human Rights as a common standard of achievement for all peoples and all nations +
+ + + Universal Declaration of Human Rights +
Proclaims this 
+ Universal-Declaration-of-Human-Rights + as a common standard of achievement for all peoples and all nations +
+ + + \u05E9\u0591\u05E9 + \u05E9\u0592\u05E9 + + + + \u05E9\u0591\u05E9 +
\u05E9\u0592\u05E9
+