ICU-6659 Merge changes from branches/eric/boyer-moore

X-SVN-Rev: 25282
2009-01-22 00:24:48 +00:00 · 2009-01-22 00:24:48 +00:00 · 5f73103b5a
commit 5f73103b5a
parent d9737d2f4a
21 changed files with 4476 additions and 531 deletions
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@ -1,6 +1,6 @@
 #******************************************************************************
 #
-#   Copyright (C) 1998-2008, International Business Machines
+#   Copyright (C) 1998-2009, International Business Machines
 #   Corporation and others.  All Rights Reserved.
 #
 #******************************************************************************
@ -81,7 +81,7 @@ ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
 csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
 wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \
 zonemeta.o zstrfmt.o plurrule.o plurfmt.o dtitvfmt.o dtitvinf.o \
-tmunit.o tmutamt.o tmutfmt.o
+tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o

 ## Header files to install
 HEADERS = $(srcdir)/unicode/*.h
--- a/icu4c/source/i18n/bms.cpp
+++ b/icu4c/source/i18n/bms.cpp
@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2008-2009, International Business Machines Corporation and Others.
+ * All rights reserved.
+ */
+
+#include "unicode/utypes.h"
+#include "cmemory.h"
+#include "unicode/bms.h"
+#include "unicode/unistr.h"
+#include "unicode/colldata.h"
+#include "unicode/bmsearch.h"
+
+//#define USE_SAFE_CASTS
+#ifdef USE_SAFE_CASTS
+#define STATIC_CAST(type,value) static_cast<type>(value)
+#define CONST_CAST(type,value) const_cast<type>(value)
+#else
+#define STATIC_CAST(type,value) (type) (value)
+#define CONST_CAST(type,value) (type) (value)
+#endif
+
+U_CAPI UCD * U_EXPORT2
+ucd_open(UCollator *coll, UErrorCode *status)
+{
+    return STATIC_CAST(UCD *, CollData::open(coll, *status));
+}
+
+U_CAPI void U_EXPORT2
+ucd_close(UCD *ucd)
+{
+    CollData *data = STATIC_CAST(CollData *, ucd);
+
+    CollData::close(data);
+}
+
+U_CAPI UCollator * U_EXPORT2
+ucd_getCollator(UCD *ucd)
+{
+    CollData *data = STATIC_CAST(CollData *, ucd);
+
+    return data->getCollator();
+}
+
+U_CAPI void U_EXPORT2
+ucd_freeCache()
+{
+    CollData::freeCollDataCache();
+}
+
+U_CAPI void U_EXPORT2
+ucd_flushCache()
+{
+    CollData::flushCollDataCache();
+}
+
+struct BMS
+{
+    BoyerMooreSearch *bms;
+    const UnicodeString *targetString;
+};
+
+U_CAPI BMS * U_EXPORT2
+bms_open(UCD *ucd,
+         const UChar *pattern, int32_t patternLength,
+         const UChar *target,  int32_t targetLength,
+         UErrorCode  *status)
+{
+    BMS *bms = STATIC_CAST(BMS *, uprv_malloc(sizeof(BMS)));
+
+    if (bms == NULL) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
+
+    CollData *data = (CollData *) ucd;
+    UnicodeString patternString(pattern, patternLength);
+
+    if (target != NULL) {
+        bms->targetString = new UnicodeString(target, targetLength);
+        
+        if (bms->targetString == NULL) {
+            bms->bms = NULL;
+            *status = U_MEMORY_ALLOCATION_ERROR;
+            return bms;
+        }
+    } else {
+        bms->targetString = NULL;
+    }
+
+    bms->bms = new BoyerMooreSearch(data, patternString, bms->targetString, *status);
+
+    if (bms->bms == NULL) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+    }
+
+    return bms;
+}
+
+U_CAPI void U_EXPORT2
+bms_close(BMS *bms)
+{
+    delete bms->bms;
+
+    delete bms->targetString;
+
+    uprv_free(bms);
+}
+
+U_CAPI UBool U_EXPORT2
+bms_empty(BMS *bms)
+{
+    return bms->bms->empty();
+}
+
+U_CAPI UCD * U_EXPORT2
+bms_getData(BMS *bms)
+{
+    return STATIC_CAST(UCD *, bms->bms->getData());
+}
+
+U_CAPI UBool U_EXPORT2
+bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end)
+{
+    return bms->bms->search(offset, *start, *end);
+}
+
+U_CAPI void U_EXPORT2
+bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status)
+{
+    if (U_FAILURE(*status)) {
+        return;
+    }
+
+    if (bms->targetString != NULL) {
+        delete bms->targetString;
+    }
+
+    if (target != NULL) {
+        bms->targetString = new UnicodeString(target, targetLength);
+    } else {
+        bms->targetString = NULL;
+    }
+
+    bms->bms->setTargetString(bms->targetString, *status);
+}
--- a/icu4c/source/i18n/bmsearch.cpp
+++ b/icu4c/source/i18n/bmsearch.cpp
@ -0,0 +1,864 @@
+/*
+ ******************************************************************************
+ *   Copyright (C) 1996-2009, International Business Machines                 *
+ *   Corporation and others.  All Rights Reserved.                            *
+ ******************************************************************************
+ */
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "unicode/unistr.h"
+#include "unicode/putil.h"
+#include "unicode/usearch.h"
+
+#include "cmemory.h"
+#include "unicode/coll.h"
+#include "unicode/tblcoll.h"
+#include "unicode/coleitr.h"
+#include "unicode/ucoleitr.h"
+
+#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
+
+#include "unicode/uniset.h"
+#include "unicode/uset.h"
+#include "unicode/ustring.h"
+#include "hash.h"
+#include "uhash.h"
+#include "ucol_imp.h"
+#include "unormimp.h"
+
+#include "unicode/colldata.h"
+#include "unicode/bmsearch.h"
+
+U_NAMESPACE_BEGIN
+
+#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
+#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
+#define DELETE_ARRAY(array) uprv_free((void *) (array))
+
+
+struct CEI
+{
+    uint32_t order;
+    int32_t  lowOffset;
+    int32_t  highOffset;
+};
+
+class Target : public UMemory
+{
+public:
+    Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
+    ~Target();
+
+    void setTargetString(const UnicodeString *target);
+
+    const CEI *nextCE(int32_t offset);
+    const CEI *prevCE(int32_t offset);
+
+    int32_t stringLength();
+    UChar charAt(int32_t offset);
+
+    UBool isBreakBoundary(int32_t offset);
+    int32_t nextBreakBoundary(int32_t offset);
+    int32_t nextSafeBoundary(int32_t offset);
+
+    UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
+
+    void setOffset(int32_t offset);
+    void setLast(int32_t last);
+    int32_t getOffset();
+
+private:
+    CEI *ceb;
+    int32_t bufferSize;
+    int32_t bufferMin;
+    int32_t bufferMax;
+
+    uint32_t strengthMask;
+    UCollationStrength strength;
+    uint32_t variableTop;
+    UBool toShift;
+    UCollator *coll;
+
+    const UnicodeString *targetString;
+    const UChar *targetBuffer;
+    int32_t targetLength;
+
+    UCollationElements *elements;
+    UBreakIterator *charBreakIterator;
+};
+
+Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
+    : bufferSize(0), bufferMin(0), bufferMax(0),
+      strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
+      targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
+{
+    strength = ucol_getStrength(coll);
+    toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
+    variableTop = ucol_getVariableTop(coll, &status);
+
+    // find the largest expansion
+    uint8_t maxExpansion = 0;
+    for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
+        if (*expansion > maxExpansion) {
+            maxExpansion = *expansion;
+        }
+    }
+
+    // room for an extra character on each end, plus 4 for safety
+    bufferSize = patternLength + (2 * maxExpansion) + 4;
+
+    ceb = NEW_ARRAY(CEI, bufferSize);
+
+    if (ceb == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+
+    if (target != NULL) {
+        setTargetString(target);
+    }
+
+    switch (strength) 
+    {
+    default:
+        strengthMask |= UCOL_TERTIARYORDERMASK;
+        /* fall through */
+
+    case UCOL_SECONDARY:
+        strengthMask |= UCOL_SECONDARYORDERMASK;
+        /* fall through */
+
+    case UCOL_PRIMARY:
+        strengthMask |= UCOL_PRIMARYORDERMASK;
+    }
+}
+
+Target::~Target()
+{
+    ubrk_close(charBreakIterator);
+    ucol_closeElements(elements);
+
+    DELETE_ARRAY(ceb);
+}
+
+void Target::setTargetString(const UnicodeString *target)
+{
+    if (charBreakIterator != NULL) {
+        ubrk_close(charBreakIterator);
+        ucol_closeElements(elements);
+    }
+
+    targetString = target;
+
+    if (targetString != NULL) {
+        UErrorCode status = U_ZERO_ERROR;
+
+        targetBuffer = targetString->getBuffer();
+        targetLength = targetString->length();
+
+        elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
+        ucol_forceHanImplicit(elements, &status);
+
+        charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status),
+                                      targetBuffer, targetLength, &status);
+    } else {
+        targetBuffer = NULL;
+        targetLength = 0;
+    }
+}
+
+const CEI *Target::nextCE(int32_t offset)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    int32_t low = -1, high = -1;
+    uint32_t order;
+    UBool cont = FALSE;
+
+    if (offset >= bufferMin && offset < bufferMax) {
+        return &ceb[offset];
+    }
+
+    if (bufferMax >= bufferSize || offset != bufferMax) {
+        return NULL;
+    }
+
+    do {
+        low   = ucol_getOffset(elements);
+        order = ucol_next(elements, &status);
+        high  = ucol_getOffset(elements);
+
+        if (order == UCOL_NULLORDER) {
+          //high = low = -1;
+            break;
+        }
+
+        cont = isContinuation(order);
+        order &= strengthMask;
+
+        if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
+            if (strength >= UCOL_QUATERNARY) {
+                order &= UCOL_PRIMARYORDERMASK;
+            } else {
+                order = UCOL_IGNORABLE;
+            }
+        }
+    } while (order == UCOL_IGNORABLE);
+
+    if (cont) {
+        order |= UCOL_CONTINUATION_MARKER;
+    }
+
+    ceb[offset].order = order;
+    ceb[offset].lowOffset = low;
+    ceb[offset].highOffset = high;
+
+    bufferMax += 1;
+
+    return &ceb[offset];
+}
+
+const CEI *Target::prevCE(int32_t offset)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    int32_t low = -1, high = -1;
+    uint32_t order;
+    UBool cont = FALSE;
+
+    if (offset >= bufferMin && offset < bufferMax) {
+        return &ceb[offset];
+    }
+
+    if (bufferMax >= bufferSize || offset != bufferMax) {
+        return NULL;
+    }
+
+    do {
+        high  = ucol_getOffset(elements);
+        order = ucol_previous(elements, &status);
+        low   = ucol_getOffset(elements);
+
+        if (order == UCOL_NULLORDER) {
+            break;
+        }
+
+        cont = isContinuation(order);
+        order &= strengthMask;
+
+        if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
+            if (strength >= UCOL_QUATERNARY) {
+                order &= UCOL_PRIMARYORDERMASK;
+            } else {
+                order = UCOL_IGNORABLE;
+            }
+        }
+    } while (order == UCOL_IGNORABLE);
+
+    bufferMax += 1;
+
+    if (cont) {
+        order |= UCOL_CONTINUATION_MARKER;
+    }
+
+    ceb[offset].order       = order;
+    ceb[offset].lowOffset   = low;
+    ceb[offset].highOffset = high;
+
+    return &ceb[offset];
+}
+
+int32_t Target::stringLength()
+{
+    if (targetString != NULL) {
+        return targetLength;
+    }
+
+    return 0;
+}
+
+UChar Target::charAt(int32_t offset)
+{
+    if (targetString != NULL) {
+        return targetBuffer[offset];
+    }
+
+    return 0x0000;
+}
+
+void Target::setOffset(int32_t offset)
+{
+    UErrorCode status = U_ZERO_ERROR;
+
+    bufferMin = 0;
+    bufferMax = 0;
+
+    ucol_setOffset(elements, offset, &status);
+}
+
+void Target::setLast(int32_t last)
+{
+    UErrorCode status = U_ZERO_ERROR;
+
+    bufferMin = 0;
+    bufferMax = 1;
+
+    ceb[0].order      = UCOL_NULLORDER;
+    ceb[0].lowOffset  = last;
+    ceb[0].highOffset = last;
+
+    ucol_setOffset(elements, last, &status);
+}
+
+int32_t Target::getOffset()
+{
+    return ucol_getOffset(elements);
+}
+
+UBool Target::isBreakBoundary(int32_t offset)
+{
+    return ubrk_isBoundary(charBreakIterator, offset);
+}
+
+int32_t Target::nextBreakBoundary(int32_t offset)
+{
+    return ubrk_following(charBreakIterator, offset);
+}
+
+int32_t Target::nextSafeBoundary(int32_t offset)
+{
+    while (offset < targetLength) {
+      //UChar ch = charAt(offset);
+        UChar ch = targetBuffer[offset];
+
+        if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
+            return offset;
+        }
+
+        offset += 1;
+    }
+
+    return targetLength;
+}
+
+UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
+{
+    if (strength < UCOL_IDENTICAL) {
+        return TRUE;
+    }
+
+    UChar t2[32], p2[32];
+    const UChar *pBuffer = pattern.getBuffer();
+    int32_t pLength = pattern.length();
+    int32_t length = end - start;
+
+    UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
+
+    int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2), 
+                                       targetBuffer + start, length, 
+                                       FALSE, 0, &status);
+
+    // use separate status2 in case of buffer overflow
+    if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2),
+                                        pBuffer, pLength,
+                                        FALSE, 0, &status2)) {
+        return FALSE; // lengths are different
+    }
+
+    // compare contents
+    UChar *text, *pat;
+
+    if(U_SUCCESS(status)) {
+        text = t2;
+        pat = p2;
+    } else if(status == U_BUFFER_OVERFLOW_ERROR) {
+        status = U_ZERO_ERROR;
+
+        // allocate one buffer for both decompositions
+        text = NEW_ARRAY(UChar, decomplength * 2);
+
+        // Check for allocation failure.
+        if (text == NULL) {
+        	return FALSE;
+        }
+
+        pat = text + decomplength;
+
+        unorm_decompose(text, decomplength, targetBuffer + start, 
+                        length, FALSE, 0, &status);
+
+        unorm_decompose(pat, decomplength, pBuffer, 
+                        pLength, FALSE, 0, &status);
+    } else {
+        // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
+        // and that we don't uprv_free() an undefined text pointer
+        text = pat = t2;
+        decomplength = 0;
+    }
+
+    UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0);
+
+    if(text != t2) {
+        DELETE_ARRAY(text);
+    }
+
+    // return FALSE if NFD failed
+    return U_SUCCESS(status) && result;
+}
+
+#define HASH_TABLE_SIZE 257
+
+class BadCharacterTable : public UMemory
+{
+public:
+    BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
+    ~BadCharacterTable();
+
+    int32_t operator[](uint32_t ce) const;
+    int32_t getMaxSkip() const;
+    int32_t minLengthInChars(int32_t index);
+
+private:
+    static int32_t hash(uint32_t ce);
+
+    int32_t maxSkip;
+    int32_t badCharacterTable[HASH_TABLE_SIZE];
+
+    int32_t *minLengthCache;
+};
+
+BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
+    : minLengthCache(NULL)
+{
+    int32_t plen = patternCEs.size();
+
+    // **** need a better way to deal with this ****
+    if (U_FAILURE(status) || plen == 0) {
+        return;
+    }
+
+    int32_t *history = NEW_ARRAY(int32_t, plen);
+
+    if (history == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+
+    for (int32_t i = 0; i < plen; i += 1) {
+        history[i] = -1;
+    }
+
+    minLengthCache = NEW_ARRAY(int32_t, plen + 1);
+
+    if (minLengthCache == NULL) {
+        DELETE_ARRAY(history);
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+
+    maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
+
+    for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
+        badCharacterTable[j] = maxSkip;
+    }
+
+    for(int32_t p = 1; p < plen; p += 1) {
+        minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
+
+        // Make sure this entry is not bigger than the previous one.
+        // Otherwise, we might skip too far in some cases.
+        if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
+            minLengthCache[p] = minLengthCache[p - 1];
+        }
+    }
+
+    minLengthCache[plen] = 0;
+
+    for(int32_t p = 0; p < plen - 1; p += 1) {
+        badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
+    }
+
+    DELETE_ARRAY(history);
+}
+
+BadCharacterTable::~BadCharacterTable()
+{
+    DELETE_ARRAY(minLengthCache);
+}
+
+int32_t BadCharacterTable::operator[](uint32_t ce) const
+{
+    return badCharacterTable[hash(ce)];
+}
+
+int32_t BadCharacterTable::getMaxSkip() const
+{
+    return maxSkip;
+}
+
+int32_t BadCharacterTable::minLengthInChars(int32_t index)
+{
+    return minLengthCache[index];
+}
+
+int32_t BadCharacterTable::hash(uint32_t ce)
+{
+    return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
+}
+
+class GoodSuffixTable : public UMemory
+{
+public:
+    GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
+    ~GoodSuffixTable();
+
+    int32_t operator[](int32_t offset) const;
+
+private:
+    int32_t *goodSuffixTable;
+};
+
+GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
+    : goodSuffixTable(NULL)
+{
+    int32_t patlen = patternCEs.size();
+
+    // **** need a better way to deal with this ****
+    if (U_FAILURE(status) || patlen <= 0) {
+        return;
+    }
+
+    int32_t *suff  = NEW_ARRAY(int32_t, patlen);
+    int32_t start = patlen - 1, end = - 1;
+    int32_t maxSkip = badCharacterTable.getMaxSkip();
+
+    if (suff == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+
+    // initialze suff
+    suff[patlen - 1] = patlen;
+
+    for (int32_t i = patlen - 2; i >= 0; i -= 1) {
+        // (i > start) means we're inside the last suffix match we found
+        // ((patlen - 1) - end) is how far the end of that match is from end of pattern
+        // (i - start) is how far we are from start of that match
+        // (i + (patlen - 1) - end) is index of same character at end of pattern
+        // so if any suffix match at that character doesn't extend beyond the last match,
+        // it's the suffix for this character as well
+        if (i > start && suff[i + patlen - 1 - end] < i - start) {
+            suff[i] = suff[i + patlen - 1 - end];
+        } else {
+            start = end = i;
+
+            int32_t s = patlen;
+
+            while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
+                start -= 1;
+            }
+
+            suff[i] = end - start;
+        }
+    }
+
+    // now build goodSuffixTable
+    goodSuffixTable  = NEW_ARRAY(int32_t, patlen);
+
+    if (goodSuffixTable == NULL) {
+        DELETE_ARRAY(suff);
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+
+
+    // initialize entries to minLengthInChars of the pattern
+    for (int32_t i = 0; i < patlen; i += 1) {
+        goodSuffixTable[i] = maxSkip;
+    }
+
+    int32_t prefix = 0;
+
+    for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
+        if (suff[i] == i + 1) {
+            // this matching suffix is a prefix of the pattern
+            int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
+
+            // for any mis-match before this suffix, we should skip
+            // so that the front of the pattern (i.e. the prefix)
+            // lines up with the front of the suffix.
+            // (patlen - 1 - i) is the start of the suffix
+            while (prefix < patlen - 1 - i) {
+                // value of maxSkip means never set...
+                if (goodSuffixTable[prefix] == maxSkip) {
+                    goodSuffixTable[prefix] = prefixSkip;
+                }
+
+                prefix += 1;
+            }
+        }
+    }
+
+    for (int32_t i = 0; i < patlen - 1; i += 1) {
+        goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
+    }
+
+    DELETE_ARRAY(suff);
+}
+
+GoodSuffixTable::~GoodSuffixTable()
+{
+    DELETE_ARRAY(goodSuffixTable);
+}
+
+int32_t GoodSuffixTable::operator[](int32_t offset) const
+{
+    return goodSuffixTable[offset];
+}
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
+
+
+UBool BoyerMooreSearch::empty()
+{
+    return patCEs->size() <= 0;
+}
+
+CollData *BoyerMooreSearch::getData()
+{
+    return data;
+}
+
+CEList *BoyerMooreSearch::getPatternCEs()
+{
+    return patCEs;
+}
+
+BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
+{
+    return badCharacterTable;
+}
+
+GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
+{
+    return goodSuffixTable;
+}
+
+BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
+                                   UErrorCode &status)
+    : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
+{
+
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    UCollator *collator = data->getCollator();
+
+    patCEs = new CEList(collator, patternString, status);
+
+    if (patCEs == NULL || U_FAILURE(status)) {
+        return;
+    }
+
+    badCharacterTable = new BadCharacterTable(*patCEs, data, status);
+
+    if (badCharacterTable == NULL || U_FAILURE(status)) {
+        return;
+    }
+
+    goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
+
+    if (targetString != NULL) {
+        target = new Target(collator, targetString, patCEs->size(), status);
+    }
+}
+
+BoyerMooreSearch::~BoyerMooreSearch()
+{
+    delete target;
+    delete goodSuffixTable;
+    delete badCharacterTable;
+    delete patCEs;
+}
+
+void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
+{
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    if (target == NULL) {
+        target = new Target(data->getCollator(), targetString, patCEs->size(), status);
+    } else {
+        target->setTargetString(targetString);
+    }
+}
+
+// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
+/*
+ * TODO:
+ *  * deal with trailing (and leading?) ignorables.
+ *  * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
+ */
+UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
+{
+    UCollator *coll = data->getCollator();
+    int32_t plen = patCEs->size();
+    int32_t tlen = target->stringLength();
+    int32_t maxSkip = badCharacterTable->getMaxSkip();
+    int32_t tOffset = offset + maxSkip;
+
+    if (plen <= 0) {
+        // Searching for a zero length pattern always fails.
+        start = end = -1;
+        return FALSE;
+    }
+
+    while (tOffset <= tlen) {
+        int32_t pIndex = plen - 1;
+        int32_t tIndex = 0;
+        int32_t lIndex = 0;
+
+        if (tOffset < tlen) {
+            // **** we really want to skip ahead enough to  ****
+            // **** be sure we get at least 1 non-ignorable ****
+            // **** CE after the end of the pattern.        ****
+            int32_t next = target->nextSafeBoundary(tOffset + 1);
+
+            target->setOffset(next);
+
+            for (lIndex = 0; ; lIndex += 1) {
+                const CEI *cei = target->prevCE(lIndex);
+                int32_t low = cei->lowOffset;
+                int32_t high = cei->highOffset;
+
+                if (high == 0 || (low < high && low <= tOffset)) {
+                    if (low < tOffset) {
+                        while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
+                            lIndex -= 1;
+                        }
+
+                        if (high > tOffset) {
+                            tOffset = high;
+                        }
+                    }
+
+                    break;
+                }
+            }
+        } else {
+            target->setLast(tOffset);
+            lIndex = 0;
+        }
+
+        tIndex = ++lIndex;
+
+        // Iterate backward until we hit the beginning of the pattern
+        while (pIndex >= 0) {
+            uint32_t pce = (*patCEs)[pIndex];
+            const CEI *tcei = target->prevCE(tIndex++);
+
+
+            if (tcei->order != pce) {
+                // There is a mismatch at this position.  Decide how far
+                // over to shift the pattern, then try again.
+ 
+                int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
+#ifdef EXTRA_CAUTIOUS
+                int32_t old = tOffset;
+#endif
+
+                tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
+
+                if (gsOffset > tOffset) {
+                    tOffset = gsOffset;
+                }
+
+#ifdef EXTRA_CAUTIOUS
+                // Make sure we don't skip backwards...
+                if (tOffset <= old) {
+                    tOffset = old + 1;
+                }
+#endif
+
+                break;
+            }
+
+            pIndex -= 1;
+        }
+
+        if (pIndex < 0) {
+            // We made it back to the beginning of the pattern,
+            // which means we matched it all.  Return the location.
+            const CEI firstCEI = *target->prevCE(tIndex - 1);
+            const CEI lastCEI  = *target->prevCE(lIndex);
+            int32_t mStart   = firstCEI.lowOffset;
+            int32_t minLimit = lastCEI.lowOffset;
+            int32_t maxLimit = lastCEI.highOffset;
+            int32_t mLimit; 
+            UBool found = TRUE;
+
+            target->setOffset(/*tOffset*/maxLimit);
+
+            const CEI nextCEI = *target->nextCE(0);
+
+            if (nextCEI.lowOffset > maxLimit) {
+                maxLimit = nextCEI.lowOffset;
+            }
+
+            if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != UCOL_NULLORDER) {
+                found = FALSE;
+            }
+
+            if (! target->isBreakBoundary(mStart)) {
+                found = FALSE;
+            }
+
+            if (firstCEI.lowOffset == firstCEI.highOffset) {
+                found = FALSE;
+            }
+
+            mLimit = maxLimit;
+            if (minLimit < maxLimit) {
+                int32_t nbb = target->nextBreakBoundary(minLimit);
+
+                if (nbb >= lastCEI.highOffset) {
+                    mLimit = nbb;
+                }
+            }
+
+            if (mLimit > maxLimit) {
+                found = FALSE;
+            }
+
+            if (! target->isBreakBoundary(mLimit)) {
+                found = FALSE;
+            }
+
+            if (! target->isIdentical(pattern, mStart, mLimit)) {
+                found = FALSE;
+            }
+
+            if (found) {
+                start = mStart;
+                end   = mLimit;
+
+                return TRUE;
+            }
+
+            tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
+        }
+        // Otherwise, we're here because of a mismatch, so keep going....
+    }
+    
+    // no match
+   start = -1;
+   end = -1;
+   return FALSE;
+}
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_COLLATION
--- a/icu4c/source/i18n/colldata.cpp
+++ b/icu4c/source/i18n/colldata.cpp
--- a/icu4c/source/i18n/i18n.vcproj
+++ b/icu4c/source/i18n/i18n.vcproj
@ -408,6 +408,40 @@
 		<Filter
 			Name="collation"
 			>
+			<File
+				RelativePath=".\bms.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\unicode\bms.h"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\bmsearch.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\unicode\bmsearch.h"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+			</File>
 			<File
 				RelativePath=".\bocsu.c"
 				>
@ -504,6 +538,23 @@
 					/>
 				</FileConfiguration>
 			</File>
+			<File
+				RelativePath=".\colldata.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\unicode\colldata.h"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+			</File>
 			<File
 				RelativePath=".\search.cpp"
 				>
--- a/icu4c/source/i18n/ucln_in.h
+++ b/icu4c/source/i18n/ucln_in.h
@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *                                                                            *
-* Copyright (C) 2001-2008, International Business Machines                   *
+* Copyright (C) 2001-2009, International Business Machines                   *
 *                Corporation and others. All Rights Reserved.                *
 *                                                                            *
 ******************************************************************************
@ -45,6 +45,7 @@ typedef enum ECleanupI18NType {
    UCLN_I18N_UCOL_RES,
    UCLN_I18N_UCOL_BLD,
    UCLN_I18N_CSDET,
+    UCLN_I18N_COLL_DATA,
    UCLN_I18N_COUNT /* This must be last */
 } ECleanupI18NType;

--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-*   Copyright (C) 1996-2008, International Business Machines
+*   Copyright (C) 1996-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 *   file name:  ucol.cpp
@ -123,7 +123,6 @@ uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
    IInit_collIterate(collator, sourceString, sourceLen, s);
 }

-
 /**
 * Backup the state of the collIterate struct data
 * @param data collIterate to backup
@ -1499,10 +1498,30 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
    }
    else
    {
-        order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+        // Always use UCA for Han, Hangul
+        // (Han extension A is before main Han block)
+        // **** Han compatibility chars ?? ****
+        if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
+            (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
+            if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
+                // between the two target ranges; do normal lookup
+                // **** this range is YI, Modifier tone letters, ****
+                // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
+                // **** Latin-D might be tailored, so we need to ****
+                // **** do the normal lookup for these guys.     ****
+                order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+            } else {
+                // in one of the target ranges; use UCA
+                order = UCOL_NOT_FOUND;
+            }
+        } else {
+            order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+        }
+
        if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
            order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
        }
+
        if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
            /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
            order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
@ -1939,7 +1958,23 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
                result = coll->latinOneMapping[ch];
            }
            else {
-                result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+                // Always use UCA for [3400..9FFF], [AC00..D7AF]
+                // **** [FA0E..FA2F] ?? ****
+                if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
+                    (ch >= 0x3400 && ch <= 0xD7AF)) {
+                    if (ch > 0x9FFF && ch < 0xAC00) {
+                        // between the two target ranges; do normal lookup
+                        // **** this range is YI, Modifier tone letters, ****
+                        // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
+                        // **** Latin-D might be tailored, so we need to ****
+                        // **** do the normal lookup for these guys.     ****
+                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+                    } else {
+                        result = UCOL_NOT_FOUND;
+                    }
+                } else {
+                    result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+                }
            }
            if (result > UCOL_NOT_FOUND) {
                result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
@ -3545,38 +3580,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,

            int32_t offsetBias;

-#if 0
-            if (source->offsetReturn != NULL) {
-                source->offsetStore = source->offsetReturn - noChars;
-            }
-
            // **** doesn't work if using iterator ****
            if (source->flags & UCOL_ITER_INNORMBUF) {
-                if (source->fcdPosition == NULL) {
-                    offsetBias = 0;
-                } else {
-                    offsetBias = (int32_t)(source->fcdPosition - source->string);
-                }
-            } else {
-                offsetBias = (int32_t)(source->pos - source->string);
-            }
-
-#else
-            // **** doesn't work if using iterator ****
-            if (source->flags & UCOL_ITER_INNORMBUF) {
-#if 1
                offsetBias = -1;
-#else
-              if (source->fcdPosition == NULL) {
-                  offsetBias = 0;
-              } else {
-                  offsetBias = (int32_t)(source->fcdPosition - source->string);
-              }
-#endif
            } else {
                offsetBias = (int32_t)(source->pos - source->string);
            }
-#endif

            /* a new collIterate is used to simplify things, since using the current
            collIterate will mean that the forward and backwards iteration will
@ -3584,9 +3593,9 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
            collIterate temp;
            int32_t rawOffset;

-            //IInit_collIterate(coll, UCharOffset, -1, &temp);
            IInit_collIterate(coll, UCharOffset, noChars, &temp);
            temp.flags &= ~UCOL_ITER_NORM;
+            temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;

            rawOffset = temp.pos - temp.string; // should always be zero?
            CE = ucol_IGetNextCE(coll, &temp, status);
@ -3679,7 +3688,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
                    }
                }

-                rawOffset = temp.pos - temp.string;
+                if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
+                    rawOffset = temp.fcdPosition - temp.string;
+                } else {
+                    rawOffset = temp.pos - temp.string;
+                }
+
                CE = ucol_IGetNextCE(coll, &temp, status);
            }

@ -4136,29 +4150,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
            }

        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
-#if 0
-			if (source->offsetBuffer == NULL) {
-				source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
-				source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
-				source->offsetStore = source->offsetBuffer;
-			}
-
-			// **** doesn't work if using iterator ****
-			if (source->flags & UCOL_ITER_INNORMBUF) {
-			  source->offsetRepeatCount = 1;
-			} else {
-			  int32_t firstOffset = (int32_t)(source->pos - source->string);
-
-			  *(source->offsetStore++) = firstOffset;
-			  *(source->offsetStore++) = firstOffset + 1;
-
-				source->offsetReturn = source->offsetStore - 1;
-				if (source->offsetReturn == source->offsetBuffer) {
-					source->offsetStore = source->offsetBuffer;
-				}
-			}
-#endif
-
            return getPrevImplicit(ch, source);

            // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
--- a/icu4c/source/i18n/ucol_imp.h
+++ b/icu4c/source/i18n/ucol_imp.h
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1998-2008, International Business Machines
+*   Copyright (C) 1998-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -260,6 +260,8 @@ minimum number for special Jamo
                              /* by index */
 #define UCOL_USE_ITERATOR   64

+#define UCOL_FORCE_HAN_IMPLICIT 128
+
 #define NFC_ZERO_CC_BLOCK_LIMIT_  0x300

 typedef struct collIterate {
@ -390,6 +392,29 @@ uprv_init_pce(const struct UCollationElements *elems);
                         (((uint32_t)(ch) - 0x1161) <= (0x1175 - 0x1161)) || \
                         (((uint32_t)(ch) - 0x11A8) <= (0x11C2 - 0x11A8)))

+/* Han character ranges */
+#define UCOL_FIRST_HAN 0x4E00
+#define UCOL_LAST_HAN  0x9FFF
+#define UCOL_FIRST_HAN_A 0x3400
+#define UCOL_LAST_HAN_A  0x4DBF
+#define UCOL_FIRST_HAN_COMPAT 0xFAE0
+#define UCOL_LAST_HAN_COMPAT  0xFA2F
+
+/* Han extension B is in plane 2 */
+#define UCOL_FIRST_HAN_B_LEAD  0xD840
+#define UCOL_FIRST_HAN_B_TRAIL 0xDC00
+#define UCOL_LAST_HAN_B_LEAD   0xD869
+#define UCOL_LAST_HAN_B_TRAIL  0xDEDF
+
+/* Hangul range */
+#define UCOL_FIRST_HANGUL 0xAC00
+#define UCOL_LAST_HANGUL  0xD7AF
+
+/* Jamo ranges */
+#define UCOL_FIRST_L_JAMO 0x1100
+#define UCOL_FIRST_V_JAMO 0x1161
+#define UCOL_FIRST_T_JAMO 0x11A8
+#define UCOL_LAST_T_JAMO  0x11F9


 #if 0
--- a/icu4c/source/i18n/ucol_sit.cpp
+++ b/icu4c/source/i18n/ucol_sit.cpp
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-*   Copyright (C) 2004-2008, International Business Machines
+*   Copyright (C) 2004-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 *   file name:  ucol_sit.cpp
@ -578,15 +578,15 @@ ucol_getShortDefinitionString(const UCollator *coll,
    if(elementSize) {
        // we should probably canonicalize here...
        elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status);
-        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, languageArg);
+        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg);
        elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status);
-        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, regionArg);
+        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg);
        elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status);
-        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, scriptArg);
+        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg);
        elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status);
-        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variantArg);
+        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg);
        elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status);
-        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, keywordArg);
+        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg);
    } 

    int32_t i = 0;
@ -597,7 +597,7 @@ ucol_getShortDefinitionString(const UCollator *coll,
            if(attribute != UCOL_DEFAULT) {
                char letter = ucol_sit_attributeValueToLetter(attribute, status);
                appendShortStringElement(&letter, 1, 
-                    buffer, &resultSize, capacity, options[i].optionStart);
+                    buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart);
            }
        }
    }
--- a/icu4c/source/i18n/ucoleitr.cpp
+++ b/icu4c/source/i18n/ucoleitr.cpp
@ -1,6 +1,6 @@
 /*
 ******************************************************************************
-*   Copyright (C) 2001-2008, International Business Machines
+*   Copyright (C) 2001-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 ******************************************************************************
 *
@ -263,7 +263,14 @@ inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
        primary = ucol_primaryOrder(ce);
    }

-    // Continuation?
+    // **** This should probably handle continuations too.  ****
+    // **** That means that we need 24 bits for the primary ****
+    // **** instead of the 16 that we're currently using.   ****
+    // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
+    // **** Another complication with continuations is that ****
+    // **** the *second* CE is marked as a continuation, so ****
+    // **** we always have to peek ahead to know how long   ****
+    // **** the primary is...                               ****
    if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0)
                || (elems->pce->isShifted && primary == 0)) {

@ -285,7 +292,6 @@ inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
        elems->pce->isShifted = FALSE;
    }

-
    return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
 }

@ -332,6 +338,7 @@ ucol_openElements(const UCollator  *coll,
    return result;
 }

+
 U_CAPI void U_EXPORT2
 ucol_closeElements(UCollationElements *elems)
 {
@ -375,7 +382,7 @@ ucol_reset(UCollationElements *elems)
        ci->endp      = ci->string + u_strlen(ci->string);
    }
    ci->CEpos       = ci->toReturn = ci->CEs;
-    ci->flags       = UCOL_ITER_HASLEN;
+    ci->flags       = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN;
    if (ci->coll->normalizationMode == UCOL_ON) {
        ci->flags |= UCOL_ITER_NORM;
    }
@ -391,6 +398,21 @@ ucol_reset(UCollationElements *elems)
 	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
 }

+U_CAPI void U_EXPORT2
+ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status)
+{
+    if (U_FAILURE(*status)) {
+        return;
+    }
+
+    if (elems == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
+    elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT;
+}
+
 U_CAPI int32_t U_EXPORT2
 ucol_next(UCollationElements *elems, 
          UErrorCode         *status)
--- a/icu4c/source/i18n/unicode/bms.h
+++ b/icu4c/source/i18n/unicode/bms.h
@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 1996-2009, International Business Machines Corporation and Others.
+ * All rights reserved.
+ */
+
+/**
+ * \file 
+ * \brief C API: Boyer-Moore StringSearch prototype.
+ * \internal
+ */
+
+#ifndef _BMS_H
+#define _BMS_H
+
+#include "unicode/utypes.h"
+#include "unicode/ucol.h"
+
+/**
+ * A <code>UCD</code> object holds the Collator-specific data needed to
+ * compute the length of the shortest string that can
+ * generate a partcular list of CEs.
+ *
+ * <code>UCD</code> objects are quite expensive to compute. Because
+ * of this, they are cached. When you call <code>ucd_open</code> it
+ * returns a reference counted cached object. When you call <code>ucd_close</code>
+ * the reference count on the object is decremented but the object is not deleted.
+ *
+ * If you do not need to reuse any unreferenced objects in the cache, you can call
+ * <code>ucd_flushCCache</code>. If you no longer need any <code>UCD</code>
+ * objects, you can call <code>ucd_freeCache</code>
+ */
+typedef void UCD;
+
+/**
+ * Open a <code>UCD</code> object.
+ *
+ * @param collator - the collator
+ * @param status - will be set if any errors occur. 
+ *
+ * @return the <code>UCD</code> object. You must call
+ *         <code>ucd_close</code> when you are done using the object.
+ *
+ * Note: if on return status is set to an error, the only safe
+ * thing to do with the returned object is to call <code>ucd_close</code>.
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI UCD * U_EXPORT2
+ucd_open(UCollator *coll, UErrorCode *status);
+
+/**
+ * Release a <code>UCD</code> object.
+ *
+ * @param ucd - the object
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI void U_EXPORT2
+ucd_close(UCD *ucd);
+
+/**
+ * Get the <code>UCollator</code> object used to create a <code>UCD</code> object.
+ * The <code>UCollator</code> object returned may not be the exact
+ * object that was used to create this object, but it will have the
+ * same behavior.
+ *
+ * @param ucd - the <code>UCD</code> object
+ *
+ * @return the <code>UCollator</code> used to create the given
+ *         <code>UCD</code> object.
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI UCollator * U_EXPORT2
+ucd_getCollator(UCD *ucd);
+
+/**
+ * <code>UCD</code> objects are expensive to compute, and so
+ * may be cached. This routine will free the cached objects and delete
+ * the cache.
+ *
+ * WARNING: Don't call this until you are have called <code>close</code>
+ * for each <code>UCD</code> object that you have used. also,
+ * DO NOT call this if another thread may be calling <code>ucd_flushCache</code>
+ * at the same time.
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI void U_EXPORT2
+ucd_freeCache();
+
+/**
+ * <code>UCD</code> objects are expensive to compute, and so
+ * may be cached. This routine will remove any unused <code>UCD</code>
+ * objects from the cache.
+ *
+ * @internal 4.0.1 technology preview
+ */
+U_CAPI void U_EXPORT2
+ucd_flushCache();
+
+/**
+ * BMS
+ *
+ * This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
+ * the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
+ * and a reference to the text being searched.
+ *
+ * To do a search, you fist need to get a <code>UCD</code> object by calling <code>ucd_open</code>.
+ * Then you construct a <code>BMS</code> object from the <code>UCD</code> object, the pattern
+ * string and the target string. Then you call the <code>search</code> method. Here's a code sample:
+ *
+ * <pre>
+ * void boyerMooreExample(UCollator *collator, UChar *pattern, int32_t patternLen, UChar *target, int32_t targetLength)
+ * {
+ *     UErrorCode status = U_ZERO_ERROR;
+ *     int32_t offset = 0, start = -1, end = -1;
+ *     UCD *ucd = NULL);
+ *     BMS *bms = NULL;
+ *
+ *     ucd = ucd_open(collator, &status);
+ *     if (U_FAILURE(status)) {
+ *         // could not create a UCD object
+ *         return;
+ *     }
+ *
+ *     BMS *bms = bms_open(ucd, pattern, patternLength, target, targetlength, &status);
+ *     if (U_FAILURE(status)) {
+ *         // could not create a BMS object
+ *         ucd_close(ucd);
+ *         return;
+ *     }
+ *
+ *
+ *     // Find all matches
+ *     while (bms_search(bms, offset, &start, &end)) {
+ *         // process the match between start and end
+ *         ...
+ *
+ *         // advance past the match
+ *         offset = end; 
+ *     }
+ *
+ *     // at this point, if offset == 0, there were no matches
+ *     if (offset == 0) {
+ *         // handle the case of no matches
+ *     }
+ *
+ *     bms_close(bms);
+ *     ucd_close(ucd);
+ *
+ *     // UCD objects are cached, so the call to
+ *     // ucd_close doesn't delete the object.
+ *     // Call this if you don't need the object any more.
+ *     ucd_flushCache();
+ * }
+ * </pre>
+ *
+ * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
+ *
+ * Knows linitations:
+ *   1) Backwards searching has not been implemented.
+ *
+ *   2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
+ *      this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
+ *      to be equal to Han characters with the same pronounciation. Because this code ignroes
+ *      tailorings, searching for a Hangul character will not find a Han character and visa-versa.
+ *
+ *   3) In some cases, searching for a pattern that needs to be normalized and ends
+ *      in a discontiguous contraction may fail. The only known cases of this are with
+ *      the Tibetan script. For example searching for the pattern
+ *      "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
+ *      been unable to find a pratical, real-world example of this failure.)  
+ *
+ * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+struct BMS;
+typedef struct BMS BMS;
+
+/**
+ * Construct a <code>MBS</code> object.
+ *
+ * @param ucd - A <code>UCD</code> object holding the Collator-sensitive data
+ * @param pattern - the string for which to search
+ * @param latternLength - the length of the string for which to search
+ * @param target - the string in which to search
+ * @param targetLength - the length of the string in which to search
+ * @param status - will be set if any errors occur. 
+ *
+ * @return the <code>BMS</code> object.
+ *
+ * Note: if on return status is set to an error, the only safe
+ * thing to do with the returned object is to call
+ * <code>bms_close</code>.
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI BMS * U_EXPORT2
+bms_open(UCD *ucd,
+         const UChar *pattern, int32_t patternLength,
+         const UChar *target,  int32_t targetLength,
+         UErrorCode  *status);
+
+/**
+ * Close a <code>BMS</code> object and release all the
+ * storage associated with it.
+ *
+ * @param bms - the <code>BMS</code> object to close.
+ */
+U_CAPI void U_EXPORT2
+bms_close(BMS *bms);
+
+/**
+ * Test the pattern to see if it generates any CEs.
+ *
+ * @return <code>TRUE</code> if the pattern string did not generate any CEs
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI UBool U_EXPORT2
+bms_empty(BMS *bms);
+
+/**
+ * Get the <code>UCD</code> object used to create
+ * a given <code>BMS</code> object.
+ *
+ * @param bms - the <code>BMS</code> object
+ *
+ * @return - the <code>UCD</code> object used to create
+ *           the given <code>BMS</code> object.
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI UCD * U_EXPORT2
+bms_getData(BMS *bms);
+
+/**
+ * Search for the pattern string in the target string.
+ *
+ * @param offset - the offset in the target string at which to begin the search
+ * @param start - will be set to the starting offset of the match, or -1 if there's no match
+ * @param end - will be set to the ending offset of the match, or -1 if there's no match
+ *
+ * @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI UBool U_EXPORT2
+bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end);
+
+/**
+ * Set the target string for the match.
+ *
+ * @param target - the new target string
+ * @param targetLength - the length of the new target string
+ * @param status - will be set if any errors occur. 
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+U_CAPI void U_EXPORT2
+bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status);
+
+#endif /* _BMS_H */
--- a/icu4c/source/i18n/unicode/bmsearch.h
+++ b/icu4c/source/i18n/unicode/bmsearch.h
@ -0,0 +1,221 @@
+/*
+ ******************************************************************************
+ *   Copyright (C) 1996-2009, International Business Machines                 *
+ *   Corporation and others.  All Rights Reserved.                            *
+ ******************************************************************************
+ */
+
+/**
+ * \file 
+ * \brief C++ API: Boyer-Moore StringSearch technology preview
+ * \internal ICU 4.0.1 technology preview
+ */
+ 
+#ifndef B_M_SEARCH_H
+#define B_M_SEARCH_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "unicode/uobject.h"
+#include "unicode/ucol.h"
+
+#include "unicode/colldata.h"
+
+U_NAMESPACE_BEGIN
+
+class BadCharacterTable;
+class GoodSuffixTable;
+class Target;
+
+/**
+ * BoyerMooreSearch
+ *
+ * This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
+ * the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
+ * and a reference to the text being searched.
+ *
+ * To do a search, you fist need to get a <code>CollData</code> object by calling <code>CollData::open</code>.
+ * Then you construct a <code>BoyerMooreSearch</code> object from the <code>CollData</code> object, the pattern
+ * string and the target string. Then you call the <code>search</code> method. Here's a code sample:
+ *
+ * <pre>
+ * void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeString *target)
+ * {
+ *     UErrorCode status = U_ZERO_ERROR;
+ *     CollData *collData = CollData::open(collator, status);
+ *
+ *     if (U_FAILURE(status)) {
+ *         // could not create a CollData object
+ *         return;
+ *     }
+ *
+ *     BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString, target, status);
+ *
+ *     if (U_FAILURE(status)) {
+ *         // could not create a BoyerMooreSearch object
+ *         CollData::close(collData);
+ *         return;
+ *     }
+ *
+ *     int32_t offset = 0, start = -1, end = -1;
+ *
+ *     // Find all matches
+ *     while (search->search(offset, start, end)) {
+ *         // process the match between start and end
+ *         ...
+ *         // advance past the match
+ *         offset = end; 
+ *     }
+ *
+ *     // at this point, if offset == 0, there were no matches
+ *     if (offset == 0) {
+ *         // handle the case of no matches
+ *     }
+ *
+ *     delete search;
+ *     CollData::close(collData);
+ *
+ *     // CollData objects are cached, so the call to
+ *     // CollData::close doesn't delete the object.
+ *     // Call this if you don't need the object any more.
+ *     CollData::flushCollDataCache();
+ * }
+ * </pre>
+ *
+ * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
+ *
+ * Knows linitations:
+ *   1) Backwards searching has not been implemented.
+ *
+ *   2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
+ *      this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
+ *      to be equal to Han characters with the same pronounciation. Because this code ignroes
+ *      tailorings, searching for a Hangul character will not find a Han character and visa-versa.
+ *
+ *   3) In some cases, searching for a pattern that needs to be normalized and ends
+ *      in a discontiguous contraction may fail. The only known cases of this are with
+ *      the Tibetan script. For example searching for the pattern
+ *      "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
+ *      been unable to find a pratical, real-world example of this failure.)  
+ *
+ * @internal ICU 4.0.1 technology preview
+ *
+ * @see CollData
+ */
+class U_I18N_API BoyerMooreSearch : public UObject
+{
+public:
+    /**
+     * Construct a <code>BoyerMooreSearch</code> object.
+     *
+     * @param theData - A <code>CollData</code> object holding the Collator-sensitive data
+     * @param patternString - the string for which to search
+     * @param targetString - the string in which to search or <code>NULL</code> if youu will
+     *                       set it later by calling <code>setTargetString</code>.
+     * @param status - will be set if any errors occur. 
+     *
+     * Note: if on return, status is set to an error code,
+     * the only safe thing to do with this object is to call
+     * the destructor.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status);
+
+    /**
+     * The desstructor
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    ~BoyerMooreSearch();
+
+    /**
+     * Test the pattern to see if it generates any CEs.
+     *
+     * @return <code>TRUE</code> if the pattern string did not generate any CEs
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    UBool empty();
+
+    /**
+     * Search for the pattern string in the target string.
+     *
+     * @param offset - the offset in the target string at which to begin the search
+     * @param start - will be set to the starting offset of the match, or -1 if there's no match
+     * @param end - will be set to the ending offset of the match, or -1 if there's no match
+     *
+     * @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    UBool search(int32_t offset, int32_t &start, int32_t &end);
+
+    /**
+     * Set the target string for the match.
+     *
+     * @param targetString - the new target string
+     * @param status - will be set if any errors occur. 
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    void setTargetString(const UnicodeString *targetString, UErrorCode &status);
+
+    // **** no longer need these? ****
+    /**
+     * Return the <code>CollData</code> object used for searching
+     *
+     * @return the <code>CollData</code> object used for searching
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    CollData *getData();
+
+    /**
+     * Return the CEs generated by the pattern string.
+     *
+     * @return a <code>CEList</code> object holding the CEs generated by the pattern string.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    CEList   *getPatternCEs();
+
+    /**
+     * Return the <code>BadCharacterTable</code> object computed for the pattern string.
+     *
+     * @return the <code>BadCharacterTable</code> object.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    BadCharacterTable *getBadCharacterTable();
+
+    /**
+     * Return the <code>GoodSuffixTable</code> object computed for the pattern string.
+     *
+     * @return the <code>GoodSuffixTable</code> object computed for the pattern string.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    GoodSuffixTable   *getGoodSuffixTable();
+
+    /*
+     * UObject glue...
+     */
+    virtual UClassID getDynamicClassID() const;
+    static UClassID getStaticClassID();
+    
+private:
+    CollData *data;
+    CEList *patCEs;
+    BadCharacterTable *badCharacterTable;
+    GoodSuffixTable   *goodSuffixTable;
+    UnicodeString pattern;
+    Target *target;
+};
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_COLLATION
+#endif // #ifndef B_M_SEARCH_H
--- a/icu4c/source/i18n/unicode/colldata.h
+++ b/icu4c/source/i18n/unicode/colldata.h
@ -0,0 +1,430 @@
+/*
+ ******************************************************************************
+ *   Copyright (C) 1996-2009, International Business Machines                 *
+ *   Corporation and others.  All Rights Reserved.                            *
+ ******************************************************************************
+ */
+
+/**
+ * \file 
+ * \brief C++ API: Collation data used to compute minLengthInChars.
+ * \internal
+ */
+ 
+#ifndef COLL_DATA_H
+#define COLL_DATA_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "unicode/uobject.h"
+#include "unicode/ucol.h"
+
+U_NAMESPACE_BEGIN
+
+/*
+ * The size of the internal buffer for the Collator's short description string.
+ */
+#define KEY_BUFFER_SIZE 64
+
+ /*
+  * The size of the internal CE buffer in a <code>CEList</code> object
+  */
+#define CELIST_BUFFER_SIZE 4
+
+/*
+ * Define this to enable the <code>CEList</code> objects to collect
+ * statistics.
+ */
+//#define INSTRUMENT_CELIST
+
+ /*
+  * The size of the initial list in a <code>StringList</code> object.
+  */
+#define STRING_LIST_BUFFER_SIZE 16
+
+/*
+ * Define this to enable the <code>StringList</code> objects to
+ * collect statistics.
+ */
+//#define INSTRUMENT_STRING_LIST
+
+ /**
+  * CEList
+  *
+  * This object holds a list of CEs generated from a particular
+  * <code>UnicodeString</code>
+  *
+  * @internal ICU 4.0.1 technology preview
+  */
+class U_I18N_API CEList : public UObject
+{
+public:
+    /**
+     * Construct a <code>CEList</code> object.
+     *
+     * @param coll - the Collator used to collect the CEs.
+     * @param string - the string for which to collect the CEs.
+     * @param status - will be set if any errors occur. 
+     *
+     * Note: if on return, status is set to an error code,
+     * the only safe thing to do with this object is to call
+     * the destructor.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
+
+    /**
+     * The destructor.
+     */
+    ~CEList();
+
+    /**
+     * Return the number of CEs in the list.
+     *
+     * @return the number of CEs in the list.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    int32_t size() const;
+
+    /**
+     * Get a particular CE from the list.
+     *
+     * @param index - the index of the CE to return
+     *
+     * @return the CE, or <code>0</code> if <code>index</code> is out of range
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    uint32_t get(int32_t index) const;
+
+    /**
+     * Check if the CEs in another <code>CEList</code> match the
+     * suffix of this list starting at a give offset.
+     *
+     * @param offsset - the offset of the suffix
+     * @param other - the other <code>CEList</code>
+     *
+     * @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    UBool matchesAt(int32_t offset, const CEList *other) const; 
+
+    /**
+     * The index operator.
+     *
+     * @param index - the index
+     *
+     * @return a reference to the given CE in the list
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    uint32_t &operator[](int32_t index) const;
+
+    /*
+     * UObject glue...
+     */
+    virtual UClassID getDynamicClassID() const;
+    static UClassID getStaticClassID();
+
+private:
+    void add(uint32_t ce, UErrorCode &status);
+
+    uint32_t ceBuffer[CELIST_BUFFER_SIZE];
+    uint32_t *ces;
+    int32_t listMax;
+    int32_t listSize;
+
+#ifdef INSTRUMENT_CELIST
+    static int32_t _active;
+    static int32_t _histogram[10];
+#endif
+};
+
+/**
+ * StringList
+ *
+ * This object holds a list of <code>UnicodeString</code> objects.
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+class U_I18N_API StringList : public UObject
+{
+public:
+    /**
+     * Construct an empty <code>StringList</code>
+     *
+     * @param status - will be set if any errors occur. 
+     *
+     * Note: if on return, status is set to an error code,
+     * the only safe thing to do with this object is to call
+     * the destructor.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    StringList(UErrorCode &status);
+
+    /**
+     * The destructor.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    ~StringList();
+
+    /**
+     * Add a string to the list.
+     *
+     * @param string - the string to add
+     * @param status - will be set if any errors occur. 
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    void add(const UnicodeString *string, UErrorCode &status);
+
+    /**
+     * Add an array of Unicode code points to the list.
+     *
+     * @param chars - the address of the array of code points
+     * @param count - the number of code points in the array
+     * @param status - will be set if any errors occur. 
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    void add(const UChar *chars, int32_t count, UErrorCode &status);
+
+    /**
+     * Get a particular string from the list.
+     *
+     * @param index - the index of the string
+     *
+     * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code> 
+     *         if <code>index</code> is out of bounds.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    const UnicodeString *get(int32_t index) const;
+
+    /**
+     * Get the number of stings in the list.
+     *
+     * @return the number of strings in the list.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    int32_t size() const;
+
+    /*
+     * the UObject glue...
+     */
+    virtual UClassID getDynamicClassID() const;
+    static UClassID getStaticClassID();
+
+private:
+    UnicodeString *strings;
+    int32_t listMax;
+    int32_t listSize;
+
+#ifdef INSTRUMENT_STRING_LIST
+    static int32_t _lists;
+    static int32_t _strings;
+    static int32_t _histogram[101];
+#endif
+};
+
+/*
+ * Forward references to internal classes.
+ */
+class StringToCEsMap;
+class CEToStringsMap;
+class CollDataCache;
+
+/**
+ * CollData
+ *
+ * This class holds the Collator-specific data needed to
+ * compute the length of the shortest string that can
+ * generate a partcular list of CEs.
+ *
+ * <code>CollData</code> objects are quite expensive to compute. Because
+ * of this, they are cached. When you call <code>CollData::open</code> it
+ * returns a reference counted cached object. When you call <code>CollData::close</code>
+ * the reference count on the object is decremented but the object is not deleted.
+ *
+ * If you do not need to reuse any unreferenced objects in the cache, you can call
+ * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
+ * objects, you can call <code>CollData::freeCollDataCache</code>
+ *
+ * @internal ICU 4.0.1 technology preview
+ */
+class U_I18N_API CollData : public UObject
+{
+public:
+    /**
+     * Construct a <code>CollData</code> object.
+     *
+     * @param collator - the collator
+     * @param status - will be set if any errors occur. 
+     *
+     * @return the <code>CollData</code> object. You must call
+     *         <code>close</code> when you are done using the object.
+     *
+     * Note: if on return, status is set to an error code,
+     * the only safe thing to do with this object is to call
+     * <code>CollData::close</code>.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    static CollData *open(UCollator *collator, UErrorCode &status);
+
+    /**
+     * Release a <code>CollData</code> object.
+     *
+     * @param collData - the object
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    static void close(CollData *collData);
+
+    /**
+     * Get the <code>UCollator</code> object used to create this object.
+     * The object returned may not be the exact object that was used to
+     * create this object, but it will have the same behavior.
+     */
+    UCollator *getCollator() const;
+
+    /**
+     * Get a list of all the strings which generate a list
+     * of CEs starting with a given CE.
+     *
+     * @param ce - the CE
+     *
+     * return a <code>StringList</code> object containing all
+     *        the stirngs, or <code>NULL</code> if there are
+     *        no such strings.
+     *
+     * @internal ICU 4.0.1 technology preview.
+     */
+    const StringList *getStringList(int32_t ce) const;
+
+    /**
+     * Get a list of the CEs generated by a partcular stirng.
+     *
+     * @param string - the string
+     *
+     * @return a <code>CEList</code> object containt the CEs. You
+     *         must call <code>freeCEList</code> when you are finished
+     *         using the <code>CEList</code>/
+     *
+     * @internal ICU 4.0.1 technology preview.
+     */
+    const CEList *getCEList(const UnicodeString *string) const;
+
+    /**
+     * Release a <code>CEList</code> returned by <code>getCEList</code>.
+     *
+     * @param list - the <CEList> to free.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    void freeCEList(const CEList *list);
+
+    /**
+     * Return the length of the shortest string that will generate
+     * the given list of CEs.
+     *
+     * @param ces - the CEs
+     * @param offset - the offset of the first CE in the list to use.
+     *
+     * @return the length of the shortest string.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+    int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
+
+ 
+    /**
+     * Return the length of the shortest string that will generate
+     * the given list of CEs.
+     *
+     * Note: the algorithm used to do this computation is recursive. To
+     * limit the amount of recursion, a "history" list is used to record
+     * the best answer starting at a particular offset in the list of CEs.
+     * If the same offset is visited again during the recursion, the answer
+     * in the history list is used.
+     *
+     * @param ces - the CEs
+     * @param offset - the offset of the first CE in the list to use.
+     * param history - the history list. Must be at least as long as
+     *                 the number of cEs in the <code>CEList</code>
+     *
+     * @return the length of the shortest string.
+     *
+     * @internal ICU 4.0.1 technology preview
+     */
+   int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
+
+   /*
+    * UObject glue...
+    */
+    virtual UClassID getDynamicClassID() const;
+    static UClassID getStaticClassID();
+
+    /**
+     * <code>CollData</code> objects are expensive to compute, and so
+     * may be cached. This routine will free the cached objects and delete
+     * the cache.
+     *
+     * WARNING: Don't call this until you are have called <code>close</code>
+     * for each <code>CollData</code> object that you have used. also,
+     * DO NOT call this if another thread may be calling <code>flushCollDataCache</code>
+     * at the same time.
+     *
+     * @internal 4.0.1 technology preview
+     */
+    static void freeCollDataCache();
+
+    /**
+     * <code>CollData</code> objects are expensive to compute, and so
+     * may be cached. This routine will remove any unused <code>CollData</code>
+     * objects from the cache.
+     *
+     * @internal 4.0.1 technology preview
+     */
+    static void flushCollDataCache();
+
+private:
+    friend class CollDataCache;
+    friend class CollDataCacheEntry;
+
+    CollData(UCollator *collator, char *cacheKey, int32_t cachekeyLength, UErrorCode &status);
+    ~CollData();
+
+    CollData();
+
+    static char *getCollatorKey(UCollator *collator, char *buffer, int32_t bufferLength);
+
+    static CollDataCache *getCollDataCache();
+
+    UCollator      *coll;
+    StringToCEsMap *charsToCEList;
+    CEToStringsMap *ceToCharsStartingWith;
+
+    char keyBuffer[KEY_BUFFER_SIZE];
+    char *key;
+
+    static CollDataCache *collDataCache;
+
+    uint32_t minHan;
+    uint32_t maxHan;
+
+    uint32_t jamoLimits[4];
+};
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_COLLATION
+#endif // #ifndef COLL_DATA_H
--- a/icu4c/source/i18n/unicode/ucoleitr.h
+++ b/icu4c/source/i18n/unicode/ucoleitr.h
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-*   Copyright (C) 2001-2008, International Business Machines
+*   Copyright (C) 2001-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 *
@ -121,6 +121,7 @@ ucol_openElements(const UCollator  *coll,
                        int32_t    textLength,
                        UErrorCode *status);

+
 /**
 * get a hash code for a key... Not very useful!
 * @param key    the given key.
@ -152,6 +153,20 @@ ucol_closeElements(UCollationElements *elems);
 U_STABLE void U_EXPORT2 
 ucol_reset(UCollationElements *elems);

+/**
+ * Set the collation elements to use implicit ordering for Han
+ * even if they've been tailored. This will also force Hangul
+ * syllables to be ordered by decomposing them to their component
+ * Jamo.
+ *
+ * @param elems The UCollationElements containing the text.
+ * @param status A pointer to a UErrorCode to reveive any errors.
+ *
+ * @internal
+ */
+U_INTERNAL void U_EXPORT2
+ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status);
+
 /**
 * Get the ordering priority of the next collation element in the text.
 * A single character may contain more than one collation element.
--- a/icu4c/source/i18n/usearch.cpp
+++ b/icu4c/source/i18n/usearch.cpp
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 2001-2008 IBM and others. All rights reserved.
+*   Copyright (C) 2001-2009 IBM and others. All rights reserved.
 **********************************************************************
 *   Date        Name        Description
 *  07/02/2001   synwee      Creation.
@ -3785,7 +3785,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
        found = TRUE;
        //  Inner loop checks for a match beginning at each
        //  position from the outer loop.
-        for (patIx=0; patIx<strsrch->pattern.CELength; patIx++) {
+        for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
            int64_t patCE = strsrch->pattern.PCE[patIx];
            targetCEI = ceb.get(targetIx+patIx);
            //  Compare CE from target string with CE from the pattern.
@ -3814,11 +3814,9 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
        //     an acceptable character range.
        //
        const CEI *firstCEI = ceb.get(targetIx);
-        const CEI *lastCEI  = ceb.get(targetIx + strsrch->pattern.CELength - 1);
-        const CEI *nextCEI  = ceb.get(targetIx + strsrch->pattern.CELength);
+        const CEI *lastCEI  = ceb.get(targetIx + strsrch->pattern.PCELength - 1);
+        const CEI *nextCEI  = ceb.get(targetIx + strsrch->pattern.PCELength);

-     // targetCEI = ceb.get(targetIx+strsrch->pattern.CELength);
-     // maxLimit = targetCEI->lowIndex;
        mStart   = firstCEI->lowIndex;
        minLimit = lastCEI->lowIndex;
        maxLimit = nextCEI->lowIndex;
@ -3883,7 +3881,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
            found = FALSE;
        }

-        if (!checkIdentical(strsrch, mStart, mLimit)) {
+        if (! checkIdentical(strsrch, mStart, mLimit)) {
            found = FALSE;
        }

@ -4006,10 +4004,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
        found = TRUE;
        //  Inner loop checks for a match beginning at each
        //  position from the outer loop.
-        for (patIx = strsrch->pattern.CELength - 1; patIx >= 0; patIx -= 1) {
+        for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
            int64_t patCE = strsrch->pattern.PCE[patIx];

-            targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1 - patIx);
+            targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx);
            //  Compare CE from target string with CE from the pattern.
            //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
            //    which will fail the compare, below.
@ -4035,7 +4033,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
        //  There still is a chance of match failure if the CE range not correspond to
        //     an acceptable character range.
        //
-        const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1);
+        const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1);
        const CEI *lastCEI  = ceb.getPrevious(targetIx);
        const CEI *nextCEI  = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;

@ -4102,6 +4100,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
            found = FALSE;
        }

+        if (! checkIdentical(strsrch, mStart, mLimit)) {
+            found = FALSE;
+        }
+
        if (found) {
            break;
        }
--- a/icu4c/source/test/cintltst/callcoll.c
+++ b/icu4c/source/test/cintltst/callcoll.c
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 1997-2008, International Business Machines Corporation and
+ * Copyright (c) 1997-2009, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /*******************************************************************************
@ -515,7 +515,7 @@ backAndForth(UCollationElements *iter)
          }

          if (o != orders[index].order) {
-              log_err("Mismatched order at index %d: 0x%0:8X vs. 0x%0:8X\n", index,
+              log_err("Mismatched order at index %d: 0x%8.8X vs. 0x%8.8X\n", index,
                orders[index].order, o);
            goto bail;
          }
--- a/icu4c/source/test/intltest/ssearch.cpp
+++ b/icu4c/source/test/intltest/ssearch.cpp
--- a/icu4c/source/test/intltest/ssearch.h
+++ b/icu4c/source/test/intltest/ssearch.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
- *   Copyright (C) 2005-2008, International Business Machines
+ *   Copyright (C) 2005-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -11,6 +11,7 @@
 #include "unicode/utypes.h"
 #include "unicode/unistr.h"
 #include "unicode/ucol.h"
+#include "unicode/bmsearch.h"

 #include "intltest.h"

@ -34,10 +35,24 @@ public:
    virtual void offsetTest();
    virtual void monkeyTest(char *params);

+    virtual void bmMonkeyTest(char *params);
+    virtual void boyerMooreTest();
+    virtual void goodSuffixTest();
+    virtual void searchTime();
+    
+    virtual void bmsTest();
+    virtual void bmSearchTest();
+
+    virtual void udhrTest();
+
 private:
    virtual const char   *getPath(char buffer[2048], const char *filename);
    virtual       int32_t monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
                                         const char *name, const char *strength, uint32_t seed);
+
+    virtual       int32_t bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
+                                         BoyerMooreSearch *bms, BoyerMooreSearch *abms,
+                                         const char *name, const char *strength, uint32_t seed);
 #endif
                                         
 };
--- a/icu4c/source/test/perf/strsrchperf/strsrchperf.cpp
+++ b/icu4c/source/test/perf/strsrchperf/strsrchperf.cpp
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT:
- * Copyright (C) 2008 IBM, Inc.   All Rights Reserved.
+ * Copyright (C) 2008-2009 IBM, Inc.   All Rights Reserved.
 *
 ********************************************************************/
 /** 
@ -14,7 +14,13 @@
 StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
 :UPerfTest(argc,argv,status){
    int32_t start, end;
+
+#ifdef TEST_BOYER_MOORE_SEARCH
+    bms = NULL;
+#else
    srch = NULL;
+#endif
+
    pttrn = NULL;
    if(status== U_ILLEGAL_ARGUMENT_ERROR || line_mode){
       fprintf(stderr,gUsageString, "strsrchperf");
@ -22,7 +28,8 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
    }
    /* Get the Text */
    src = getBuffer(srcLen, status);
-    
+
+#if 0
    /* Get a word to find. Do this by selecting a random word with a word breakiterator. */
    UBreakIterator* brk = ubrk_open(UBRK_WORD, locale, src, srcLen, &status);
    if(U_FAILURE(status)){
@ -38,9 +45,38 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
    }
    pttrn = temp; /* store word in pttrn */
    ubrk_close(brk);
+#else
+    /* The first line of the file contains the pattern */
+    start = 0;
+
+    for(end = start; ; end += 1) {
+        UChar ch = src[end];
+
+        if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
+            break;
+        }
+    }
+
+    pttrnLen = end - start;
+    UChar* temp = (UChar*)malloc(sizeof(UChar)*(pttrnLen));
+    for (int i = 0; i < pttrnLen; i++) {
+        temp[i] = src[start++];
+    }
+    pttrn = temp; /* store word in pttrn */
+#endif
    
+#ifdef TEST_BOYER_MOORE_SEARCH
+    UnicodeString patternString(pttrn, pttrnLen);
+    UCollator *coll = ucol_open(locale, &status);
+    CollData *data = CollData::open(coll, status);
+
+    targetString = new UnicodeString(src, srcLen);
+    bms = new BoyerMooreSearch(data, patternString, targetString, status);
+#else
    /* Create the StringSearch object to be use in performance test. */
    srch = usearch_open(pttrn, pttrnLen, src, srcLen, locale, NULL, &status);
+#endif
+
    if(U_FAILURE(status)){
        fprintf(stderr, "FAILED to create UPerfTest object. Error: %s\n", u_errorName(status));
        return;
@ -49,12 +85,23 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
 }

 StringSearchPerformanceTest::~StringSearchPerformanceTest() {
+    CollData *data  = bms->getData();
+    UCollator *coll = data->getCollator();
+
+    delete bms;
+    delete targetString;
+    CollData::close(data);
+    ucol_close(coll);
+
    if (pttrn != NULL) {
        free(pttrn);
    }
+
+#ifndef TEST_BOYER_MOORE_SEARCH
    if (srch != NULL) {
        usearch_close(srch);
    }
+#endif
 }

 UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char *par) {
@ -70,12 +117,20 @@ UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool
 }

 UPerfFunction* StringSearchPerformanceTest::Test_ICU_Forward_Search(){
+#ifdef TEST_BOYER_MOORE_SEARCH
+    StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUForwardSearch, bms, src, srcLen, pttrn, pttrnLen);
+#else
    StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUForwardSearch, srch, src, srcLen, pttrn, pttrnLen);
+#endif
    return func;
 }

 UPerfFunction* StringSearchPerformanceTest::Test_ICU_Backward_Search(){
+#ifdef TEST_BOYER_MOORE_SEARCH
+    StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUBackwardSearch, bms, src, srcLen, pttrn, pttrnLen);
+#else
    StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUBackwardSearch, srch, src, srcLen, pttrn, pttrnLen);
+#endif
    return func;
 }

--- a/icu4c/source/test/perf/strsrchperf/strsrchperf.h
+++ b/icu4c/source/test/perf/strsrchperf/strsrchperf.h
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT:
- * Copyright (C) 2008 IBM, Inc.   All Rights Reserved.
+ * Copyright (C) 2008-2009 IBM, Inc.   All Rights Reserved.
 *
 ********************************************************************/
 #ifndef _STRSRCHPERF_H
@ -8,11 +8,19 @@

 #include "unicode/ubrk.h"
 #include "unicode/usearch.h"
+#include "unicode/colldata.h"
+#include "unicode/bmsearch.h"
 #include "unicode/uperf.h"
 #include <stdlib.h>
 #include <stdio.h>

+#define TEST_BOYER_MOORE_SEARCH
+
+#ifdef TEST_BOYER_MOORE_SEARCH
+typedef void (*StrSrchFn) (BoyerMooreSearch * bms, const UChar *src, int32_t srcLen, const UChar *pttrn, int32_t pttrnLen, UErrorCode *status);
+#else
 typedef void (*StrSrchFn)(UStringSearch* srch, const UChar* src,int32_t srcLen, const UChar* pttrn, int32_t pttrnLen, UErrorCode* status);
+#endif

 class StringSearchPerfFunction : public UPerfFunction {
 private:
@ -21,17 +29,39 @@ private:
    int32_t srcLen;
    const UChar* pttrn;
    int32_t pttrnLen;
+#ifdef TEST_BOYER_MOORE_SEARCH
+    BoyerMooreSearch *bms;
+#else
    UStringSearch* srch;
+#endif
    
 public:
    virtual void call(UErrorCode* status) {
+#ifdef TEST_BOYER_MOORE_SEARCH
+        (*fn)(bms, src, srcLen, pttrn, pttrnLen, status);
+#else
        (*fn)(srch, src, srcLen, pttrn, pttrnLen, status);
+#endif
    }
    
    virtual long getOperationsPerIteration() {
+#if 0
        return (long)(srcLen/pttrnLen);
+#else
+        return (long) srcLen;
+#endif
    }
    
+#ifdef TEST_BOYER_MOORE_SEARCH
+    StringSearchPerfFunction(StrSrchFn func, BoyerMooreSearch *search, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen) {
+        fn       = func;
+        src      = source;
+        srcLen   = sourceLen;
+        pttrn    = pattern;
+        pttrnLen = patternLen;
+        bms      = search;
+    }
+#else
    StringSearchPerfFunction(StrSrchFn func, UStringSearch* search, const UChar* source,int32_t sourceLen, const UChar* pattern, int32_t patternLen) {
        fn = func;
        src = source;
@ -40,6 +70,7 @@ public:
        pttrnLen = patternLen;
        srch = search;
    }
+#endif
 };

 class StringSearchPerformanceTest : public UPerfTest {
@ -48,7 +79,12 @@ private:
    int32_t srcLen;
    UChar* pttrn;
    int32_t pttrnLen;
+#ifdef TEST_BOYER_MOORE_SEARCH
+    UnicodeString *targetString;
+    BoyerMooreSearch *bms;
+#else
    UStringSearch* srch;
+#endif
    
 public:
    StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status);
@ -56,9 +92,29 @@ public:
    virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = NULL);
    
    UPerfFunction* Test_ICU_Forward_Search();
+
    UPerfFunction* Test_ICU_Backward_Search();
 };

+
+#ifdef TEST_BOYER_MOORE_SEARCH
+void ICUForwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) { 
+    int32_t offset = 0, start = -1, end = -1;
+
+    while (bms->search(offset, start, end)) {
+        offset = end;
+    }
+}
+
+void ICUBackwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) { 
+    int32_t offset = 0, start = -1, end = -1;
+
+    /* NOTE: No Boyer-Moore backward search yet... */
+    while (bms->search(offset, start, end)) {
+        offset = end;
+    }
+}
+#else
 void ICUForwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceLen, const UChar* pattern, int32_t patternLen, UErrorCode* status) {
    int32_t match;
    
@ -76,5 +132,6 @@ void ICUBackwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceL
        match = usearch_previous(srch, status);
    }
 }
+#endif

 #endif /* _STRSRCHPERF_H */
--- a/icu4c/source/test/testdata/ssearch.xml
+++ b/icu4c/source/test/testdata/ssearch.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>

-<!-- Copyright (c) 2007-2008 IBM Corporation and others. All rights reserved -->
+<!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->

 <!-- Test data file for string search  -->
 <!DOCTYPE stringsearch-tests [
@ -12,6 +12,7 @@
          locale CDATA "en" 
          strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY" 
          norm (ON | OFF) "OFF"
+          alternate_handling (NON_IGNORABLE | SHIFTED) "NON_IGNORABLE"
          >

 <!ELEMENT pattern (#PCDATA)>
@ -20,7 +21,7 @@
 <!ELEMENT post (#PCDATA)>
 ]>

-<stringsearch-tests debug="test32">
+<stringsearch-tests>
  <!-- debug="test11"     (for copying into the above element)  -->
    
    <!-- Very simple match  -->
@ -174,8 +175,15 @@
      <pattern>A\u0300</pattern>
      <pre>At IDENTICAL, shoud this match?  </pre><m>\u00c0</m><post></post>
    </test-case>
-    
-    <test-case id="test25" strength="SECONDARY" locale="en">
+
+  <test-case id="test24b" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
+    <pattern>A\u0300</pattern>
+    <pre>At IDENTICAL, shoud this match?  </pre>
+    <m>\u00c0</m>
+    <post></post>
+  </test-case>
+
+  <test-case id="test25" strength="SECONDARY" locale="en">
      <pattern>Ű</pattern>
      <pre>12</pre><m>ű</m><post> Ű</post>
    </test-case>
@ -285,11 +293,13 @@
    

    <!-- Long combining sequences  -->
+    <!-- Backwards search fails because patterns ends w/ ignorables
    <test-case id="test60" strength="PRIMARY">
      <pattern>A\u0301\u0301\u0301\u0301</pattern>
      <m>A\u0301\u0301\u0301\u0301\u0301</m>
    </test-case>
-    
+    -->
+
    <test-case id="test61" strength="TERTIARY">
      <pattern>A\u0301\u0301\u0301\u0301</pattern>
          <pre>A\u0301\u0301\u0301\u0301\u0301</pre>
@ -409,5 +419,27 @@
    <pattern>VII</pattern>
    <m>\u2166</m>
  </test-case>
+
+  <test-case id="test83" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
+    <pattern>Universal Declaration of Human Rights</pattern>
+    <pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of achievement for all peoples and all nations</post>
+  </test-case>
+
+  <test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en">
+    <pattern>Universal Declaration of Human Rights</pattern>
+    <pre>Proclaims this </pre>
+    <m>Universal-Declaration-of-Human-Rights</m>
+    <post> as a common standard of achievement for all peoples and all nations</post>
+  </test-case>
+
+  <test-case id="test84" strength="TERTIARY" locale="en">
+    <pattern>\u05E9\u0591\u05E9</pattern>
+    <m>\u05E9\u0592\u05E9</m>
+  </test-case>
+
+  <test-case id="test84b" strength="IDENTICAL" locale="en">
+    <pattern>\u05E9\u0591\u05E9</pattern>
+    <pre>\u05E9\u0592\u05E9</pre>
+  </test-case>
 </stringsearch-tests>