ICU-6659 Merge changes from branches/eric/boyer-moore
X-SVN-Rev: 25282
This commit is contained in:
parent
d9737d2f4a
commit
5f73103b5a
@ -1,6 +1,6 @@
|
||||
#******************************************************************************
|
||||
#
|
||||
# Copyright (C) 1998-2008, International Business Machines
|
||||
# Copyright (C) 1998-2009, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
#******************************************************************************
|
||||
@ -81,7 +81,7 @@ ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
|
||||
csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
|
||||
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \
|
||||
zonemeta.o zstrfmt.o plurrule.o plurfmt.o dtitvfmt.o dtitvinf.o \
|
||||
tmunit.o tmutamt.o tmutfmt.o
|
||||
tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o
|
||||
|
||||
## Header files to install
|
||||
HEADERS = $(srcdir)/unicode/*.h
|
||||
|
145
icu4c/source/i18n/bms.cpp
Normal file
145
icu4c/source/i18n/bms.cpp
Normal file
@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright (C) 2008-2009, International Business Machines Corporation and Others.
|
||||
* All rights reserved.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "cmemory.h"
|
||||
#include "unicode/bms.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/colldata.h"
|
||||
#include "unicode/bmsearch.h"
|
||||
|
||||
//#define USE_SAFE_CASTS
|
||||
#ifdef USE_SAFE_CASTS
|
||||
#define STATIC_CAST(type,value) static_cast<type>(value)
|
||||
#define CONST_CAST(type,value) const_cast<type>(value)
|
||||
#else
|
||||
#define STATIC_CAST(type,value) (type) (value)
|
||||
#define CONST_CAST(type,value) (type) (value)
|
||||
#endif
|
||||
|
||||
U_CAPI UCD * U_EXPORT2
|
||||
ucd_open(UCollator *coll, UErrorCode *status)
|
||||
{
|
||||
return STATIC_CAST(UCD *, CollData::open(coll, *status));
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucd_close(UCD *ucd)
|
||||
{
|
||||
CollData *data = STATIC_CAST(CollData *, ucd);
|
||||
|
||||
CollData::close(data);
|
||||
}
|
||||
|
||||
U_CAPI UCollator * U_EXPORT2
|
||||
ucd_getCollator(UCD *ucd)
|
||||
{
|
||||
CollData *data = STATIC_CAST(CollData *, ucd);
|
||||
|
||||
return data->getCollator();
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucd_freeCache()
|
||||
{
|
||||
CollData::freeCollDataCache();
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucd_flushCache()
|
||||
{
|
||||
CollData::flushCollDataCache();
|
||||
}
|
||||
|
||||
struct BMS
|
||||
{
|
||||
BoyerMooreSearch *bms;
|
||||
const UnicodeString *targetString;
|
||||
};
|
||||
|
||||
U_CAPI BMS * U_EXPORT2
|
||||
bms_open(UCD *ucd,
|
||||
const UChar *pattern, int32_t patternLength,
|
||||
const UChar *target, int32_t targetLength,
|
||||
UErrorCode *status)
|
||||
{
|
||||
BMS *bms = STATIC_CAST(BMS *, uprv_malloc(sizeof(BMS)));
|
||||
|
||||
if (bms == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
CollData *data = (CollData *) ucd;
|
||||
UnicodeString patternString(pattern, patternLength);
|
||||
|
||||
if (target != NULL) {
|
||||
bms->targetString = new UnicodeString(target, targetLength);
|
||||
|
||||
if (bms->targetString == NULL) {
|
||||
bms->bms = NULL;
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return bms;
|
||||
}
|
||||
} else {
|
||||
bms->targetString = NULL;
|
||||
}
|
||||
|
||||
bms->bms = new BoyerMooreSearch(data, patternString, bms->targetString, *status);
|
||||
|
||||
if (bms->bms == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
return bms;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
bms_close(BMS *bms)
|
||||
{
|
||||
delete bms->bms;
|
||||
|
||||
delete bms->targetString;
|
||||
|
||||
uprv_free(bms);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
bms_empty(BMS *bms)
|
||||
{
|
||||
return bms->bms->empty();
|
||||
}
|
||||
|
||||
U_CAPI UCD * U_EXPORT2
|
||||
bms_getData(BMS *bms)
|
||||
{
|
||||
return STATIC_CAST(UCD *, bms->bms->getData());
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end)
|
||||
{
|
||||
return bms->bms->search(offset, *start, *end);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status)
|
||||
{
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (bms->targetString != NULL) {
|
||||
delete bms->targetString;
|
||||
}
|
||||
|
||||
if (target != NULL) {
|
||||
bms->targetString = new UnicodeString(target, targetLength);
|
||||
} else {
|
||||
bms->targetString = NULL;
|
||||
}
|
||||
|
||||
bms->bms->setTargetString(bms->targetString, *status);
|
||||
}
|
864
icu4c/source/i18n/bmsearch.cpp
Normal file
864
icu4c/source/i18n/bmsearch.cpp
Normal file
@ -0,0 +1,864 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2009, International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/usearch.h"
|
||||
|
||||
#include "cmemory.h"
|
||||
#include "unicode/coll.h"
|
||||
#include "unicode/tblcoll.h"
|
||||
#include "unicode/coleitr.h"
|
||||
#include "unicode/ucoleitr.h"
|
||||
|
||||
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "hash.h"
|
||||
#include "uhash.h"
|
||||
#include "ucol_imp.h"
|
||||
#include "unormimp.h"
|
||||
|
||||
#include "unicode/colldata.h"
|
||||
#include "unicode/bmsearch.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
|
||||
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
|
||||
#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
||||
|
||||
|
||||
struct CEI
|
||||
{
|
||||
uint32_t order;
|
||||
int32_t lowOffset;
|
||||
int32_t highOffset;
|
||||
};
|
||||
|
||||
class Target : public UMemory
|
||||
{
|
||||
public:
|
||||
Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
|
||||
~Target();
|
||||
|
||||
void setTargetString(const UnicodeString *target);
|
||||
|
||||
const CEI *nextCE(int32_t offset);
|
||||
const CEI *prevCE(int32_t offset);
|
||||
|
||||
int32_t stringLength();
|
||||
UChar charAt(int32_t offset);
|
||||
|
||||
UBool isBreakBoundary(int32_t offset);
|
||||
int32_t nextBreakBoundary(int32_t offset);
|
||||
int32_t nextSafeBoundary(int32_t offset);
|
||||
|
||||
UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
|
||||
|
||||
void setOffset(int32_t offset);
|
||||
void setLast(int32_t last);
|
||||
int32_t getOffset();
|
||||
|
||||
private:
|
||||
CEI *ceb;
|
||||
int32_t bufferSize;
|
||||
int32_t bufferMin;
|
||||
int32_t bufferMax;
|
||||
|
||||
uint32_t strengthMask;
|
||||
UCollationStrength strength;
|
||||
uint32_t variableTop;
|
||||
UBool toShift;
|
||||
UCollator *coll;
|
||||
|
||||
const UnicodeString *targetString;
|
||||
const UChar *targetBuffer;
|
||||
int32_t targetLength;
|
||||
|
||||
UCollationElements *elements;
|
||||
UBreakIterator *charBreakIterator;
|
||||
};
|
||||
|
||||
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
|
||||
: bufferSize(0), bufferMin(0), bufferMax(0),
|
||||
strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
|
||||
targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
|
||||
{
|
||||
strength = ucol_getStrength(coll);
|
||||
toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
|
||||
variableTop = ucol_getVariableTop(coll, &status);
|
||||
|
||||
// find the largest expansion
|
||||
uint8_t maxExpansion = 0;
|
||||
for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
|
||||
if (*expansion > maxExpansion) {
|
||||
maxExpansion = *expansion;
|
||||
}
|
||||
}
|
||||
|
||||
// room for an extra character on each end, plus 4 for safety
|
||||
bufferSize = patternLength + (2 * maxExpansion) + 4;
|
||||
|
||||
ceb = NEW_ARRAY(CEI, bufferSize);
|
||||
|
||||
if (ceb == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (target != NULL) {
|
||||
setTargetString(target);
|
||||
}
|
||||
|
||||
switch (strength)
|
||||
{
|
||||
default:
|
||||
strengthMask |= UCOL_TERTIARYORDERMASK;
|
||||
/* fall through */
|
||||
|
||||
case UCOL_SECONDARY:
|
||||
strengthMask |= UCOL_SECONDARYORDERMASK;
|
||||
/* fall through */
|
||||
|
||||
case UCOL_PRIMARY:
|
||||
strengthMask |= UCOL_PRIMARYORDERMASK;
|
||||
}
|
||||
}
|
||||
|
||||
Target::~Target()
|
||||
{
|
||||
ubrk_close(charBreakIterator);
|
||||
ucol_closeElements(elements);
|
||||
|
||||
DELETE_ARRAY(ceb);
|
||||
}
|
||||
|
||||
void Target::setTargetString(const UnicodeString *target)
|
||||
{
|
||||
if (charBreakIterator != NULL) {
|
||||
ubrk_close(charBreakIterator);
|
||||
ucol_closeElements(elements);
|
||||
}
|
||||
|
||||
targetString = target;
|
||||
|
||||
if (targetString != NULL) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
targetBuffer = targetString->getBuffer();
|
||||
targetLength = targetString->length();
|
||||
|
||||
elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
|
||||
ucol_forceHanImplicit(elements, &status);
|
||||
|
||||
charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status),
|
||||
targetBuffer, targetLength, &status);
|
||||
} else {
|
||||
targetBuffer = NULL;
|
||||
targetLength = 0;
|
||||
}
|
||||
}
|
||||
|
||||
const CEI *Target::nextCE(int32_t offset)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t low = -1, high = -1;
|
||||
uint32_t order;
|
||||
UBool cont = FALSE;
|
||||
|
||||
if (offset >= bufferMin && offset < bufferMax) {
|
||||
return &ceb[offset];
|
||||
}
|
||||
|
||||
if (bufferMax >= bufferSize || offset != bufferMax) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
do {
|
||||
low = ucol_getOffset(elements);
|
||||
order = ucol_next(elements, &status);
|
||||
high = ucol_getOffset(elements);
|
||||
|
||||
if (order == UCOL_NULLORDER) {
|
||||
//high = low = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
cont = isContinuation(order);
|
||||
order &= strengthMask;
|
||||
|
||||
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
|
||||
if (strength >= UCOL_QUATERNARY) {
|
||||
order &= UCOL_PRIMARYORDERMASK;
|
||||
} else {
|
||||
order = UCOL_IGNORABLE;
|
||||
}
|
||||
}
|
||||
} while (order == UCOL_IGNORABLE);
|
||||
|
||||
if (cont) {
|
||||
order |= UCOL_CONTINUATION_MARKER;
|
||||
}
|
||||
|
||||
ceb[offset].order = order;
|
||||
ceb[offset].lowOffset = low;
|
||||
ceb[offset].highOffset = high;
|
||||
|
||||
bufferMax += 1;
|
||||
|
||||
return &ceb[offset];
|
||||
}
|
||||
|
||||
const CEI *Target::prevCE(int32_t offset)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t low = -1, high = -1;
|
||||
uint32_t order;
|
||||
UBool cont = FALSE;
|
||||
|
||||
if (offset >= bufferMin && offset < bufferMax) {
|
||||
return &ceb[offset];
|
||||
}
|
||||
|
||||
if (bufferMax >= bufferSize || offset != bufferMax) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
do {
|
||||
high = ucol_getOffset(elements);
|
||||
order = ucol_previous(elements, &status);
|
||||
low = ucol_getOffset(elements);
|
||||
|
||||
if (order == UCOL_NULLORDER) {
|
||||
break;
|
||||
}
|
||||
|
||||
cont = isContinuation(order);
|
||||
order &= strengthMask;
|
||||
|
||||
if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
|
||||
if (strength >= UCOL_QUATERNARY) {
|
||||
order &= UCOL_PRIMARYORDERMASK;
|
||||
} else {
|
||||
order = UCOL_IGNORABLE;
|
||||
}
|
||||
}
|
||||
} while (order == UCOL_IGNORABLE);
|
||||
|
||||
bufferMax += 1;
|
||||
|
||||
if (cont) {
|
||||
order |= UCOL_CONTINUATION_MARKER;
|
||||
}
|
||||
|
||||
ceb[offset].order = order;
|
||||
ceb[offset].lowOffset = low;
|
||||
ceb[offset].highOffset = high;
|
||||
|
||||
return &ceb[offset];
|
||||
}
|
||||
|
||||
int32_t Target::stringLength()
|
||||
{
|
||||
if (targetString != NULL) {
|
||||
return targetLength;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
UChar Target::charAt(int32_t offset)
|
||||
{
|
||||
if (targetString != NULL) {
|
||||
return targetBuffer[offset];
|
||||
}
|
||||
|
||||
return 0x0000;
|
||||
}
|
||||
|
||||
void Target::setOffset(int32_t offset)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
bufferMin = 0;
|
||||
bufferMax = 0;
|
||||
|
||||
ucol_setOffset(elements, offset, &status);
|
||||
}
|
||||
|
||||
void Target::setLast(int32_t last)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
bufferMin = 0;
|
||||
bufferMax = 1;
|
||||
|
||||
ceb[0].order = UCOL_NULLORDER;
|
||||
ceb[0].lowOffset = last;
|
||||
ceb[0].highOffset = last;
|
||||
|
||||
ucol_setOffset(elements, last, &status);
|
||||
}
|
||||
|
||||
int32_t Target::getOffset()
|
||||
{
|
||||
return ucol_getOffset(elements);
|
||||
}
|
||||
|
||||
UBool Target::isBreakBoundary(int32_t offset)
|
||||
{
|
||||
return ubrk_isBoundary(charBreakIterator, offset);
|
||||
}
|
||||
|
||||
int32_t Target::nextBreakBoundary(int32_t offset)
|
||||
{
|
||||
return ubrk_following(charBreakIterator, offset);
|
||||
}
|
||||
|
||||
int32_t Target::nextSafeBoundary(int32_t offset)
|
||||
{
|
||||
while (offset < targetLength) {
|
||||
//UChar ch = charAt(offset);
|
||||
UChar ch = targetBuffer[offset];
|
||||
|
||||
if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
|
||||
return offset;
|
||||
}
|
||||
|
||||
offset += 1;
|
||||
}
|
||||
|
||||
return targetLength;
|
||||
}
|
||||
|
||||
UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
|
||||
{
|
||||
if (strength < UCOL_IDENTICAL) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UChar t2[32], p2[32];
|
||||
const UChar *pBuffer = pattern.getBuffer();
|
||||
int32_t pLength = pattern.length();
|
||||
int32_t length = end - start;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
|
||||
|
||||
int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2),
|
||||
targetBuffer + start, length,
|
||||
FALSE, 0, &status);
|
||||
|
||||
// use separate status2 in case of buffer overflow
|
||||
if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2),
|
||||
pBuffer, pLength,
|
||||
FALSE, 0, &status2)) {
|
||||
return FALSE; // lengths are different
|
||||
}
|
||||
|
||||
// compare contents
|
||||
UChar *text, *pat;
|
||||
|
||||
if(U_SUCCESS(status)) {
|
||||
text = t2;
|
||||
pat = p2;
|
||||
} else if(status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
// allocate one buffer for both decompositions
|
||||
text = NEW_ARRAY(UChar, decomplength * 2);
|
||||
|
||||
// Check for allocation failure.
|
||||
if (text == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
pat = text + decomplength;
|
||||
|
||||
unorm_decompose(text, decomplength, targetBuffer + start,
|
||||
length, FALSE, 0, &status);
|
||||
|
||||
unorm_decompose(pat, decomplength, pBuffer,
|
||||
pLength, FALSE, 0, &status);
|
||||
} else {
|
||||
// NFD failed, make sure that u_memcmp() does not overrun t2 & p2
|
||||
// and that we don't uprv_free() an undefined text pointer
|
||||
text = pat = t2;
|
||||
decomplength = 0;
|
||||
}
|
||||
|
||||
UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0);
|
||||
|
||||
if(text != t2) {
|
||||
DELETE_ARRAY(text);
|
||||
}
|
||||
|
||||
// return FALSE if NFD failed
|
||||
return U_SUCCESS(status) && result;
|
||||
}
|
||||
|
||||
#define HASH_TABLE_SIZE 257
|
||||
|
||||
class BadCharacterTable : public UMemory
|
||||
{
|
||||
public:
|
||||
BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
|
||||
~BadCharacterTable();
|
||||
|
||||
int32_t operator[](uint32_t ce) const;
|
||||
int32_t getMaxSkip() const;
|
||||
int32_t minLengthInChars(int32_t index);
|
||||
|
||||
private:
|
||||
static int32_t hash(uint32_t ce);
|
||||
|
||||
int32_t maxSkip;
|
||||
int32_t badCharacterTable[HASH_TABLE_SIZE];
|
||||
|
||||
int32_t *minLengthCache;
|
||||
};
|
||||
|
||||
BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
|
||||
: minLengthCache(NULL)
|
||||
{
|
||||
int32_t plen = patternCEs.size();
|
||||
|
||||
// **** need a better way to deal with this ****
|
||||
if (U_FAILURE(status) || plen == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t *history = NEW_ARRAY(int32_t, plen);
|
||||
|
||||
if (history == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < plen; i += 1) {
|
||||
history[i] = -1;
|
||||
}
|
||||
|
||||
minLengthCache = NEW_ARRAY(int32_t, plen + 1);
|
||||
|
||||
if (minLengthCache == NULL) {
|
||||
DELETE_ARRAY(history);
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
|
||||
|
||||
for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
|
||||
badCharacterTable[j] = maxSkip;
|
||||
}
|
||||
|
||||
for(int32_t p = 1; p < plen; p += 1) {
|
||||
minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
|
||||
|
||||
// Make sure this entry is not bigger than the previous one.
|
||||
// Otherwise, we might skip too far in some cases.
|
||||
if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
|
||||
minLengthCache[p] = minLengthCache[p - 1];
|
||||
}
|
||||
}
|
||||
|
||||
minLengthCache[plen] = 0;
|
||||
|
||||
for(int32_t p = 0; p < plen - 1; p += 1) {
|
||||
badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
|
||||
}
|
||||
|
||||
DELETE_ARRAY(history);
|
||||
}
|
||||
|
||||
BadCharacterTable::~BadCharacterTable()
|
||||
{
|
||||
DELETE_ARRAY(minLengthCache);
|
||||
}
|
||||
|
||||
int32_t BadCharacterTable::operator[](uint32_t ce) const
|
||||
{
|
||||
return badCharacterTable[hash(ce)];
|
||||
}
|
||||
|
||||
int32_t BadCharacterTable::getMaxSkip() const
|
||||
{
|
||||
return maxSkip;
|
||||
}
|
||||
|
||||
int32_t BadCharacterTable::minLengthInChars(int32_t index)
|
||||
{
|
||||
return minLengthCache[index];
|
||||
}
|
||||
|
||||
int32_t BadCharacterTable::hash(uint32_t ce)
|
||||
{
|
||||
return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
|
||||
}
|
||||
|
||||
class GoodSuffixTable : public UMemory
|
||||
{
|
||||
public:
|
||||
GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
|
||||
~GoodSuffixTable();
|
||||
|
||||
int32_t operator[](int32_t offset) const;
|
||||
|
||||
private:
|
||||
int32_t *goodSuffixTable;
|
||||
};
|
||||
|
||||
GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
|
||||
: goodSuffixTable(NULL)
|
||||
{
|
||||
int32_t patlen = patternCEs.size();
|
||||
|
||||
// **** need a better way to deal with this ****
|
||||
if (U_FAILURE(status) || patlen <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t *suff = NEW_ARRAY(int32_t, patlen);
|
||||
int32_t start = patlen - 1, end = - 1;
|
||||
int32_t maxSkip = badCharacterTable.getMaxSkip();
|
||||
|
||||
if (suff == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// initialze suff
|
||||
suff[patlen - 1] = patlen;
|
||||
|
||||
for (int32_t i = patlen - 2; i >= 0; i -= 1) {
|
||||
// (i > start) means we're inside the last suffix match we found
|
||||
// ((patlen - 1) - end) is how far the end of that match is from end of pattern
|
||||
// (i - start) is how far we are from start of that match
|
||||
// (i + (patlen - 1) - end) is index of same character at end of pattern
|
||||
// so if any suffix match at that character doesn't extend beyond the last match,
|
||||
// it's the suffix for this character as well
|
||||
if (i > start && suff[i + patlen - 1 - end] < i - start) {
|
||||
suff[i] = suff[i + patlen - 1 - end];
|
||||
} else {
|
||||
start = end = i;
|
||||
|
||||
int32_t s = patlen;
|
||||
|
||||
while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
|
||||
start -= 1;
|
||||
}
|
||||
|
||||
suff[i] = end - start;
|
||||
}
|
||||
}
|
||||
|
||||
// now build goodSuffixTable
|
||||
goodSuffixTable = NEW_ARRAY(int32_t, patlen);
|
||||
|
||||
if (goodSuffixTable == NULL) {
|
||||
DELETE_ARRAY(suff);
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// initialize entries to minLengthInChars of the pattern
|
||||
for (int32_t i = 0; i < patlen; i += 1) {
|
||||
goodSuffixTable[i] = maxSkip;
|
||||
}
|
||||
|
||||
int32_t prefix = 0;
|
||||
|
||||
for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
|
||||
if (suff[i] == i + 1) {
|
||||
// this matching suffix is a prefix of the pattern
|
||||
int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
|
||||
|
||||
// for any mis-match before this suffix, we should skip
|
||||
// so that the front of the pattern (i.e. the prefix)
|
||||
// lines up with the front of the suffix.
|
||||
// (patlen - 1 - i) is the start of the suffix
|
||||
while (prefix < patlen - 1 - i) {
|
||||
// value of maxSkip means never set...
|
||||
if (goodSuffixTable[prefix] == maxSkip) {
|
||||
goodSuffixTable[prefix] = prefixSkip;
|
||||
}
|
||||
|
||||
prefix += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < patlen - 1; i += 1) {
|
||||
goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
|
||||
}
|
||||
|
||||
DELETE_ARRAY(suff);
|
||||
}
|
||||
|
||||
GoodSuffixTable::~GoodSuffixTable()
|
||||
{
|
||||
DELETE_ARRAY(goodSuffixTable);
|
||||
}
|
||||
|
||||
int32_t GoodSuffixTable::operator[](int32_t offset) const
|
||||
{
|
||||
return goodSuffixTable[offset];
|
||||
}
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
|
||||
|
||||
|
||||
UBool BoyerMooreSearch::empty()
|
||||
{
|
||||
return patCEs->size() <= 0;
|
||||
}
|
||||
|
||||
CollData *BoyerMooreSearch::getData()
|
||||
{
|
||||
return data;
|
||||
}
|
||||
|
||||
CEList *BoyerMooreSearch::getPatternCEs()
|
||||
{
|
||||
return patCEs;
|
||||
}
|
||||
|
||||
BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
|
||||
{
|
||||
return badCharacterTable;
|
||||
}
|
||||
|
||||
GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
|
||||
{
|
||||
return goodSuffixTable;
|
||||
}
|
||||
|
||||
BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
|
||||
UErrorCode &status)
|
||||
: data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
|
||||
{
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
UCollator *collator = data->getCollator();
|
||||
|
||||
patCEs = new CEList(collator, patternString, status);
|
||||
|
||||
if (patCEs == NULL || U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
badCharacterTable = new BadCharacterTable(*patCEs, data, status);
|
||||
|
||||
if (badCharacterTable == NULL || U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
|
||||
|
||||
if (targetString != NULL) {
|
||||
target = new Target(collator, targetString, patCEs->size(), status);
|
||||
}
|
||||
}
|
||||
|
||||
BoyerMooreSearch::~BoyerMooreSearch()
|
||||
{
|
||||
delete target;
|
||||
delete goodSuffixTable;
|
||||
delete badCharacterTable;
|
||||
delete patCEs;
|
||||
}
|
||||
|
||||
void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (target == NULL) {
|
||||
target = new Target(data->getCollator(), targetString, patCEs->size(), status);
|
||||
} else {
|
||||
target->setTargetString(targetString);
|
||||
}
|
||||
}
|
||||
|
||||
// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
|
||||
/*
|
||||
* TODO:
|
||||
* * deal with trailing (and leading?) ignorables.
|
||||
* * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
|
||||
*/
|
||||
UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
|
||||
{
|
||||
UCollator *coll = data->getCollator();
|
||||
int32_t plen = patCEs->size();
|
||||
int32_t tlen = target->stringLength();
|
||||
int32_t maxSkip = badCharacterTable->getMaxSkip();
|
||||
int32_t tOffset = offset + maxSkip;
|
||||
|
||||
if (plen <= 0) {
|
||||
// Searching for a zero length pattern always fails.
|
||||
start = end = -1;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
while (tOffset <= tlen) {
|
||||
int32_t pIndex = plen - 1;
|
||||
int32_t tIndex = 0;
|
||||
int32_t lIndex = 0;
|
||||
|
||||
if (tOffset < tlen) {
|
||||
// **** we really want to skip ahead enough to ****
|
||||
// **** be sure we get at least 1 non-ignorable ****
|
||||
// **** CE after the end of the pattern. ****
|
||||
int32_t next = target->nextSafeBoundary(tOffset + 1);
|
||||
|
||||
target->setOffset(next);
|
||||
|
||||
for (lIndex = 0; ; lIndex += 1) {
|
||||
const CEI *cei = target->prevCE(lIndex);
|
||||
int32_t low = cei->lowOffset;
|
||||
int32_t high = cei->highOffset;
|
||||
|
||||
if (high == 0 || (low < high && low <= tOffset)) {
|
||||
if (low < tOffset) {
|
||||
while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
|
||||
lIndex -= 1;
|
||||
}
|
||||
|
||||
if (high > tOffset) {
|
||||
tOffset = high;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
target->setLast(tOffset);
|
||||
lIndex = 0;
|
||||
}
|
||||
|
||||
tIndex = ++lIndex;
|
||||
|
||||
// Iterate backward until we hit the beginning of the pattern
|
||||
while (pIndex >= 0) {
|
||||
uint32_t pce = (*patCEs)[pIndex];
|
||||
const CEI *tcei = target->prevCE(tIndex++);
|
||||
|
||||
|
||||
if (tcei->order != pce) {
|
||||
// There is a mismatch at this position. Decide how far
|
||||
// over to shift the pattern, then try again.
|
||||
|
||||
int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
|
||||
#ifdef EXTRA_CAUTIOUS
|
||||
int32_t old = tOffset;
|
||||
#endif
|
||||
|
||||
tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
|
||||
|
||||
if (gsOffset > tOffset) {
|
||||
tOffset = gsOffset;
|
||||
}
|
||||
|
||||
#ifdef EXTRA_CAUTIOUS
|
||||
// Make sure we don't skip backwards...
|
||||
if (tOffset <= old) {
|
||||
tOffset = old + 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
pIndex -= 1;
|
||||
}
|
||||
|
||||
if (pIndex < 0) {
|
||||
// We made it back to the beginning of the pattern,
|
||||
// which means we matched it all. Return the location.
|
||||
const CEI firstCEI = *target->prevCE(tIndex - 1);
|
||||
const CEI lastCEI = *target->prevCE(lIndex);
|
||||
int32_t mStart = firstCEI.lowOffset;
|
||||
int32_t minLimit = lastCEI.lowOffset;
|
||||
int32_t maxLimit = lastCEI.highOffset;
|
||||
int32_t mLimit;
|
||||
UBool found = TRUE;
|
||||
|
||||
target->setOffset(/*tOffset*/maxLimit);
|
||||
|
||||
const CEI nextCEI = *target->nextCE(0);
|
||||
|
||||
if (nextCEI.lowOffset > maxLimit) {
|
||||
maxLimit = nextCEI.lowOffset;
|
||||
}
|
||||
|
||||
if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != UCOL_NULLORDER) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (! target->isBreakBoundary(mStart)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (firstCEI.lowOffset == firstCEI.highOffset) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
mLimit = maxLimit;
|
||||
if (minLimit < maxLimit) {
|
||||
int32_t nbb = target->nextBreakBoundary(minLimit);
|
||||
|
||||
if (nbb >= lastCEI.highOffset) {
|
||||
mLimit = nbb;
|
||||
}
|
||||
}
|
||||
|
||||
if (mLimit > maxLimit) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (! target->isBreakBoundary(mLimit)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (! target->isIdentical(pattern, mStart, mLimit)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (found) {
|
||||
start = mStart;
|
||||
end = mLimit;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
|
||||
}
|
||||
// Otherwise, we're here because of a mismatch, so keep going....
|
||||
}
|
||||
|
||||
// no match
|
||||
start = -1;
|
||||
end = -1;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // #if !UCONFIG_NO_COLLATION
|
1104
icu4c/source/i18n/colldata.cpp
Normal file
1104
icu4c/source/i18n/colldata.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -408,6 +408,40 @@
|
||||
<Filter
|
||||
Name="collation"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\bms.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicode\bms.h"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\bmsearch.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicode\bmsearch.h"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\bocsu.c"
|
||||
>
|
||||
@ -504,6 +538,23 @@
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\colldata.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicode\colldata.h"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\search.cpp"
|
||||
>
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* *
|
||||
* Copyright (C) 2001-2008, International Business Machines *
|
||||
* Copyright (C) 2001-2009, International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
* *
|
||||
******************************************************************************
|
||||
@ -45,6 +45,7 @@ typedef enum ECleanupI18NType {
|
||||
UCLN_I18N_UCOL_RES,
|
||||
UCLN_I18N_UCOL_BLD,
|
||||
UCLN_I18N_CSDET,
|
||||
UCLN_I18N_COLL_DATA,
|
||||
UCLN_I18N_COUNT /* This must be last */
|
||||
} ECleanupI18NType;
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2008, International Business Machines
|
||||
* Copyright (C) 1996-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: ucol.cpp
|
||||
@ -123,7 +123,6 @@ uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
|
||||
IInit_collIterate(collator, sourceString, sourceLen, s);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Backup the state of the collIterate struct data
|
||||
* @param data collIterate to backup
|
||||
@ -1499,10 +1498,30 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
|
||||
}
|
||||
else
|
||||
{
|
||||
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
|
||||
// Always use UCA for Han, Hangul
|
||||
// (Han extension A is before main Han block)
|
||||
// **** Han compatibility chars ?? ****
|
||||
if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
|
||||
(ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
|
||||
if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
|
||||
// between the two target ranges; do normal lookup
|
||||
// **** this range is YI, Modifier tone letters, ****
|
||||
// **** Latin-D, Syloti Nagari, Phagas-pa. ****
|
||||
// **** Latin-D might be tailored, so we need to ****
|
||||
// **** do the normal lookup for these guys. ****
|
||||
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
|
||||
} else {
|
||||
// in one of the target ranges; use UCA
|
||||
order = UCOL_NOT_FOUND;
|
||||
}
|
||||
} else {
|
||||
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
|
||||
}
|
||||
|
||||
if(order > UCOL_NOT_FOUND) { /* if a CE is special */
|
||||
order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
|
||||
}
|
||||
|
||||
if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
|
||||
/* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
|
||||
order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
|
||||
@ -1939,7 +1958,23 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
||||
result = coll->latinOneMapping[ch];
|
||||
}
|
||||
else {
|
||||
result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
|
||||
// Always use UCA for [3400..9FFF], [AC00..D7AF]
|
||||
// **** [FA0E..FA2F] ?? ****
|
||||
if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
|
||||
(ch >= 0x3400 && ch <= 0xD7AF)) {
|
||||
if (ch > 0x9FFF && ch < 0xAC00) {
|
||||
// between the two target ranges; do normal lookup
|
||||
// **** this range is YI, Modifier tone letters, ****
|
||||
// **** Latin-D, Syloti Nagari, Phagas-pa. ****
|
||||
// **** Latin-D might be tailored, so we need to ****
|
||||
// **** do the normal lookup for these guys. ****
|
||||
result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
|
||||
} else {
|
||||
result = UCOL_NOT_FOUND;
|
||||
}
|
||||
} else {
|
||||
result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
|
||||
}
|
||||
}
|
||||
if (result > UCOL_NOT_FOUND) {
|
||||
result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
|
||||
@ -3545,38 +3580,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
||||
|
||||
int32_t offsetBias;
|
||||
|
||||
#if 0
|
||||
if (source->offsetReturn != NULL) {
|
||||
source->offsetStore = source->offsetReturn - noChars;
|
||||
}
|
||||
|
||||
// **** doesn't work if using iterator ****
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
if (source->fcdPosition == NULL) {
|
||||
offsetBias = 0;
|
||||
} else {
|
||||
offsetBias = (int32_t)(source->fcdPosition - source->string);
|
||||
}
|
||||
} else {
|
||||
offsetBias = (int32_t)(source->pos - source->string);
|
||||
}
|
||||
|
||||
#else
|
||||
// **** doesn't work if using iterator ****
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
#if 1
|
||||
offsetBias = -1;
|
||||
#else
|
||||
if (source->fcdPosition == NULL) {
|
||||
offsetBias = 0;
|
||||
} else {
|
||||
offsetBias = (int32_t)(source->fcdPosition - source->string);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
offsetBias = (int32_t)(source->pos - source->string);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* a new collIterate is used to simplify things, since using the current
|
||||
collIterate will mean that the forward and backwards iteration will
|
||||
@ -3584,9 +3593,9 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
||||
collIterate temp;
|
||||
int32_t rawOffset;
|
||||
|
||||
//IInit_collIterate(coll, UCharOffset, -1, &temp);
|
||||
IInit_collIterate(coll, UCharOffset, noChars, &temp);
|
||||
temp.flags &= ~UCOL_ITER_NORM;
|
||||
temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
|
||||
|
||||
rawOffset = temp.pos - temp.string; // should always be zero?
|
||||
CE = ucol_IGetNextCE(coll, &temp, status);
|
||||
@ -3679,7 +3688,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
||||
}
|
||||
}
|
||||
|
||||
rawOffset = temp.pos - temp.string;
|
||||
if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
|
||||
rawOffset = temp.fcdPosition - temp.string;
|
||||
} else {
|
||||
rawOffset = temp.pos - temp.string;
|
||||
}
|
||||
|
||||
CE = ucol_IGetNextCE(coll, &temp, status);
|
||||
}
|
||||
|
||||
@ -4136,29 +4150,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
||||
}
|
||||
|
||||
case IMPLICIT_TAG: /* everything that is not defined otherwise */
|
||||
#if 0
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
// **** doesn't work if using iterator ****
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
source->offsetRepeatCount = 1;
|
||||
} else {
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
|
||||
*(source->offsetStore++) = firstOffset;
|
||||
*(source->offsetStore++) = firstOffset + 1;
|
||||
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return getPrevImplicit(ch, source);
|
||||
|
||||
// TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1998-2008, International Business Machines
|
||||
* Copyright (C) 1998-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -260,6 +260,8 @@ minimum number for special Jamo
|
||||
/* by index */
|
||||
#define UCOL_USE_ITERATOR 64
|
||||
|
||||
#define UCOL_FORCE_HAN_IMPLICIT 128
|
||||
|
||||
#define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300
|
||||
|
||||
typedef struct collIterate {
|
||||
@ -390,6 +392,29 @@ uprv_init_pce(const struct UCollationElements *elems);
|
||||
(((uint32_t)(ch) - 0x1161) <= (0x1175 - 0x1161)) || \
|
||||
(((uint32_t)(ch) - 0x11A8) <= (0x11C2 - 0x11A8)))
|
||||
|
||||
/* Han character ranges */
|
||||
#define UCOL_FIRST_HAN 0x4E00
|
||||
#define UCOL_LAST_HAN 0x9FFF
|
||||
#define UCOL_FIRST_HAN_A 0x3400
|
||||
#define UCOL_LAST_HAN_A 0x4DBF
|
||||
#define UCOL_FIRST_HAN_COMPAT 0xFAE0
|
||||
#define UCOL_LAST_HAN_COMPAT 0xFA2F
|
||||
|
||||
/* Han extension B is in plane 2 */
|
||||
#define UCOL_FIRST_HAN_B_LEAD 0xD840
|
||||
#define UCOL_FIRST_HAN_B_TRAIL 0xDC00
|
||||
#define UCOL_LAST_HAN_B_LEAD 0xD869
|
||||
#define UCOL_LAST_HAN_B_TRAIL 0xDEDF
|
||||
|
||||
/* Hangul range */
|
||||
#define UCOL_FIRST_HANGUL 0xAC00
|
||||
#define UCOL_LAST_HANGUL 0xD7AF
|
||||
|
||||
/* Jamo ranges */
|
||||
#define UCOL_FIRST_L_JAMO 0x1100
|
||||
#define UCOL_FIRST_V_JAMO 0x1161
|
||||
#define UCOL_FIRST_T_JAMO 0x11A8
|
||||
#define UCOL_LAST_T_JAMO 0x11F9
|
||||
|
||||
|
||||
#if 0
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2004-2008, International Business Machines
|
||||
* Copyright (C) 2004-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: ucol_sit.cpp
|
||||
@ -578,15 +578,15 @@ ucol_getShortDefinitionString(const UCollator *coll,
|
||||
if(elementSize) {
|
||||
// we should probably canonicalize here...
|
||||
elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, languageArg);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg);
|
||||
elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, regionArg);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg);
|
||||
elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, scriptArg);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg);
|
||||
elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variantArg);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg);
|
||||
elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, keywordArg);
|
||||
appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg);
|
||||
}
|
||||
|
||||
int32_t i = 0;
|
||||
@ -597,7 +597,7 @@ ucol_getShortDefinitionString(const UCollator *coll,
|
||||
if(attribute != UCOL_DEFAULT) {
|
||||
char letter = ucol_sit_attributeValueToLetter(attribute, status);
|
||||
appendShortStringElement(&letter, 1,
|
||||
buffer, &resultSize, capacity, options[i].optionStart);
|
||||
buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 2001-2008, International Business Machines
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
*
|
||||
@ -263,7 +263,14 @@ inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
|
||||
primary = ucol_primaryOrder(ce);
|
||||
}
|
||||
|
||||
// Continuation?
|
||||
// **** This should probably handle continuations too. ****
|
||||
// **** That means that we need 24 bits for the primary ****
|
||||
// **** instead of the 16 that we're currently using. ****
|
||||
// **** So we can lay out the 64 bits as: 24.12.12.16. ****
|
||||
// **** Another complication with continuations is that ****
|
||||
// **** the *second* CE is marked as a continuation, so ****
|
||||
// **** we always have to peek ahead to know how long ****
|
||||
// **** the primary is... ****
|
||||
if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0)
|
||||
|| (elems->pce->isShifted && primary == 0)) {
|
||||
|
||||
@ -285,7 +292,6 @@ inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
|
||||
elems->pce->isShifted = FALSE;
|
||||
}
|
||||
|
||||
|
||||
return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
|
||||
}
|
||||
|
||||
@ -332,6 +338,7 @@ ucol_openElements(const UCollator *coll,
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucol_closeElements(UCollationElements *elems)
|
||||
{
|
||||
@ -375,7 +382,7 @@ ucol_reset(UCollationElements *elems)
|
||||
ci->endp = ci->string + u_strlen(ci->string);
|
||||
}
|
||||
ci->CEpos = ci->toReturn = ci->CEs;
|
||||
ci->flags = UCOL_ITER_HASLEN;
|
||||
ci->flags = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN;
|
||||
if (ci->coll->normalizationMode == UCOL_ON) {
|
||||
ci->flags |= UCOL_ITER_NORM;
|
||||
}
|
||||
@ -391,6 +398,21 @@ ucol_reset(UCollationElements *elems)
|
||||
ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status)
|
||||
{
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (elems == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucol_next(UCollationElements *elems,
|
||||
UErrorCode *status)
|
||||
|
265
icu4c/source/i18n/unicode/bms.h
Normal file
265
icu4c/source/i18n/unicode/bms.h
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
* Copyright (C) 1996-2009, International Business Machines Corporation and Others.
|
||||
* All rights reserved.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: Boyer-Moore StringSearch prototype.
|
||||
* \internal
|
||||
*/
|
||||
|
||||
#ifndef _BMS_H
|
||||
#define _BMS_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucol.h"
|
||||
|
||||
/**
|
||||
* A <code>UCD</code> object holds the Collator-specific data needed to
|
||||
* compute the length of the shortest string that can
|
||||
* generate a partcular list of CEs.
|
||||
*
|
||||
* <code>UCD</code> objects are quite expensive to compute. Because
|
||||
* of this, they are cached. When you call <code>ucd_open</code> it
|
||||
* returns a reference counted cached object. When you call <code>ucd_close</code>
|
||||
* the reference count on the object is decremented but the object is not deleted.
|
||||
*
|
||||
* If you do not need to reuse any unreferenced objects in the cache, you can call
|
||||
* <code>ucd_flushCCache</code>. If you no longer need any <code>UCD</code>
|
||||
* objects, you can call <code>ucd_freeCache</code>
|
||||
*/
|
||||
typedef void UCD;
|
||||
|
||||
/**
|
||||
* Open a <code>UCD</code> object.
|
||||
*
|
||||
* @param collator - the collator
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* @return the <code>UCD</code> object. You must call
|
||||
* <code>ucd_close</code> when you are done using the object.
|
||||
*
|
||||
* Note: if on return status is set to an error, the only safe
|
||||
* thing to do with the returned object is to call <code>ucd_close</code>.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI UCD * U_EXPORT2
|
||||
ucd_open(UCollator *coll, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Release a <code>UCD</code> object.
|
||||
*
|
||||
* @param ucd - the object
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucd_close(UCD *ucd);
|
||||
|
||||
/**
|
||||
* Get the <code>UCollator</code> object used to create a <code>UCD</code> object.
|
||||
* The <code>UCollator</code> object returned may not be the exact
|
||||
* object that was used to create this object, but it will have the
|
||||
* same behavior.
|
||||
*
|
||||
* @param ucd - the <code>UCD</code> object
|
||||
*
|
||||
* @return the <code>UCollator</code> used to create the given
|
||||
* <code>UCD</code> object.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI UCollator * U_EXPORT2
|
||||
ucd_getCollator(UCD *ucd);
|
||||
|
||||
/**
|
||||
* <code>UCD</code> objects are expensive to compute, and so
|
||||
* may be cached. This routine will free the cached objects and delete
|
||||
* the cache.
|
||||
*
|
||||
* WARNING: Don't call this until you are have called <code>close</code>
|
||||
* for each <code>UCD</code> object that you have used. also,
|
||||
* DO NOT call this if another thread may be calling <code>ucd_flushCache</code>
|
||||
* at the same time.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucd_freeCache();
|
||||
|
||||
/**
|
||||
* <code>UCD</code> objects are expensive to compute, and so
|
||||
* may be cached. This routine will remove any unused <code>UCD</code>
|
||||
* objects from the cache.
|
||||
*
|
||||
* @internal 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucd_flushCache();
|
||||
|
||||
/**
|
||||
* BMS
|
||||
*
|
||||
* This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
|
||||
* the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
|
||||
* and a reference to the text being searched.
|
||||
*
|
||||
* To do a search, you fist need to get a <code>UCD</code> object by calling <code>ucd_open</code>.
|
||||
* Then you construct a <code>BMS</code> object from the <code>UCD</code> object, the pattern
|
||||
* string and the target string. Then you call the <code>search</code> method. Here's a code sample:
|
||||
*
|
||||
* <pre>
|
||||
* void boyerMooreExample(UCollator *collator, UChar *pattern, int32_t patternLen, UChar *target, int32_t targetLength)
|
||||
* {
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* int32_t offset = 0, start = -1, end = -1;
|
||||
* UCD *ucd = NULL);
|
||||
* BMS *bms = NULL;
|
||||
*
|
||||
* ucd = ucd_open(collator, &status);
|
||||
* if (U_FAILURE(status)) {
|
||||
* // could not create a UCD object
|
||||
* return;
|
||||
* }
|
||||
*
|
||||
* BMS *bms = bms_open(ucd, pattern, patternLength, target, targetlength, &status);
|
||||
* if (U_FAILURE(status)) {
|
||||
* // could not create a BMS object
|
||||
* ucd_close(ucd);
|
||||
* return;
|
||||
* }
|
||||
*
|
||||
*
|
||||
* // Find all matches
|
||||
* while (bms_search(bms, offset, &start, &end)) {
|
||||
* // process the match between start and end
|
||||
* ...
|
||||
*
|
||||
* // advance past the match
|
||||
* offset = end;
|
||||
* }
|
||||
*
|
||||
* // at this point, if offset == 0, there were no matches
|
||||
* if (offset == 0) {
|
||||
* // handle the case of no matches
|
||||
* }
|
||||
*
|
||||
* bms_close(bms);
|
||||
* ucd_close(ucd);
|
||||
*
|
||||
* // UCD objects are cached, so the call to
|
||||
* // ucd_close doesn't delete the object.
|
||||
* // Call this if you don't need the object any more.
|
||||
* ucd_flushCache();
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
|
||||
*
|
||||
* Knows linitations:
|
||||
* 1) Backwards searching has not been implemented.
|
||||
*
|
||||
* 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
|
||||
* this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
|
||||
* to be equal to Han characters with the same pronounciation. Because this code ignroes
|
||||
* tailorings, searching for a Hangul character will not find a Han character and visa-versa.
|
||||
*
|
||||
* 3) In some cases, searching for a pattern that needs to be normalized and ends
|
||||
* in a discontiguous contraction may fail. The only known cases of this are with
|
||||
* the Tibetan script. For example searching for the pattern
|
||||
* "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
|
||||
* been unable to find a pratical, real-world example of this failure.)
|
||||
*
|
||||
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
struct BMS;
|
||||
typedef struct BMS BMS;
|
||||
|
||||
/**
|
||||
* Construct a <code>MBS</code> object.
|
||||
*
|
||||
* @param ucd - A <code>UCD</code> object holding the Collator-sensitive data
|
||||
* @param pattern - the string for which to search
|
||||
* @param latternLength - the length of the string for which to search
|
||||
* @param target - the string in which to search
|
||||
* @param targetLength - the length of the string in which to search
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* @return the <code>BMS</code> object.
|
||||
*
|
||||
* Note: if on return status is set to an error, the only safe
|
||||
* thing to do with the returned object is to call
|
||||
* <code>bms_close</code>.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI BMS * U_EXPORT2
|
||||
bms_open(UCD *ucd,
|
||||
const UChar *pattern, int32_t patternLength,
|
||||
const UChar *target, int32_t targetLength,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Close a <code>BMS</code> object and release all the
|
||||
* storage associated with it.
|
||||
*
|
||||
* @param bms - the <code>BMS</code> object to close.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
bms_close(BMS *bms);
|
||||
|
||||
/**
|
||||
* Test the pattern to see if it generates any CEs.
|
||||
*
|
||||
* @return <code>TRUE</code> if the pattern string did not generate any CEs
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
bms_empty(BMS *bms);
|
||||
|
||||
/**
|
||||
* Get the <code>UCD</code> object used to create
|
||||
* a given <code>BMS</code> object.
|
||||
*
|
||||
* @param bms - the <code>BMS</code> object
|
||||
*
|
||||
* @return - the <code>UCD</code> object used to create
|
||||
* the given <code>BMS</code> object.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI UCD * U_EXPORT2
|
||||
bms_getData(BMS *bms);
|
||||
|
||||
/**
|
||||
* Search for the pattern string in the target string.
|
||||
*
|
||||
* @param offset - the offset in the target string at which to begin the search
|
||||
* @param start - will be set to the starting offset of the match, or -1 if there's no match
|
||||
* @param end - will be set to the ending offset of the match, or -1 if there's no match
|
||||
*
|
||||
* @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
bms_search(BMS *bms, int32_t offset, int32_t *start, int32_t *end);
|
||||
|
||||
/**
|
||||
* Set the target string for the match.
|
||||
*
|
||||
* @param target - the new target string
|
||||
* @param targetLength - the length of the new target string
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
bms_setTargetString(BMS *bms, const UChar *target, int32_t targetLength, UErrorCode *status);
|
||||
|
||||
#endif /* _BMS_H */
|
221
icu4c/source/i18n/unicode/bmsearch.h
Normal file
221
icu4c/source/i18n/unicode/bmsearch.h
Normal file
@ -0,0 +1,221 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2009, International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: Boyer-Moore StringSearch technology preview
|
||||
* \internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
|
||||
#ifndef B_M_SEARCH_H
|
||||
#define B_M_SEARCH_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/ucol.h"
|
||||
|
||||
#include "unicode/colldata.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class BadCharacterTable;
|
||||
class GoodSuffixTable;
|
||||
class Target;
|
||||
|
||||
/**
|
||||
* BoyerMooreSearch
|
||||
*
|
||||
* This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
|
||||
* the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
|
||||
* and a reference to the text being searched.
|
||||
*
|
||||
* To do a search, you fist need to get a <code>CollData</code> object by calling <code>CollData::open</code>.
|
||||
* Then you construct a <code>BoyerMooreSearch</code> object from the <code>CollData</code> object, the pattern
|
||||
* string and the target string. Then you call the <code>search</code> method. Here's a code sample:
|
||||
*
|
||||
* <pre>
|
||||
* void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeString *target)
|
||||
* {
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* CollData *collData = CollData::open(collator, status);
|
||||
*
|
||||
* if (U_FAILURE(status)) {
|
||||
* // could not create a CollData object
|
||||
* return;
|
||||
* }
|
||||
*
|
||||
* BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString, target, status);
|
||||
*
|
||||
* if (U_FAILURE(status)) {
|
||||
* // could not create a BoyerMooreSearch object
|
||||
* CollData::close(collData);
|
||||
* return;
|
||||
* }
|
||||
*
|
||||
* int32_t offset = 0, start = -1, end = -1;
|
||||
*
|
||||
* // Find all matches
|
||||
* while (search->search(offset, start, end)) {
|
||||
* // process the match between start and end
|
||||
* ...
|
||||
* // advance past the match
|
||||
* offset = end;
|
||||
* }
|
||||
*
|
||||
* // at this point, if offset == 0, there were no matches
|
||||
* if (offset == 0) {
|
||||
* // handle the case of no matches
|
||||
* }
|
||||
*
|
||||
* delete search;
|
||||
* CollData::close(collData);
|
||||
*
|
||||
* // CollData objects are cached, so the call to
|
||||
* // CollData::close doesn't delete the object.
|
||||
* // Call this if you don't need the object any more.
|
||||
* CollData::flushCollDataCache();
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
|
||||
*
|
||||
* Knows linitations:
|
||||
* 1) Backwards searching has not been implemented.
|
||||
*
|
||||
* 2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
|
||||
* this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
|
||||
* to be equal to Han characters with the same pronounciation. Because this code ignroes
|
||||
* tailorings, searching for a Hangul character will not find a Han character and visa-versa.
|
||||
*
|
||||
* 3) In some cases, searching for a pattern that needs to be normalized and ends
|
||||
* in a discontiguous contraction may fail. The only known cases of this are with
|
||||
* the Tibetan script. For example searching for the pattern
|
||||
* "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
|
||||
* been unable to find a pratical, real-world example of this failure.)
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*
|
||||
* @see CollData
|
||||
*/
|
||||
class U_I18N_API BoyerMooreSearch : public UObject
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Construct a <code>BoyerMooreSearch</code> object.
|
||||
*
|
||||
* @param theData - A <code>CollData</code> object holding the Collator-sensitive data
|
||||
* @param patternString - the string for which to search
|
||||
* @param targetString - the string in which to search or <code>NULL</code> if youu will
|
||||
* set it later by calling <code>setTargetString</code>.
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* Note: if on return, status is set to an error code,
|
||||
* the only safe thing to do with this object is to call
|
||||
* the destructor.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* The desstructor
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
~BoyerMooreSearch();
|
||||
|
||||
/**
|
||||
* Test the pattern to see if it generates any CEs.
|
||||
*
|
||||
* @return <code>TRUE</code> if the pattern string did not generate any CEs
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
UBool empty();
|
||||
|
||||
/**
|
||||
* Search for the pattern string in the target string.
|
||||
*
|
||||
* @param offset - the offset in the target string at which to begin the search
|
||||
* @param start - will be set to the starting offset of the match, or -1 if there's no match
|
||||
* @param end - will be set to the ending offset of the match, or -1 if there's no match
|
||||
*
|
||||
* @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
UBool search(int32_t offset, int32_t &start, int32_t &end);
|
||||
|
||||
/**
|
||||
* Set the target string for the match.
|
||||
*
|
||||
* @param targetString - the new target string
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
void setTargetString(const UnicodeString *targetString, UErrorCode &status);
|
||||
|
||||
// **** no longer need these? ****
|
||||
/**
|
||||
* Return the <code>CollData</code> object used for searching
|
||||
*
|
||||
* @return the <code>CollData</code> object used for searching
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
CollData *getData();
|
||||
|
||||
/**
|
||||
* Return the CEs generated by the pattern string.
|
||||
*
|
||||
* @return a <code>CEList</code> object holding the CEs generated by the pattern string.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
CEList *getPatternCEs();
|
||||
|
||||
/**
|
||||
* Return the <code>BadCharacterTable</code> object computed for the pattern string.
|
||||
*
|
||||
* @return the <code>BadCharacterTable</code> object.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
BadCharacterTable *getBadCharacterTable();
|
||||
|
||||
/**
|
||||
* Return the <code>GoodSuffixTable</code> object computed for the pattern string.
|
||||
*
|
||||
* @return the <code>GoodSuffixTable</code> object computed for the pattern string.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
GoodSuffixTable *getGoodSuffixTable();
|
||||
|
||||
/*
|
||||
* UObject glue...
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
static UClassID getStaticClassID();
|
||||
|
||||
private:
|
||||
CollData *data;
|
||||
CEList *patCEs;
|
||||
BadCharacterTable *badCharacterTable;
|
||||
GoodSuffixTable *goodSuffixTable;
|
||||
UnicodeString pattern;
|
||||
Target *target;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // #if !UCONFIG_NO_COLLATION
|
||||
#endif // #ifndef B_M_SEARCH_H
|
430
icu4c/source/i18n/unicode/colldata.h
Normal file
430
icu4c/source/i18n/unicode/colldata.h
Normal file
@ -0,0 +1,430 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2009, International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: Collation data used to compute minLengthInChars.
|
||||
* \internal
|
||||
*/
|
||||
|
||||
#ifndef COLL_DATA_H
|
||||
#define COLL_DATA_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/ucol.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* The size of the internal buffer for the Collator's short description string.
|
||||
*/
|
||||
#define KEY_BUFFER_SIZE 64
|
||||
|
||||
/*
|
||||
* The size of the internal CE buffer in a <code>CEList</code> object
|
||||
*/
|
||||
#define CELIST_BUFFER_SIZE 4
|
||||
|
||||
/*
|
||||
* Define this to enable the <code>CEList</code> objects to collect
|
||||
* statistics.
|
||||
*/
|
||||
//#define INSTRUMENT_CELIST
|
||||
|
||||
/*
|
||||
* The size of the initial list in a <code>StringList</code> object.
|
||||
*/
|
||||
#define STRING_LIST_BUFFER_SIZE 16
|
||||
|
||||
/*
|
||||
* Define this to enable the <code>StringList</code> objects to
|
||||
* collect statistics.
|
||||
*/
|
||||
//#define INSTRUMENT_STRING_LIST
|
||||
|
||||
/**
|
||||
* CEList
|
||||
*
|
||||
* This object holds a list of CEs generated from a particular
|
||||
* <code>UnicodeString</code>
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
class U_I18N_API CEList : public UObject
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Construct a <code>CEList</code> object.
|
||||
*
|
||||
* @param coll - the Collator used to collect the CEs.
|
||||
* @param string - the string for which to collect the CEs.
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* Note: if on return, status is set to an error code,
|
||||
* the only safe thing to do with this object is to call
|
||||
* the destructor.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* The destructor.
|
||||
*/
|
||||
~CEList();
|
||||
|
||||
/**
|
||||
* Return the number of CEs in the list.
|
||||
*
|
||||
* @return the number of CEs in the list.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
int32_t size() const;
|
||||
|
||||
/**
|
||||
* Get a particular CE from the list.
|
||||
*
|
||||
* @param index - the index of the CE to return
|
||||
*
|
||||
* @return the CE, or <code>0</code> if <code>index</code> is out of range
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
uint32_t get(int32_t index) const;
|
||||
|
||||
/**
|
||||
* Check if the CEs in another <code>CEList</code> match the
|
||||
* suffix of this list starting at a give offset.
|
||||
*
|
||||
* @param offsset - the offset of the suffix
|
||||
* @param other - the other <code>CEList</code>
|
||||
*
|
||||
* @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
UBool matchesAt(int32_t offset, const CEList *other) const;
|
||||
|
||||
/**
|
||||
* The index operator.
|
||||
*
|
||||
* @param index - the index
|
||||
*
|
||||
* @return a reference to the given CE in the list
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
uint32_t &operator[](int32_t index) const;
|
||||
|
||||
/*
|
||||
* UObject glue...
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
static UClassID getStaticClassID();
|
||||
|
||||
private:
|
||||
void add(uint32_t ce, UErrorCode &status);
|
||||
|
||||
uint32_t ceBuffer[CELIST_BUFFER_SIZE];
|
||||
uint32_t *ces;
|
||||
int32_t listMax;
|
||||
int32_t listSize;
|
||||
|
||||
#ifdef INSTRUMENT_CELIST
|
||||
static int32_t _active;
|
||||
static int32_t _histogram[10];
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
* StringList
|
||||
*
|
||||
* This object holds a list of <code>UnicodeString</code> objects.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
class U_I18N_API StringList : public UObject
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Construct an empty <code>StringList</code>
|
||||
*
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* Note: if on return, status is set to an error code,
|
||||
* the only safe thing to do with this object is to call
|
||||
* the destructor.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
StringList(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* The destructor.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
~StringList();
|
||||
|
||||
/**
|
||||
* Add a string to the list.
|
||||
*
|
||||
* @param string - the string to add
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
void add(const UnicodeString *string, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Add an array of Unicode code points to the list.
|
||||
*
|
||||
* @param chars - the address of the array of code points
|
||||
* @param count - the number of code points in the array
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
void add(const UChar *chars, int32_t count, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Get a particular string from the list.
|
||||
*
|
||||
* @param index - the index of the string
|
||||
*
|
||||
* @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
|
||||
* if <code>index</code> is out of bounds.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
const UnicodeString *get(int32_t index) const;
|
||||
|
||||
/**
|
||||
* Get the number of stings in the list.
|
||||
*
|
||||
* @return the number of strings in the list.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
int32_t size() const;
|
||||
|
||||
/*
|
||||
* the UObject glue...
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
static UClassID getStaticClassID();
|
||||
|
||||
private:
|
||||
UnicodeString *strings;
|
||||
int32_t listMax;
|
||||
int32_t listSize;
|
||||
|
||||
#ifdef INSTRUMENT_STRING_LIST
|
||||
static int32_t _lists;
|
||||
static int32_t _strings;
|
||||
static int32_t _histogram[101];
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Forward references to internal classes.
|
||||
*/
|
||||
class StringToCEsMap;
|
||||
class CEToStringsMap;
|
||||
class CollDataCache;
|
||||
|
||||
/**
|
||||
* CollData
|
||||
*
|
||||
* This class holds the Collator-specific data needed to
|
||||
* compute the length of the shortest string that can
|
||||
* generate a partcular list of CEs.
|
||||
*
|
||||
* <code>CollData</code> objects are quite expensive to compute. Because
|
||||
* of this, they are cached. When you call <code>CollData::open</code> it
|
||||
* returns a reference counted cached object. When you call <code>CollData::close</code>
|
||||
* the reference count on the object is decremented but the object is not deleted.
|
||||
*
|
||||
* If you do not need to reuse any unreferenced objects in the cache, you can call
|
||||
* <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
|
||||
* objects, you can call <code>CollData::freeCollDataCache</code>
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
class U_I18N_API CollData : public UObject
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Construct a <code>CollData</code> object.
|
||||
*
|
||||
* @param collator - the collator
|
||||
* @param status - will be set if any errors occur.
|
||||
*
|
||||
* @return the <code>CollData</code> object. You must call
|
||||
* <code>close</code> when you are done using the object.
|
||||
*
|
||||
* Note: if on return, status is set to an error code,
|
||||
* the only safe thing to do with this object is to call
|
||||
* <code>CollData::close</code>.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
static CollData *open(UCollator *collator, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Release a <code>CollData</code> object.
|
||||
*
|
||||
* @param collData - the object
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
static void close(CollData *collData);
|
||||
|
||||
/**
|
||||
* Get the <code>UCollator</code> object used to create this object.
|
||||
* The object returned may not be the exact object that was used to
|
||||
* create this object, but it will have the same behavior.
|
||||
*/
|
||||
UCollator *getCollator() const;
|
||||
|
||||
/**
|
||||
* Get a list of all the strings which generate a list
|
||||
* of CEs starting with a given CE.
|
||||
*
|
||||
* @param ce - the CE
|
||||
*
|
||||
* return a <code>StringList</code> object containing all
|
||||
* the stirngs, or <code>NULL</code> if there are
|
||||
* no such strings.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview.
|
||||
*/
|
||||
const StringList *getStringList(int32_t ce) const;
|
||||
|
||||
/**
|
||||
* Get a list of the CEs generated by a partcular stirng.
|
||||
*
|
||||
* @param string - the string
|
||||
*
|
||||
* @return a <code>CEList</code> object containt the CEs. You
|
||||
* must call <code>freeCEList</code> when you are finished
|
||||
* using the <code>CEList</code>/
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview.
|
||||
*/
|
||||
const CEList *getCEList(const UnicodeString *string) const;
|
||||
|
||||
/**
|
||||
* Release a <code>CEList</code> returned by <code>getCEList</code>.
|
||||
*
|
||||
* @param list - the <CEList> to free.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
void freeCEList(const CEList *list);
|
||||
|
||||
/**
|
||||
* Return the length of the shortest string that will generate
|
||||
* the given list of CEs.
|
||||
*
|
||||
* @param ces - the CEs
|
||||
* @param offset - the offset of the first CE in the list to use.
|
||||
*
|
||||
* @return the length of the shortest string.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
|
||||
|
||||
|
||||
/**
|
||||
* Return the length of the shortest string that will generate
|
||||
* the given list of CEs.
|
||||
*
|
||||
* Note: the algorithm used to do this computation is recursive. To
|
||||
* limit the amount of recursion, a "history" list is used to record
|
||||
* the best answer starting at a particular offset in the list of CEs.
|
||||
* If the same offset is visited again during the recursion, the answer
|
||||
* in the history list is used.
|
||||
*
|
||||
* @param ces - the CEs
|
||||
* @param offset - the offset of the first CE in the list to use.
|
||||
* param history - the history list. Must be at least as long as
|
||||
* the number of cEs in the <code>CEList</code>
|
||||
*
|
||||
* @return the length of the shortest string.
|
||||
*
|
||||
* @internal ICU 4.0.1 technology preview
|
||||
*/
|
||||
int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
|
||||
|
||||
/*
|
||||
* UObject glue...
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
static UClassID getStaticClassID();
|
||||
|
||||
/**
|
||||
* <code>CollData</code> objects are expensive to compute, and so
|
||||
* may be cached. This routine will free the cached objects and delete
|
||||
* the cache.
|
||||
*
|
||||
* WARNING: Don't call this until you are have called <code>close</code>
|
||||
* for each <code>CollData</code> object that you have used. also,
|
||||
* DO NOT call this if another thread may be calling <code>flushCollDataCache</code>
|
||||
* at the same time.
|
||||
*
|
||||
* @internal 4.0.1 technology preview
|
||||
*/
|
||||
static void freeCollDataCache();
|
||||
|
||||
/**
|
||||
* <code>CollData</code> objects are expensive to compute, and so
|
||||
* may be cached. This routine will remove any unused <code>CollData</code>
|
||||
* objects from the cache.
|
||||
*
|
||||
* @internal 4.0.1 technology preview
|
||||
*/
|
||||
static void flushCollDataCache();
|
||||
|
||||
private:
|
||||
friend class CollDataCache;
|
||||
friend class CollDataCacheEntry;
|
||||
|
||||
CollData(UCollator *collator, char *cacheKey, int32_t cachekeyLength, UErrorCode &status);
|
||||
~CollData();
|
||||
|
||||
CollData();
|
||||
|
||||
static char *getCollatorKey(UCollator *collator, char *buffer, int32_t bufferLength);
|
||||
|
||||
static CollDataCache *getCollDataCache();
|
||||
|
||||
UCollator *coll;
|
||||
StringToCEsMap *charsToCEList;
|
||||
CEToStringsMap *ceToCharsStartingWith;
|
||||
|
||||
char keyBuffer[KEY_BUFFER_SIZE];
|
||||
char *key;
|
||||
|
||||
static CollDataCache *collDataCache;
|
||||
|
||||
uint32_t minHan;
|
||||
uint32_t maxHan;
|
||||
|
||||
uint32_t jamoLimits[4];
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // #if !UCONFIG_NO_COLLATION
|
||||
#endif // #ifndef COLL_DATA_H
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2001-2008, International Business Machines
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*
|
||||
@ -121,6 +121,7 @@ ucol_openElements(const UCollator *coll,
|
||||
int32_t textLength,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* get a hash code for a key... Not very useful!
|
||||
* @param key the given key.
|
||||
@ -152,6 +153,20 @@ ucol_closeElements(UCollationElements *elems);
|
||||
U_STABLE void U_EXPORT2
|
||||
ucol_reset(UCollationElements *elems);
|
||||
|
||||
/**
|
||||
* Set the collation elements to use implicit ordering for Han
|
||||
* even if they've been tailored. This will also force Hangul
|
||||
* syllables to be ordered by decomposing them to their component
|
||||
* Jamo.
|
||||
*
|
||||
* @param elems The UCollationElements containing the text.
|
||||
* @param status A pointer to a UErrorCode to reveive any errors.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
U_INTERNAL void U_EXPORT2
|
||||
ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the ordering priority of the next collation element in the text.
|
||||
* A single character may contain more than one collation element.
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
|
||||
* Copyright (C) 2001-2009 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 07/02/2001 synwee Creation.
|
||||
@ -3785,7 +3785,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
found = TRUE;
|
||||
// Inner loop checks for a match beginning at each
|
||||
// position from the outer loop.
|
||||
for (patIx=0; patIx<strsrch->pattern.CELength; patIx++) {
|
||||
for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
|
||||
int64_t patCE = strsrch->pattern.PCE[patIx];
|
||||
targetCEI = ceb.get(targetIx+patIx);
|
||||
// Compare CE from target string with CE from the pattern.
|
||||
@ -3814,11 +3814,9 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
// an acceptable character range.
|
||||
//
|
||||
const CEI *firstCEI = ceb.get(targetIx);
|
||||
const CEI *lastCEI = ceb.get(targetIx + strsrch->pattern.CELength - 1);
|
||||
const CEI *nextCEI = ceb.get(targetIx + strsrch->pattern.CELength);
|
||||
const CEI *lastCEI = ceb.get(targetIx + strsrch->pattern.PCELength - 1);
|
||||
const CEI *nextCEI = ceb.get(targetIx + strsrch->pattern.PCELength);
|
||||
|
||||
// targetCEI = ceb.get(targetIx+strsrch->pattern.CELength);
|
||||
// maxLimit = targetCEI->lowIndex;
|
||||
mStart = firstCEI->lowIndex;
|
||||
minLimit = lastCEI->lowIndex;
|
||||
maxLimit = nextCEI->lowIndex;
|
||||
@ -3883,7 +3881,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (!checkIdentical(strsrch, mStart, mLimit)) {
|
||||
if (! checkIdentical(strsrch, mStart, mLimit)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
@ -4006,10 +4004,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
|
||||
found = TRUE;
|
||||
// Inner loop checks for a match beginning at each
|
||||
// position from the outer loop.
|
||||
for (patIx = strsrch->pattern.CELength - 1; patIx >= 0; patIx -= 1) {
|
||||
for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
|
||||
int64_t patCE = strsrch->pattern.PCE[patIx];
|
||||
|
||||
targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1 - patIx);
|
||||
targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx);
|
||||
// Compare CE from target string with CE from the pattern.
|
||||
// Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
|
||||
// which will fail the compare, below.
|
||||
@ -4035,7 +4033,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
|
||||
// There still is a chance of match failure if the CE range not correspond to
|
||||
// an acceptable character range.
|
||||
//
|
||||
const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.CELength - 1);
|
||||
const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1);
|
||||
const CEI *lastCEI = ceb.getPrevious(targetIx);
|
||||
const CEI *nextCEI = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;
|
||||
|
||||
@ -4102,6 +4100,10 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (! checkIdentical(strsrch, mStart, mLimit)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (found) {
|
||||
break;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2008, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/*******************************************************************************
|
||||
@ -515,7 +515,7 @@ backAndForth(UCollationElements *iter)
|
||||
}
|
||||
|
||||
if (o != orders[index].order) {
|
||||
log_err("Mismatched order at index %d: 0x%0:8X vs. 0x%0:8X\n", index,
|
||||
log_err("Mismatched order at index %d: 0x%8.8X vs. 0x%8.8X\n", index,
|
||||
orders[index].order, o);
|
||||
goto bail;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2008, International Business Machines
|
||||
* Copyright (C) 2005-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
@ -11,6 +11,7 @@
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/ucol.h"
|
||||
#include "unicode/bmsearch.h"
|
||||
|
||||
#include "intltest.h"
|
||||
|
||||
@ -34,10 +35,24 @@ public:
|
||||
virtual void offsetTest();
|
||||
virtual void monkeyTest(char *params);
|
||||
|
||||
virtual void bmMonkeyTest(char *params);
|
||||
virtual void boyerMooreTest();
|
||||
virtual void goodSuffixTest();
|
||||
virtual void searchTime();
|
||||
|
||||
virtual void bmsTest();
|
||||
virtual void bmSearchTest();
|
||||
|
||||
virtual void udhrTest();
|
||||
|
||||
private:
|
||||
virtual const char *getPath(char buffer[2048], const char *filename);
|
||||
virtual int32_t monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
|
||||
const char *name, const char *strength, uint32_t seed);
|
||||
|
||||
virtual int32_t bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
|
||||
BoyerMooreSearch *bms, BoyerMooreSearch *abms,
|
||||
const char *name, const char *strength, uint32_t seed);
|
||||
#endif
|
||||
|
||||
};
|
||||
|
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (C) 2008 IBM, Inc. All Rights Reserved.
|
||||
* Copyright (C) 2008-2009 IBM, Inc. All Rights Reserved.
|
||||
*
|
||||
********************************************************************/
|
||||
/**
|
||||
@ -14,7 +14,13 @@
|
||||
StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
|
||||
:UPerfTest(argc,argv,status){
|
||||
int32_t start, end;
|
||||
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
bms = NULL;
|
||||
#else
|
||||
srch = NULL;
|
||||
#endif
|
||||
|
||||
pttrn = NULL;
|
||||
if(status== U_ILLEGAL_ARGUMENT_ERROR || line_mode){
|
||||
fprintf(stderr,gUsageString, "strsrchperf");
|
||||
@ -22,7 +28,8 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
|
||||
}
|
||||
/* Get the Text */
|
||||
src = getBuffer(srcLen, status);
|
||||
|
||||
|
||||
#if 0
|
||||
/* Get a word to find. Do this by selecting a random word with a word breakiterator. */
|
||||
UBreakIterator* brk = ubrk_open(UBRK_WORD, locale, src, srcLen, &status);
|
||||
if(U_FAILURE(status)){
|
||||
@ -38,9 +45,38 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
|
||||
}
|
||||
pttrn = temp; /* store word in pttrn */
|
||||
ubrk_close(brk);
|
||||
#else
|
||||
/* The first line of the file contains the pattern */
|
||||
start = 0;
|
||||
|
||||
for(end = start; ; end += 1) {
|
||||
UChar ch = src[end];
|
||||
|
||||
if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pttrnLen = end - start;
|
||||
UChar* temp = (UChar*)malloc(sizeof(UChar)*(pttrnLen));
|
||||
for (int i = 0; i < pttrnLen; i++) {
|
||||
temp[i] = src[start++];
|
||||
}
|
||||
pttrn = temp; /* store word in pttrn */
|
||||
#endif
|
||||
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
UnicodeString patternString(pttrn, pttrnLen);
|
||||
UCollator *coll = ucol_open(locale, &status);
|
||||
CollData *data = CollData::open(coll, status);
|
||||
|
||||
targetString = new UnicodeString(src, srcLen);
|
||||
bms = new BoyerMooreSearch(data, patternString, targetString, status);
|
||||
#else
|
||||
/* Create the StringSearch object to be use in performance test. */
|
||||
srch = usearch_open(pttrn, pttrnLen, src, srcLen, locale, NULL, &status);
|
||||
#endif
|
||||
|
||||
if(U_FAILURE(status)){
|
||||
fprintf(stderr, "FAILED to create UPerfTest object. Error: %s\n", u_errorName(status));
|
||||
return;
|
||||
@ -49,12 +85,23 @@ StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const cha
|
||||
}
|
||||
|
||||
StringSearchPerformanceTest::~StringSearchPerformanceTest() {
|
||||
CollData *data = bms->getData();
|
||||
UCollator *coll = data->getCollator();
|
||||
|
||||
delete bms;
|
||||
delete targetString;
|
||||
CollData::close(data);
|
||||
ucol_close(coll);
|
||||
|
||||
if (pttrn != NULL) {
|
||||
free(pttrn);
|
||||
}
|
||||
|
||||
#ifndef TEST_BOYER_MOORE_SEARCH
|
||||
if (srch != NULL) {
|
||||
usearch_close(srch);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char *par) {
|
||||
@ -70,12 +117,20 @@ UPerfFunction* StringSearchPerformanceTest::runIndexedTest(int32_t index, UBool
|
||||
}
|
||||
|
||||
UPerfFunction* StringSearchPerformanceTest::Test_ICU_Forward_Search(){
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUForwardSearch, bms, src, srcLen, pttrn, pttrnLen);
|
||||
#else
|
||||
StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUForwardSearch, srch, src, srcLen, pttrn, pttrnLen);
|
||||
#endif
|
||||
return func;
|
||||
}
|
||||
|
||||
UPerfFunction* StringSearchPerformanceTest::Test_ICU_Backward_Search(){
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
StringSearchPerfFunction *func = new StringSearchPerfFunction(ICUBackwardSearch, bms, src, srcLen, pttrn, pttrnLen);
|
||||
#else
|
||||
StringSearchPerfFunction* func = new StringSearchPerfFunction(ICUBackwardSearch, srch, src, srcLen, pttrn, pttrnLen);
|
||||
#endif
|
||||
return func;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (C) 2008 IBM, Inc. All Rights Reserved.
|
||||
* Copyright (C) 2008-2009 IBM, Inc. All Rights Reserved.
|
||||
*
|
||||
********************************************************************/
|
||||
#ifndef _STRSRCHPERF_H
|
||||
@ -8,11 +8,19 @@
|
||||
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/usearch.h"
|
||||
#include "unicode/colldata.h"
|
||||
#include "unicode/bmsearch.h"
|
||||
#include "unicode/uperf.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define TEST_BOYER_MOORE_SEARCH
|
||||
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
typedef void (*StrSrchFn) (BoyerMooreSearch * bms, const UChar *src, int32_t srcLen, const UChar *pttrn, int32_t pttrnLen, UErrorCode *status);
|
||||
#else
|
||||
typedef void (*StrSrchFn)(UStringSearch* srch, const UChar* src,int32_t srcLen, const UChar* pttrn, int32_t pttrnLen, UErrorCode* status);
|
||||
#endif
|
||||
|
||||
class StringSearchPerfFunction : public UPerfFunction {
|
||||
private:
|
||||
@ -21,17 +29,39 @@ private:
|
||||
int32_t srcLen;
|
||||
const UChar* pttrn;
|
||||
int32_t pttrnLen;
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
BoyerMooreSearch *bms;
|
||||
#else
|
||||
UStringSearch* srch;
|
||||
#endif
|
||||
|
||||
public:
|
||||
virtual void call(UErrorCode* status) {
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
(*fn)(bms, src, srcLen, pttrn, pttrnLen, status);
|
||||
#else
|
||||
(*fn)(srch, src, srcLen, pttrn, pttrnLen, status);
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual long getOperationsPerIteration() {
|
||||
#if 0
|
||||
return (long)(srcLen/pttrnLen);
|
||||
#else
|
||||
return (long) srcLen;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
StringSearchPerfFunction(StrSrchFn func, BoyerMooreSearch *search, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen) {
|
||||
fn = func;
|
||||
src = source;
|
||||
srcLen = sourceLen;
|
||||
pttrn = pattern;
|
||||
pttrnLen = patternLen;
|
||||
bms = search;
|
||||
}
|
||||
#else
|
||||
StringSearchPerfFunction(StrSrchFn func, UStringSearch* search, const UChar* source,int32_t sourceLen, const UChar* pattern, int32_t patternLen) {
|
||||
fn = func;
|
||||
src = source;
|
||||
@ -40,6 +70,7 @@ public:
|
||||
pttrnLen = patternLen;
|
||||
srch = search;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class StringSearchPerformanceTest : public UPerfTest {
|
||||
@ -48,7 +79,12 @@ private:
|
||||
int32_t srcLen;
|
||||
UChar* pttrn;
|
||||
int32_t pttrnLen;
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
UnicodeString *targetString;
|
||||
BoyerMooreSearch *bms;
|
||||
#else
|
||||
UStringSearch* srch;
|
||||
#endif
|
||||
|
||||
public:
|
||||
StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status);
|
||||
@ -56,9 +92,29 @@ public:
|
||||
virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = NULL);
|
||||
|
||||
UPerfFunction* Test_ICU_Forward_Search();
|
||||
|
||||
UPerfFunction* Test_ICU_Backward_Search();
|
||||
};
|
||||
|
||||
|
||||
#ifdef TEST_BOYER_MOORE_SEARCH
|
||||
void ICUForwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) {
|
||||
int32_t offset = 0, start = -1, end = -1;
|
||||
|
||||
while (bms->search(offset, start, end)) {
|
||||
offset = end;
|
||||
}
|
||||
}
|
||||
|
||||
void ICUBackwardSearch(BoyerMooreSearch *bms, const UChar *source, int32_t sourceLen, const UChar *pattern, int32_t patternLen, UErrorCode * /*status*/) {
|
||||
int32_t offset = 0, start = -1, end = -1;
|
||||
|
||||
/* NOTE: No Boyer-Moore backward search yet... */
|
||||
while (bms->search(offset, start, end)) {
|
||||
offset = end;
|
||||
}
|
||||
}
|
||||
#else
|
||||
void ICUForwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceLen, const UChar* pattern, int32_t patternLen, UErrorCode* status) {
|
||||
int32_t match;
|
||||
|
||||
@ -76,5 +132,6 @@ void ICUBackwardSearch(UStringSearch *srch, const UChar* source, int32_t sourceL
|
||||
match = usearch_previous(srch, status);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _STRSRCHPERF_H */
|
||||
|
42
icu4c/source/test/testdata/ssearch.xml
vendored
42
icu4c/source/test/testdata/ssearch.xml
vendored
@ -1,6 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<!-- Copyright (c) 2007-2008 IBM Corporation and others. All rights reserved -->
|
||||
<!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->
|
||||
|
||||
<!-- Test data file for string search -->
|
||||
<!DOCTYPE stringsearch-tests [
|
||||
@ -12,6 +12,7 @@
|
||||
locale CDATA "en"
|
||||
strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY"
|
||||
norm (ON | OFF) "OFF"
|
||||
alternate_handling (NON_IGNORABLE | SHIFTED) "NON_IGNORABLE"
|
||||
>
|
||||
|
||||
<!ELEMENT pattern (#PCDATA)>
|
||||
@ -20,7 +21,7 @@
|
||||
<!ELEMENT post (#PCDATA)>
|
||||
]>
|
||||
|
||||
<stringsearch-tests debug="test32">
|
||||
<stringsearch-tests>
|
||||
<!-- debug="test11" (for copying into the above element) -->
|
||||
|
||||
<!-- Very simple match -->
|
||||
@ -174,8 +175,15 @@
|
||||
<pattern>A\u0300</pattern>
|
||||
<pre>At IDENTICAL, shoud this match? </pre><m>\u00c0</m><post></post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test25" strength="SECONDARY" locale="en">
|
||||
|
||||
<test-case id="test24b" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
|
||||
<pattern>A\u0300</pattern>
|
||||
<pre>At IDENTICAL, shoud this match? </pre>
|
||||
<m>\u00c0</m>
|
||||
<post></post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test25" strength="SECONDARY" locale="en">
|
||||
<pattern>Ű</pattern>
|
||||
<pre>12</pre><m>ű</m><post> Ű</post>
|
||||
</test-case>
|
||||
@ -285,11 +293,13 @@
|
||||
|
||||
|
||||
<!-- Long combining sequences -->
|
||||
<!-- Backwards search fails because patterns ends w/ ignorables
|
||||
<test-case id="test60" strength="PRIMARY">
|
||||
<pattern>A\u0301\u0301\u0301\u0301</pattern>
|
||||
<m>A\u0301\u0301\u0301\u0301\u0301</m>
|
||||
</test-case>
|
||||
|
||||
-->
|
||||
|
||||
<test-case id="test61" strength="TERTIARY">
|
||||
<pattern>A\u0301\u0301\u0301\u0301</pattern>
|
||||
<pre>A\u0301\u0301\u0301\u0301\u0301</pre>
|
||||
@ -409,5 +419,27 @@
|
||||
<pattern>VII</pattern>
|
||||
<m>\u2166</m>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test83" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
|
||||
<pattern>Universal Declaration of Human Rights</pattern>
|
||||
<pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of achievement for all peoples and all nations</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en">
|
||||
<pattern>Universal Declaration of Human Rights</pattern>
|
||||
<pre>Proclaims this </pre>
|
||||
<m>Universal-Declaration-of-Human-Rights</m>
|
||||
<post> as a common standard of achievement for all peoples and all nations</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test84" strength="TERTIARY" locale="en">
|
||||
<pattern>\u05E9\u0591\u05E9</pattern>
|
||||
<m>\u05E9\u0592\u05E9</m>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test84b" strength="IDENTICAL" locale="en">
|
||||
<pattern>\u05E9\u0591\u05E9</pattern>
|
||||
<pre>\u05E9\u0592\u05E9</pre>
|
||||
</test-case>
|
||||
</stringsearch-tests>
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user