2001-08-25 02:03:53 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
2014-02-25 21:21:49 +00:00
|
|
|
* Copyright (C) 2001-2014 IBM and others. All rights reserved.
|
2001-08-25 02:03:53 +00:00
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 08/13/2001 synwee Creation.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
#ifndef USRCHIMP_H
|
|
|
|
#define USRCHIMP_H
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
2002-09-20 01:54:48 +00:00
|
|
|
|
|
|
|
#if !UCONFIG_NO_COLLATION
|
|
|
|
|
2010-01-06 23:50:03 +00:00
|
|
|
#include "unicode/normalizer2.h"
|
2001-08-25 02:03:53 +00:00
|
|
|
#include "unicode/ucol.h"
|
|
|
|
#include "unicode/ucoleitr.h"
|
|
|
|
#include "unicode/ubrk.h"
|
|
|
|
|
2014-02-25 21:21:49 +00:00
|
|
|
/* mask off anything but primary order */
|
|
|
|
#define UCOL_PRIMARYORDERMASK 0xffff0000
|
|
|
|
/* mask off anything but secondary order */
|
|
|
|
#define UCOL_SECONDARYORDERMASK 0x0000ff00
|
|
|
|
/* mask off anything but tertiary order */
|
|
|
|
#define UCOL_TERTIARYORDERMASK 0x000000ff
|
|
|
|
/* primary order shift */
|
|
|
|
#define UCOL_PRIMARYORDERSHIFT 16
|
|
|
|
/* secondary order shift */
|
|
|
|
#define UCOL_SECONDARYORDERSHIFT 8
|
|
|
|
|
|
|
|
#define UCOL_IGNORABLE 0
|
|
|
|
|
|
|
|
/* get weights from a CE */
|
|
|
|
#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
|
|
|
|
#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
|
|
|
|
#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
|
|
|
|
|
|
|
|
#define UCOL_CONTINUATION_MARKER 0xC0
|
|
|
|
|
|
|
|
#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This indicates an error has occured during processing or there are no more CEs
|
|
|
|
* to be returned.
|
|
|
|
*/
|
|
|
|
#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
class CollationElementIterator;
|
|
|
|
class Collator;
|
|
|
|
|
|
|
|
struct PCEI
|
|
|
|
{
|
|
|
|
uint64_t ce;
|
|
|
|
int32_t low;
|
|
|
|
int32_t high;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct PCEBuffer
|
|
|
|
{
|
|
|
|
PCEI defaultBuffer[16];
|
|
|
|
PCEI *buffer;
|
|
|
|
int32_t bufferIndex;
|
|
|
|
int32_t bufferSize;
|
|
|
|
|
|
|
|
PCEBuffer();
|
|
|
|
~PCEBuffer();
|
|
|
|
|
|
|
|
void reset();
|
|
|
|
UBool empty() const;
|
|
|
|
void put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
|
|
|
|
const PCEI *get();
|
|
|
|
};
|
|
|
|
|
|
|
|
class UCollationPCE : public UMemory {
|
|
|
|
private:
|
|
|
|
PCEBuffer pceBuffer;
|
|
|
|
CollationElementIterator *cei;
|
|
|
|
UCollationStrength strength;
|
|
|
|
UBool toShift;
|
|
|
|
UBool isShifted;
|
|
|
|
uint32_t variableTop;
|
|
|
|
|
|
|
|
public:
|
|
|
|
UCollationPCE(UCollationElements *elems);
|
|
|
|
UCollationPCE(CollationElementIterator *iter);
|
|
|
|
~UCollationPCE();
|
|
|
|
|
|
|
|
void init(UCollationElements *elems);
|
|
|
|
void init(CollationElementIterator *iter);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the processed ordering priority of the next collation element in the text.
|
|
|
|
* A single character may contain more than one collation element.
|
|
|
|
*
|
|
|
|
* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
|
|
|
|
* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
|
|
|
|
* @param status A pointer to an UErrorCode to receive any errors.
|
|
|
|
* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
|
|
|
|
* if an error has occured or if the end of string has been reached
|
|
|
|
*/
|
|
|
|
int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
|
|
|
|
/**
|
|
|
|
* Get the processed ordering priority of the previous collation element in the text.
|
|
|
|
* A single character may contain more than one collation element.
|
|
|
|
*
|
|
|
|
* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
|
|
|
|
* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
|
|
|
|
* @param status A pointer to an UErrorCode to receive any errors. Noteably
|
|
|
|
* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
|
|
|
|
* buffer has been exhausted.
|
|
|
|
* @return The previous collation elements ordering, otherwise returns
|
|
|
|
* UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
|
|
|
|
* string has been reached.
|
|
|
|
*/
|
|
|
|
int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
|
|
|
|
|
|
|
|
private:
|
|
|
|
void init(const Collator &coll);
|
|
|
|
uint64_t processCE(uint32_t ce);
|
|
|
|
};
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
2001-08-25 02:03:53 +00:00
|
|
|
#define INITIAL_ARRAY_SIZE_ 256
|
2001-09-07 21:56:18 +00:00
|
|
|
#define MAX_TABLE_SIZE_ 257
|
2001-08-25 02:03:53 +00:00
|
|
|
|
|
|
|
struct USearch {
|
|
|
|
// required since collation element iterator does not have a getText API
|
|
|
|
const UChar *text;
|
|
|
|
int32_t textLength; // exact length
|
|
|
|
UBool isOverlap;
|
|
|
|
UBool isCanonicalMatch;
|
2010-01-15 00:55:58 +00:00
|
|
|
int16_t elementComparisonType;
|
|
|
|
UBreakIterator *internalBreakIter; //internal character breakiterator
|
2001-08-25 02:03:53 +00:00
|
|
|
UBreakIterator *breakIter;
|
|
|
|
// value USEARCH_DONE is the default value
|
|
|
|
// if we are not at the start of the text or the end of the text,
|
|
|
|
// depending on the iteration direction and matchedIndex is USEARCH_DONE
|
2008-05-23 04:22:28 +00:00
|
|
|
// it means that we can't find any more matches in that particular direction
|
|
|
|
int32_t matchedIndex;
|
2001-08-25 02:03:53 +00:00
|
|
|
int32_t matchedLength;
|
|
|
|
UBool isForwardSearching;
|
|
|
|
UBool reset;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct UPattern {
|
2001-09-07 21:56:18 +00:00
|
|
|
const UChar *text;
|
|
|
|
int32_t textLength; // exact length
|
2001-08-25 02:03:53 +00:00
|
|
|
// length required for backwards ce comparison
|
2001-09-07 21:56:18 +00:00
|
|
|
int32_t CELength;
|
2004-04-01 22:04:52 +00:00
|
|
|
int32_t *CE;
|
|
|
|
int32_t CEBuffer[INITIAL_ARRAY_SIZE_];
|
2008-05-23 04:22:28 +00:00
|
|
|
int32_t PCELength;
|
|
|
|
int64_t *PCE;
|
|
|
|
int64_t PCEBuffer[INITIAL_ARRAY_SIZE_];
|
2001-09-07 21:56:18 +00:00
|
|
|
UBool hasPrefixAccents;
|
|
|
|
UBool hasSuffixAccents;
|
|
|
|
int16_t defaultShiftSize;
|
|
|
|
int16_t shift[MAX_TABLE_SIZE_];
|
|
|
|
int16_t backShift[MAX_TABLE_SIZE_];
|
2001-08-25 02:03:53 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct UStringSearch {
|
|
|
|
struct USearch *search;
|
|
|
|
struct UPattern pattern;
|
|
|
|
const UCollator *collator;
|
2011-07-06 04:03:35 +00:00
|
|
|
const icu::Normalizer2 *nfd;
|
2001-08-25 02:03:53 +00:00
|
|
|
// positions within the collation element iterator is used to determine
|
|
|
|
// if we are at the start of the text.
|
|
|
|
UCollationElements *textIter;
|
2014-02-25 21:21:49 +00:00
|
|
|
icu::UCollationPCE *textProcessedIter;
|
2001-09-07 21:56:18 +00:00
|
|
|
// utility collation element, used throughout program for temporary
|
|
|
|
// iteration.
|
|
|
|
UCollationElements *utilIter;
|
2001-08-25 02:03:53 +00:00
|
|
|
UBool ownCollator;
|
|
|
|
UCollationStrength strength;
|
|
|
|
uint32_t ceMask;
|
|
|
|
uint32_t variableTop;
|
|
|
|
UBool toShift;
|
|
|
|
UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
|
|
|
|
UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Exact matches without checking for the ends for extra accents.
|
|
|
|
* The match after the position within the collation element iterator is to be
|
|
|
|
* found.
|
|
|
|
* After a match is found the offset in the collation element iterator will be
|
|
|
|
* shifted to the start of the match.
|
|
|
|
* Implementation note:
|
|
|
|
* For tertiary we can't use the collator->tertiaryMask, that is a
|
|
|
|
* preprocessed mask that takes into account case options. since we are only
|
|
|
|
* concerned with exact matches, we don't need that.
|
|
|
|
* Alternate handling - since only the 16 most significant digits is only used,
|
|
|
|
* we can safely do a compare without masking if the ce is a variable, we mask
|
|
|
|
* and get only the primary values no shifting to quartenary is required since
|
|
|
|
* all primary values less than variabletop will need to be masked off anyway.
|
|
|
|
* If the end character is composite and the pattern ce does not match the text
|
|
|
|
* ce, we skip it until we find a match in the end composite character or when
|
|
|
|
* it has passed the character. This is so that we can match pattern "a" with
|
|
|
|
* the text "\u00e6"
|
|
|
|
* @param strsrch string search data
|
|
|
|
* @param status error status if any
|
|
|
|
* @return TRUE if an exact match is found, FALSE otherwise
|
|
|
|
*/
|
2001-10-20 01:09:31 +00:00
|
|
|
U_CFUNC
|
2001-08-25 02:03:53 +00:00
|
|
|
UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Canonical matches.
|
|
|
|
* According to the definition, matches found here will include the whole span
|
|
|
|
* of beginning and ending accents if it overlaps that region.
|
|
|
|
* @param strsrch string search data
|
|
|
|
* @param status error status if any
|
|
|
|
* @return TRUE if a canonical match is found, FALSE otherwise
|
|
|
|
*/
|
2001-10-20 01:09:31 +00:00
|
|
|
U_CFUNC
|
2001-08-25 02:03:53 +00:00
|
|
|
UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets the previous match.
|
|
|
|
* Comments follows from handleNextExact
|
|
|
|
* @param strsrch string search data
|
|
|
|
* @param status error status if any
|
2002-07-01 11:04:45 +00:00
|
|
|
* @return True if a exact math is found, FALSE otherwise.
|
2001-08-25 02:03:53 +00:00
|
|
|
*/
|
2001-10-20 01:09:31 +00:00
|
|
|
U_CFUNC
|
2001-08-25 02:03:53 +00:00
|
|
|
UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Canonical matches.
|
|
|
|
* According to the definition, matches found here will include the whole span
|
|
|
|
* of beginning and ending accents if it overlaps that region.
|
|
|
|
* @param strsrch string search data
|
|
|
|
* @param status error status if any
|
|
|
|
* @return TRUE if a canonical match is found, FALSE otherwise
|
|
|
|
*/
|
2001-10-20 01:09:31 +00:00
|
|
|
U_CFUNC
|
2001-08-25 02:03:53 +00:00
|
|
|
UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
|
|
|
|
UErrorCode *status);
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#endif /* #if !UCONFIG_NO_COLLATION */
|
2001-08-25 02:03:53 +00:00
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#endif
|