scuffed-code/icu4c/source/i18n/usrchimp.h

/*
**********************************************************************
*   Copyright (C) 2001-2015 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*  08/13/2001   synwee      Creation.
**********************************************************************
*/
#ifndef USRCHIMP_H
#define USRCHIMP_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/normalizer2.h"
#include "unicode/ucol.h"
#include "unicode/ucoleitr.h"
#include "unicode/ubrk.h"

/* mask off anything but primary order */
#define UCOL_PRIMARYORDERMASK 0xffff0000
/* mask off anything but secondary order */
#define UCOL_SECONDARYORDERMASK 0x0000ff00
/* mask off anything but tertiary order */
#define UCOL_TERTIARYORDERMASK 0x000000ff
/* primary order shift */
#define UCOL_PRIMARYORDERSHIFT 16
/* secondary order shift */
#define UCOL_SECONDARYORDERSHIFT 8

#define UCOL_IGNORABLE 0

/* get weights from a CE */
#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)

#define UCOL_CONTINUATION_MARKER 0xC0

#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)

/**
 * This indicates an error has occured during processing or there are no more CEs 
 * to be returned.
 */
#define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)

U_NAMESPACE_BEGIN

class CollationElementIterator;
class Collator;

struct PCEI
{
    uint64_t ce;
    int32_t  low;
    int32_t  high;
};

struct PCEBuffer
{
    PCEI    defaultBuffer[16];
    PCEI   *buffer;
    int32_t bufferIndex;
    int32_t bufferSize;

    PCEBuffer();
    ~PCEBuffer();

    void  reset();
    UBool isEmpty() const;
    void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
    const PCEI *get();
};

class UCollationPCE : public UMemory {
private:
    PCEBuffer          pceBuffer;
    CollationElementIterator *cei;
    UCollationStrength strength;
    UBool              toShift;
    UBool              isShifted;
    uint32_t           variableTop;

public:
    UCollationPCE(UCollationElements *elems);
    UCollationPCE(CollationElementIterator *iter);
    ~UCollationPCE();

    void init(UCollationElements *elems);
    void init(CollationElementIterator *iter);

    /**
     * Get the processed ordering priority of the next collation element in the text.
     * A single character may contain more than one collation element.
     *
     * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
     * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
     * @param status A pointer to an UErrorCode to receive any errors.
     * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER 
     *         if an error has occured or if the end of string has been reached
     */
    int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    /**
     * Get the processed ordering priority of the previous collation element in the text.
     * A single character may contain more than one collation element.
     *
     * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
     * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
     * @param status A pointer to an UErrorCode to receive any errors. Noteably 
     *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
     *               buffer has been exhausted.
     * @return The previous collation elements ordering, otherwise returns 
     *         UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
     *         string has been reached.
     */
    int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);

private:
    void init(const Collator &coll);
    uint64_t processCE(uint32_t ce);
};

U_NAMESPACE_END

#define INITIAL_ARRAY_SIZE_       256
#define MAX_TABLE_SIZE_           257

struct USearch {
    // required since collation element iterator does not have a getText API
    const UChar              *text;
          int32_t             textLength; // exact length
          UBool               isOverlap;
          UBool               isCanonicalMatch;
          int16_t             elementComparisonType;
          UBreakIterator     *internalBreakIter;  //internal character breakiterator
          UBreakIterator     *breakIter;
    // value USEARCH_DONE is the default value
    // if we are not at the start of the text or the end of the text, 
    // depending on the iteration direction and matchedIndex is USEARCH_DONE 
    // it means that we can't find any more matches in that particular direction
          int32_t             matchedIndex; 
          int32_t             matchedLength;
          UBool               isForwardSearching;
          UBool               reset;
};

struct UPattern {
    const UChar              *text;
          int32_t             textLength; // exact length
          // length required for backwards ce comparison
          int32_t             cesLength;
          int32_t            *ces;
          int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
          int32_t             pcesLength;
          int64_t            *pces;
          int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
          UBool               hasPrefixAccents;
          UBool               hasSuffixAccents;
          int16_t             defaultShiftSize;
          int16_t             shift[MAX_TABLE_SIZE_];
          int16_t             backShift[MAX_TABLE_SIZE_];
};

struct UStringSearch {
    struct USearch            *search;
    struct UPattern            pattern;
    const  UCollator          *collator;
    const  icu::Normalizer2   *nfd;
    // positions within the collation element iterator is used to determine
    // if we are at the start of the text.
           UCollationElements *textIter;
           icu::UCollationPCE *textProcessedIter;
    // utility collation element, used throughout program for temporary 
    // iteration.
           UCollationElements *utilIter;
           UBool               ownCollator;
           UCollationStrength  strength;
           uint32_t            ceMask;
           uint32_t            variableTop;
           UBool               toShift;
           UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
           UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
};

/**
* Exact matches without checking for the ends for extra accents.
* The match after the position within the collation element iterator is to be
* found. 
* After a match is found the offset in the collation element iterator will be
* shifted to the start of the match.
* Implementation note: 
* For tertiary we can't use the collator->tertiaryMask, that is a 
* preprocessed mask that takes into account case options. since we are only 
* concerned with exact matches, we don't need that.
* Alternate handling - since only the 16 most significant digits is only used, 
* we can safely do a compare without masking if the ce is a variable, we mask 
* and get only the primary values no shifting to quartenary is required since 
* all primary values less than variabletop will need to be masked off anyway.
* If the end character is composite and the pattern ce does not match the text 
* ce, we skip it until we find a match in the end composite character or when 
* it has passed the character. This is so that we can match pattern "a" with
* the text "\u00e6" 
* @param strsrch string search data
* @param status error status if any
* @return TRUE if an exact match is found, FALSE otherwise
*/
U_CFUNC
UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);

/**
* Canonical matches.
* According to the definition, matches found here will include the whole span 
* of beginning and ending accents if it overlaps that region.
* @param strsrch string search data
* @param status error status if any
* @return TRUE if a canonical match is found, FALSE otherwise
*/
U_CFUNC
UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);

/**
* Gets the previous match.
* Comments follows from handleNextExact
* @param strsrch string search data
* @param status error status if any
* @return True if a exact math is found, FALSE otherwise.
*/
U_CFUNC
UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);

/**
* Canonical matches.
* According to the definition, matches found here will include the whole span 
* of beginning and ending accents if it overlaps that region.
* @param strsrch string search data
* @param status error status if any
* @return TRUE if a canonical match is found, FALSE otherwise
*/
U_CFUNC
UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 
                                      UErrorCode    *status);

#endif /* #if !UCONFIG_NO_COLLATION */

#endif
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`/*`
			`**********************************************************************`
ICU-12564 Reverted r38761 and r38762, because we want to prepend the Unicode copyright for existing source files, instead of replacing copyright comments. X-SVN-Rev: 38776 2016-05-31 21:45:07 +00:00			`* Copyright (C) 2001-2015 IBM and others. All rights reserved.`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`**********************************************************************`
			`* Date Name Description`
			`* 08/13/2001 synwee Creation.`
			`**********************************************************************`
			`*/`
			`#ifndef USRCHIMP_H`
			`#define USRCHIMP_H`

			`#include "unicode/utypes.h"`
ICU-2248 modularize icu, allow parts to not be built X-SVN-Rev: 9900 2002-09-20 01:54:48 +00:00
			`#if !UCONFIG_NO_COLLATION`

ICU-7273 merge in Normalizer2 API & code, and ICU-5785 UnicodeSet::span(UnicodeString) and ICU-7296 tempSubString()/retainBetween(); merge -r 26971:27150 branches/markus/norm2 X-SVN-Rev: 27155 2010-01-06 23:50:03 +00:00			`#include "unicode/normalizer2.h"`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`#include "unicode/ucol.h"`
			`#include "unicode/ucoleitr.h"`
			`#include "unicode/ubrk.h"`

ICU-9101 merge branches/markus/collv2@35225 into the trunk X-SVN-Rev: 35227 2014-02-25 21:21:49 +00:00			`/* mask off anything but primary order */`
			`#define UCOL_PRIMARYORDERMASK 0xffff0000`
			`/* mask off anything but secondary order */`
			`#define UCOL_SECONDARYORDERMASK 0x0000ff00`
			`/* mask off anything but tertiary order */`
			`#define UCOL_TERTIARYORDERMASK 0x000000ff`
			`/* primary order shift */`
			`#define UCOL_PRIMARYORDERSHIFT 16`
			`/* secondary order shift */`
			`#define UCOL_SECONDARYORDERSHIFT 8`

			`#define UCOL_IGNORABLE 0`

			`/* get weights from a CE */`
			`#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)`
			`#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)`
			`#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)`

			`#define UCOL_CONTINUATION_MARKER 0xC0`

			`#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)`

			`/**`
			`* This indicates an error has occured during processing or there are no more CEs`
			`* to be returned.`
			`*/`
			`#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)`

			`U_NAMESPACE_BEGIN`

			`class CollationElementIterator;`
			`class Collator;`

			`struct PCEI`
			`{`
			`uint64_t ce;`
			`int32_t low;`
			`int32_t high;`
			`};`

			`struct PCEBuffer`
			`{`
			`PCEI defaultBuffer[16];`
			`PCEI *buffer;`
			`int32_t bufferIndex;`
			`int32_t bufferSize;`

			`PCEBuffer();`
			`~PCEBuffer();`

			`void reset();`
ICU-11832 some better error checking based on static code analysis X-SVN-Rev: 37930 2015-09-10 01:42:34 +00:00			`UBool isEmpty() const;`
			`void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);`
ICU-9101 merge branches/markus/collv2@35225 into the trunk X-SVN-Rev: 35227 2014-02-25 21:21:49 +00:00			`const PCEI *get();`
			`};`

			`class UCollationPCE : public UMemory {`
			`private:`
			`PCEBuffer pceBuffer;`
			`CollationElementIterator *cei;`
			`UCollationStrength strength;`
			`UBool toShift;`
			`UBool isShifted;`
			`uint32_t variableTop;`

			`public:`
			`UCollationPCE(UCollationElements *elems);`
			`UCollationPCE(CollationElementIterator *iter);`
			`~UCollationPCE();`

			`void init(UCollationElements *elems);`
			`void init(CollationElementIterator *iter);`

			`/**`
			`* Get the processed ordering priority of the next collation element in the text.`
			`* A single character may contain more than one collation element.`
			`*`
			`* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.`
			`* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.`
			`* @param status A pointer to an UErrorCode to receive any errors.`
			`* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER`
			`* if an error has occured or if the end of string has been reached`
			`*/`
			`int64_t nextProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);`
			`/**`
			`* Get the processed ordering priority of the previous collation element in the text.`
			`* A single character may contain more than one collation element.`
			`*`
			`* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE`
			`* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE`
			`* @param status A pointer to an UErrorCode to receive any errors. Noteably`
			`* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack`
			`* buffer has been exhausted.`
			`* @return The previous collation elements ordering, otherwise returns`
			`* UCOL_PROCESSED_NULLORDER if an error has occured or if the start of`
			`* string has been reached.`
			`*/`
			`int64_t previousProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);`

			`private:`
			`void init(const Collator &coll);`
			`uint64_t processCE(uint32_t ce);`
			`};`

			`U_NAMESPACE_END`

ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`#define INITIAL_ARRAY_SIZE_ 256`
ICU-1030 Code review and optimizations updates for String Search. X-SVN-Rev: 5717 2001-09-07 21:56:18 +00:00			`#define MAX_TABLE_SIZE_ 257`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00
			`struct USearch {`
			`// required since collation element iterator does not have a getText API`
			`const UChar *text;`
			`int32_t textLength; // exact length`
			`UBool isOverlap;`
			`UBool isCanonicalMatch;`
ICU-7093 Add USearchAttribute USEARCH_ELEMENT_COMPARISON and values X-SVN-Rev: 27264 2010-01-15 00:55:58 +00:00			`int16_t elementComparisonType;`
			`UBreakIterator *internalBreakIter; //internal character breakiterator`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`UBreakIterator *breakIter;`
			`// value USEARCH_DONE is the default value`
			`// if we are not at the start of the text or the end of the text,`
			`// depending on the iteration direction and matchedIndex is USEARCH_DONE`
ICU-5420 merge changes from branches/eric/string-search r.23303 - r.23976 X-SVN-Rev: 23977 2008-05-23 04:22:28 +00:00			`// it means that we can't find any more matches in that particular direction`
			`int32_t matchedIndex;`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`int32_t matchedLength;`
			`UBool isForwardSearching;`
			`UBool reset;`
			`};`

			`struct UPattern {`
ICU-1030 Code review and optimizations updates for String Search. X-SVN-Rev: 5717 2001-09-07 21:56:18 +00:00			`const UChar *text;`
			`int32_t textLength; // exact length`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`// length required for backwards ce comparison`
ICU-11047 disambiguate various CEBuffer-s X-SVN-Rev: 36217 2014-08-20 23:55:04 +00:00			`int32_t cesLength;`
			`int32_t *ces;`
			`int32_t cesBuffer[INITIAL_ARRAY_SIZE_];`
			`int32_t pcesLength;`
			`int64_t *pces;`
			`int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];`
ICU-1030 Code review and optimizations updates for String Search. X-SVN-Rev: 5717 2001-09-07 21:56:18 +00:00			`UBool hasPrefixAccents;`
			`UBool hasSuffixAccents;`
			`int16_t defaultShiftSize;`
			`int16_t shift[MAX_TABLE_SIZE_];`
			`int16_t backShift[MAX_TABLE_SIZE_];`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`};`

			`struct UStringSearch {`
			`struct USearch *search;`
			`struct UPattern pattern;`
			`const UCollator *collator;`
ICU-8680 require C++ namespace, replace most U_NAMESPACE_QUALIFIER with icu::, remove still-draft U_STD_NS, U_STD_NSQ, and U_STD_NS_USE X-SVN-Rev: 30281 2011-07-06 04:03:35 +00:00			`const icu::Normalizer2 *nfd;`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`// positions within the collation element iterator is used to determine`
			`// if we are at the start of the text.`
			`UCollationElements *textIter;`
ICU-9101 merge branches/markus/collv2@35225 into the trunk X-SVN-Rev: 35227 2014-02-25 21:21:49 +00:00			`icu::UCollationPCE *textProcessedIter;`
ICU-1030 Code review and optimizations updates for String Search. X-SVN-Rev: 5717 2001-09-07 21:56:18 +00:00			`// utility collation element, used throughout program for temporary`
			`// iteration.`
			`UCollationElements *utilIter;`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`UBool ownCollator;`
			`UCollationStrength strength;`
			`uint32_t ceMask;`
			`uint32_t variableTop;`
			`UBool toShift;`
			`UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];`
			`UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];`
			`};`

			`/**`
			`* Exact matches without checking for the ends for extra accents.`
			`* The match after the position within the collation element iterator is to be`
			`* found.`
			`* After a match is found the offset in the collation element iterator will be`
			`* shifted to the start of the match.`
			`* Implementation note:`
			`* For tertiary we can't use the collator->tertiaryMask, that is a`
			`* preprocessed mask that takes into account case options. since we are only`
			`* concerned with exact matches, we don't need that.`
			`* Alternate handling - since only the 16 most significant digits is only used,`
			`* we can safely do a compare without masking if the ce is a variable, we mask`
			`* and get only the primary values no shifting to quartenary is required since`
			`* all primary values less than variabletop will need to be masked off anyway.`
			`* If the end character is composite and the pattern ce does not match the text`
			`* ce, we skip it until we find a match in the end composite character or when`
			`* it has passed the character. This is so that we can match pattern "a" with`
			`* the text "\u00e6"`
			`* @param strsrch string search data`
			`* @param status error status if any`
			`* @return TRUE if an exact match is found, FALSE otherwise`
			`*/`
ICU-1211 correcting what gets exported where X-SVN-Rev: 6361 2001-10-20 01:09:31 +00:00			`U_CFUNC`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`UBool usearch_handleNextExact(UStringSearch strsrch, UErrorCode status);`

			`/**`
			`* Canonical matches.`
			`* According to the definition, matches found here will include the whole span`
			`* of beginning and ending accents if it overlaps that region.`
			`* @param strsrch string search data`
			`* @param status error status if any`
			`* @return TRUE if a canonical match is found, FALSE otherwise`
			`*/`
ICU-1211 correcting what gets exported where X-SVN-Rev: 6361 2001-10-20 01:09:31 +00:00			`U_CFUNC`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`UBool usearch_handleNextCanonical(UStringSearch strsrch, UErrorCode status);`

			`/**`
			`* Gets the previous match.`
			`* Comments follows from handleNextExact`
			`* @param strsrch string search data`
			`* @param status error status if any`
ICU-1953 API docs have complete set of @param, @return... X-SVN-Rev: 8983 2002-07-01 11:04:45 +00:00			`* @return True if a exact math is found, FALSE otherwise.`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`*/`
ICU-1211 correcting what gets exported where X-SVN-Rev: 6361 2001-10-20 01:09:31 +00:00			`U_CFUNC`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`UBool usearch_handlePreviousExact(UStringSearch strsrch, UErrorCode status);`

			`/**`
			`* Canonical matches.`
			`* According to the definition, matches found here will include the whole span`
			`* of beginning and ending accents if it overlaps that region.`
			`* @param strsrch string search data`
			`* @param status error status if any`
			`* @return TRUE if a canonical match is found, FALSE otherwise`
			`*/`
ICU-1211 correcting what gets exported where X-SVN-Rev: 6361 2001-10-20 01:09:31 +00:00			`U_CFUNC`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00			`UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,`
			`UErrorCode *status);`

ICU-2248 modularize icu, allow parts to not be built X-SVN-Rev: 9900 2002-09-20 01:54:48 +00:00			`#endif /* #if !UCONFIG_NO_COLLATION */`
ICU-1030 New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587 2001-08-25 02:03:53 +00:00
ICU-2248 modularize icu, allow parts to not be built X-SVN-Rev: 9900 2002-09-20 01:54:48 +00:00			`#endif`