ICU-352 rewrite to use inversion list; upgrade API to use UChar32

X-SVN-Rev: 1491
This commit is contained in:
Alan Liu 2000-05-25 19:27:14 +00:00
parent edd13c1b92
commit 55bf569c3a
2 changed files with 801 additions and 574 deletions

View File

@ -248,19 +248,17 @@ class TransliterationRule;
*/
class U_I18N_API UnicodeSet : public UnicodeFilter {
/**
* The internal representation is a UnicodeString of even length.
* Each pair of characters represents a range that is included in
* the set. A single character c is represented as cc. Thus, the
* ranges in the set are (a,b), a and b inclusive, where a =
* pairs.charAt(i) and b = pairs.charAt(i+1) for all even i, 0 <=
* i <= pairs.length()-2. Pairs are always stored in ascending
* Unicode order. Pairs are always stored in shortest form. For
* example, if the pair "hh", representing the single character
* 'h', is added to the pairs list "agik", representing the ranges
* 'a'-'g' and 'i'-'k', the result is "ak", not "aghhik".
*/
UnicodeString pairs;
int32_t len; // length of list used; 0 <= len <= capacity
int32_t capacity; // capacity of list
int32_t bufferCapacity; // capacity of buffer
UChar32* list; // MUST be terminated with HIGH
UChar32* buffer; // internal buffer, may be NULL
static const UChar32 LOW; // LOW <= all valid values. ZERO for codepoints
static const UChar32 HIGH; // HIGH > all valid values. 110000 for codepoints
static const int32_t START_EXTRA; // initial storage. Must be >= 0
static const int32_t GROW_EXTRA; // extra amount for growth. Must be >= 0
static const UnicodeString CATEGORY_NAMES;
@ -269,7 +267,7 @@ class U_I18N_API UnicodeSet : public UnicodeFilter {
* Unicode::getType(), to pairs strings. Entries are initially
* zero length and are filled in on demand.
*/
static UnicodeString* CATEGORY_PAIRS_CACHE;
static UnicodeSet* CATEGORY_CACHE;
/**
* Delimiter string used in patterns to close a category reference:
@ -285,19 +283,19 @@ class U_I18N_API UnicodeSet : public UnicodeFilter {
static const UChar COLON;
static const UChar BACKSLASH;
static const UChar INTERSECTION;
static const UChar UPPER_U;
//----------------------------------------------------------------
// Debugging and testing
//----------------------------------------------------------------
public:
/**
* Return the representation of this set as a list of character
* ranges. Ranges are listed in ascending Unicode order. For
* example, the set [a-zA-M3] is represented as "33AMaz".
* Minimum value that can be stored in a UnicodeSet.
*/
const UnicodeString& getPairs(void) const;
static const UChar32 MIN_VALUE;
/**
* Maximum value that can be stored in a UnicodeSet.
*/
static const UChar32 MAX_VALUE;
//----------------------------------------------------------------
// Constructors &c
@ -311,6 +309,15 @@ public:
*/
UnicodeSet();
/**
* Constructs a set containing the given range. If <code>end >
* start</code> then an empty set is created.
*
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
*/
UnicodeSet(UChar32 start, UChar32 end);
/**
* Constructs a set from the given pattern. See the class
* description for the syntax of the pattern language.
@ -391,6 +398,16 @@ public:
// Public API
//----------------------------------------------------------------
/**
* Make this object represent the range <code>start - end</code>.
* If <code>end > start</code> then this object is set to an
* an empty range.
*
* @param start first character in the set, inclusive
* @rparam end last character in the set, inclusive
*/
void set(UChar32 start, UChar32 end);
/**
* Modifies this set to represent the set specified by the given
* pattern, optionally ignoring white space. See the class
@ -436,7 +453,7 @@ public:
* of chars.
* @draft
*/
virtual UBool contains(UChar first, UChar last) const;
virtual UBool contains(UChar32 start, UChar32 end) const;
/**
* Returns <tt>true</tt> if this set contains the specified char.
@ -444,21 +461,30 @@ public:
* @return <tt>true</tt> if this set contains the specified char.
* @draft
*/
virtual UBool contains(UChar32 c) const;
/**
* Implement UnicodeFilter:
* Returns <tt>true</tt> if this set contains the specified char.
*
* @return <tt>true</tt> if this set contains the specified char.
* @draft
*/
virtual UBool contains(UChar c) const;
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,
* the call leaves this set unchanged. If <code>last > first</code>
* the call leaves this set unchanged. If <code>end > start</code>
* then an empty range is added, leaving the set unchanged.
*
* @param first first character, inclusive, of range to be added
* @param start first character, inclusive, of range to be added
* to this set.
* @param last last character, inclusive, of range to be added
* @param end last character, inclusive, of range to be added
* to this set.
* @draft
*/
virtual void add(UChar first, UChar last);
virtual void add(UChar32 start, UChar32 end);
/**
* Adds the specified character to this set if it is not already
@ -466,21 +492,39 @@ public:
* the call leaves this set unchanged.
* @draft
*/
virtual void add(UChar c);
void add(UChar32 c);
/**
* Retain only the elements in this set that are contained in the
* specified range. If <code>end > start</code> then an empty range is
* retained, leaving the set empty.
*
* @param start first character, inclusive, of range to be retained
* to this set.
* @param end last character, inclusive, of range to be retained
* to this set.
*/
virtual void retain(UChar32 start, UChar32 end);
/**
* Retain the specified character from this set if it is present.
*/
void retain(UChar32 c);
/**
* Removes the specified range from this set if it is present.
* The set will not contain the specified range once the call
* returns. If <code>last > first</code> then an empty range is
* returns. If <code>end > start</code> then an empty range is
* removed, leaving the set unchanged.
*
* @param first first character, inclusive, of range to be removed
* @param start first character, inclusive, of range to be removed
* from this set.
* @param last last character, inclusive, of range to be removed
* @param end last character, inclusive, of range to be removed
* from this set.
* @draft
*/
virtual void remove(UChar first, UChar last);
virtual void remove(UChar32 start, UChar32 end);
/**
* Removes the specified character from this set if it is present.
@ -488,7 +532,28 @@ public:
* returns.
* @draft
*/
virtual void remove(UChar c);
void remove(UChar32 c);
/**
* Complements the specified range in this set. Any character in
* the range will be removed if it is in this set, or will be
* added if it is not in this set. If <code>end > start</code>
* then an empty range is xor'ed, leaving the set unchanged.
*
* @param start first character, inclusive, of range to be removed
* from this set.
* @param end last character, inclusive, of range to be removed
* from this set.
*/
virtual void xor(UChar32 start, UChar32 end);
/**
* Complements the specified character in this set. The character
* will be removed if it is in this set, or will be added if it is
* not in this set.
*/
void xor(UChar32 c);
/**
* Returns <tt>true</tt> if the specified set is a <i>subset</i>
@ -538,6 +603,16 @@ public:
*/
virtual void removeAll(const UnicodeSet& c);
/**
* Complements in this set all elements contained in the specified
* set. Any character in the other set will be removed if it is
* in this set, or will be added if it is not in this set.
*
* @param c set that defines which elements will be xor'ed from
* this set.
*/
virtual void xorAll(const UnicodeSet& c);
/**
* Inverts this set. This operation modifies this set so that
* its value is its complement. This is equivalent to the pseudo code:
@ -553,6 +628,36 @@ public:
*/
virtual void clear(void);
/**
* Iteration method that returns the number of ranges contained in
* this set.
* @see #getRangeStart
* @see #getRangeEnd
*/
virtual int32_t getRangeCount(void) const;
/**
* Iteration method that returns the first character in the
* specified range of this set.
* @see #getRangeCount
* @see #getRangeEnd
*/
virtual UChar32 getRangeStart(int32_t index) const;
/**
* Iteration method that returns the last character in the
* specified range of this set.
* @see #getRangeStart
* @see #getRangeEnd
*/
virtual UChar32 getRangeEnd(int32_t index) const;
/**
* Reallocate this objects internal structures to take up the least
* possible space, without changing this object's value.
*/
virtual void compact();
private:
//----------------------------------------------------------------
@ -621,70 +726,17 @@ private:
* substring of <code>pattern</code>
* @exception IllegalArgumentException if the parse fails.
*/
static UnicodeString& parse(UnicodeString& pairsBuf /*result*/,
const UnicodeString& pattern,
ParsePosition& pos,
const SymbolTable* symbols,
UErrorCode& status);
//----------------------------------------------------------------
// Implementation: Efficient in-place union & difference
//----------------------------------------------------------------
/**
* Performs a union operation: adds the range 'c'-'d' to the given
* pairs list. The pairs list is modified in place. The result
* is normalized (in order and as short as possible). For
* example, addPair("am", 'l', 'q') => "aq". addPair("ampz", 'n',
* 'o') => "az".
*/
static void addPair(UnicodeString& pairs, UChar c, UChar d);
/**
* Performs an asymmetric difference: removes the range 'c'-'d'
* from the pairs list. The pairs list is modified in place. The
* result is normalized (in order and as short as possible). For
* example, removePair("am", 'l', 'q') => "ak".
* removePair("ampz", 'l', 'q') => "akrz".
*/
static void removePair(UnicodeString& pairs, UChar c, UChar d);
//----------------------------------------------------------------
// Implementation: Fundamental operators
//----------------------------------------------------------------
/**
* Changes the pairs list to represent the complement of the set it
* currently represents. The pairs list will be normalized (in
* order and in shortest possible form) if the original pairs list
* was normalized.
*/
static void doComplement(UnicodeString& pairs);
/**
* Given two pairs lists, changes the first in place to represent
* the union of the two sets.
*/
static void doUnion(UnicodeString& pairs, const UnicodeString& c2);
/**
* Given two pairs lists, changes the first in place to represent
* the asymmetric difference of the two sets.
*/
static void doDifference(UnicodeString& pairs, const UnicodeString& pairs2);
/**
* Given two pairs lists, changes the first in place to represent
* the intersection of the two sets.
*/
static void doIntersection(UnicodeString& pairs, const UnicodeString& c2);
void applyPattern(const UnicodeString& pattern,
ParsePosition& pos,
const SymbolTable* symbols,
UErrorCode& status);
//----------------------------------------------------------------
// Implementation: Generation of pairs for Unicode categories
//----------------------------------------------------------------
/**
* Returns a pairs string for the given category, given its name.
* Sets this object to the given category, given its name.
* The category name must be either a two-letter name, such as
* "Lu", or a one letter name, such as "L". One-letter names
* indicate the logical union of all two-letter names that start
@ -697,16 +749,15 @@ private:
* complements such as "^Lu" or "^L". It would be easy to cache
* these as well in a hashtable should the need arise.
*/
static UnicodeString& getCategoryPairs(UnicodeString& result,
const UnicodeString& catName,
UErrorCode& status);
void applyCategory(const UnicodeString& catName,
UErrorCode& status);
/**
* Returns a pairs string for the given category. This string is
* cached and returned again if this method is called again with
* the same parameter.
*/
static const UnicodeString& getCategoryPairs(int8_t cat);
static const UnicodeSet& getCategorySet(int8_t cat);
//----------------------------------------------------------------
// Implementation: Utility methods
@ -717,6 +768,26 @@ private:
* there is none.
*/
static UChar charAfter(const UnicodeString& str, int32_t i);
void ensureCapacity(int32_t newLen);
void ensureBufferCapacity(int32_t newLen);
void swapBuffers(void);
static const UChar HEX[16];
static void _toPat(UnicodeString& buf, UChar32 c);
//----------------------------------------------------------------
// Implementation: Fundamental operators
//----------------------------------------------------------------
void xor(const UChar32* other, int32_t otherLen, int8_t polarity);
void add(const UChar32* other, int32_t otherLen, int8_t polarity);
void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
};
inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {

File diff suppressed because it is too large Load Diff