681c0468a4
X-SVN-Rev: 9810
292 lines
10 KiB
C++
292 lines
10 KiB
C++
/*
|
|
**********************************************************************
|
|
* Copyright (c) 2001-2002, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
*/
|
|
#ifndef _UPROPSET_H_
|
|
#define _UPROPSET_H_
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uobject.h"
|
|
#include "unicode/uscript.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
class UnicodeString;
|
|
class UnicodeSet;
|
|
class ParsePosition;
|
|
class Hashtable;
|
|
|
|
/**
|
|
* INTERNAL CLASS implementing the UnicodeSet properties as outlined
|
|
* at:
|
|
*
|
|
* http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
|
|
*
|
|
* Recognized syntax:
|
|
*
|
|
* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
|
|
* \p{foo} \P{foo} - white space not allowed within "\p" or "\P"
|
|
* \N{name} - white space not allowed within "\N"
|
|
*
|
|
* Other than the above restrictions, white space is ignored. Case
|
|
* is ignored except in "\p" and "\P" and "\N". In 'name' leading
|
|
* and trailing space is deleted, and internal runs of whitespace
|
|
* are collapsed to a single space.
|
|
*
|
|
* This class cannot be instantiated. It has a public static method,
|
|
* createPropertySet(), with takes a pattern to be parsed and returns
|
|
* a new UnicodeSet. Another public static method,
|
|
* resemblesPattern(), returns true if a given pattern string appears
|
|
* to be a property set pattern, and therefore should be passed in to
|
|
* createPropertySet().
|
|
*
|
|
* NOTE: Current implementation is incomplete. The following list
|
|
* indicates which properties are supported.
|
|
*
|
|
* + GeneralCategory
|
|
* + CombiningClass
|
|
* + BidiClass
|
|
* DecompositionType
|
|
* + NumericValue
|
|
* NumericType
|
|
* EastAsianWidth
|
|
* LineBreak
|
|
* JoiningType
|
|
* + Script
|
|
* + Binary properties
|
|
* + Name
|
|
*
|
|
* '+' indicates a supported property.
|
|
*
|
|
* @author Alan Liu
|
|
* @internal
|
|
*/
|
|
class UnicodePropertySet /* not : public UObject because all methods are static */ {
|
|
|
|
public:
|
|
|
|
//----------------------------------------------------------------
|
|
// Public API
|
|
//----------------------------------------------------------------
|
|
|
|
/**
|
|
* Return true if the given position, in the given pattern, appears
|
|
* to be the start of a property set pattern [:foo:], \p{foo}, or
|
|
* \P{foo}, or \N{name}.
|
|
*/
|
|
static UBool resemblesPattern(const UnicodeString& pattern, int32_t pos);
|
|
|
|
/**
|
|
* Create a UnicodeSet by parsing the given pattern at the given
|
|
* parse position.
|
|
*
|
|
* @param pattern the pattern string
|
|
* @param ppos on entry, the position at which to begin parsing.
|
|
* This should be one of the locations marked '^':
|
|
*
|
|
* [:blah:] \p{blah} \P{blah} \N{name}
|
|
* ^ % ^ % ^ % ^ %
|
|
*
|
|
* On return, the position after the last character parsed, that is,
|
|
* the locations marked '%'. If the parse fails, ppos is returned
|
|
* unchanged.
|
|
* @return a newly-constructed UnicodeSet object, or null upon
|
|
* failure.
|
|
*/
|
|
static UnicodeSet* createFromPattern(const UnicodeString& pattern,
|
|
ParsePosition& ppos,
|
|
UErrorCode &status);
|
|
|
|
|
|
typedef UnicodeSet* (*SetFactory)(const UnicodeString& valueName,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* "white space" in the sense of ICU rule parsers
|
|
* @internal
|
|
*/
|
|
static UnicodeSet getRuleWhiteSpaceSet(UErrorCode &status);
|
|
|
|
private:
|
|
|
|
//----------------------------------------------------------------
|
|
// Property set factory static methods
|
|
// NOTE: This will change/go away when we implement UCharacter
|
|
// based property retrieval.
|
|
//----------------------------------------------------------------
|
|
|
|
static UnicodeSet* createNumericValueSet(const UnicodeString& valueName,
|
|
UErrorCode &status);
|
|
|
|
static UnicodeSet* createNameSet(const UnicodeString& valueName,
|
|
UErrorCode& status);
|
|
|
|
/**
|
|
* Given a combining class name, or number, create a corresponding
|
|
* set and return it. The name may be numeric, as in \p{cc=0}, or
|
|
* symbolic, as in \p{cc=Below Left}. If the name is invalid,
|
|
* return an empty set.
|
|
*/
|
|
static UnicodeSet* createCombiningClassSet(const UnicodeString& valueName,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Given a bidi class name create a corresponding set and return
|
|
* it. If the name is invalid, return NULL.
|
|
*/
|
|
static UnicodeSet* createBidiClassSet(const UnicodeString& valueName,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Given a general category value name, create a corresponding
|
|
* set and return it, or return null if the name is invalid.
|
|
* @param valueName a pre-munged general category value name
|
|
*/
|
|
static UnicodeSet* createCategorySet(const UnicodeString& valueName,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Given a script value name, create a corresponding set and
|
|
* return it, or return null if the name is invalid.
|
|
* @param valueName a pre-munged script value name
|
|
*/
|
|
static UnicodeSet* createScriptSet(const UnicodeString& valueName,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Given a binary property name, create a corresponding
|
|
* set and return it, or return null if the name is invalid.
|
|
* @param valueName a pre-munged binary property name
|
|
*/
|
|
static UnicodeSet* createBinaryPropertySet(const UnicodeString& valueName,
|
|
UErrorCode &status);
|
|
|
|
//----------------------------------------------------------------
|
|
// Utility methods
|
|
//----------------------------------------------------------------
|
|
|
|
/**
|
|
* Returns a UnicodeSet for the given category. This set is
|
|
* cached and returned again if this method is called again with
|
|
* the same parameter.
|
|
*
|
|
* Callers MUST NOT MODIFY the returned set.
|
|
*/
|
|
static const UnicodeSet& getCategorySet(int32_t cat);
|
|
|
|
/**
|
|
* Returns a UnicodeSet for the given script. This set is
|
|
* cached and returned again if this method is called again with
|
|
* the same parameter.
|
|
*
|
|
* Callers MUST NOT MODIFY the returned set.
|
|
*/
|
|
static const UnicodeSet* getScriptSet(UScriptCode script,
|
|
UErrorCode &status);
|
|
|
|
/**
|
|
* Given a string, munge it to lose the whitespace, underscores, and hyphens.
|
|
* So "General Category " or "General_Category" or " General-Category"
|
|
* become "GENERALCATEGORY". We munge all type and value
|
|
* strings, and store all type and value keys pre-munged. NOTE:
|
|
* Unlike the Java version, we do not modify the case, since we use a
|
|
* case-insensitive compare function.
|
|
* @param keepSpace if false, completely delete white space.
|
|
* Otherwise compress runs of whitespace to a single space,
|
|
* and delete leading and trailing whitespace. If keepSpace
|
|
* is true, we also keep underscores and hyphens.
|
|
*/
|
|
static UnicodeString munge(const UnicodeString& str,
|
|
int32_t start, int32_t limit,
|
|
UBool keepSpace);
|
|
|
|
/**
|
|
* Skip over a sequence of zero or more white space characters
|
|
* at pos. Return the index of the first non-white-space character
|
|
* at or after pos, or str.length(), if there is none.
|
|
*/
|
|
static int32_t skipWhitespace(const UnicodeString& str, int32_t pos);
|
|
|
|
//----------------------------------------------------------------
|
|
// Generic filter-based scanning code
|
|
//
|
|
// NOTE: In general, we don't want to do this! This is a temporary
|
|
// implementation until we have time for something that examines
|
|
// the underlying UCharacter data structures in an intelligent
|
|
// way. Iterating over all code points is dumb. What we want to
|
|
// do, for instance, is iterate over internally-stored ranges
|
|
// of characters that have a given property.
|
|
//----------------------------------------------------------------
|
|
|
|
/**
|
|
* A filter that returns TRUE if the given code point should be
|
|
* included in the UnicodeSet being constructed.
|
|
*/
|
|
typedef UBool (*Filter)(UChar32 codePoint, void* context);
|
|
|
|
/**
|
|
* Set the given UnicodeSet to contain all code points for which
|
|
* filter returns TRUE. The context parameter is passed unchanged
|
|
* to the filter function.
|
|
*/
|
|
static void initSetFromFilter(UnicodeSet& set, Filter filter,
|
|
void* context,
|
|
UErrorCode &status);
|
|
|
|
//----------------------------------------------------------------
|
|
// Type and value name maps
|
|
//----------------------------------------------------------------
|
|
|
|
/**
|
|
* Add a type mapping to the name map.
|
|
*/
|
|
static void addType(const UnicodeString& shortName,
|
|
const UnicodeString& longName,
|
|
SetFactory factory);
|
|
|
|
/**
|
|
* Add a value mapping to the name map.
|
|
*/
|
|
static void addValue(Hashtable* map,
|
|
const UnicodeString& shortName,
|
|
const UnicodeString& longName,
|
|
int32_t value);
|
|
|
|
static void init(UErrorCode &status);
|
|
|
|
private:
|
|
// do not instantiate
|
|
UnicodePropertySet();
|
|
|
|
//----------------------------------------------------------------
|
|
// SetFactory <=> void*
|
|
// I don't know why the compiler won't cast between these types.
|
|
// They should be interconvertible. Does C++ distinguish between
|
|
// pointers into code and pointers into data? In any case, we
|
|
// convert between these types in a safe way here.
|
|
//----------------------------------------------------------------
|
|
|
|
union SetFactoryTok {
|
|
void* voidPointer;
|
|
SetFactory functionPointer;
|
|
};
|
|
|
|
inline static void* setFactoryToVoidPtr(SetFactory f) {
|
|
SetFactoryTok tok;
|
|
tok.functionPointer = f;
|
|
return tok.voidPointer;
|
|
}
|
|
|
|
inline static SetFactory voidPtrToSetFactory(void* p) {
|
|
SetFactoryTok tok;
|
|
tok.voidPointer = p;
|
|
return tok.functionPointer;
|
|
}
|
|
};
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif
|