From e6a0df52eeede89ee095c09c293bc12e35dd3ed3 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Sat, 28 May 2005 22:54:36 +0000 Subject: [PATCH] ICU-4199 enum/name API support for C/POSIX character classes, and UnicodeSet support for [:Assigned:] X-SVN-Rev: 17730 --- icu4c/source/common/uchar.c | 57 +- icu4c/source/common/unicode/uchar.h | 55 +- icu4c/source/common/unicode/uniset.h | 3 +- icu4c/source/common/unicode/uset.h | 3 +- icu4c/source/common/uniset_props.cpp | 69 +- icu4c/source/common/uprops.c | 27 +- icu4c/source/common/uprops.h | 27 + icu4c/source/test/intltest/usettest.cpp | 65 +- .../genpname/SyntheticPropertyAliases.txt | 10 +- icu4c/source/tools/genpname/data.h | 2131 +++++++++-------- icu4c/source/tools/genpname/preparse.pl | 27 +- 11 files changed, 1299 insertions(+), 1175 deletions(-) diff --git a/icu4c/source/common/uchar.c b/icu4c/source/common/uchar.c index 5f3a656bb2..5b00a493e6 100644 --- a/icu4c/source/common/uchar.c +++ b/icu4c/source/common/uchar.c @@ -504,7 +504,7 @@ u_isUAlphabetic(UChar32 c) { return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0; } -/* Checks if ch is a letter or a decimal digit */ +/* Checks if c is a letter or a decimal digit */ U_CAPI UBool U_EXPORT2 u_isalnum(UChar32 c) { uint32_t props; @@ -512,6 +512,15 @@ u_isalnum(UChar32 c) { return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0); } +/** + * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. + * @internal + */ +U_CFUNC UBool +u_isalnumPOSIX(UChar32 c) { + return (UBool)(u_isUAlphabetic(c) || u_isdigit(c)); +} + /* Checks if ch is a unicode character with assigned character type.*/ U_CAPI UBool U_EXPORT2 u_isdefined(UChar32 c) { @@ -577,8 +586,10 @@ u_isblank(UChar32 c) { if((uint32_t)c<=0x9f) { return c==9 || c==0x20; /* TAB or SPACE */ } else { - /* White_Space but not LS (Zl) or PS (Zp) */ - return u_isUWhiteSpace(c) && ((c&0xfffffffe)!=0x2028); + /* Zs */ + uint32_t props; + GET_PROPS(c, props); + return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR); } } @@ -596,6 +607,22 @@ u_isprint(UChar32 c) { return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0); } +/** + * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. + * Implements UCHAR_POSIX_PRINT. + * @internal + */ +U_CFUNC UBool +u_isprintPOSIX(UChar32 c) { + uint32_t props; + GET_PROPS(c, props); + /* + * The only cntrl character in graph+blank is TAB (in blank). + * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). + */ + return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c)); +} + U_CAPI UBool U_EXPORT2 u_isgraph(UChar32 c) { uint32_t props; @@ -606,6 +633,24 @@ u_isgraph(UChar32 c) { ==0); } +/** + * Checks if c is in + * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] + * with space=\p{Whitespace} and Control=Cc. + * Implements UCHAR_POSIX_GRAPH. + * @internal + */ +U_CFUNC UBool +u_isgraphPOSIX(UChar32 c) { + uint32_t props; + GET_PROPS(c, props); + /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ + /* comparing ==0 returns FALSE for the categories mentioned */ + return (UBool)((CAT_MASK(props)& + (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) + ==0); +} + U_CAPI UBool U_EXPORT2 u_ispunct(UChar32 c) { uint32_t props; @@ -1003,9 +1048,11 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { /* add code points with hardcoded properties, plus the ones following them */ + /* add for u_isblank() */ + USET_ADD_CP_AND_NEXT(sa, TAB); + /* add for IS_THAT_CONTROL_SPACE() */ - sa->add(sa->set, TAB); /* range TAB..CR */ - sa->add(sa->set, CR+1); + sa->add(sa->set, CR+1); /* range TAB..CR */ sa->add(sa->set, 0x1c); sa->add(sa->set, 0x1f+1); USET_ADD_CP_AND_NEXT(sa, NL); diff --git a/icu4c/source/common/unicode/uchar.h b/icu4c/source/common/unicode/uchar.h index c868b9ec67..16a64342ac 100644 --- a/icu4c/source/common/unicode/uchar.h +++ b/icu4c/source/common/unicode/uchar.h @@ -77,12 +77,31 @@ U_CDECL_BEGIN * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) * Another example: There is no "istitle()" class for titlecase characters. * - * A summary of the behavior of some C/POSIX character classification implementations - * for Unicode is available at http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/posix_classes.html + * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. + * ICU implements them according to the Standard Recommendations in + * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions + * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). * - * Important: - * The behavior of the ICU C/POSIX-style character classification - * functions is subject to change according to discussion of the above summary. + * API access for C/POSIX character classes is as follows: + * - alpha: u_isUAlphabetic(c) or u_hasBinaryProperty(c, UCHAR_ALPHABETIC) + * - lower: u_isULowercase(c) or u_hasBinaryProperty(c, UCHAR_LOWERCASE) + * - upper: u_isUUppercase(c) or u_hasBinaryProperty(c, UCHAR_UPPERCASE) + * - punct: u_ispunct(c) + * - digit: u_charType(c)==U_DECIMAL_DIGIT_NUMBER + * - xdigit: u_isxdigit(c) or u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT) + * - alnum: u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM) + * - space: u_isUWhiteSpace(c) or u_hasBinaryProperty(c, UCHAR_WHITE_SPACE) + * - blank: u_isblank(c) or u_hasBinaryProperty(c, UCHAR_POSIX_BLANK) + * - cntrl: u_charType(c)==U_CONTROL_CHAR + * - graph: u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH) + * - print: u_hasBinaryProperty(c, UCHAR_POSIX_PRINT) + * + * Note: Some of the u_isxyz() functions in uchar.h predate, and do not match, + * the Standard Recommendations in UTS #18. Instead, they match Java + * functions according to their API documentation. + * + * The C/POSIX character classes are also available in UnicodeSet patterns, + * using patterns like [:graph:] or \p{graph}. * * Note: There are several ICU whitespace functions. * Comparison: @@ -368,6 +387,31 @@ typedef enum UProperty { (http://www.unicode.org/reports/tr31/) @draft ICU 3.4 */ UCHAR_PATTERN_WHITE_SPACE, + /** Binary property alnum (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @draft ICU 3.4 */ + UCHAR_POSIX_ALNUM, + /** Binary property blank (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @draft ICU 3.4 */ + UCHAR_POSIX_BLANK, + /** Binary property graph (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @draft ICU 3.4 */ + UCHAR_POSIX_GRAPH, + /** Binary property print (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @draft ICU 3.4 */ + UCHAR_POSIX_PRINT, + /** Binary property xdigit (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @draft ICU 3.4 */ + UCHAR_POSIX_XDIGIT, /** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */ UCHAR_BINARY_LIMIT, @@ -1739,7 +1783,6 @@ u_getNumericValue(UChar32 c); * @see UCHAR_LOWERCASE * @see u_isupper * @see u_istitle - * @see u_islower * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index 9e45b52f1c..639305e7c3 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -569,7 +569,8 @@ public: * correspond to the following sets: * * "ANY" = [\\u0000-\\U0010FFFF], - * "ASCII" = [\\u0000-\\u007F]. + * "ASCII" = [\\u0000-\\u007F], + * "Assigned" = [:^Cn:]. * * @param value a value alias, either short or long. The name is matched * loosely. See PropertyValueAliases.txt for names and a description of diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h index 6a243d7fbe..9a2066c943 100644 --- a/icu4c/source/common/unicode/uset.h +++ b/icu4c/source/common/unicode/uset.h @@ -265,7 +265,8 @@ uset_applyIntPropertyValue(USet* set, * matched loosely and correspond to the following sets: * * "ANY" = [\\u0000-\\U0010FFFF], - * "ASCII" = [\\u0000-\\u007F]. + * "ASCII" = [\\u0000-\\u007F], + * "Assigned" = [:^Cn:]. * * @param propLength the length of the prop, or -1 if NULL * diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 069c59fe2f..590c5a84b1 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -77,42 +77,12 @@ static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ // Special property set IDs static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] static const char ASCII[] = "ASCII"; // [\u0000-\u007F] +static const char ASSIGNED[] = "Assigned"; // [:^Cn:] // Unicode name property alias #define NAME_PROP "na" #define NAME_PROP_LENGTH 2 -// TODO: Remove the following special-case code when -// these four C99-compatibility properties are implemented -// as enums/names. -U_CDECL_BEGIN - typedef UBool (U_CALLCONV *C99_Property_Function)(UChar32); -U_CDECL_END -static const struct C99_Map { - const char* name; - C99_Property_Function func; - UPropertySource src; -} C99_DISPATCH[] = { - // These three entries omitted; they clash with PropertyAliases - // names for Unicode properties, so UnicodeSet already maps them - // to those properties. - //{ "alpha", u_isalpha, UPROPS_SRC_PROPSVEC }, - //{ "lower", u_islower, UPROPS_SRC_CASE }, - //{ "upper", u_isupper, UPROPS_SRC_CASE }, - - // MUST be in SORTED order - { "alnum", u_isalnum, UPROPS_SRC_CHAR }, - { "blank", u_isblank, UPROPS_SRC_PROPSVEC }, - // new alias in Unicode 4.1 { "cntrl", u_iscntrl, UPROPS_SRC_CHAR }, - // new alias in Unicode 4.1 { "digit", u_isdigit, UPROPS_SRC_CHAR }, - { "graph", u_isgraph, UPROPS_SRC_CHAR }, - { "print", u_isprint, UPROPS_SRC_CHAR }, - // new alias in Unicode 4.1 { "punct", u_ispunct, UPROPS_SRC_CHAR }, - // new alias in Unicode 4.1 { "space", u_isspace, UPROPS_SRC_CHAR }, - { "title", u_istitle, UPROPS_SRC_CHAR }, - { "xdigit", u_isxdigit, UPROPS_SRC_CHAR } -}; - // TEMPORARY: Remove when deprecated category code constructor is removed. static const UChar CATEGORY_NAMES[] = { // Must be kept in sync with uchar.h/UCharCategory @@ -931,14 +901,6 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} -// TODO: Remove the following special-case code when -// these four C99-compatibility properties are implemented -// as enums/names. -static UBool c99Filter(UChar32 ch, void* context) { - struct C99_Map* m = (struct C99_Map*) context; - return m->func(ch); -} - UnicodeSet& UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { if (U_FAILURE(ec)) return *this; @@ -974,7 +936,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, UProperty p; int32_t v; - UBool mustNotBeEmpty = FALSE; + UBool mustNotBeEmpty = FALSE, invert = FALSE; if (value.length() > 0) { p = u_getPropertyEnum(pname); @@ -1081,22 +1043,12 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, } else if (0 == uprv_comparePropertyNames(ASCII, pname)) { set(0, 0x7F); return *this; + } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) { + // [:Assigned:]=[:^Cn:] + p = UCHAR_GENERAL_CATEGORY_MASK; + v = U_GC_CN_MASK; + invert = TRUE; } else { - - // TODO: Remove the following special-case code when - // these four C99-compatibility properties are implemented - // as enums/names. - for (int32_t i=0; i{$long_name} . "|" . $long_name; + my $value; + if($pa->{$long_name} =~ m|^n/a\d*$|) { + $value = $long_name; + } else { + $value = $pa->{$long_name} . "|" . $long_name; + } if (exists $additional_property_aliases{$long_name}) { $value .= "|" . $additional_property_aliases{$long_name}; } @@ -689,8 +697,8 @@ sub merge_PropertyValueAliases { my $l = $n; my $r = $pva->{$n}; # convert |n/a\d+| to blank - $l = '' if ($l =~ m|^n/a\d+$|); - $r = '' if ($r =~ m|^n/a\d+$|); + $l = '' if ($l =~ m|^n/a\d*$|); + $r = '' if ($r =~ m|^n/a\d*$|); $hh->{$enum} = "$l|$r"; # Don't delete the 'gc' properties because we need to share @@ -766,8 +774,6 @@ sub read_PropertyAliases { my $in = new FileHandle($filename, 'r'); die "Error: Cannot open $filename" if (!defined $in); - my $sym = 0; # Used to make "n/a" strings unique - while (<$in>) { # Read version (embedded in a comment) @@ -795,9 +801,12 @@ sub read_PropertyAliases { } # Make "n/a" strings unique + if ($short eq 'n/a') { + $short .= sprintf("%03d", $propNA++); + } my $long = $fields[0]; if ($long eq 'n/a') { - $long .= sprintf("%03d", $sym++); + $long .= sprintf("%03d", $propNA++); } # Add long name->short name to the hash=pa hash table @@ -847,7 +856,7 @@ sub read_PropertyValueAliases { my $in = new FileHandle($filename, 'r'); die "Error: Cannot open $filename" if (!defined $in); - my $sym = 0; # Used to make "n/a" strings unique + my $valueNA = 0; # Used to make "n/a" strings unique while (<$in>) { @@ -868,7 +877,7 @@ sub read_PropertyValueAliases { die "Error: Wrong number of fields in $filename" if (@fields < 2 || @fields > 3); # Make "n/a" strings unique - $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a'); + $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a'); # Squash extra fields together while (@fields > 2) { my $f = pop @fields;