ICU-2657 add/change properties functions to match Java

X-SVN-Rev: 10999
2003-02-09 21:02:26 +00:00 · 2003-02-09 21:02:26 +00:00 · 77a600126a
commit 77a600126a
parent e87f9e1104
3 changed files with 143 additions and 90 deletions
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@ -316,10 +316,7 @@ U_CAPI UBool U_EXPORT2
 u_isdigit(UChar32 c) {
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)(((1UL<<GET_CATEGORY(props))&
-            (1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER)
-           )!=0);
-    /* ### TODO: should this not check only U_DECIMAL_DIGIT_NUMBER?! */
+    return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
 }

 /* Checks if the Unicode character is a letter.*/
@ -327,9 +324,7 @@ U_CAPI UBool U_EXPORT2
 u_isalpha(UChar32 c) {
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)(((1UL<<GET_CATEGORY(props))&
-            (1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
-           )!=0);
+    return (UBool)((U_MASK(GET_CATEGORY(props))&U_GC_L_MASK)!=0);
 }

 /* Checks if ch is a letter or a decimal digit */
@ -337,10 +332,7 @@ U_CAPI UBool U_EXPORT2
 u_isalnum(UChar32 c) {
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)(((1UL<<GET_CATEGORY(props))&
-            (1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER|
-             1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
-           )!=0);
+    return (UBool)((U_MASK(GET_CATEGORY(props))&(U_GC_L_MASK|U_GC_ND_MASK))!=0);
 }

 /* Checks if ch is a unicode character with assigned character type.*/
@ -374,6 +366,11 @@ u_iscntrl(UChar32 c) {
           )!=0);
 }

+U_CAPI UBool U_EXPORT2
+u_isISOControl(UChar32 c) {
+    return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f);
+}
+
 /* Some control characters that are used as space. */
 #define IS_THAT_CONTROL_SPACE(c) \
    ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL)
@ -388,16 +385,23 @@ u_isspace(UChar32 c) {
           )!=0) || IS_THAT_CONTROL_SPACE(c));
 }

+U_CAPI UBool U_EXPORT2
+u_isJavaSpaceChar(UChar32 c) {
+    uint32_t props;
+    GET_PROPS(c, props);
+    return (UBool)((U_MASK(GET_CATEGORY(props))&U_GC_Z_MASK)!=0);
+}
+
 /* Checks if the Unicode character is a whitespace character.*/
 U_CAPI UBool U_EXPORT2
 u_isWhitespace(UChar32 c) {
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)((((1UL<<GET_CATEGORY(props))&
-            (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
-           )!=0 &&
-           c!=NBSP && c!=NNBSP && c!=ZWNBSP) || /* exclude no-break spaces */
-           IS_THAT_CONTROL_SPACE(c));
+    return (UBool)(
+                ((U_MASK(GET_CATEGORY(props))&U_GC_Z_MASK)!=0 &&
+                    c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */
+                IS_THAT_CONTROL_SPACE(c)
+           );
 }

 /* Checks if the Unicode character is printable.*/
@ -419,9 +423,7 @@ u_isIDStart(UChar32 c) {
    /* same as u_isalpha() */
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)(((1UL<<GET_CATEGORY(props))&
-            (1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
-           )!=0);
+    return (UBool)((U_MASK(GET_CATEGORY(props))&(U_GC_L_MASK|U_GC_NL_MASK))!=0);
 }

 /* Checks if the Unicode character can be a Unicode identifier part other than starting the
@ -442,12 +444,13 @@ u_isIDPart(UChar32 c) {
 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
 U_CAPI UBool U_EXPORT2
 u_isIDIgnorable(UChar32 c) {
-    return (UBool)((uint32_t)c<=8 ||
-           (uint32_t)(c-0xe)<=(0x1b-0xe) ||
-           (uint32_t)(c-0x7f)<=(0x9f-0x7f) ||
-           (uint32_t)(c-0x200a)<=(0x200f-0x200a) ||
-           (uint32_t)(c-0x206a)<=(0x206f-0x206a) ||
-           c==0xfeff);
+    if(c<=0x9f) {
+        return u_isISOControl(c) && !IS_THAT_CONTROL_SPACE(c);
+    } else {
+        uint32_t props;
+        GET_PROPS(c, props);
+        return (UBool)(GET_CATEGORY(props)==U_FORMAT_CHAR);
+    }
 }

 /*Checks if the Unicode character can start a Java identifier.*/
@ -916,6 +919,7 @@ uchar_addPropertyStarts(USet *set) {

    /* add no-break spaces for u_isWhitespace() what was not added above */
    USET_ADD_CP_AND_NEXT(set, NBSP);
+    USET_ADD_CP_AND_NEXT(set, FIGURESP);
    USET_ADD_CP_AND_NEXT(set, NNBSP);

    /* add for u_charDigitValue() */
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@ -55,6 +55,11 @@ U_CDECL_BEGIN
 * as well as semantic information such as whether a character is a digit or 
 * uppercase, lowercase, or uncased.
 * <P>
+ *
+ * Many functions are designed to match java.lang.Character functions.
+ * See the individual function documentation,
+ * and see the JDK 1.4.1 java.lang.Character documentation
+ * at http://java.sun.com/j2se/1.4.1/docs/api/java/lang/Character.html
 */

 /**
@ -1445,6 +1450,7 @@ u_isUUppercase(UChar32 c);
 * @see UCHAR_WHITE_SPACE
 * @see u_isWhitespace
 * @see u_isspace
+ * @see u_isJavaSpaceChar
 * @see u_hasBinaryProperty
 * @draft ICU 2.1
 */
@ -1618,21 +1624,30 @@ U_CAPI UBool U_EXPORT2
 u_istitle(UChar32 c);

 /**
- * Determines whether the specified character is a digit according to UnicodeData.txt.
+ * Determines whether the specified code point is a digit character according to Java.
+ * True for characters with general category "Nd" (decimal digit numbers).
+ *
+ * Same as java.lang.Character.isDigit().
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is a digit character according to Character.isDigit()
 *
- * @param c    the character to be tested
- * @return  true if the character is a digit; false otherwise.
 * @stable ICU 2.0
 */
 U_CAPI UBool U_EXPORT2
 u_isdigit(UChar32 c);

 /**
- * Determines whether the specified character is an alphanumeric character
- * (letter or digit)according to UnicodeData.txt.
+ * Determines whether the specified code point is an alphanumeric character
+ * (letter or digit) according to Java.
+ * True for characters with general categories
+ * "L" (letters) and "Nd" (decimal digit numbers).
+ *
+ * Same as java.lang.Character.isLetterOrDigit().
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is an alphanumeric character according to Character.isLetterOrDigit()
 *
- * @param c    the character to be tested
- * @return  true if the character is a letter or a digit; false otherwise.
 * @stable ICU 2.0
 */
 U_CAPI UBool U_EXPORT2
@ -1672,40 +1687,63 @@ u_isalpha(UChar32 c);

 /**
 * Determines if the specified character is a space character or not.
+ * ### same as u_isJavaSpaceChar except... (ISO control codes)
 *
 * @param c    the character to be tested
 * @return  true if the character is a space character; false otherwise.
+ *
+ * @see u_isJavaSpaceChar
+ * @see u_isWhitespace
+ * @see u_isUWhiteSpace
 * @stable ICU 2.0
 */
 U_CAPI UBool U_EXPORT2
 u_isspace(UChar32 c);

 /**
- * Determines if the specified character is white space according to ICU.
- * A character is considered to be an ICU whitespace character if and only
- * if it satisfies one of the following criteria:
- * <ul>
- * <li> It is a Unicode space separator (category "Zs"), but is not
- *      a no-break space (&#92;u00A0 or &#92;uFEFF).
- * <li> It is a Unicode line separator (category "Zl").
- * <li> It is a Unicode paragraph separator (category "Zp").
- * <li> It is &#92;u0009, HORIZONTAL TABULATION.
- * <li> It is &#92;u000A, LINE FEED.
- * <li> It is &#92;u000B, VERTICAL TABULATION.
- * <li> It is &#92;u000C, FORM FEED.
- * <li> It is &#92;u000D, CARRIAGE RETURN.
- * <li> It is &#92;u001C, FILE SEPARATOR.
- * <li> It is &#92;u001D, GROUP SEPARATOR.
- * <li> It is &#92;u001E, RECORD SEPARATOR.
- * <li> It is &#92;u001F, UNIT SEPARATOR.
- * </ul>
- * Note: This method corresponds to the Java method
- * <tt>java.lang.Character.isWhitespace()</tt>.
+ * Determine if the specified code point is a space character according to Java.
+ * True for characters with general categories "Z" (separators),
+ * which does not include control codes (e.g., TAB or Line Feed).
 *
- * @param   ch  the character to be tested.
- * @return  true if the character is an ICU whitespace character;
- *          false otherwise.
- * @see     #u_isspace
+ * Same as java.lang.Character.isSpaceChar().
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is a space character according to Character.isSpaceChar()
+ *
+ * @see u_isspace
+ * @see u_isWhitespace
+ * @see u_isUWhiteSpace
+ * @draft ICU 2.6
+ */
+U_CAPI UBool U_EXPORT2
+u_isJavaSpaceChar(UChar32 c);
+
+/**
+ * Determines if the specified code point is a whitespace character according to Java/ICU.
+ * A character is considered to be a Java whitespace character if and only
+ * if it satisfies one of the following criteria:
+ *
+ * - It is a Unicode separator (categories "Z"), but is not
+ *      a no-break space (U+00A0 NBSP or U+2007 Figure Space or U+202F Narrow NBSP).
+ * - It is U+0009 HORIZONTAL TABULATION.
+ * - It is U+000A LINE FEED.
+ * - It is U+000B VERTICAL TABULATION.
+ * - It is U+000C FORM FEED.
+ * - It is U+000D CARRIAGE RETURN.
+ * - It is U+001C FILE SEPARATOR.
+ * - It is U+001D GROUP SEPARATOR.
+ * - It is U+001E RECORD SEPARATOR.
+ * - It is U+001F UNIT SEPARATOR.
+ * - It is U+0085 NEXT LINE.
+ *
+ * Same as java.lang.Character.isWhitespace() except that Java omits U+0085.
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is a whitespace character according to Java/ICU
+ *
+ * @see u_isspace
+ * @see u_isJavaSpaceChar
+ * @see u_isUWhiteSpace
 * @stable ICU 2.0
 */
 U_CAPI UBool U_EXPORT2
@ -1729,6 +1767,20 @@ u_isWhitespace(UChar32 c);
 U_CAPI UBool U_EXPORT2
 u_iscntrl(UChar32 c);

+/**
+ * Determines whether the specified code point is an ISO control code.
+ * True for U+0000..U+001f and U+007f..U+009f.
+ *
+ * Same as java.lang.Character.isISOControl().
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is an ISO control code
+ *
+ * @see u_iscntrl
+ * @draft ICU 2.6
+ */
+U_CAPI UBool U_EXPORT2
+u_isISOControl(UChar32 c);

 /**
 * Determines whether the specified character is a printable character according 
@ -2248,20 +2300,19 @@ U_CAPI int32_t U_EXPORT2
 u_getPropertyValueEnum(UProperty property,
                       const char* alias);

-/** 
- * The following functions are java specific.
- */
 /**
- * A convenience method for determining if a Unicode character 
- * is allowed to start in a Unicode identifier.
- * A character may start a Unicode identifier if and only if
- * it is a letter.
+ * Determines if the specified character is permissible as the
+ * first character in a Unicode identifier according to Java.
+ * ### according to Unicode as well?!
+ * True for characters with general categories "L" (letters) and "Nl" (letter numbers).
 *
- * @param   c  the Unicode character.
- * @return  TRUE if the character may start a Unicode identifier;
- *          FALSE otherwise.
- * @see     u_isalpha
- * @see     u_isIDPart
+ * Same as java.lang.Character.isUnicodeIdentifierStart().
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point may start an identifier according to Java
+ *
+ * @see u_isalpha
+ * @see u_isIDPart
 * @stable ICU 2.0
 */
 U_CAPI UBool U_EXPORT2
@ -2287,35 +2338,32 @@ u_isIDStart(UChar32 c);
 * @param   c  the Unicode character.
 * @return  TRUE if the character may be part of a Unicode identifier;
 *          FALSE otherwise.
- * @see     u_isIDIgnorable
+ *
 * @see     u_isIDStart
+ * @see     u_isIDIgnorable
 * @stable ICU 2.0
 */
 U_CAPI UBool U_EXPORT2
 u_isIDPart(UChar32 c);

 /**
- * A convenience method for determining if a Unicode character 
- * should be regarded as an ignorable character 
- * in a Unicode identifier.
- * <P>
- * The following Unicode characters are ignorable in a 
- * Unicode identifier:
- * <table>
- * <tr><td>0x0000 through 0x0008,</td>
- *                                 <td>ISO control characters that</td></tr>
- * <tr><td>0x000E through 0x001B,</td> <td>are not whitespace</td></tr>
- * <tr><td>and 0x007F through 0x009F</td></tr>
- * <tr><td>0x200C through 0x200F</td>  <td>join controls</td></tr>
- * <tr><td>0x200A through 0x200E</td>  <td>bidirectional controls</td></tr>
- * <tr><td>0x206A through 0x206F</td>  <td>format controls</td></tr>
- * <tr><td>0xFEFF</td>               <td>zero-width no-break space</td></tr>
- * </table>
- * 
- * @param   c  the Unicode character.
- * @return  TRUE if the character may be part of a Unicode identifier;
- *          FALSE otherwise.
- * @see     u_isIDPart
+ * Determines if the specified character should be regarded
+ * as an ignorable character in an identifier,
+ * according to Java.
+ * True for characters with general category "Cf" (format controls) as well as
+ * non-whitespace ISO controls
+ * (U+0000..U+0008, U+000E..U+001B, U+007F..U+0084, U+0086..U+009F).
+ *
+ * Same as java.lang.Character.isIdentifierIgnorable()
+ * except that Java also returns TRUE for U+0085 Next Line
+ * (it omits U+0085 from whitespace ISO controls).
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is ignorable in identifiers according to Java
+ *
+ * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT
+ * @see u_isIDStart
+ * @see u_isIDPart
 * @stable ICU 2.0
 */
 U_CAPI UBool U_EXPORT2
--- a/icu4c/source/common/uprops.h
+++ b/icu4c/source/common/uprops.h
@ -247,6 +247,7 @@ enum {
    NL      =0x0085,
    NBSP    =0x00a0,
    CGJ     =0x034f,
+    FIGURESP=0x2007,
    HAIRSP  =0x200a,
    ZWNJ    =0x200c,
    ZWJ     =0x200d,