ICU-2831 add 4 new C ctype.h/POSIX-style functions

X-SVN-Rev: 11687
2003-04-24 23:09:26 +00:00 · 2003-04-24 23:09:26 +00:00 · 9f32af0974
commit 9f32af0974
parent 55f4855c84
2 changed files with 216 additions and 13 deletions
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@ -323,6 +323,22 @@ u_isdigit(UChar32 c) {
    return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
 }

+U_CAPI UBool U_EXPORT2
+u_isxdigit(UChar32 c) {
+    uint32_t props;
+
+    /* check ASCII and Fullwidth ASCII a-fA-F */
+    if(
+        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
+        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
+    ) {
+        return TRUE;
+    }
+
+    GET_PROPS(c, props);
+    return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
+}
+
 /* Checks if the Unicode character is a letter.*/
 U_CAPI UBool U_EXPORT2
 u_isalpha(UChar32 c) {
@ -399,6 +415,16 @@ u_isWhitespace(UChar32 c) {
           );
 }

+U_CAPI UBool U_EXPORT2
+u_isblank(UChar32 c) {
+    if((uint32_t)c<=0x9f) {
+        return c==9 || c==0x20; /* TAB or SPACE */
+    } else {
+        /* White_Space but not LS (Zl) or PS (Zp) */
+        return u_isUWhiteSpace(c) && ((c&0xfffffffe)!=0x2028);
+    }
+}
+
 /* Checks if the Unicode character is printable.*/
 U_CAPI UBool U_EXPORT2
 u_isprint(UChar32 c) {
@ -408,6 +434,23 @@ u_isprint(UChar32 c) {
    return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
 }

+U_CAPI UBool U_EXPORT2
+u_isgraph(UChar32 c) {
+    uint32_t props;
+    GET_PROPS(c, props);
+    /* comparing ==0 returns FALSE for the categories mentioned */
+    return (UBool)((CAT_MASK(props)&
+                    (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
+                   ==0);
+}
+
+U_CAPI UBool U_EXPORT2
+u_ispunct(UChar32 c) {
+    uint32_t props;
+    GET_PROPS(c, props);
+    return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
+}
+
 /* Checks if the Unicode character can start a Unicode identifier.*/
 U_CAPI UBool U_EXPORT2
 u_isIDStart(UChar32 c) {
@ -706,6 +749,10 @@ u_digit(UChar32 ch, int8_t radix) {
                value=(int8_t)(ch-0x57);  /* ch - 'a' + 10 */
            } else if(ch>=0x41 && ch<=0x5A) {
                value=(int8_t)(ch-0x37);  /* ch - 'A' + 10 */
+            } else if(ch>=0xFF41 && ch<=0xFF5A) {
+                value=(int8_t)(ch-0xFF37);  /* fullwidth ASCII a-z */
+            } else if(ch>=0xFF21 && ch<=0xFF3A) {
+                value=(int8_t)(ch-0xFF17);  /* fullwidth ASCII A-Z */
            }
        }
    } else {
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@ -62,6 +62,27 @@ U_CDECL_BEGIN
 * See the individual function documentation,
 * and see the JDK 1.4.1 java.lang.Character documentation
 * at http://java.sun.com/j2se/1.4.1/docs/api/java/lang/Character.html
+ *
+ * There are also functions that provide easy migration from C/POSIX functions
+ * like isblank(). Their use is generally discouraged because the C/POSIX
+ * standards do not define their semantics beyond the ASCII range, which means
+ * that different implementations exhibit very different behavior.
+ * Instead, Unicode properties should be used directly.
+ *
+ * There are also only a few, broad C/POSIX character classes, and they tend
+ * to be used for conflicting purposes. For example, the "isalpha()" class
+ * is sometimes used to determine word boundaries, while a more sophisticated
+ * approach would at least distinguish initial letters from continuation
+ * characters (the latter including combining marks).
+ * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
+ * Another example: There is no "istitle()" class for titlecase characters.
+ *
+ * A summary of the behavior of some C/POSIX character classification implementations
+ * for Unicode is available at http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/posix_classes.html
+ *
+ * <strong>Important</strong>:
+ * The behavior of the ICU C/POSIX-style character classification
+ * functions is subject to change according to discussion of the above summary.
 */

 /**
@ -1332,11 +1353,12 @@ u_isUUppercase(UChar32 c);
 *
 * Comparison:
 * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
- *       general categories "Z" (separators) + whitespace ISO controls
- *       (including no-break spaces)
+ *       most of general categories "Z" (separators) + most whitespace ISO controls
+ *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
 * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
 * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
 * - u_isspace: Z + whitespace ISO controls (including no-break spaces)
+ * - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
 *
 * @param c Code point to test
 * @return true if the code point has the White_Space Unicode property, false otherwise.
@ -1483,6 +1505,11 @@ u_getNumericValue(UChar32 c);
 * have a different general category value.
 * In order to include those, use UCHAR_LOWERCASE.
 *
+ * In addition to being equivalent to a Java function, this also serves
+ * as a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
 * @param c the code point to be tested
 * @return TRUE if the code point is an Ll lowercase letter
 *
@ -1505,6 +1532,11 @@ u_islower(UChar32 c);
 * have a different general category value.
 * In order to include those, use UCHAR_UPPERCASE.
 *
+ * In addition to being equivalent to a Java function, this also serves
+ * as a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
 * @param c the code point to be tested
 * @return TRUE if the code point is an Lu uppercase letter
 *
@ -1542,6 +1574,11 @@ u_istitle(UChar32 c);
 *
 * Same as java.lang.Character.isDigit().
 *
+ * In addition to being equivalent to a Java function, this also serves
+ * as a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
 * @param c the code point to be tested
 * @return TRUE if the code point is a digit character according to Character.isDigit()
 *
@ -1556,6 +1593,11 @@ u_isdigit(UChar32 c);
 *
 * Same as java.lang.Character.isLetter().
 *
+ * In addition to being equivalent to a Java function, this also serves
+ * as a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
 * @param c the code point to be tested
 * @return TRUE if the code point is a letter character
 *
@ -1574,6 +1616,11 @@ u_isalpha(UChar32 c);
 *
 * Same as java.lang.Character.isLetterOrDigit().
 *
+ * In addition to being equivalent to a Java function, this also serves
+ * as a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
 * @param c the code point to be tested
 * @return TRUE if the code point is an alphanumeric character according to Character.isLetterOrDigit()
 *
@ -1582,14 +1629,107 @@ u_isalpha(UChar32 c);
 U_CAPI UBool U_EXPORT2
 u_isalnum(UChar32 c);

+/**
+ * Determines whether the specified code point is a hexadecimal digit.
+ * This is equivalent to u_digit(c, 16)>=0.
+ * True for characters with general category "Nd" (decimal digit numbers)
+ * as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII.
+ * (That is, for letters with code points
+ * 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.)
+ *
+ * In order to narrow the definition of hexadecimal digits to only ASCII
+ * characters, use (c<=0x7f && u_isxdigit(c)).
+ *
+ * This is a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is a hexadecimal digit
+ *
+ * @draft ICU 2.6
+ */
+U_CAPI UBool U_EXPORT2
+u_isxdigit(UChar32 c);
+
+/**
+ * Determines whether the specified code point is a punctuation character.
+ * True for characters with general categories "P" (punctuation).
+ *
+ * This is a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is a punctuation character
+ *
+ * @draft ICU 2.6
+ */
+U_CAPI UBool U_EXPORT2
+u_ispunct(UChar32 c);
+
+/**
+ * Determines whether the specified code point is a "graphic" character
+ * (printable, excluding spaces).
+ * TRUE for all characters except those with general categories
+ * "Cc" (control codes), "Cf" (format controls), "Cs" (surrogates),
+ * "Cn" (unassigned), and "Z" (separators).
+ *
+ * This is a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is a "graphic" character
+ *
+ * @draft ICU 2.6
+ */
+U_CAPI UBool U_EXPORT2
+u_isgraph(UChar32 c);
+
+/**
+ * Determines whether the specified code point is a "blank" or "horizontal space",
+ * a character that visibly separates words on a line.
+ * The following are equivalent definitions:
+ *
+ * TRUE for Unicode White_Space characters except for "vertical space controls"
+ * where "vertical space controls" contains
+ * U+000A (LF) U+000B (VT) U+000C (FF) U+000D (CR) U+0085 (NEL) U+2028 (LS) U+2029 (PS)
+ *
+ * same as
+ *
+ * TRUE for U+0009 (TAB) and characters with general category "Zs" (space separators)
+ * except Zero Width Space (ZWSP, U+200B).
+ *
+ * Comparison:
+ * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
+ *       most of general categories "Z" (separators) + most whitespace ISO controls
+ *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
+ * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
+ * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
+ * - u_isspace: Z + whitespace ISO controls (including no-break spaces)
+ * - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
+ *
+ * This is a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
+ * @param c the code point to be tested
+ * @return TRUE if the code point is a "blank"
+ *
+ * @draft ICU 2.6
+ */
+U_CAPI UBool U_EXPORT2
+u_isblank(UChar32 c);
+
 /**
 * Determines whether the specified code point is "defined",
 * which usually means that it is assigned a character.
 * True for general categories other than "Cn" (other, not assigned),
 * i.e., true for all code points mentioned in UnicodeData.txt.
 *
- * Note that it is true for non-character code points (e.g., U+FDD0)
- * but not for surrogate code points (Cs).
+ * Note that non-character code points (e.g., U+FDD0) are not "defined"
+ * (they are Cn), but surrogate code points are "defined" (Cs).
 *
 * Same as java.lang.Character.isDefined().
 *
@ -1612,11 +1752,16 @@ u_isdefined(UChar32 c);
 *
 * Comparison:
 * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
- *       general categories "Z" (separators) + whitespace ISO controls
- *       (including no-break spaces)
+ *       most of general categories "Z" (separators) + most whitespace ISO controls
+ *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
 * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
 * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
 * - u_isspace: Z + whitespace ISO controls (including no-break spaces)
+ * - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
+ *
+ * This is a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
 *
 * @param c    the character to be tested
 * @return  true if the character is a space character; false otherwise.
@ -1638,11 +1783,12 @@ u_isspace(UChar32 c);
 *
 * Comparison:
 * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
- *       general categories "Z" (separators) + whitespace ISO controls
- *       (including no-break spaces)
+ *       most of general categories "Z" (separators) + most whitespace ISO controls
+ *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
 * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
 * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
 * - u_isspace: Z + whitespace ISO controls (including no-break spaces)
+ * - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
 *
 * @param c the code point to be tested
 * @return TRUE if the code point is a space character according to Character.isSpaceChar()
@ -1677,11 +1823,12 @@ u_isJavaSpaceChar(UChar32 c);
 *
 * Comparison:
 * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
- *       general categories "Z" (separators) + whitespace ISO controls
- *       (including no-break spaces)
+ *       most of general categories "Z" (separators) + most whitespace ISO controls
+ *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
 * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
 * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
 * - u_isspace: Z + whitespace ISO controls (including no-break spaces)
+ * - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
 *
 * @param c the code point to be tested
 * @return TRUE if the code point is a whitespace character according to Java/ICU
@ -1704,6 +1851,10 @@ u_isWhitespace(UChar32 c);
 * - U_LINE_SEPARATOR (Zl)
 * - U_PARAGRAPH_SEPARATOR (Zp)
 *
+ * This is a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
 * @param c the code point to be tested
 * @return TRUE if the code point is a control character
 *
@ -1733,6 +1884,10 @@ u_isISOControl(UChar32 c);
 * Determines whether the specified code point is a printable character.
 * True for general categories <em>other</em> than "C" (controls).
 *
+ * This is a C/POSIX migration function.
+ * See the comments about C/POSIX character classification functions in the
+ * documentation at the top of this header file.
+ *
 * @param c the code point to be tested
 * @return TRUE if the code point is a printable character
 *
@ -2520,11 +2675,12 @@ u_foldCase(UChar32 c, uint32_t options);
 * <li>The character is one of the lowercase Latin letters
 *     <code>'a'</code> through <code>'z'</code>.
 *     In this case the value is <code>ch-'a'+10</code>.</li>
- * <li>The character is one of the primary numeric Han characters.
- *     In this case, the value is the digit value for that Han character.</li>
+ * <li>Latin letters from both the ASCII range (0061..007A, 0041..005A)
+ *     as well as from the Fullwidth ASCII range (FF41..FF5A, FF21..FF3A)
+ *     are recognized.</li>
 * </ul>
 *
- * Same as java.lang.Character.digit() except that Java does not handle Han digits.
+ * Same as java.lang.Character.digit().
 *
 * @param   c       the code point to be tested.
 * @param   radix   the radix.