ICU-2831 add 4 new C ctype.h/POSIX-style functions

X-SVN-Rev: 11687
This commit is contained in:
Markus Scherer 2003-04-24 23:09:26 +00:00
parent 55f4855c84
commit 9f32af0974
2 changed files with 216 additions and 13 deletions

View File

@ -323,6 +323,22 @@ u_isdigit(UChar32 c) {
return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
}
U_CAPI UBool U_EXPORT2
u_isxdigit(UChar32 c) {
uint32_t props;
/* check ASCII and Fullwidth ASCII a-fA-F */
if(
(c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
(c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
) {
return TRUE;
}
GET_PROPS(c, props);
return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
}
/* Checks if the Unicode character is a letter.*/
U_CAPI UBool U_EXPORT2
u_isalpha(UChar32 c) {
@ -399,6 +415,16 @@ u_isWhitespace(UChar32 c) {
);
}
U_CAPI UBool U_EXPORT2
u_isblank(UChar32 c) {
if((uint32_t)c<=0x9f) {
return c==9 || c==0x20; /* TAB or SPACE */
} else {
/* White_Space but not LS (Zl) or PS (Zp) */
return u_isUWhiteSpace(c) && ((c&0xfffffffe)!=0x2028);
}
}
/* Checks if the Unicode character is printable.*/
U_CAPI UBool U_EXPORT2
u_isprint(UChar32 c) {
@ -408,6 +434,23 @@ u_isprint(UChar32 c) {
return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
}
U_CAPI UBool U_EXPORT2
u_isgraph(UChar32 c) {
uint32_t props;
GET_PROPS(c, props);
/* comparing ==0 returns FALSE for the categories mentioned */
return (UBool)((CAT_MASK(props)&
(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
==0);
}
U_CAPI UBool U_EXPORT2
u_ispunct(UChar32 c) {
uint32_t props;
GET_PROPS(c, props);
return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
}
/* Checks if the Unicode character can start a Unicode identifier.*/
U_CAPI UBool U_EXPORT2
u_isIDStart(UChar32 c) {
@ -706,6 +749,10 @@ u_digit(UChar32 ch, int8_t radix) {
value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */
} else if(ch>=0x41 && ch<=0x5A) {
value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */
} else if(ch>=0xFF41 && ch<=0xFF5A) {
value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */
} else if(ch>=0xFF21 && ch<=0xFF3A) {
value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */
}
}
} else {

View File

@ -62,6 +62,27 @@ U_CDECL_BEGIN
* See the individual function documentation,
* and see the JDK 1.4.1 java.lang.Character documentation
* at http://java.sun.com/j2se/1.4.1/docs/api/java/lang/Character.html
*
* There are also functions that provide easy migration from C/POSIX functions
* like isblank(). Their use is generally discouraged because the C/POSIX
* standards do not define their semantics beyond the ASCII range, which means
* that different implementations exhibit very different behavior.
* Instead, Unicode properties should be used directly.
*
* There are also only a few, broad C/POSIX character classes, and they tend
* to be used for conflicting purposes. For example, the "isalpha()" class
* is sometimes used to determine word boundaries, while a more sophisticated
* approach would at least distinguish initial letters from continuation
* characters (the latter including combining marks).
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
* Another example: There is no "istitle()" class for titlecase characters.
*
* A summary of the behavior of some C/POSIX character classification implementations
* for Unicode is available at http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/posix_classes.html
*
* <strong>Important</strong>:
* The behavior of the ICU C/POSIX-style character classification
* functions is subject to change according to discussion of the above summary.
*/
/**
@ -1332,11 +1353,12 @@ u_isUUppercase(UChar32 c);
*
* Comparison:
* - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* general categories "Z" (separators) + whitespace ISO controls
* (including no-break spaces)
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
* - u_isspace: Z + whitespace ISO controls (including no-break spaces)
* - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
*
* @param c Code point to test
* @return true if the code point has the White_Space Unicode property, false otherwise.
@ -1483,6 +1505,11 @@ u_getNumericValue(UChar32 c);
* have a different general category value.
* In order to include those, use UCHAR_LOWERCASE.
*
* In addition to being equivalent to a Java function, this also serves
* as a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is an Ll lowercase letter
*
@ -1505,6 +1532,11 @@ u_islower(UChar32 c);
* have a different general category value.
* In order to include those, use UCHAR_UPPERCASE.
*
* In addition to being equivalent to a Java function, this also serves
* as a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is an Lu uppercase letter
*
@ -1542,6 +1574,11 @@ u_istitle(UChar32 c);
*
* Same as java.lang.Character.isDigit().
*
* In addition to being equivalent to a Java function, this also serves
* as a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is a digit character according to Character.isDigit()
*
@ -1556,6 +1593,11 @@ u_isdigit(UChar32 c);
*
* Same as java.lang.Character.isLetter().
*
* In addition to being equivalent to a Java function, this also serves
* as a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is a letter character
*
@ -1574,6 +1616,11 @@ u_isalpha(UChar32 c);
*
* Same as java.lang.Character.isLetterOrDigit().
*
* In addition to being equivalent to a Java function, this also serves
* as a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is an alphanumeric character according to Character.isLetterOrDigit()
*
@ -1582,14 +1629,107 @@ u_isalpha(UChar32 c);
U_CAPI UBool U_EXPORT2
u_isalnum(UChar32 c);
/**
* Determines whether the specified code point is a hexadecimal digit.
* This is equivalent to u_digit(c, 16)>=0.
* True for characters with general category "Nd" (decimal digit numbers)
* as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII.
* (That is, for letters with code points
* 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.)
*
* In order to narrow the definition of hexadecimal digits to only ASCII
* characters, use (c<=0x7f && u_isxdigit(c)).
*
* This is a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is a hexadecimal digit
*
* @draft ICU 2.6
*/
U_CAPI UBool U_EXPORT2
u_isxdigit(UChar32 c);
/**
* Determines whether the specified code point is a punctuation character.
* True for characters with general categories "P" (punctuation).
*
* This is a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is a punctuation character
*
* @draft ICU 2.6
*/
U_CAPI UBool U_EXPORT2
u_ispunct(UChar32 c);
/**
* Determines whether the specified code point is a "graphic" character
* (printable, excluding spaces).
* TRUE for all characters except those with general categories
* "Cc" (control codes), "Cf" (format controls), "Cs" (surrogates),
* "Cn" (unassigned), and "Z" (separators).
*
* This is a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is a "graphic" character
*
* @draft ICU 2.6
*/
U_CAPI UBool U_EXPORT2
u_isgraph(UChar32 c);
/**
* Determines whether the specified code point is a "blank" or "horizontal space",
* a character that visibly separates words on a line.
* The following are equivalent definitions:
*
* TRUE for Unicode White_Space characters except for "vertical space controls"
* where "vertical space controls" contains
* U+000A (LF) U+000B (VT) U+000C (FF) U+000D (CR) U+0085 (NEL) U+2028 (LS) U+2029 (PS)
*
* same as
*
* TRUE for U+0009 (TAB) and characters with general category "Zs" (space separators)
* except Zero Width Space (ZWSP, U+200B).
*
* Comparison:
* - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
* - u_isspace: Z + whitespace ISO controls (including no-break spaces)
* - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
*
* This is a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is a "blank"
*
* @draft ICU 2.6
*/
U_CAPI UBool U_EXPORT2
u_isblank(UChar32 c);
/**
* Determines whether the specified code point is "defined",
* which usually means that it is assigned a character.
* True for general categories other than "Cn" (other, not assigned),
* i.e., true for all code points mentioned in UnicodeData.txt.
*
* Note that it is true for non-character code points (e.g., U+FDD0)
* but not for surrogate code points (Cs).
* Note that non-character code points (e.g., U+FDD0) are not "defined"
* (they are Cn), but surrogate code points are "defined" (Cs).
*
* Same as java.lang.Character.isDefined().
*
@ -1612,11 +1752,16 @@ u_isdefined(UChar32 c);
*
* Comparison:
* - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* general categories "Z" (separators) + whitespace ISO controls
* (including no-break spaces)
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
* - u_isspace: Z + whitespace ISO controls (including no-break spaces)
* - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
*
* This is a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the character to be tested
* @return true if the character is a space character; false otherwise.
@ -1638,11 +1783,12 @@ u_isspace(UChar32 c);
*
* Comparison:
* - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* general categories "Z" (separators) + whitespace ISO controls
* (including no-break spaces)
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
* - u_isspace: Z + whitespace ISO controls (including no-break spaces)
* - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
*
* @param c the code point to be tested
* @return TRUE if the code point is a space character according to Character.isSpaceChar()
@ -1677,11 +1823,12 @@ u_isJavaSpaceChar(UChar32 c);
*
* Comparison:
* - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* general categories "Z" (separators) + whitespace ISO controls
* (including no-break spaces)
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces)
* - u_isspace: Z + whitespace ISO controls (including no-break spaces)
* - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP
*
* @param c the code point to be tested
* @return TRUE if the code point is a whitespace character according to Java/ICU
@ -1704,6 +1851,10 @@ u_isWhitespace(UChar32 c);
* - U_LINE_SEPARATOR (Zl)
* - U_PARAGRAPH_SEPARATOR (Zp)
*
* This is a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is a control character
*
@ -1733,6 +1884,10 @@ u_isISOControl(UChar32 c);
* Determines whether the specified code point is a printable character.
* True for general categories <em>other</em> than "C" (controls).
*
* This is a C/POSIX migration function.
* See the comments about C/POSIX character classification functions in the
* documentation at the top of this header file.
*
* @param c the code point to be tested
* @return TRUE if the code point is a printable character
*
@ -2520,11 +2675,12 @@ u_foldCase(UChar32 c, uint32_t options);
* <li>The character is one of the lowercase Latin letters
* <code>'a'</code> through <code>'z'</code>.
* In this case the value is <code>ch-'a'+10</code>.</li>
* <li>The character is one of the primary numeric Han characters.
* In this case, the value is the digit value for that Han character.</li>
* <li>Latin letters from both the ASCII range (0061..007A, 0041..005A)
* as well as from the Fullwidth ASCII range (FF41..FF5A, FF21..FF3A)
* are recognized.</li>
* </ul>
*
* Same as java.lang.Character.digit() except that Java does not handle Han digits.
* Same as java.lang.Character.digit().
*
* @param c the code point to be tested.
* @param radix the radix.