1999-08-16 21:50:52 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
1999-11-22 20:25:35 +00:00
|
|
|
* Copyright (C) 1996-1999, International Business Machines Corporation and *
|
|
|
|
* others. All Rights Reserved. *
|
1999-08-16 21:50:52 +00:00
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
// FILE NAME : unicode.cpp
|
|
|
|
//
|
|
|
|
// CREATED
|
|
|
|
// Wednesday, December 11, 1996
|
|
|
|
//
|
|
|
|
// CHANGES
|
|
|
|
// Wednesday, February 4, 1998
|
|
|
|
// Changed logic in toUpperCase and toLowerCase in order
|
|
|
|
// to avoid 0xFFFF to be returned when receiving
|
|
|
|
// confusing Unichar to lowercase or to uppercase
|
|
|
|
// (e.g. Letterlike symbols)
|
|
|
|
//
|
|
|
|
// CHANGES BY
|
|
|
|
// Bertramd A. DAMIBA
|
|
|
|
//
|
|
|
|
// CREATED BY
|
|
|
|
// Helena Shih
|
|
|
|
//
|
|
|
|
// CHANGES
|
|
|
|
// Thursday, April 15, 1999
|
|
|
|
// Modified the definitions of all the functions
|
|
|
|
// C++ Wrappers for Unicode
|
|
|
|
// CHANGES BY
|
|
|
|
// Madhu Katragadda
|
|
|
|
// 5/20/99 Madhu Added the function u_getVersion()
|
|
|
|
// 07/09/99 stephen Added definition for {MIN,MAX}_VALUE
|
1999-11-23 01:25:27 +00:00
|
|
|
// 11/22/99 aliu Added MIN_RADIX, MAX_RADIX, digit, forDigit
|
1999-08-16 21:50:52 +00:00
|
|
|
//********************************************************************************************
|
|
|
|
|
1999-12-28 23:39:02 +00:00
|
|
|
#include "unicode/unicode.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
1999-12-28 23:39:02 +00:00
|
|
|
#include "unicode/uchar.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
const UChar Unicode::MIN_VALUE = 0x0000;
|
|
|
|
const UChar Unicode::MAX_VALUE = 0xFFFF;
|
1999-11-23 01:25:27 +00:00
|
|
|
const int8_t Unicode::MIN_RADIX = 2;
|
|
|
|
const int8_t Unicode::MAX_RADIX = 36;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
Unicode::Unicode()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
Unicode::Unicode(const Unicode& other)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
Unicode::~Unicode()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
const Unicode&
|
|
|
|
Unicode::operator=(const Unicode& other)
|
|
|
|
{
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if ch is a lower case letter.
|
|
|
|
bool_t
|
|
|
|
Unicode::isLowerCase(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_islower(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if ch is a upper case letter.
|
|
|
|
bool_t
|
|
|
|
Unicode::isUpperCase(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_isupper(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if ch is a title case letter; usually upper case letters.
|
|
|
|
bool_t
|
|
|
|
Unicode::isTitleCase(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_istitle(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if ch is a decimal digit.
|
|
|
|
bool_t
|
|
|
|
Unicode::isDigit(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_isdigit(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if ch is a unicode character with assigned character type.
|
|
|
|
bool_t
|
|
|
|
Unicode::isDefined(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_isdefined(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Gets the character's linguistic directionality.
|
|
|
|
Unicode::EDirectionProperty
|
|
|
|
Unicode::characterDirection( UChar ch )
|
|
|
|
{
|
|
|
|
|
|
|
|
return ((EDirectionProperty)u_charDirection(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the script associated with the character
|
|
|
|
Unicode::EUnicodeScript
|
|
|
|
Unicode::getScript(UChar ch)
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
return ((EUnicodeScript) u_charScript(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character is a base form character that can take a diacritic.
|
|
|
|
bool_t
|
|
|
|
Unicode::isBaseForm(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_isbase(ch) );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character is a control character.
|
|
|
|
bool_t
|
|
|
|
Unicode::isControl(UChar ch)
|
|
|
|
{
|
|
|
|
return( u_iscntrl(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character is printable.
|
|
|
|
bool_t
|
|
|
|
Unicode::isPrintable(UChar ch)
|
|
|
|
{
|
|
|
|
return( u_isprint(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character is a letter.
|
|
|
|
bool_t
|
|
|
|
Unicode::isLetter(UChar ch)
|
|
|
|
{
|
|
|
|
return(u_isalpha(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character can start a Java identifier.
|
|
|
|
bool_t
|
|
|
|
Unicode::isJavaIdentifierStart(UChar ch)
|
|
|
|
{
|
|
|
|
return( u_isJavaIDStart(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character can be a Java identifier part other than starting the
|
|
|
|
// identifier.
|
|
|
|
bool_t
|
|
|
|
Unicode::isJavaIdentifierPart(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_isJavaIDPart(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character can start a Unicode identifier.
|
|
|
|
bool_t
|
|
|
|
Unicode::isUnicodeIdentifierStart(UChar ch)
|
|
|
|
{
|
|
|
|
return(u_isIDStart(ch));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character can be a Unicode identifier part other than starting the
|
|
|
|
// identifier.
|
|
|
|
bool_t
|
|
|
|
Unicode::isUnicodeIdentifierPart(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_isIDPart(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
|
|
|
|
bool_t
|
|
|
|
Unicode::isIdentifierIgnorable(UChar ch)
|
|
|
|
{
|
|
|
|
return( u_isIDIgnorable(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Transforms the Unicode character to its lower case equivalent.
|
|
|
|
UChar
|
|
|
|
Unicode::toLowerCase(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_tolower(ch) );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// Transforms the Unicode character to its upper case equivalent.
|
|
|
|
UChar
|
|
|
|
Unicode::toUpperCase(UChar ch)
|
|
|
|
{
|
|
|
|
return(u_toupper(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Transforms the Unicode character to its title case equivalent.
|
|
|
|
UChar
|
|
|
|
Unicode::toTitleCase(UChar ch)
|
|
|
|
{
|
|
|
|
return(u_totitle(ch) );
|
|
|
|
}
|
|
|
|
|
2000-01-12 18:01:51 +00:00
|
|
|
/**
|
|
|
|
* Determines if the specified character is ISO-LATIN-1 white space.
|
|
|
|
* This method returns <code>true</code> for the following five
|
|
|
|
* characters only:
|
|
|
|
* <table>
|
|
|
|
* <tr><td>'\t'</td> <td>\u0009</td>
|
|
|
|
* <td><code>HORIZONTAL TABULATION</code></td></tr>
|
|
|
|
* <tr><td>'\n'</td> <td>\u000A</td>
|
|
|
|
* <td><code>NEW LINE</code></td></tr>
|
|
|
|
* <tr><td>'\f'</td> <td>\u000C</td>
|
|
|
|
* <td><code>FORM FEED</code></td></tr>
|
|
|
|
* <tr><td>'\r'</td> <td>\u000D</td>
|
|
|
|
* <td><code>CARRIAGE RETURN</code></td></tr>
|
|
|
|
* <tr><td>' '</td> <td>\u0020</td>
|
|
|
|
* <td><code>SPACE</code></td></tr>
|
|
|
|
* </table>
|
|
|
|
*
|
|
|
|
* @param ch the character to be tested.
|
|
|
|
* @return <code>true</code> if the character is ISO-LATIN-1 white
|
|
|
|
* space; <code>false</code> otherwise.
|
|
|
|
* @see #isSpaceChar
|
|
|
|
* @see #isWhitespace
|
|
|
|
* @deprecated Replaced by isWhitespace(char).
|
|
|
|
*/
|
|
|
|
bool_t
|
|
|
|
Unicode::isSpace(UChar ch) {
|
|
|
|
return (ch <= 0x0020) &&
|
|
|
|
(((((int32_t(1) << 0x0009) |
|
|
|
|
(int32_t(1) << 0x000A) |
|
|
|
|
(int32_t(1) << 0x000C) |
|
|
|
|
(int32_t(1) << 0x000D) |
|
|
|
|
(int32_t(1) << 0x0020)) >> ch) & int32_t(1)) != 0);
|
|
|
|
}
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
// Checks if the Unicode character is a space character.
|
|
|
|
bool_t
|
|
|
|
Unicode::isSpaceChar(UChar ch)
|
|
|
|
{
|
|
|
|
return(u_isspace(ch) );
|
|
|
|
}
|
|
|
|
|
2000-01-12 18:01:51 +00:00
|
|
|
/**
|
|
|
|
* Determines if the specified character is white space according to ICU.
|
|
|
|
* A character is considered to be an ICU whitespace character if and only
|
|
|
|
* if it satisfies one of the following criteria:
|
|
|
|
* <ul>
|
|
|
|
* <li> It is a Unicode space separator (category "Zs"), but is not
|
|
|
|
* a no-break space (\u00A0 or \uFEFF).
|
|
|
|
* <li> It is a Unicode line separator (category "Zl").
|
|
|
|
* <li> It is a Unicode paragraph separator (category "Zp").
|
|
|
|
* <li> It is \u0009, HORIZONTAL TABULATION.
|
|
|
|
* <li> It is \u000A, LINE FEED.
|
|
|
|
* <li> It is \u000B, VERTICAL TABULATION.
|
|
|
|
* <li> It is \u000C, FORM FEED.
|
|
|
|
* <li> It is \u000D, CARRIAGE RETURN.
|
|
|
|
* <li> It is \u001C, FILE SEPARATOR.
|
|
|
|
* <li> It is \u001D, GROUP SEPARATOR.
|
|
|
|
* <li> It is \u001E, RECORD SEPARATOR.
|
|
|
|
* <li> It is \u001F, UNIT SEPARATOR.
|
|
|
|
* </ul>
|
|
|
|
*
|
|
|
|
* @param ch the character to be tested.
|
|
|
|
* @return true if the character is a Java whitespace character;
|
|
|
|
* false otherwise.
|
|
|
|
* @see #isSpaceChar
|
|
|
|
*/
|
|
|
|
bool_t
|
|
|
|
Unicode::isWhitespace(UChar ch) {
|
|
|
|
// From Mark Davis:
|
|
|
|
//| What we should do is to make sure that the special Cc characters like CR
|
|
|
|
//| have either Zs, Zl, or Zp in the property database. We can then just call
|
|
|
|
//| the equivalent of:
|
|
|
|
//|
|
|
|
|
//| public static boolean isWhileSpace(char ch) {
|
|
|
|
//| return ((1 << Character.getType(c)) & WHITESPACE_MASK) != 0; }
|
|
|
|
//|
|
|
|
|
//| where WHITESPACE_MASK = (1 << Zs) | (1 << Zl) | (1 << Zp);
|
|
|
|
//|
|
|
|
|
//| This is much faster code, since it just looksup the property value and does
|
|
|
|
//| a couple of arithmetics to get the right answer.
|
|
|
|
|
|
|
|
// TEMPORARY IMPLEMENTATION until the tables are updated to
|
|
|
|
// modify Cc character categories:
|
|
|
|
int8_t cat = Unicode::getType(ch);
|
|
|
|
return
|
|
|
|
(cat == SPACE_SEPARATOR && ch != 0x00A0 && ch != 0xFEFF) ||
|
|
|
|
(cat == LINE_SEPARATOR) ||
|
|
|
|
(cat == PARAGRAPH_SEPARATOR) ||
|
|
|
|
(ch <= 0x1F && ((((int32_t(1) << 0x0009) |
|
|
|
|
(int32_t(1) << 0x000A) |
|
|
|
|
(int32_t(1) << 0x000B) |
|
|
|
|
(int32_t(1) << 0x000C) |
|
|
|
|
(int32_t(1) << 0x000D) |
|
|
|
|
(int32_t(1) << 0x001C) |
|
|
|
|
(int32_t(1) << 0x001D) |
|
|
|
|
(int32_t(1) << 0x001E) |
|
|
|
|
(int32_t(1) << 0x001F)) >> ch) & int32_t(1)) != 0);
|
|
|
|
}
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
// Gets if the Unicode character's character property.
|
|
|
|
int8_t
|
|
|
|
Unicode::getType(UChar ch)
|
|
|
|
{
|
|
|
|
return(u_charType(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Gets table cell width of the Unicode character.
|
|
|
|
uint16_t
|
|
|
|
Unicode::getCellWidth(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_charCellWidth(ch) );
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
Unicode::digitValue(UChar ch)
|
|
|
|
{
|
|
|
|
return (u_charDigitValue(ch) );
|
|
|
|
}
|
|
|
|
|
1999-11-23 01:25:27 +00:00
|
|
|
int8_t
|
|
|
|
Unicode::digit(UChar ch, int8_t radix) {
|
|
|
|
int8_t value = -1;
|
|
|
|
if (radix >= MIN_RADIX && radix <= MAX_RADIX) {
|
|
|
|
value = (int8_t) u_charDigitValue(ch);
|
|
|
|
if (value < 0) {
|
|
|
|
if (ch >= (UChar)'A' && ch <= (UChar)'Z') {
|
|
|
|
value = ch - ((UChar)'A' - 10);
|
|
|
|
} else if (ch >= (UChar)'a' && ch <= (UChar)'z') {
|
|
|
|
value = ch - ((UChar)'a' - 10);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (value < radix) ? value : -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
UChar
|
|
|
|
Unicode::forDigit(int32_t digit, int8_t radix) {
|
|
|
|
if ((radix < MIN_RADIX) || (radix > MAX_RADIX) ||
|
|
|
|
(digit < 0) || (digit >= radix)) {
|
|
|
|
return (UChar)0;
|
|
|
|
}
|
|
|
|
return (UChar)(((digit < 10) ? (UChar)'0' : ((UChar)'a' - 10))
|
|
|
|
+ digit);
|
|
|
|
}
|
|
|
|
|
2000-01-12 19:50:27 +00:00
|
|
|
void
|
2000-01-12 20:20:38 +00:00
|
|
|
Unicode::getUnicodeVersion(UVersionInfo versionArray)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-01-12 19:50:27 +00:00
|
|
|
u_getUnicodeVersion(versionArray);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|