scuffed-code/icu4c/source/common/unicode.cpp

/*
*******************************************************************************
* Copyright (C) 1996-1999, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/
//  FILE NAME : unicode.cpp
//
//  CREATED
//      Wednesday, December 11, 1996
//
//  CHANGES
//      Wednesday, February 4,  1998
//      Changed logic in toUpperCase and toLowerCase in order
//      to avoid 0xFFFF to be returned when receiving
//      confusing Unichar  to lowercase or to uppercase
//      (e.g. Letterlike symbols)
//
//  CHANGES BY
//  Bertramd A. DAMIBA
//
//  CREATED BY
//      Helena Shih
//
//  CHANGES
//      Thursday, April 15, 1999
//      Modified the definitions of all the functions
//      C++ Wrappers for Unicode
//  CHANGES BY
//      Madhu Katragadda
//   5/20/99     Madhu		Added the function u_getVersion()
//  07/09/99     stephen        Added definition for {MIN,MAX}_VALUE
//  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
//********************************************************************************************

#include "unicode/unicode.h"

#include "unicode/uchar.h"


const UChar Unicode::MIN_VALUE = 0x0000;
const UChar Unicode::MAX_VALUE = 0xFFFF;
const int8_t Unicode::MIN_RADIX = 2;
const int8_t Unicode::MAX_RADIX = 36;

Unicode::Unicode()
{
}

Unicode::Unicode(const  Unicode&    other)
{
}

Unicode::~Unicode()
{
}

const Unicode&
Unicode::operator=(const    Unicode&    other)
{
    return *this;
}

// Checks if ch is a lower case letter.
bool_t
Unicode::isLowerCase(UChar ch)
{
    return (u_islower(ch) );
}

// Checks if ch is a upper case letter.
bool_t
Unicode::isUpperCase(UChar ch)
{
    return (u_isupper(ch) );
}

// Checks if ch is a title case letter; usually upper case letters.
bool_t
Unicode::isTitleCase(UChar ch)
{
    return (u_istitle(ch) );
}

// Checks if ch is a decimal digit.
bool_t
Unicode::isDigit(UChar ch)
{
    return (u_isdigit(ch) );
}

// Checks if ch is a unicode character with assigned character type.
bool_t
Unicode::isDefined(UChar ch)
{
    return (u_isdefined(ch) );
}


// Gets the character's linguistic directionality.
Unicode::EDirectionProperty
Unicode::characterDirection( UChar ch )
{

    return ((EDirectionProperty)u_charDirection(ch) );
}

// Get the script associated with the character
Unicode::EUnicodeScript
Unicode::getScript(UChar ch)
{


    return ((EUnicodeScript) u_charScript(ch) );
}

// Checks if the Unicode character is a base form character that can take a diacritic.
bool_t
Unicode::isBaseForm(UChar ch)
{
    return (u_isbase(ch) );

}

// Checks if the Unicode character is a control character.
bool_t
Unicode::isControl(UChar ch)
{
    return( u_iscntrl(ch) );
}

// Checks if the Unicode character is printable.
bool_t
Unicode::isPrintable(UChar ch)
{
    return( u_isprint(ch) );
}

// Checks if the Unicode character is a letter.
bool_t
Unicode::isLetter(UChar ch)
{
    return(u_isalpha(ch) );
}

// Checks if the Unicode character can start a Java identifier.
bool_t
Unicode::isJavaIdentifierStart(UChar ch)
{
    return( u_isJavaIDStart(ch) );
}

// Checks if the Unicode character can be a Java identifier part other than starting the
// identifier.
bool_t
Unicode::isJavaIdentifierPart(UChar ch)
{
    return (u_isJavaIDPart(ch) );
}

// Checks if the Unicode character can start a Unicode identifier.
bool_t
Unicode::isUnicodeIdentifierStart(UChar ch)
{
    return(u_isIDStart(ch));
}

// Checks if the Unicode character can be a Unicode identifier part other than starting the
// identifier.
bool_t
Unicode::isUnicodeIdentifierPart(UChar ch)
{
    return (u_isIDPart(ch) );
}

// Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
bool_t
Unicode::isIdentifierIgnorable(UChar ch)
{
    return( u_isIDIgnorable(ch) );
}

// Transforms the Unicode character to its lower case equivalent.
UChar
Unicode::toLowerCase(UChar ch)
{
    return (u_tolower(ch) );

}

// Transforms the Unicode character to its upper case equivalent.
UChar
Unicode::toUpperCase(UChar ch)
{
    return(u_toupper(ch) );
}

// Transforms the Unicode character to its title case equivalent.
UChar
Unicode::toTitleCase(UChar ch)
{
    return(u_totitle(ch) );
}

// Checks if the Unicode character is a space character.
bool_t
Unicode::isSpaceChar(UChar ch)
{
    return(u_isspace(ch) );
}

// Determines if the specified character is white space according to ICU.
bool_t
Unicode::isWhitespace(UChar ch) {
    // TODO Move this implementation to C, and make this call the C
    //      implementation.
    // TODO Optional -- reimplement in terms of modified category
    //      code -- see Mark Davis's note (below).  If this is done,
    //      the implementation still must conform to the specified
    //      semantics.  That is, U+00A0 and U+FEFF must return false,
    //      and the ranges U+0009 - U+000D and U+001C - U+001F must
    //      return true.  Characters other than these in Zs, Zl, or Zp
    //      must return true.

    int8_t cat = Unicode::getType(ch);
    return
        (cat == SPACE_SEPARATOR && ch != 0x00A0 && ch != 0xFEFF) ||
        (((((int32_t(1) << LINE_SEPARATOR) |
            (int32_t(1) << PARAGRAPH_SEPARATOR)) >> cat) & int32_t(1)) != 0) ||
        (ch <= 0x1F && ((((int32_t(1) << 0x0009) |
                          (int32_t(1) << 0x000A) |
                          (int32_t(1) << 0x000B) |
                          (int32_t(1) << 0x000C) |
                          (int32_t(1) << 0x000D) |
                          (int32_t(1) << 0x001C) |
                          (int32_t(1) << 0x001D) |
                          (int32_t(1) << 0x001E) |
                          (int32_t(1) << 0x001F)) >> ch) & int32_t(1)) != 0);

    // From Mark Davis:
    //| What we should do is to make sure that the special Cc characters like CR
    //| have either Zs, Zl, or Zp in the property database. We can then just call
    //| the equivalent of:
    //|
    //|  public static boolean isWhileSpace(char ch) {
    //|   return ((1 << Character.getType(c)) & WHITESPACE_MASK) != 0; }
    //|
    //| where WHITESPACE_MASK = (1 << Zs) | (1 << Zl) | (1 << Zp);
    //|
    //| This is much faster code, since it just looksup the property value and does
    //| a couple of arithmetics to get the right answer.
    //
    // (We still have to make sure U+00A0 and U+FEFF are excluded, so the code
    //  might not be as simple as this. - aliu)
}

// Gets if the Unicode character's character property.
int8_t
Unicode::getType(UChar ch)
{
    return(u_charType(ch) );
}


// Gets table cell width of the Unicode character.
uint16_t
Unicode::getCellWidth(UChar ch)
{
    return (u_charCellWidth(ch) );
}

int32_t
Unicode::digitValue(UChar ch)
{
    return (u_charDigitValue(ch) );
}

int8_t
Unicode::digit(UChar ch, int8_t radix) {
    int8_t value = -1;
    if (radix >= MIN_RADIX && radix <= MAX_RADIX) {
        value = (int8_t) u_charDigitValue(ch);
        if (value < 0) {
            if (ch >= 0x0041/*A*/ && ch <= 0x005A/*Z*/) {
                value = ch - (0x0041/*A*/ - 10);
            } else if (ch >= 0x0061/*a*/ && ch <= 0x007A/*z*/) {
                value = ch - (0x0061/*a*/ - 10);
            }
        }
    }
    return (value < radix) ? value : -1;
}

UChar
Unicode::forDigit(int32_t digit, int8_t radix) {
    if ((radix < MIN_RADIX) || (radix > MAX_RADIX) ||
        (digit < 0) || (digit >= radix)) {
        return (UChar)0;
    }
    return (UChar)(((digit < 10) ? 0x0030/*0*/ : (0x0061/*a*/ - 10))
                   + digit);
}

void
Unicode::getUnicodeVersion(UVersionInfo versionArray)
{
	u_getUnicodeVersion(versionArray);
}