ICU-76 ucharacter semantic changes

jitterbug 77: add additional API for numeric values
jitterbug 78: UCharacter.getNumericValue superscript bugs

X-SVN-Rev: 6085
This commit is contained in:
Doug Felt 2001-10-05 18:42:33 +00:00
parent 5746e4c2fc
commit 21b8e60b11
2 changed files with 600 additions and 110 deletions

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $
* $Date: 2001/08/22 22:38:30 $ * $Date: 2001/10/05 18:42:33 $
* $Revision: 1.11 $ * $Revision: 1.12 $
* *
******************************************************************************* *******************************************************************************
*/ */
@ -57,8 +57,11 @@ import java.util.Locale;
* does it include the Java-specific character information, such as * does it include the Java-specific character information, such as
* boolean isJavaIdentifierPart(char ch). * boolean isJavaIdentifierPart(char ch).
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
* values '10' - '35'. UCharacter does not treat the above code points * values '10' - '35'. UCharacter also does this in digit and
* as having numeric values * getNumericValue, to adhere to the java semantics of these
* methods. New methods unicodeDigit, and
* getUnicodeNumericValue do not treat the above code points
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
* <li> For consistency with ICU4C's data, control code points below have their * <li> For consistency with ICU4C's data, control code points below have their
* Unicode general category reset to the types below. * Unicode general category reset to the types below.
* <ul> * <ul>
@ -349,19 +352,25 @@ public final class UCharacter
// public methods =================================================== // public methods ===================================================
/** /**
* Retrieves the decimal numeric value of a digit code point.<br> * Retrieves the numeric value of a decimal digit code point.
* A code point is a valid digit if the following is true: * <br>This method observes the semantics of
* <code>java.lang.Character.digit()</code>. Note that this
* will return positive values for code points for which isDigit
* returns false, just like java.lang.Character.
* <br><em>Semantic Change:</em> In release 1.3.1 and
* prior, this did not treat the European letters as having a
* digit value, and also treated numeric letters and other numbers as digits.
* This has been changed to conform to the java semantics.
* <br>A code point is a valid digit if and only if:
* <ul> * <ul>
* <li> The method isDigit(ch) is true and the Unicode decimal digit value of * <li>ch is a decimal digit or one of the european letters, and
* ch is less than the specified radix. * <li>the value of ch is less than the specified radix.
* </ul> * </ul>
* Note this method, unlike java.lang.Character.digit() does not regard the * @param ch the code point to query
* ascii characters 'A' - 'Z' and 'a' - 'z' as digits. * @param radix the radix
* @param ch the code point whose numeric value is to be determined * @return the numeric value represented by the code point in the
* @param radix the radix which the digit is to be converted to * specified radix, or -1 if the code point is not a decimal digit
* @return the numeric value of the code point ch in the argument radix, * or if its value is too large for the radix
* this method returns -1 if ch is not a valid digit code point or
* if its digit value exceeds the radix.
*/ */
public static int digit(int ch, int radix) public static int digit(int ch, int radix)
{ {
@ -375,6 +384,7 @@ public final class UCharacter
result = UCharacterPropertyDB.getSignedValue(props); result = UCharacterPropertyDB.getSignedValue(props);
} }
} }
/*
else { else {
// contained in exception data // contained in exception data
int index = UCharacterPropertyDB.getExceptionIndex(props); int index = UCharacterPropertyDB.getExceptionIndex(props);
@ -398,6 +408,11 @@ public final class UCharacter
if (result < 0) { if (result < 0) {
result = getHanDigit(ch); result = getHanDigit(ch);
} }
*/
if (result < 0 && radix > 10) {
result = getEuropeanDigit(ch);
}
if (result < 0 || result >= radix) { if (result < 0 || result >= radix) {
return -1; return -1;
@ -405,14 +420,39 @@ public final class UCharacter
return result; return result;
} }
private static boolean isEuropeanDigit(int ch) {
return (ch <= 0x7a && ((ch >= 0x41 && ch <= 0x5a) || ch >= 0x61)) ||
(ch >= 0xff21 && (ch <= 0xff3a || (ch >= 0xff41 && ch <= 0xff5a)));
}
private static int getEuropeanDigit(int ch) {
if (ch <= 0x7a) {
if (ch >= 0x41 && ch <= 0x5a) {
return ch + 10 - 0x41;
} else if (ch >= 0x61) {
return ch + 10 - 0x61;
}
} else if (ch >= 0xff21) {
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
} else if (ch >= 0xff41 && ch <= 0xff5a) {
return ch + 10 - 0xff41;
}
}
return -1;
}
/** /**
* Retrieves the decimal numeric value of a digit code point in radix 10<br> * Retrieves the numeric value of a decimal digit code point.
* Note this method, unlike java.lang.Character.digit() does not regard the * <br>This is a convenience overload of <code>digit(int, int)</code>
* ascii characters 'A' - 'Z' and 'a' - 'z' as digits. * that provides a decimal radix.
* @param ch the code point whose numeric value is to be determined * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
* @return the numeric value of the code point ch, this method returns -1 if * treated numeric letters and other numbers as digits. This has
* ch is not a valid digit code point * been changed to conform to the java semantics.
*/ * @param ch the code point to query
* @return the numeric value represented by the code point,
* or -1 if the code point is not a decimal digit or if its
* value is too large for a decimal radix */
public static int digit(int ch) public static int digit(int ch)
{ {
return digit(ch, DECIMAL_RADIX_); return digit(ch, DECIMAL_RADIX_);
@ -420,17 +460,43 @@ public final class UCharacter
/** /**
* Returns the Unicode numeric value of the code point as a nonnegative * Returns the Unicode numeric value of the code point as a nonnegative
* integer. <br> * integer.
* If the code point does not have a numeric value, then -1 is returned. <br> * <br>If the code point does not have a numeric value, then -1 is returned. <br>
* If the code point has a numeric value that cannot be represented as a * If the code point has a numeric value that cannot be represented as a
* nonnegative integer (for example, a fractional value), then -2 is returned. * nonnegative integer (for example, a fractional value), then -2 is returned.
* <br> * <br><em>Semantic Change:</em> In release 1.3.1 and
* Note this method, unlike java.lang.Character.digit() does not regard the * prior, this returned -1 for ASCII letters and their
* ascii characters 'A' - 'Z' and 'a' - 'z' as numbers. * fullwidth counterparts. This has been changed to
* @param ch Unicode code point * conform to the java semantics.
* @return numeric value of the code point as a nonnegative integer * @param ch the code point to query
* @return the numeric value of the code point, or -1 if it has no numeric value,
* or -2 if it has a numeric value that cannot be represented as a nonnegative
* integer
*/ */
public static int getNumericValue(int ch) public static int getNumericValue(int ch)
{
return getNumericValueInternal(ch, true);
}
/**
* Returns the Unicode numeric value of the code point as a nonnegative
* integer.
* <br>If the code point does not have a numeric value, then -1 is returned. <br>
* If the code point has a numeric value that cannot be represented as a
* nonnegative integer (for example, a fractional value), then -2 is returned.
* This returns values other than -1 for all and only those code points whose
* type is a numeric type.
* @param ch the code point to query
* @return the numeric value of the code point, or -1 if it has no numeric value,
* or -2 if it has a numeric value that cannot be represented as a nonnegative
* integer
*/
public static int getUnicodeNumericValue(int ch)
{
return getNumericValueInternal(ch, false);
}
private static int getNumericValueInternal(int ch, boolean useEuropean)
{ {
int props = getProps(ch); int props = getProps(ch);
int type = UCharacterPropertyDB.getPropType(props); int type = UCharacterPropertyDB.getPropType(props);
@ -439,7 +505,8 @@ public final class UCharacter
if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER && if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER &&
type != UCharacterCategory.LETTER_NUMBER && type != UCharacterCategory.LETTER_NUMBER &&
type != UCharacterCategory.OTHER_NUMBER) { type != UCharacterCategory.OTHER_NUMBER) {
return -1;
return useEuropean ? getEuropeanDigit(ch) : -1;
} }
int result = -1; int result = -1;
@ -453,7 +520,8 @@ public final class UCharacter
if (PROPERTY_DB_.hasExceptionValue(index, if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_DIGIT_VALUE_)) { UCharacterPropertyDB.EXC_DIGIT_VALUE_)) {
result = PROPERTY_DB_.getException(index, result = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_DIGIT_VALUE_); UCharacterPropertyDB.EXC_DIGIT_VALUE_) &
LAST_CHAR_MASK_;
} }
else { else {
if (!PROPERTY_DB_.hasExceptionValue(index, if (!PROPERTY_DB_.hasExceptionValue(index,
@ -466,10 +534,6 @@ public final class UCharacter
} }
} }
if (result < 0) {
result = getHanDigit(ch);
}
if (result < 0) { if (result < 0) {
return -2; return -2;
} }
@ -506,19 +570,18 @@ public final class UCharacter
} }
/** /**
* Determines if a code point is a digit.<br> * Determines if a code point is a Java digit.
* Note this method, unlike java.lang.Character.isDigit() does not regard the * <br>This method observes the semantics of
* ascii characters 'A' - 'Z' and 'a' - 'z' as digits.<br> * <code>java.lang.Character.isDigit()</code>. It returns true for
* @param ch code point to determine if it is a digit * decimal digits only.
* @return true if this code point is a digit * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
*/ * treated numeric letters and other numbers as digits. This has
* been changed to conform to the java semantics.
* @param ch code point to query
* @return true if this code point is a digit */
public static boolean isDigit(int ch) public static boolean isDigit(int ch)
{ {
int cat = getType(ch); return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
cat == UCharacterCategory.OTHER_NUMBER ||
cat == UCharacterCategory.LETTER_NUMBER;
} }
/** /**
@ -1527,38 +1590,220 @@ public final class UCharacter
} }
/** /**
* Getting Han character digit values * Return numeric value of Han code points.
* @param ch code point to test if it is a Han character * <br> This returns the value of Han 'numeric' code points,
* @return Han digit value if ch is a Han digit character * including those for zero, ten, hundred, thousand, ten thousand,
* and hundred million. Unicode does not consider these to be
* numeric. This includes both the standard and 'checkwriting'
* characters, the 'big circle' zero character, and the standard
* zero character.
* @param ch code point to query
* @return value if it is a Han 'numeric character,' otherwise return -1.
*/ */
private static int getHanDigit(int ch) public static int getHanNumericValue(int ch)
{ {
switch(ch) switch(ch)
{ {
case IDEOGRAPHIC_NUMBER_ZERO_ : case IDEOGRAPHIC_NUMBER_ZERO_ :
case CJK_IDEOGRAPH_COMPLEX_ZERO:
return 0; // Han Zero return 0; // Han Zero
case CJK_IDEOGRAPH_FIRST_ : case CJK_IDEOGRAPH_FIRST_ :
case CJK_IDEOGRAPH_COMPLEX_ONE:
return 1; // Han One return 1; // Han One
case CJK_IDEOGRAPH_SECOND_ : case CJK_IDEOGRAPH_SECOND_ :
case CJK_IDEOGRAPH_COMPLEX_TWO:
return 2; // Han Two return 2; // Han Two
case CJK_IDEOGRAPH_THIRD_ : case CJK_IDEOGRAPH_THIRD_ :
case CJK_IDEOGRAPH_COMPLEX_THREE:
return 3; // Han Three return 3; // Han Three
case CJK_IDEOGRAPH_FOURTH_ : case CJK_IDEOGRAPH_FOURTH_ :
case CJK_IDEOGRAPH_COMPLEX_FOUR:
return 4; // Han Four return 4; // Han Four
case CJK_IDEOGRAPH_FIFTH_ : case CJK_IDEOGRAPH_FIFTH_ :
case CJK_IDEOGRAPH_COMPLEX_FIVE:
return 5; // Han Five return 5; // Han Five
case CJK_IDEOGRAPH_SIXTH_ : case CJK_IDEOGRAPH_SIXTH_ :
case CJK_IDEOGRAPH_COMPLEX_SIX:
return 6; // Han Six return 6; // Han Six
case CJK_IDEOGRAPH_SEVENTH_ : case CJK_IDEOGRAPH_SEVENTH_ :
case CJK_IDEOGRAPH_COMPLEX_SEVEN:
return 7; // Han Seven return 7; // Han Seven
case CJK_IDEOGRAPH_EIGHTH_ : case CJK_IDEOGRAPH_EIGHTH_ :
case CJK_IDEOGRAPH_COMPLEX_EIGHT:
return 8; // Han Eight return 8; // Han Eight
case CJK_IDEOGRAPH_NINETH_ : case CJK_IDEOGRAPH_NINETH_ :
case CJK_IDEOGRAPH_COMPLEX_NINE:
return 9; // Han Nine return 9; // Han Nine
case CJK_IDEOGRAPH_TEN:
case CJK_IDEOGRAPH_COMPLEX_TEN:
return 10;
case CJK_IDEOGRAPH_HUNDRED:
case CJK_IDEOGRAPH_COMPLEX_HUNDRED:
return 100;
case CJK_IDEOGRAPH_THOUSAND:
case CJK_IDEOGRAPH_COMPLEX_THOUSAND:
return 1000;
case CJK_IDEOGRAPH_TEN_THOUSAND:
return 10000;
case CJK_IDEOGRAPH_HUNDRED_MILLION:
return 100000000;
} }
return -1; // no value return -1; // no value
} }
/*
* Return a decimal code point in the given range for the provided value.
* The range is defined by a DIGIT_RANGE selector, see below. Most ranges
* only accept values between 0 and 9, some ranges (EUROPEAN_EX) accept
* values between 0 and 35.
* <br>
* @param value a decimal value, from 0 to 9 for most standard ranges, and
* from 0 to 35 for the EUROPEAN_EX ranges.
* @param digitRange one of the DIGIT_RANGE selectors.
* @returns the code point, or -1 if no valid code point exists for that decimal.
*/
public int getCodePointForDigit(int digit, int digitRange) {
if (digitRange < 0 || digitRange > DIGIT_RANGE_LIMIT) {
throw new IllegalArgumentException("invalid digit range selector: " + digitRange);
}
if (digit < 0 || digit > ((digitRange < 1 || digitRange > 4) ? 9 : 35)) {
return -1;
}
if (digit < 10) {
if (digitRange < DIGIT_RANGE_HAN) {
if (digit == 0 && digitRange == DIGIT_RANGE_TAMIL) {
return -1;
}
return bases[digitRange] + digit;
} else if (digitRange == DIGIT_RANGE_HAN) {
return hanmap[digit];
} else {
return exhanmap[digit];
}
} else {
return exbases[digitRange] + digit;
}
}
private static int[] bases = {
0x0030, 0x0030, 0x0030, 0x0030, 0x0030,
0x0660, 0x06f0, 0x0966, 0x09e6, 0x0a66,
0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6,
0x0d66, 0x0e50, 0x0ed0, 0x0f20, 0x1040,
0x1369, 0x17e0, 0x1810
};
private static int[] exbases = {
0x0000, 0x0040, 0x0060, 0xff21, 0xff41
};
/* uses 'big circle' ling, includes shi, bai, qian, wan, yi */
private static int[] hanmap = {
0x3007, 0x4e00, 0x48ec, 0x4e09, 0x56d8, 0x4e94, 0x516d, 0x4e03, 0x516b, 0x4e5d
};
/* uses lingsuide ling, includes shi, bai, qian, wan, yi */
private static int[] exhanmap = {
0x96f6, 0x58f9, 0x8cb3, 0x53c3, 0x8086, 0x4f0d, 0x9678, 0x67d2, 0x634c, 0x7396
};
private static final int CJK_IDEOGRAPH_COMPLEX_ZERO = 0x96f6;
private static final int CJK_IDEOGRAPH_COMPLEX_ONE = 0x58f9;
private static final int CJK_IDEOGRAPH_COMPLEX_TWO = 0x8cb3;
private static final int CJK_IDEOGRAPH_COMPLEX_THREE = 0x53c3;
private static final int CJK_IDEOGRAPH_COMPLEX_FOUR = 0x8086;
private static final int CJK_IDEOGRAPH_COMPLEX_FIVE = 0x4f0d;
private static final int CJK_IDEOGRAPH_COMPLEX_SIX = 0x9678;
private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN = 0x67d2;
private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT = 0x634c;
private static final int CJK_IDEOGRAPH_COMPLEX_NINE = 0x7396;
private static final int CJK_IDEOGRAPH_TEN = 0x5341;
private static final int CJK_IDEOGRAPH_COMPLEX_TEN = 0x62fe;
private static final int CJK_IDEOGRAPH_HUNDRED = 0x767e;
private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED = 0x4f70;
private static final int CJK_IDEOGRAPH_THOUSAND = 0x5343;
private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND = 0x4edf;
private static final int CJK_IDEOGRAPH_TEN_THOUSAND = 0x824c;
private static final int CJK_IDEOGRAPH_HUNDRED_MILLION = 0x5104;
/** European (ASCII) digits for values 0-9 */
public static final int DIGIT_RANGE_EUROPEAN = 0;
/** European (ASCII) digits for values 0-9 and upper case letters for values 10-35 */
public static final int DIGIT_RANGE_EUROPEAN_EX_UC = 1;
/** European (ASCII) digits for values 0-9 and lower case letters for values 10-35 */
public static final int DIGIT_RANGE_EUROPEAN_EX_LC = 2;
/** European (FullWidth) digits for values 0-9 and fullwidth upper case letters for values 10-35 */
public static final int DIGIT_RANGE_EUROPEAN_EX_FW_UC = 3;
/** European (FullWidth) digits for values 0-9 and fullwidth lower case letters for values 10-35 */
public static final int DIGIT_RANGE_EUROPEAN_EX_FW_LC = 4;
/** Arabic digits for values 0-9 */
public static final int DIGIT_RANGE_ARABIC = 5;
/** Eastern Arabic (Persian) digits for values 0-9 */
public static final int DIGIT_RANGE_EASTERN_ARABIC = 6;
/** Devanagari digits for values 0-9 */
public static final int DIGIT_RANGE_DEVANAGARI = 7;
/** Bengali digits for values 0-9 */
public static final int DIGIT_RANGE_BENGALI = 8;
/** Gurmukhi digits for values 0-9 */
public static final int DIGIT_RANGE_GURMUKHI = 9;
/** Gurjarati digits for values 0-9 */
public static final int DIGIT_RANGE_GUJARATI = 10;
/** Oriya digits for values 0-9 */
public static final int DIGIT_RANGE_ORIYA = 11;
/** Tamil digits for values 1-9, Tamil has no digit for zero. */
public static final int DIGIT_RANGE_TAMIL = 12;
/** Telugu digits for values 0-9 */
public static final int DIGIT_RANGE_TELUGU = 13;
/** Kannada digits for values 0-9 */
public static final int DIGIT_RANGE_KANNADA = 14;
/** Malayam digits for values 0-9 */
public static final int DIGIT_RANGE_MALAYAM = 15;
/** Thai digits for values 0-9 */
public static final int DIGIT_RANGE_THAI = 16;
/** Lao digits for values 0-9 */
public static final int DIGIT_RANGE_LAO = 17;
/** Tibetan digits for values 0-9 */
public static final int DIGIT_RANGE_TIBETAN = 18;
/** Myanmar digits for values 0-9 */
public static final int DIGIT_RANGE_MYANMAR = 19;
/** Ethiopic digits for values 0-9 */
public static final int DIGIT_RANGE_ETHIOPIC = 20;
/** Khmer digits for values 0-9 */
public static final int DIGIT_RANGE_KHMER = 21;
/** Montolian digits for values 0-9 */
public static final int DIGIT_RANGE_MONGOLIAN = 22;
/** Han digits for values 0-9 */
public static final int DIGIT_RANGE_HAN = 23;
/** Han ("checkwriting") digits for values 0-9 */
public static final int DIGIT_RANGE_HAN_CW = 24;
private static final int DIGIT_RANGE_LIMIT = 25;
/** /**
* Special casing uppercase management * Special casing uppercase management
* @param ch code point to convert * @param ch code point to convert

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacter.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacter.java,v $
* $Date: 2001/08/22 22:38:30 $ * $Date: 2001/10/05 18:42:33 $
* $Revision: 1.11 $ * $Revision: 1.12 $
* *
******************************************************************************* *******************************************************************************
*/ */
@ -57,8 +57,11 @@ import java.util.Locale;
* does it include the Java-specific character information, such as * does it include the Java-specific character information, such as
* boolean isJavaIdentifierPart(char ch). * boolean isJavaIdentifierPart(char ch).
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
* values '10' - '35'. UCharacter does not treat the above code points * values '10' - '35'. UCharacter also does this in digit and
* as having numeric values * getNumericValue, to adhere to the java semantics of these
* methods. New methods unicodeDigit, and
* getUnicodeNumericValue do not treat the above code points
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
* <li> For consistency with ICU4C's data, control code points below have their * <li> For consistency with ICU4C's data, control code points below have their
* Unicode general category reset to the types below. * Unicode general category reset to the types below.
* <ul> * <ul>
@ -349,19 +352,25 @@ public final class UCharacter
// public methods =================================================== // public methods ===================================================
/** /**
* Retrieves the decimal numeric value of a digit code point.<br> * Retrieves the numeric value of a decimal digit code point.
* A code point is a valid digit if the following is true: * <br>This method observes the semantics of
* <code>java.lang.Character.digit()</code>. Note that this
* will return positive values for code points for which isDigit
* returns false, just like java.lang.Character.
* <br><em>Semantic Change:</em> In release 1.3.1 and
* prior, this did not treat the European letters as having a
* digit value, and also treated numeric letters and other numbers as digits.
* This has been changed to conform to the java semantics.
* <br>A code point is a valid digit if and only if:
* <ul> * <ul>
* <li> The method isDigit(ch) is true and the Unicode decimal digit value of * <li>ch is a decimal digit or one of the european letters, and
* ch is less than the specified radix. * <li>the value of ch is less than the specified radix.
* </ul> * </ul>
* Note this method, unlike java.lang.Character.digit() does not regard the * @param ch the code point to query
* ascii characters 'A' - 'Z' and 'a' - 'z' as digits. * @param radix the radix
* @param ch the code point whose numeric value is to be determined * @return the numeric value represented by the code point in the
* @param radix the radix which the digit is to be converted to * specified radix, or -1 if the code point is not a decimal digit
* @return the numeric value of the code point ch in the argument radix, * or if its value is too large for the radix
* this method returns -1 if ch is not a valid digit code point or
* if its digit value exceeds the radix.
*/ */
public static int digit(int ch, int radix) public static int digit(int ch, int radix)
{ {
@ -375,6 +384,7 @@ public final class UCharacter
result = UCharacterPropertyDB.getSignedValue(props); result = UCharacterPropertyDB.getSignedValue(props);
} }
} }
/*
else { else {
// contained in exception data // contained in exception data
int index = UCharacterPropertyDB.getExceptionIndex(props); int index = UCharacterPropertyDB.getExceptionIndex(props);
@ -398,6 +408,11 @@ public final class UCharacter
if (result < 0) { if (result < 0) {
result = getHanDigit(ch); result = getHanDigit(ch);
} }
*/
if (result < 0 && radix > 10) {
result = getEuropeanDigit(ch);
}
if (result < 0 || result >= radix) { if (result < 0 || result >= radix) {
return -1; return -1;
@ -405,14 +420,39 @@ public final class UCharacter
return result; return result;
} }
private static boolean isEuropeanDigit(int ch) {
return (ch <= 0x7a && ((ch >= 0x41 && ch <= 0x5a) || ch >= 0x61)) ||
(ch >= 0xff21 && (ch <= 0xff3a || (ch >= 0xff41 && ch <= 0xff5a)));
}
private static int getEuropeanDigit(int ch) {
if (ch <= 0x7a) {
if (ch >= 0x41 && ch <= 0x5a) {
return ch + 10 - 0x41;
} else if (ch >= 0x61) {
return ch + 10 - 0x61;
}
} else if (ch >= 0xff21) {
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
} else if (ch >= 0xff41 && ch <= 0xff5a) {
return ch + 10 - 0xff41;
}
}
return -1;
}
/** /**
* Retrieves the decimal numeric value of a digit code point in radix 10<br> * Retrieves the numeric value of a decimal digit code point.
* Note this method, unlike java.lang.Character.digit() does not regard the * <br>This is a convenience overload of <code>digit(int, int)</code>
* ascii characters 'A' - 'Z' and 'a' - 'z' as digits. * that provides a decimal radix.
* @param ch the code point whose numeric value is to be determined * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
* @return the numeric value of the code point ch, this method returns -1 if * treated numeric letters and other numbers as digits. This has
* ch is not a valid digit code point * been changed to conform to the java semantics.
*/ * @param ch the code point to query
* @return the numeric value represented by the code point,
* or -1 if the code point is not a decimal digit or if its
* value is too large for a decimal radix */
public static int digit(int ch) public static int digit(int ch)
{ {
return digit(ch, DECIMAL_RADIX_); return digit(ch, DECIMAL_RADIX_);
@ -420,17 +460,43 @@ public final class UCharacter
/** /**
* Returns the Unicode numeric value of the code point as a nonnegative * Returns the Unicode numeric value of the code point as a nonnegative
* integer. <br> * integer.
* If the code point does not have a numeric value, then -1 is returned. <br> * <br>If the code point does not have a numeric value, then -1 is returned. <br>
* If the code point has a numeric value that cannot be represented as a * If the code point has a numeric value that cannot be represented as a
* nonnegative integer (for example, a fractional value), then -2 is returned. * nonnegative integer (for example, a fractional value), then -2 is returned.
* <br> * <br><em>Semantic Change:</em> In release 1.3.1 and
* Note this method, unlike java.lang.Character.digit() does not regard the * prior, this returned -1 for ASCII letters and their
* ascii characters 'A' - 'Z' and 'a' - 'z' as numbers. * fullwidth counterparts. This has been changed to
* @param ch Unicode code point * conform to the java semantics.
* @return numeric value of the code point as a nonnegative integer * @param ch the code point to query
* @return the numeric value of the code point, or -1 if it has no numeric value,
* or -2 if it has a numeric value that cannot be represented as a nonnegative
* integer
*/ */
public static int getNumericValue(int ch) public static int getNumericValue(int ch)
{
return getNumericValueInternal(ch, true);
}
/**
* Returns the Unicode numeric value of the code point as a nonnegative
* integer.
* <br>If the code point does not have a numeric value, then -1 is returned. <br>
* If the code point has a numeric value that cannot be represented as a
* nonnegative integer (for example, a fractional value), then -2 is returned.
* This returns values other than -1 for all and only those code points whose
* type is a numeric type.
* @param ch the code point to query
* @return the numeric value of the code point, or -1 if it has no numeric value,
* or -2 if it has a numeric value that cannot be represented as a nonnegative
* integer
*/
public static int getUnicodeNumericValue(int ch)
{
return getNumericValueInternal(ch, false);
}
private static int getNumericValueInternal(int ch, boolean useEuropean)
{ {
int props = getProps(ch); int props = getProps(ch);
int type = UCharacterPropertyDB.getPropType(props); int type = UCharacterPropertyDB.getPropType(props);
@ -439,7 +505,8 @@ public final class UCharacter
if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER && if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER &&
type != UCharacterCategory.LETTER_NUMBER && type != UCharacterCategory.LETTER_NUMBER &&
type != UCharacterCategory.OTHER_NUMBER) { type != UCharacterCategory.OTHER_NUMBER) {
return -1;
return useEuropean ? getEuropeanDigit(ch) : -1;
} }
int result = -1; int result = -1;
@ -453,7 +520,8 @@ public final class UCharacter
if (PROPERTY_DB_.hasExceptionValue(index, if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_DIGIT_VALUE_)) { UCharacterPropertyDB.EXC_DIGIT_VALUE_)) {
result = PROPERTY_DB_.getException(index, result = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_DIGIT_VALUE_); UCharacterPropertyDB.EXC_DIGIT_VALUE_) &
LAST_CHAR_MASK_;
} }
else { else {
if (!PROPERTY_DB_.hasExceptionValue(index, if (!PROPERTY_DB_.hasExceptionValue(index,
@ -466,10 +534,6 @@ public final class UCharacter
} }
} }
if (result < 0) {
result = getHanDigit(ch);
}
if (result < 0) { if (result < 0) {
return -2; return -2;
} }
@ -506,19 +570,18 @@ public final class UCharacter
} }
/** /**
* Determines if a code point is a digit.<br> * Determines if a code point is a Java digit.
* Note this method, unlike java.lang.Character.isDigit() does not regard the * <br>This method observes the semantics of
* ascii characters 'A' - 'Z' and 'a' - 'z' as digits.<br> * <code>java.lang.Character.isDigit()</code>. It returns true for
* @param ch code point to determine if it is a digit * decimal digits only.
* @return true if this code point is a digit * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
*/ * treated numeric letters and other numbers as digits. This has
* been changed to conform to the java semantics.
* @param ch code point to query
* @return true if this code point is a digit */
public static boolean isDigit(int ch) public static boolean isDigit(int ch)
{ {
int cat = getType(ch); return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
cat == UCharacterCategory.OTHER_NUMBER ||
cat == UCharacterCategory.LETTER_NUMBER;
} }
/** /**
@ -1527,38 +1590,220 @@ public final class UCharacter
} }
/** /**
* Getting Han character digit values * Return numeric value of Han code points.
* @param ch code point to test if it is a Han character * <br> This returns the value of Han 'numeric' code points,
* @return Han digit value if ch is a Han digit character * including those for zero, ten, hundred, thousand, ten thousand,
* and hundred million. Unicode does not consider these to be
* numeric. This includes both the standard and 'checkwriting'
* characters, the 'big circle' zero character, and the standard
* zero character.
* @param ch code point to query
* @return value if it is a Han 'numeric character,' otherwise return -1.
*/ */
private static int getHanDigit(int ch) public static int getHanNumericValue(int ch)
{ {
switch(ch) switch(ch)
{ {
case IDEOGRAPHIC_NUMBER_ZERO_ : case IDEOGRAPHIC_NUMBER_ZERO_ :
case CJK_IDEOGRAPH_COMPLEX_ZERO:
return 0; // Han Zero return 0; // Han Zero
case CJK_IDEOGRAPH_FIRST_ : case CJK_IDEOGRAPH_FIRST_ :
case CJK_IDEOGRAPH_COMPLEX_ONE:
return 1; // Han One return 1; // Han One
case CJK_IDEOGRAPH_SECOND_ : case CJK_IDEOGRAPH_SECOND_ :
case CJK_IDEOGRAPH_COMPLEX_TWO:
return 2; // Han Two return 2; // Han Two
case CJK_IDEOGRAPH_THIRD_ : case CJK_IDEOGRAPH_THIRD_ :
case CJK_IDEOGRAPH_COMPLEX_THREE:
return 3; // Han Three return 3; // Han Three
case CJK_IDEOGRAPH_FOURTH_ : case CJK_IDEOGRAPH_FOURTH_ :
case CJK_IDEOGRAPH_COMPLEX_FOUR:
return 4; // Han Four return 4; // Han Four
case CJK_IDEOGRAPH_FIFTH_ : case CJK_IDEOGRAPH_FIFTH_ :
case CJK_IDEOGRAPH_COMPLEX_FIVE:
return 5; // Han Five return 5; // Han Five
case CJK_IDEOGRAPH_SIXTH_ : case CJK_IDEOGRAPH_SIXTH_ :
case CJK_IDEOGRAPH_COMPLEX_SIX:
return 6; // Han Six return 6; // Han Six
case CJK_IDEOGRAPH_SEVENTH_ : case CJK_IDEOGRAPH_SEVENTH_ :
case CJK_IDEOGRAPH_COMPLEX_SEVEN:
return 7; // Han Seven return 7; // Han Seven
case CJK_IDEOGRAPH_EIGHTH_ : case CJK_IDEOGRAPH_EIGHTH_ :
case CJK_IDEOGRAPH_COMPLEX_EIGHT:
return 8; // Han Eight return 8; // Han Eight
case CJK_IDEOGRAPH_NINETH_ : case CJK_IDEOGRAPH_NINETH_ :
case CJK_IDEOGRAPH_COMPLEX_NINE:
return 9; // Han Nine return 9; // Han Nine
case CJK_IDEOGRAPH_TEN:
case CJK_IDEOGRAPH_COMPLEX_TEN:
return 10;
case CJK_IDEOGRAPH_HUNDRED:
case CJK_IDEOGRAPH_COMPLEX_HUNDRED:
return 100;
case CJK_IDEOGRAPH_THOUSAND:
case CJK_IDEOGRAPH_COMPLEX_THOUSAND:
return 1000;
case CJK_IDEOGRAPH_TEN_THOUSAND:
return 10000;
case CJK_IDEOGRAPH_HUNDRED_MILLION:
return 100000000;
} }
return -1; // no value return -1; // no value
} }
/*
* Return a decimal code point in the given range for the provided value.
* The range is defined by a DIGIT_RANGE selector, see below. Most ranges
* only accept values between 0 and 9, some ranges (EUROPEAN_EX) accept
* values between 0 and 35.
* <br>
* @param value a decimal value, from 0 to 9 for most standard ranges, and
* from 0 to 35 for the EUROPEAN_EX ranges.
* @param digitRange one of the DIGIT_RANGE selectors.
* @returns the code point, or -1 if no valid code point exists for that decimal.
*/
public int getCodePointForDigit(int digit, int digitRange) {
if (digitRange < 0 || digitRange > DIGIT_RANGE_LIMIT) {
throw new IllegalArgumentException("invalid digit range selector: " + digitRange);
}
if (digit < 0 || digit > ((digitRange < 1 || digitRange > 4) ? 9 : 35)) {
return -1;
}
if (digit < 10) {
if (digitRange < DIGIT_RANGE_HAN) {
if (digit == 0 && digitRange == DIGIT_RANGE_TAMIL) {
return -1;
}
return bases[digitRange] + digit;
} else if (digitRange == DIGIT_RANGE_HAN) {
return hanmap[digit];
} else {
return exhanmap[digit];
}
} else {
return exbases[digitRange] + digit;
}
}
private static int[] bases = {
0x0030, 0x0030, 0x0030, 0x0030, 0x0030,
0x0660, 0x06f0, 0x0966, 0x09e6, 0x0a66,
0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6,
0x0d66, 0x0e50, 0x0ed0, 0x0f20, 0x1040,
0x1369, 0x17e0, 0x1810
};
private static int[] exbases = {
0x0000, 0x0040, 0x0060, 0xff21, 0xff41
};
/* uses 'big circle' ling, includes shi, bai, qian, wan, yi */
private static int[] hanmap = {
0x3007, 0x4e00, 0x48ec, 0x4e09, 0x56d8, 0x4e94, 0x516d, 0x4e03, 0x516b, 0x4e5d
};
/* uses lingsuide ling, includes shi, bai, qian, wan, yi */
private static int[] exhanmap = {
0x96f6, 0x58f9, 0x8cb3, 0x53c3, 0x8086, 0x4f0d, 0x9678, 0x67d2, 0x634c, 0x7396
};
private static final int CJK_IDEOGRAPH_COMPLEX_ZERO = 0x96f6;
private static final int CJK_IDEOGRAPH_COMPLEX_ONE = 0x58f9;
private static final int CJK_IDEOGRAPH_COMPLEX_TWO = 0x8cb3;
private static final int CJK_IDEOGRAPH_COMPLEX_THREE = 0x53c3;
private static final int CJK_IDEOGRAPH_COMPLEX_FOUR = 0x8086;
private static final int CJK_IDEOGRAPH_COMPLEX_FIVE = 0x4f0d;
private static final int CJK_IDEOGRAPH_COMPLEX_SIX = 0x9678;
private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN = 0x67d2;
private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT = 0x634c;
private static final int CJK_IDEOGRAPH_COMPLEX_NINE = 0x7396;
private static final int CJK_IDEOGRAPH_TEN = 0x5341;
private static final int CJK_IDEOGRAPH_COMPLEX_TEN = 0x62fe;
private static final int CJK_IDEOGRAPH_HUNDRED = 0x767e;
private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED = 0x4f70;
private static final int CJK_IDEOGRAPH_THOUSAND = 0x5343;
private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND = 0x4edf;
private static final int CJK_IDEOGRAPH_TEN_THOUSAND = 0x824c;
private static final int CJK_IDEOGRAPH_HUNDRED_MILLION = 0x5104;
/** European (ASCII) digits for values 0-9 */
public static final int DIGIT_RANGE_EUROPEAN = 0;
/** European (ASCII) digits for values 0-9 and upper case letters for values 10-35 */
public static final int DIGIT_RANGE_EUROPEAN_EX_UC = 1;
/** European (ASCII) digits for values 0-9 and lower case letters for values 10-35 */
public static final int DIGIT_RANGE_EUROPEAN_EX_LC = 2;
/** European (FullWidth) digits for values 0-9 and fullwidth upper case letters for values 10-35 */
public static final int DIGIT_RANGE_EUROPEAN_EX_FW_UC = 3;
/** European (FullWidth) digits for values 0-9 and fullwidth lower case letters for values 10-35 */
public static final int DIGIT_RANGE_EUROPEAN_EX_FW_LC = 4;
/** Arabic digits for values 0-9 */
public static final int DIGIT_RANGE_ARABIC = 5;
/** Eastern Arabic (Persian) digits for values 0-9 */
public static final int DIGIT_RANGE_EASTERN_ARABIC = 6;
/** Devanagari digits for values 0-9 */
public static final int DIGIT_RANGE_DEVANAGARI = 7;
/** Bengali digits for values 0-9 */
public static final int DIGIT_RANGE_BENGALI = 8;
/** Gurmukhi digits for values 0-9 */
public static final int DIGIT_RANGE_GURMUKHI = 9;
/** Gurjarati digits for values 0-9 */
public static final int DIGIT_RANGE_GUJARATI = 10;
/** Oriya digits for values 0-9 */
public static final int DIGIT_RANGE_ORIYA = 11;
/** Tamil digits for values 1-9, Tamil has no digit for zero. */
public static final int DIGIT_RANGE_TAMIL = 12;
/** Telugu digits for values 0-9 */
public static final int DIGIT_RANGE_TELUGU = 13;
/** Kannada digits for values 0-9 */
public static final int DIGIT_RANGE_KANNADA = 14;
/** Malayam digits for values 0-9 */
public static final int DIGIT_RANGE_MALAYAM = 15;
/** Thai digits for values 0-9 */
public static final int DIGIT_RANGE_THAI = 16;
/** Lao digits for values 0-9 */
public static final int DIGIT_RANGE_LAO = 17;
/** Tibetan digits for values 0-9 */
public static final int DIGIT_RANGE_TIBETAN = 18;
/** Myanmar digits for values 0-9 */
public static final int DIGIT_RANGE_MYANMAR = 19;
/** Ethiopic digits for values 0-9 */
public static final int DIGIT_RANGE_ETHIOPIC = 20;
/** Khmer digits for values 0-9 */
public static final int DIGIT_RANGE_KHMER = 21;
/** Montolian digits for values 0-9 */
public static final int DIGIT_RANGE_MONGOLIAN = 22;
/** Han digits for values 0-9 */
public static final int DIGIT_RANGE_HAN = 23;
/** Han ("checkwriting") digits for values 0-9 */
public static final int DIGIT_RANGE_HAN_CW = 24;
private static final int DIGIT_RANGE_LIMIT = 25;
/** /**
* Special casing uppercase management * Special casing uppercase management
* @param ch code point to convert * @param ch code point to convert