ICU-76 ucharacter semantic changes

jitterbug 77: add additional API for numeric values jitterbug 78: UCharacter.getNumericValue superscript bugs X-SVN-Rev: 6085
2001-10-05 18:42:33 +00:00 · 2001-10-05 18:42:33 +00:00 · 21b8e60b11
commit 21b8e60b11
parent 5746e4c2fc
2 changed files with 600 additions and 110 deletions
--- a/icu4j/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/src/com/ibm/icu/lang/UCharacter.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $ 
-* $Date: 2001/08/22 22:38:30 $ 
+* $Date: 2001/10/05 18:42:33 $ 
-* $Revision: 1.11 $
+* $Revision: 1.12 $
 *
 *******************************************************************************
 */
@ -57,8 +57,11 @@ import java.util.Locale;
 *      does it include the Java-specific character information, such as 
 *      boolean isJavaIdentifierPart(char ch).
 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric 
-*      values '10' - '35'. UCharacter does not treat the above code points 
+*      values '10' - '35'. UCharacter also does this in digit and
-*      as having numeric values
+*      getNumericValue, to adhere to the java semantics of these
 *      methods.  New methods unicodeDigit, and
 *      getUnicodeNumericValue do not treat the above code points 
 *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
 * <li> For consistency with ICU4C's data, control code points below have their 
 *      Unicode general category reset to the types below.
 *      <ul>
@ -349,19 +352,25 @@ public final class UCharacter
    // public methods ===================================================
    /**
-    * Retrieves the decimal numeric value of a digit code point.<br>
+    * Retrieves the numeric value of a decimal digit code point.
-    * A code point is a valid digit if the following is true: 
+    * <br>This method observes the semantics of
    * <code>java.lang.Character.digit()</code>.  Note that this
    * will return positive values for code points for which isDigit
    * returns false, just like java.lang.Character.
    * <br><em>Semantic Change:</em> In release 1.3.1 and
    * prior, this did not treat the European letters as having a
    * digit value, and also treated numeric letters and other numbers as digits.  
    * This has been changed to conform to the java semantics.
    * <br>A code point is a valid digit if and only if:
    * <ul>
-    * <li> The method isDigit(ch) is true and the Unicode decimal digit value of 
+    *   <li>ch is a decimal digit or one of the european letters, and
-    *      ch is less than the specified radix. 
+    *   <li>the value of ch is less than the specified radix.
    * </ul>
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
+    * @param ch the code point to query
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. 
+    * @param radix the radix
-    * @param ch the code point whose numeric value is to be determined
+    * @return the numeric value represented by the code point in the
-    * @param radix the radix which the digit is to be converted to
+    * specified radix, or -1 if the code point is not a decimal digit
-    * @return the numeric value of the code point ch in the argument radix,
+    * or if its value is too large for the radix
    *         this method returns -1 if ch is not a valid digit code point or 
    *         if its digit value exceeds the radix.
    */
    public static int digit(int ch, int radix)
    {
@ -375,6 +384,7 @@ public final class UCharacter
                result = UCharacterPropertyDB.getSignedValue(props);
            }
        }
 /*
        else {
            // contained in exception data
            int index = UCharacterPropertyDB.getExceptionIndex(props);
@ -398,6 +408,11 @@ public final class UCharacter
        if (result < 0) {
            result = getHanDigit(ch);
        }
 */
        if (result < 0 && radix > 10) {
            result = getEuropeanDigit(ch);
        }
        if (result < 0 || result >= radix) {
            return -1;
@ -405,14 +420,39 @@ public final class UCharacter
        return result;
    }
    private static boolean isEuropeanDigit(int ch) {
        return (ch <= 0x7a && ((ch >= 0x41 && ch <= 0x5a) || ch >= 0x61)) ||
            (ch >= 0xff21 && (ch <= 0xff3a || (ch >= 0xff41 && ch <= 0xff5a)));
    }
    private static int getEuropeanDigit(int ch) {
        if (ch <= 0x7a) {
            if (ch >= 0x41 && ch <= 0x5a) {
                return ch + 10 - 0x41;
            } else if (ch >= 0x61) {
                return ch + 10 - 0x61;
            }
        } else if (ch >= 0xff21) {
            if (ch <= 0xff3a) {
                return ch + 10 - 0xff21;
            } else if (ch >= 0xff41 && ch <= 0xff5a) {
                return ch + 10 - 0xff41;
            }
        }
        return -1;
    }
    /**
-    * Retrieves the decimal numeric value of a digit code point in radix 10<br>
+    * Retrieves the numeric value of a decimal digit code point.
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
+    * <br>This is a convenience overload of <code>digit(int, int)</code> 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. 
+    * that provides a decimal radix.
-    * @param ch the code point whose numeric value is to be determined
+    * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
-    * @return the numeric value of the code point ch, this method returns -1 if 
+    * treated numeric letters and other numbers as digits.  This has
-    *         ch is not a valid digit code point
+    * been changed to conform to the java semantics.
-    */
+    * @param ch the code point to query
    * @return the numeric value represented by the code point,
    * or -1 if the code point is not a decimal digit or if its
    * value is too large for a decimal radix */
    public static int digit(int ch)
    {
        return digit(ch, DECIMAL_RADIX_);
@ -420,17 +460,43 @@ public final class UCharacter
   /**
    * Returns the Unicode numeric value of the code point as a nonnegative 
-    * integer. <br>
+    * integer.
-    * If the code point does not have a numeric value, then -1 is returned. <br>
+    * <br>If the code point does not have a numeric value, then -1 is returned. <br>
    * If the code point has a numeric value that cannot be represented as a 
    * nonnegative integer (for example, a fractional value), then -2 is returned.
-    * <br>
+    * <br><em>Semantic Change:</em> In release 1.3.1 and
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
+    * prior, this returned -1 for ASCII letters and their
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as numbers. 
+    * fullwidth counterparts.  This has been changed to
-    * @param ch Unicode code point
+    * conform to the java semantics.
-    * @return numeric value of the code point as a nonnegative integer
+    * @param ch the code point to query
    * @return the numeric value of the code point, or -1 if it has no numeric value,
    * or -2 if it has a numeric value that cannot be represented as a nonnegative 
    * integer
    */
    public static int getNumericValue(int ch)
    {
        return getNumericValueInternal(ch, true);
    }
   /**
    * Returns the Unicode numeric value of the code point as a nonnegative 
    * integer.
    * <br>If the code point does not have a numeric value, then -1 is returned. <br>
    * If the code point has a numeric value that cannot be represented as a 
    * nonnegative integer (for example, a fractional value), then -2 is returned.
    * This returns values other than -1 for all and only those code points whose
    * type is a numeric type.
    * @param ch the code point to query
    * @return the numeric value of the code point, or -1 if it has no numeric value,
    * or -2 if it has a numeric value that cannot be represented as a nonnegative 
    * integer
    */
    public static int getUnicodeNumericValue(int ch)
    {
        return getNumericValueInternal(ch, false);
    }
    private static int getNumericValueInternal(int ch, boolean useEuropean)
    {
        int props = getProps(ch);
        int type = UCharacterPropertyDB.getPropType(props);
@ -439,7 +505,8 @@ public final class UCharacter
        if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER &&
            type != UCharacterCategory.LETTER_NUMBER &&
            type != UCharacterCategory.OTHER_NUMBER) {
-            return -1;
+
            return useEuropean ? getEuropeanDigit(ch) : -1;
        }
        int result = -1;
@ -453,7 +520,8 @@ public final class UCharacter
            if (PROPERTY_DB_.hasExceptionValue(index, 
                                       UCharacterPropertyDB.EXC_DIGIT_VALUE_)) {
                result  = PROPERTY_DB_.getException(index, 
-                                        UCharacterPropertyDB.EXC_DIGIT_VALUE_); 
+                                        UCharacterPropertyDB.EXC_DIGIT_VALUE_) &
                    LAST_CHAR_MASK_; 
            }
            else {
                if (!PROPERTY_DB_.hasExceptionValue(index, 
@ -466,10 +534,6 @@ public final class UCharacter
            }
        }
        if (result < 0) {
          result = getHanDigit(ch);
        }
        if (result < 0) {
            return -2;
        }
@ -506,19 +570,18 @@ public final class UCharacter
    }
   /**
-    * Determines if a code point is a digit.<br>
+    * Determines if a code point is a Java digit.
-    * Note this method, unlike java.lang.Character.isDigit() does not regard the 
+    * <br>This method observes the semantics of
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits.<br>
+    * <code>java.lang.Character.isDigit()</code>.  It returns true for
-    * @param ch code point to determine if it is a digit
+    * decimal digits only.
-    * @return true if this code point is a digit
+    * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
-    */
+    * treated numeric letters and other numbers as digits.  This has
    * been changed to conform to the java semantics.
    * @param ch code point to query
    * @return true if this code point is a digit */
    public static boolean isDigit(int ch)
    {
-        int cat = getType(ch);
+        return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
        // if props == 0, it will just fall through and return false
        return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || 
            cat == UCharacterCategory.OTHER_NUMBER ||
            cat == UCharacterCategory.LETTER_NUMBER;
    }
    /**
@ -1527,38 +1590,220 @@ public final class UCharacter
    }
    /**
-    * Getting Han character digit values
+    * Return numeric value of Han code points.
-    * @param ch code point to test if it is a Han character
+    * <br> This returns the value of Han 'numeric' code points,
-    * @return Han digit value if ch is a Han digit character
+    * including those for zero, ten, hundred, thousand, ten thousand,
    * and hundred million.  Unicode does not consider these to be
    * numeric. This includes both the standard and 'checkwriting'
    * characters, the 'big circle' zero character, and the standard
    * zero character.
    * @param ch code point to query
    * @return value if it is a Han 'numeric character,' otherwise return -1.  
    */
-    private static int getHanDigit(int ch)
+    public static int getHanNumericValue(int ch)
    {
        switch(ch)
        {
        case IDEOGRAPHIC_NUMBER_ZERO_ :
        case CJK_IDEOGRAPH_COMPLEX_ZERO:
            return 0; // Han Zero
        case CJK_IDEOGRAPH_FIRST_ :
        case CJK_IDEOGRAPH_COMPLEX_ONE:
            return 1; // Han One
        case CJK_IDEOGRAPH_SECOND_ :
        case CJK_IDEOGRAPH_COMPLEX_TWO:
            return 2; // Han Two
        case CJK_IDEOGRAPH_THIRD_ :
        case CJK_IDEOGRAPH_COMPLEX_THREE:
            return 3; // Han Three
        case CJK_IDEOGRAPH_FOURTH_ :
        case CJK_IDEOGRAPH_COMPLEX_FOUR:
            return 4; // Han Four
        case CJK_IDEOGRAPH_FIFTH_ :
        case CJK_IDEOGRAPH_COMPLEX_FIVE:
            return 5; // Han Five
        case CJK_IDEOGRAPH_SIXTH_ :
        case CJK_IDEOGRAPH_COMPLEX_SIX:
            return 6; // Han Six
        case CJK_IDEOGRAPH_SEVENTH_ :
        case CJK_IDEOGRAPH_COMPLEX_SEVEN:
            return 7; // Han Seven
        case CJK_IDEOGRAPH_EIGHTH_ : 
        case CJK_IDEOGRAPH_COMPLEX_EIGHT:
            return 8; // Han Eight
        case CJK_IDEOGRAPH_NINETH_ :
        case CJK_IDEOGRAPH_COMPLEX_NINE:
            return 9; // Han Nine
        case CJK_IDEOGRAPH_TEN:
        case CJK_IDEOGRAPH_COMPLEX_TEN:
            return 10;
        case CJK_IDEOGRAPH_HUNDRED:
        case CJK_IDEOGRAPH_COMPLEX_HUNDRED:
            return 100;
        case CJK_IDEOGRAPH_THOUSAND:
        case CJK_IDEOGRAPH_COMPLEX_THOUSAND:
            return 1000;
        case CJK_IDEOGRAPH_TEN_THOUSAND:
            return 10000;
        case CJK_IDEOGRAPH_HUNDRED_MILLION:
            return 100000000;
        }
        return -1; // no value
    }
    /*
     * Return a decimal code point in the given range for the provided value.
     * The range is defined by a DIGIT_RANGE selector, see below.  Most ranges
     * only accept values between 0 and 9, some ranges (EUROPEAN_EX) accept
     * values between 0 and 35.
     * <br>
     * @param value a decimal value, from 0 to 9 for most standard ranges, and
     * from 0 to 35 for the EUROPEAN_EX ranges.
     * @param digitRange one of the DIGIT_RANGE selectors.
     * @returns the code point, or -1 if no valid code point exists for that decimal.
     */
    public int getCodePointForDigit(int digit, int digitRange) {
        if (digitRange < 0 || digitRange > DIGIT_RANGE_LIMIT) {
            throw new IllegalArgumentException("invalid digit range selector: " + digitRange);
        }
        if (digit < 0 || digit > ((digitRange < 1 || digitRange > 4) ? 9 : 35)) {
            return -1;
        }
        if (digit < 10) {
            if (digitRange < DIGIT_RANGE_HAN) {
                if (digit == 0 && digitRange == DIGIT_RANGE_TAMIL) {
                    return -1;
                }
                return bases[digitRange] + digit;
            } else if (digitRange == DIGIT_RANGE_HAN) {
                return hanmap[digit];
            } else {
                return exhanmap[digit];
            }
        } else {
            return exbases[digitRange] + digit;
        }
    }
    private static int[] bases = {
        0x0030, 0x0030, 0x0030, 0x0030, 0x0030, 
        0x0660, 0x06f0, 0x0966, 0x09e6, 0x0a66,
        0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6,
        0x0d66, 0x0e50, 0x0ed0, 0x0f20, 0x1040,
        0x1369, 0x17e0, 0x1810
    };
    private static int[] exbases = {
        0x0000, 0x0040, 0x0060, 0xff21, 0xff41
    };
    /* uses 'big circle' ling, includes shi, bai, qian, wan, yi */
    private static int[] hanmap = {
        0x3007, 0x4e00, 0x48ec, 0x4e09, 0x56d8, 0x4e94, 0x516d, 0x4e03, 0x516b, 0x4e5d
    };
    /* uses lingsuide ling, includes shi, bai, qian, wan, yi */
    private static int[] exhanmap = {
        0x96f6, 0x58f9, 0x8cb3, 0x53c3, 0x8086, 0x4f0d, 0x9678, 0x67d2, 0x634c, 0x7396
    };
    private static final int CJK_IDEOGRAPH_COMPLEX_ZERO = 0x96f6;
    private static final int CJK_IDEOGRAPH_COMPLEX_ONE  = 0x58f9;
    private static final int CJK_IDEOGRAPH_COMPLEX_TWO = 0x8cb3;
    private static final int CJK_IDEOGRAPH_COMPLEX_THREE = 0x53c3;
    private static final int CJK_IDEOGRAPH_COMPLEX_FOUR = 0x8086;
    private static final int CJK_IDEOGRAPH_COMPLEX_FIVE = 0x4f0d;
    private static final int CJK_IDEOGRAPH_COMPLEX_SIX = 0x9678;
    private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN = 0x67d2;
    private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT = 0x634c;
    private static final int CJK_IDEOGRAPH_COMPLEX_NINE = 0x7396;
    private static final int CJK_IDEOGRAPH_TEN = 0x5341;
    private static final int CJK_IDEOGRAPH_COMPLEX_TEN = 0x62fe;
    private static final int CJK_IDEOGRAPH_HUNDRED = 0x767e;
    private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED = 0x4f70;
    private static final int CJK_IDEOGRAPH_THOUSAND = 0x5343;
    private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND = 0x4edf;
    private static final int CJK_IDEOGRAPH_TEN_THOUSAND = 0x824c;
    private static final int CJK_IDEOGRAPH_HUNDRED_MILLION = 0x5104;
    /** European (ASCII) digits for values 0-9 */
    public static final int DIGIT_RANGE_EUROPEAN = 0;
    /** European (ASCII) digits for values 0-9 and upper case letters for values 10-35 */
    public static final int DIGIT_RANGE_EUROPEAN_EX_UC = 1;
    /** European (ASCII) digits for values 0-9 and lower case letters for values 10-35 */
    public static final int DIGIT_RANGE_EUROPEAN_EX_LC = 2;
    /** European (FullWidth) digits for values 0-9 and fullwidth upper case letters for values 10-35 */
    public static final int DIGIT_RANGE_EUROPEAN_EX_FW_UC = 3;
    /** European (FullWidth) digits for values 0-9 and fullwidth lower case letters for values 10-35 */
    public static final int DIGIT_RANGE_EUROPEAN_EX_FW_LC = 4;
    /** Arabic digits for values 0-9 */
    public static final int DIGIT_RANGE_ARABIC = 5;
    /** Eastern Arabic (Persian) digits for values 0-9 */
    public static final int DIGIT_RANGE_EASTERN_ARABIC = 6;
    /** Devanagari digits for values 0-9 */
    public static final int DIGIT_RANGE_DEVANAGARI = 7;
    /** Bengali digits for values 0-9 */
    public static final int DIGIT_RANGE_BENGALI = 8;
    /** Gurmukhi digits for values 0-9 */
    public static final int DIGIT_RANGE_GURMUKHI = 9;
    /** Gurjarati digits for values 0-9 */
    public static final int DIGIT_RANGE_GUJARATI = 10;
    /** Oriya digits for values 0-9 */
    public static final int DIGIT_RANGE_ORIYA = 11;
    /** Tamil digits for values 1-9, Tamil has no digit for zero. */
    public static final int DIGIT_RANGE_TAMIL = 12;
    /** Telugu digits for values 0-9 */
    public static final int DIGIT_RANGE_TELUGU = 13;
    /** Kannada digits for values 0-9 */
    public static final int DIGIT_RANGE_KANNADA = 14;
    /** Malayam digits for values 0-9 */
    public static final int DIGIT_RANGE_MALAYAM = 15;
    /** Thai digits for values 0-9 */
    public static final int DIGIT_RANGE_THAI = 16;
    /** Lao digits for values 0-9 */
    public static final int DIGIT_RANGE_LAO = 17;
    /** Tibetan digits for values 0-9 */
    public static final int DIGIT_RANGE_TIBETAN = 18;
    /** Myanmar digits for values 0-9 */
    public static final int DIGIT_RANGE_MYANMAR = 19;
    /** Ethiopic digits for values 0-9 */
    public static final int DIGIT_RANGE_ETHIOPIC = 20;
    /** Khmer digits for values 0-9 */
    public static final int DIGIT_RANGE_KHMER = 21;
    /** Montolian digits for values 0-9 */
    public static final int DIGIT_RANGE_MONGOLIAN = 22;
    /** Han digits for values 0-9 */
    public static final int DIGIT_RANGE_HAN = 23;
    /** Han ("checkwriting") digits for values 0-9 */
    public static final int DIGIT_RANGE_HAN_CW = 24;
    private static final int DIGIT_RANGE_LIMIT = 25;
    /**
    * Special casing uppercase management
    * @param ch code point to convert
--- a/icu4j/src/com/ibm/text/UCharacter.java
+++ b/icu4j/src/com/ibm/text/UCharacter.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacter.java,v $ 
-* $Date: 2001/08/22 22:38:30 $ 
+* $Date: 2001/10/05 18:42:33 $ 
-* $Revision: 1.11 $
+* $Revision: 1.12 $
 *
 *******************************************************************************
 */
@ -57,8 +57,11 @@ import java.util.Locale;
 *      does it include the Java-specific character information, such as 
 *      boolean isJavaIdentifierPart(char ch).
 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric 
-*      values '10' - '35'. UCharacter does not treat the above code points 
+*      values '10' - '35'. UCharacter also does this in digit and
-*      as having numeric values
+*      getNumericValue, to adhere to the java semantics of these
 *      methods.  New methods unicodeDigit, and
 *      getUnicodeNumericValue do not treat the above code points 
 *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
 * <li> For consistency with ICU4C's data, control code points below have their 
 *      Unicode general category reset to the types below.
 *      <ul>
@ -349,19 +352,25 @@ public final class UCharacter
    // public methods ===================================================
    /**
-    * Retrieves the decimal numeric value of a digit code point.<br>
+    * Retrieves the numeric value of a decimal digit code point.
-    * A code point is a valid digit if the following is true: 
+    * <br>This method observes the semantics of
    * <code>java.lang.Character.digit()</code>.  Note that this
    * will return positive values for code points for which isDigit
    * returns false, just like java.lang.Character.
    * <br><em>Semantic Change:</em> In release 1.3.1 and
    * prior, this did not treat the European letters as having a
    * digit value, and also treated numeric letters and other numbers as digits.  
    * This has been changed to conform to the java semantics.
    * <br>A code point is a valid digit if and only if:
    * <ul>
-    * <li> The method isDigit(ch) is true and the Unicode decimal digit value of 
+    *   <li>ch is a decimal digit or one of the european letters, and
-    *      ch is less than the specified radix. 
+    *   <li>the value of ch is less than the specified radix.
    * </ul>
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
+    * @param ch the code point to query
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. 
+    * @param radix the radix
-    * @param ch the code point whose numeric value is to be determined
+    * @return the numeric value represented by the code point in the
-    * @param radix the radix which the digit is to be converted to
+    * specified radix, or -1 if the code point is not a decimal digit
-    * @return the numeric value of the code point ch in the argument radix,
+    * or if its value is too large for the radix
    *         this method returns -1 if ch is not a valid digit code point or 
    *         if its digit value exceeds the radix.
    */
    public static int digit(int ch, int radix)
    {
@ -375,6 +384,7 @@ public final class UCharacter
                result = UCharacterPropertyDB.getSignedValue(props);
            }
        }
 /*
        else {
            // contained in exception data
            int index = UCharacterPropertyDB.getExceptionIndex(props);
@ -398,6 +408,11 @@ public final class UCharacter
        if (result < 0) {
            result = getHanDigit(ch);
        }
 */
        if (result < 0 && radix > 10) {
            result = getEuropeanDigit(ch);
        }
        if (result < 0 || result >= radix) {
            return -1;
@ -405,14 +420,39 @@ public final class UCharacter
        return result;
    }
    private static boolean isEuropeanDigit(int ch) {
        return (ch <= 0x7a && ((ch >= 0x41 && ch <= 0x5a) || ch >= 0x61)) ||
            (ch >= 0xff21 && (ch <= 0xff3a || (ch >= 0xff41 && ch <= 0xff5a)));
    }
    private static int getEuropeanDigit(int ch) {
        if (ch <= 0x7a) {
            if (ch >= 0x41 && ch <= 0x5a) {
                return ch + 10 - 0x41;
            } else if (ch >= 0x61) {
                return ch + 10 - 0x61;
            }
        } else if (ch >= 0xff21) {
            if (ch <= 0xff3a) {
                return ch + 10 - 0xff21;
            } else if (ch >= 0xff41 && ch <= 0xff5a) {
                return ch + 10 - 0xff41;
            }
        }
        return -1;
    }
    /**
-    * Retrieves the decimal numeric value of a digit code point in radix 10<br>
+    * Retrieves the numeric value of a decimal digit code point.
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
+    * <br>This is a convenience overload of <code>digit(int, int)</code> 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. 
+    * that provides a decimal radix.
-    * @param ch the code point whose numeric value is to be determined
+    * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
-    * @return the numeric value of the code point ch, this method returns -1 if 
+    * treated numeric letters and other numbers as digits.  This has
-    *         ch is not a valid digit code point
+    * been changed to conform to the java semantics.
-    */
+    * @param ch the code point to query
    * @return the numeric value represented by the code point,
    * or -1 if the code point is not a decimal digit or if its
    * value is too large for a decimal radix */
    public static int digit(int ch)
    {
        return digit(ch, DECIMAL_RADIX_);
@ -420,17 +460,43 @@ public final class UCharacter
   /**
    * Returns the Unicode numeric value of the code point as a nonnegative 
-    * integer. <br>
+    * integer.
-    * If the code point does not have a numeric value, then -1 is returned. <br>
+    * <br>If the code point does not have a numeric value, then -1 is returned. <br>
    * If the code point has a numeric value that cannot be represented as a 
    * nonnegative integer (for example, a fractional value), then -2 is returned.
-    * <br>
+    * <br><em>Semantic Change:</em> In release 1.3.1 and
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
+    * prior, this returned -1 for ASCII letters and their
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as numbers. 
+    * fullwidth counterparts.  This has been changed to
-    * @param ch Unicode code point
+    * conform to the java semantics.
-    * @return numeric value of the code point as a nonnegative integer
+    * @param ch the code point to query
    * @return the numeric value of the code point, or -1 if it has no numeric value,
    * or -2 if it has a numeric value that cannot be represented as a nonnegative 
    * integer
    */
    public static int getNumericValue(int ch)
    {
        return getNumericValueInternal(ch, true);
    }
   /**
    * Returns the Unicode numeric value of the code point as a nonnegative 
    * integer.
    * <br>If the code point does not have a numeric value, then -1 is returned. <br>
    * If the code point has a numeric value that cannot be represented as a 
    * nonnegative integer (for example, a fractional value), then -2 is returned.
    * This returns values other than -1 for all and only those code points whose
    * type is a numeric type.
    * @param ch the code point to query
    * @return the numeric value of the code point, or -1 if it has no numeric value,
    * or -2 if it has a numeric value that cannot be represented as a nonnegative 
    * integer
    */
    public static int getUnicodeNumericValue(int ch)
    {
        return getNumericValueInternal(ch, false);
    }
    private static int getNumericValueInternal(int ch, boolean useEuropean)
    {
        int props = getProps(ch);
        int type = UCharacterPropertyDB.getPropType(props);
@ -439,7 +505,8 @@ public final class UCharacter
        if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER &&
            type != UCharacterCategory.LETTER_NUMBER &&
            type != UCharacterCategory.OTHER_NUMBER) {
-            return -1;
+
            return useEuropean ? getEuropeanDigit(ch) : -1;
        }
        int result = -1;
@ -453,7 +520,8 @@ public final class UCharacter
            if (PROPERTY_DB_.hasExceptionValue(index, 
                                       UCharacterPropertyDB.EXC_DIGIT_VALUE_)) {
                result  = PROPERTY_DB_.getException(index, 
-                                        UCharacterPropertyDB.EXC_DIGIT_VALUE_); 
+                                        UCharacterPropertyDB.EXC_DIGIT_VALUE_) &
                    LAST_CHAR_MASK_; 
            }
            else {
                if (!PROPERTY_DB_.hasExceptionValue(index, 
@ -466,10 +534,6 @@ public final class UCharacter
            }
        }
        if (result < 0) {
          result = getHanDigit(ch);
        }
        if (result < 0) {
            return -2;
        }
@ -506,19 +570,18 @@ public final class UCharacter
    }
   /**
-    * Determines if a code point is a digit.<br>
+    * Determines if a code point is a Java digit.
-    * Note this method, unlike java.lang.Character.isDigit() does not regard the 
+    * <br>This method observes the semantics of
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits.<br>
+    * <code>java.lang.Character.isDigit()</code>.  It returns true for
-    * @param ch code point to determine if it is a digit
+    * decimal digits only.
-    * @return true if this code point is a digit
+    * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
-    */
+    * treated numeric letters and other numbers as digits.  This has
    * been changed to conform to the java semantics.
    * @param ch code point to query
    * @return true if this code point is a digit */
    public static boolean isDigit(int ch)
    {
-        int cat = getType(ch);
+        return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
        // if props == 0, it will just fall through and return false
        return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || 
            cat == UCharacterCategory.OTHER_NUMBER ||
            cat == UCharacterCategory.LETTER_NUMBER;
    }
    /**
@ -1527,38 +1590,220 @@ public final class UCharacter
    }
    /**
-    * Getting Han character digit values
+    * Return numeric value of Han code points.
-    * @param ch code point to test if it is a Han character
+    * <br> This returns the value of Han 'numeric' code points,
-    * @return Han digit value if ch is a Han digit character
+    * including those for zero, ten, hundred, thousand, ten thousand,
    * and hundred million.  Unicode does not consider these to be
    * numeric. This includes both the standard and 'checkwriting'
    * characters, the 'big circle' zero character, and the standard
    * zero character.
    * @param ch code point to query
    * @return value if it is a Han 'numeric character,' otherwise return -1.  
    */
-    private static int getHanDigit(int ch)
+    public static int getHanNumericValue(int ch)
    {
        switch(ch)
        {
        case IDEOGRAPHIC_NUMBER_ZERO_ :
        case CJK_IDEOGRAPH_COMPLEX_ZERO:
            return 0; // Han Zero
        case CJK_IDEOGRAPH_FIRST_ :
        case CJK_IDEOGRAPH_COMPLEX_ONE:
            return 1; // Han One
        case CJK_IDEOGRAPH_SECOND_ :
        case CJK_IDEOGRAPH_COMPLEX_TWO:
            return 2; // Han Two
        case CJK_IDEOGRAPH_THIRD_ :
        case CJK_IDEOGRAPH_COMPLEX_THREE:
            return 3; // Han Three
        case CJK_IDEOGRAPH_FOURTH_ :
        case CJK_IDEOGRAPH_COMPLEX_FOUR:
            return 4; // Han Four
        case CJK_IDEOGRAPH_FIFTH_ :
        case CJK_IDEOGRAPH_COMPLEX_FIVE:
            return 5; // Han Five
        case CJK_IDEOGRAPH_SIXTH_ :
        case CJK_IDEOGRAPH_COMPLEX_SIX:
            return 6; // Han Six
        case CJK_IDEOGRAPH_SEVENTH_ :
        case CJK_IDEOGRAPH_COMPLEX_SEVEN:
            return 7; // Han Seven
        case CJK_IDEOGRAPH_EIGHTH_ : 
        case CJK_IDEOGRAPH_COMPLEX_EIGHT:
            return 8; // Han Eight
        case CJK_IDEOGRAPH_NINETH_ :
        case CJK_IDEOGRAPH_COMPLEX_NINE:
            return 9; // Han Nine
        case CJK_IDEOGRAPH_TEN:
        case CJK_IDEOGRAPH_COMPLEX_TEN:
            return 10;
        case CJK_IDEOGRAPH_HUNDRED:
        case CJK_IDEOGRAPH_COMPLEX_HUNDRED:
            return 100;
        case CJK_IDEOGRAPH_THOUSAND:
        case CJK_IDEOGRAPH_COMPLEX_THOUSAND:
            return 1000;
        case CJK_IDEOGRAPH_TEN_THOUSAND:
            return 10000;
        case CJK_IDEOGRAPH_HUNDRED_MILLION:
            return 100000000;
        }
        return -1; // no value
    }
    /*
     * Return a decimal code point in the given range for the provided value.
     * The range is defined by a DIGIT_RANGE selector, see below.  Most ranges
     * only accept values between 0 and 9, some ranges (EUROPEAN_EX) accept
     * values between 0 and 35.
     * <br>
     * @param value a decimal value, from 0 to 9 for most standard ranges, and
     * from 0 to 35 for the EUROPEAN_EX ranges.
     * @param digitRange one of the DIGIT_RANGE selectors.
     * @returns the code point, or -1 if no valid code point exists for that decimal.
     */
    public int getCodePointForDigit(int digit, int digitRange) {
        if (digitRange < 0 || digitRange > DIGIT_RANGE_LIMIT) {
            throw new IllegalArgumentException("invalid digit range selector: " + digitRange);
        }
        if (digit < 0 || digit > ((digitRange < 1 || digitRange > 4) ? 9 : 35)) {
            return -1;
        }
        if (digit < 10) {
            if (digitRange < DIGIT_RANGE_HAN) {
                if (digit == 0 && digitRange == DIGIT_RANGE_TAMIL) {
                    return -1;
                }
                return bases[digitRange] + digit;
            } else if (digitRange == DIGIT_RANGE_HAN) {
                return hanmap[digit];
            } else {
                return exhanmap[digit];
            }
        } else {
            return exbases[digitRange] + digit;
        }
    }
    private static int[] bases = {
        0x0030, 0x0030, 0x0030, 0x0030, 0x0030, 
        0x0660, 0x06f0, 0x0966, 0x09e6, 0x0a66,
        0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6,
        0x0d66, 0x0e50, 0x0ed0, 0x0f20, 0x1040,
        0x1369, 0x17e0, 0x1810
    };
    private static int[] exbases = {
        0x0000, 0x0040, 0x0060, 0xff21, 0xff41
    };
    /* uses 'big circle' ling, includes shi, bai, qian, wan, yi */
    private static int[] hanmap = {
        0x3007, 0x4e00, 0x48ec, 0x4e09, 0x56d8, 0x4e94, 0x516d, 0x4e03, 0x516b, 0x4e5d
    };
    /* uses lingsuide ling, includes shi, bai, qian, wan, yi */
    private static int[] exhanmap = {
        0x96f6, 0x58f9, 0x8cb3, 0x53c3, 0x8086, 0x4f0d, 0x9678, 0x67d2, 0x634c, 0x7396
    };
    private static final int CJK_IDEOGRAPH_COMPLEX_ZERO = 0x96f6;
    private static final int CJK_IDEOGRAPH_COMPLEX_ONE  = 0x58f9;
    private static final int CJK_IDEOGRAPH_COMPLEX_TWO = 0x8cb3;
    private static final int CJK_IDEOGRAPH_COMPLEX_THREE = 0x53c3;
    private static final int CJK_IDEOGRAPH_COMPLEX_FOUR = 0x8086;
    private static final int CJK_IDEOGRAPH_COMPLEX_FIVE = 0x4f0d;
    private static final int CJK_IDEOGRAPH_COMPLEX_SIX = 0x9678;
    private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN = 0x67d2;
    private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT = 0x634c;
    private static final int CJK_IDEOGRAPH_COMPLEX_NINE = 0x7396;
    private static final int CJK_IDEOGRAPH_TEN = 0x5341;
    private static final int CJK_IDEOGRAPH_COMPLEX_TEN = 0x62fe;
    private static final int CJK_IDEOGRAPH_HUNDRED = 0x767e;
    private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED = 0x4f70;
    private static final int CJK_IDEOGRAPH_THOUSAND = 0x5343;
    private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND = 0x4edf;
    private static final int CJK_IDEOGRAPH_TEN_THOUSAND = 0x824c;
    private static final int CJK_IDEOGRAPH_HUNDRED_MILLION = 0x5104;
    /** European (ASCII) digits for values 0-9 */
    public static final int DIGIT_RANGE_EUROPEAN = 0;
    /** European (ASCII) digits for values 0-9 and upper case letters for values 10-35 */
    public static final int DIGIT_RANGE_EUROPEAN_EX_UC = 1;
    /** European (ASCII) digits for values 0-9 and lower case letters for values 10-35 */
    public static final int DIGIT_RANGE_EUROPEAN_EX_LC = 2;
    /** European (FullWidth) digits for values 0-9 and fullwidth upper case letters for values 10-35 */
    public static final int DIGIT_RANGE_EUROPEAN_EX_FW_UC = 3;
    /** European (FullWidth) digits for values 0-9 and fullwidth lower case letters for values 10-35 */
    public static final int DIGIT_RANGE_EUROPEAN_EX_FW_LC = 4;
    /** Arabic digits for values 0-9 */
    public static final int DIGIT_RANGE_ARABIC = 5;
    /** Eastern Arabic (Persian) digits for values 0-9 */
    public static final int DIGIT_RANGE_EASTERN_ARABIC = 6;
    /** Devanagari digits for values 0-9 */
    public static final int DIGIT_RANGE_DEVANAGARI = 7;
    /** Bengali digits for values 0-9 */
    public static final int DIGIT_RANGE_BENGALI = 8;
    /** Gurmukhi digits for values 0-9 */
    public static final int DIGIT_RANGE_GURMUKHI = 9;
    /** Gurjarati digits for values 0-9 */
    public static final int DIGIT_RANGE_GUJARATI = 10;
    /** Oriya digits for values 0-9 */
    public static final int DIGIT_RANGE_ORIYA = 11;
    /** Tamil digits for values 1-9, Tamil has no digit for zero. */
    public static final int DIGIT_RANGE_TAMIL = 12;
    /** Telugu digits for values 0-9 */
    public static final int DIGIT_RANGE_TELUGU = 13;
    /** Kannada digits for values 0-9 */
    public static final int DIGIT_RANGE_KANNADA = 14;
    /** Malayam digits for values 0-9 */
    public static final int DIGIT_RANGE_MALAYAM = 15;
    /** Thai digits for values 0-9 */
    public static final int DIGIT_RANGE_THAI = 16;
    /** Lao digits for values 0-9 */
    public static final int DIGIT_RANGE_LAO = 17;
    /** Tibetan digits for values 0-9 */
    public static final int DIGIT_RANGE_TIBETAN = 18;
    /** Myanmar digits for values 0-9 */
    public static final int DIGIT_RANGE_MYANMAR = 19;
    /** Ethiopic digits for values 0-9 */
    public static final int DIGIT_RANGE_ETHIOPIC = 20;
    /** Khmer digits for values 0-9 */
    public static final int DIGIT_RANGE_KHMER = 21;
    /** Montolian digits for values 0-9 */
    public static final int DIGIT_RANGE_MONGOLIAN = 22;
    /** Han digits for values 0-9 */
    public static final int DIGIT_RANGE_HAN = 23;
    /** Han ("checkwriting") digits for values 0-9 */
    public static final int DIGIT_RANGE_HAN_CW = 24;
    private static final int DIGIT_RANGE_LIMIT = 25;
    /**
    * Special casing uppercase management
    * @param ch code point to convert