ICU-76 ucharacter semantic changes

jitterbug 77: add additional API for numeric values jitterbug 78: UCharacter.getNumericValue superscript bugs X-SVN-Rev: 6085
2001-10-05 18:42:33 +00:00 · 2001-10-05 18:42:33 +00:00 · 21b8e60b11
commit 21b8e60b11
parent 5746e4c2fc
2 changed files with 600 additions and 110 deletions
--- a/icu4j/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/src/com/ibm/icu/lang/UCharacter.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $ 
-* $Date: 2001/08/22 22:38:30 $ 
-* $Revision: 1.11 $
+* $Date: 2001/10/05 18:42:33 $ 
+* $Revision: 1.12 $
 *
 *******************************************************************************
 */
@ -57,8 +57,11 @@ import java.util.Locale;
 *      does it include the Java-specific character information, such as 
 *      boolean isJavaIdentifierPart(char ch).
 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric 
-*      values '10' - '35'. UCharacter does not treat the above code points 
-*      as having numeric values
+*      values '10' - '35'. UCharacter also does this in digit and
+*      getNumericValue, to adhere to the java semantics of these
+*      methods.  New methods unicodeDigit, and
+*      getUnicodeNumericValue do not treat the above code points 
+*      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
 * <li> For consistency with ICU4C's data, control code points below have their 
 *      Unicode general category reset to the types below.
 *      <ul>
@ -349,19 +352,25 @@ public final class UCharacter
    // public methods ===================================================
      
    /**
-    * Retrieves the decimal numeric value of a digit code point.<br>
-    * A code point is a valid digit if the following is true: 
+    * Retrieves the numeric value of a decimal digit code point.
+    * <br>This method observes the semantics of
+    * <code>java.lang.Character.digit()</code>.  Note that this
+    * will return positive values for code points for which isDigit
+    * returns false, just like java.lang.Character.
+    * <br><em>Semantic Change:</em> In release 1.3.1 and
+    * prior, this did not treat the European letters as having a
+    * digit value, and also treated numeric letters and other numbers as digits.  
+    * This has been changed to conform to the java semantics.
+    * <br>A code point is a valid digit if and only if:
    * <ul>
-    * <li> The method isDigit(ch) is true and the Unicode decimal digit value of 
-    *      ch is less than the specified radix. 
+    *   <li>ch is a decimal digit or one of the european letters, and
+    *   <li>the value of ch is less than the specified radix.
    * </ul>
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. 
-    * @param ch the code point whose numeric value is to be determined
-    * @param radix the radix which the digit is to be converted to
-    * @return the numeric value of the code point ch in the argument radix,
-    *         this method returns -1 if ch is not a valid digit code point or 
-    *         if its digit value exceeds the radix.
+    * @param ch the code point to query
+    * @param radix the radix
+    * @return the numeric value represented by the code point in the
+    * specified radix, or -1 if the code point is not a decimal digit
+    * or if its value is too large for the radix
    */
    public static int digit(int ch, int radix)
    {
@ -375,6 +384,7 @@ public final class UCharacter
                result = UCharacterPropertyDB.getSignedValue(props);
            }
        }
+ /*
        else {
            // contained in exception data
            int index = UCharacterPropertyDB.getExceptionIndex(props);
@ -398,6 +408,11 @@ public final class UCharacter
        if (result < 0) {
            result = getHanDigit(ch);
        }
+ */
+
+        if (result < 0 && radix > 10) {
+            result = getEuropeanDigit(ch);
+        }
        
        if (result < 0 || result >= radix) {
            return -1;
@ -405,14 +420,39 @@ public final class UCharacter
        return result;
    }
    
+    private static boolean isEuropeanDigit(int ch) {
+        return (ch <= 0x7a && ((ch >= 0x41 && ch <= 0x5a) || ch >= 0x61)) ||
+            (ch >= 0xff21 && (ch <= 0xff3a || (ch >= 0xff41 && ch <= 0xff5a)));
+    }
+
+    private static int getEuropeanDigit(int ch) {
+        if (ch <= 0x7a) {
+            if (ch >= 0x41 && ch <= 0x5a) {
+                return ch + 10 - 0x41;
+            } else if (ch >= 0x61) {
+                return ch + 10 - 0x61;
+            }
+        } else if (ch >= 0xff21) {
+            if (ch <= 0xff3a) {
+                return ch + 10 - 0xff21;
+            } else if (ch >= 0xff41 && ch <= 0xff5a) {
+                return ch + 10 - 0xff41;
+            }
+        }
+        return -1;
+    }
+
    /**
-    * Retrieves the decimal numeric value of a digit code point in radix 10<br>
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. 
-    * @param ch the code point whose numeric value is to be determined
-    * @return the numeric value of the code point ch, this method returns -1 if 
-    *         ch is not a valid digit code point
-    */
+    * Retrieves the numeric value of a decimal digit code point.
+    * <br>This is a convenience overload of <code>digit(int, int)</code> 
+    * that provides a decimal radix.
+    * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
+    * treated numeric letters and other numbers as digits.  This has
+    * been changed to conform to the java semantics.
+    * @param ch the code point to query
+    * @return the numeric value represented by the code point,
+    * or -1 if the code point is not a decimal digit or if its
+    * value is too large for a decimal radix */
    public static int digit(int ch)
    {
        return digit(ch, DECIMAL_RADIX_);
@ -420,17 +460,43 @@ public final class UCharacter

   /**
    * Returns the Unicode numeric value of the code point as a nonnegative 
-    * integer. <br>
-    * If the code point does not have a numeric value, then -1 is returned. <br>
+    * integer.
+    * <br>If the code point does not have a numeric value, then -1 is returned. <br>
    * If the code point has a numeric value that cannot be represented as a 
    * nonnegative integer (for example, a fractional value), then -2 is returned.
-    * <br>
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as numbers. 
-    * @param ch Unicode code point
-    * @return numeric value of the code point as a nonnegative integer
+    * <br><em>Semantic Change:</em> In release 1.3.1 and
+    * prior, this returned -1 for ASCII letters and their
+    * fullwidth counterparts.  This has been changed to
+    * conform to the java semantics.
+    * @param ch the code point to query
+    * @return the numeric value of the code point, or -1 if it has no numeric value,
+    * or -2 if it has a numeric value that cannot be represented as a nonnegative 
+    * integer
    */
    public static int getNumericValue(int ch)
+    {
+        return getNumericValueInternal(ch, true);
+    }
+
+   /**
+    * Returns the Unicode numeric value of the code point as a nonnegative 
+    * integer.
+    * <br>If the code point does not have a numeric value, then -1 is returned. <br>
+    * If the code point has a numeric value that cannot be represented as a 
+    * nonnegative integer (for example, a fractional value), then -2 is returned.
+    * This returns values other than -1 for all and only those code points whose
+    * type is a numeric type.
+    * @param ch the code point to query
+    * @return the numeric value of the code point, or -1 if it has no numeric value,
+    * or -2 if it has a numeric value that cannot be represented as a nonnegative 
+    * integer
+    */
+    public static int getUnicodeNumericValue(int ch)
+    {
+        return getNumericValueInternal(ch, false);
+    }
+
+    private static int getNumericValueInternal(int ch, boolean useEuropean)
    {
        int props = getProps(ch);
        int type = UCharacterPropertyDB.getPropType(props);
@ -439,7 +505,8 @@ public final class UCharacter
        if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER &&
            type != UCharacterCategory.LETTER_NUMBER &&
            type != UCharacterCategory.OTHER_NUMBER) {
-            return -1;
+
+            return useEuropean ? getEuropeanDigit(ch) : -1;
        }
          
        int result = -1;
@ -453,7 +520,8 @@ public final class UCharacter
            if (PROPERTY_DB_.hasExceptionValue(index, 
                                       UCharacterPropertyDB.EXC_DIGIT_VALUE_)) {
                result  = PROPERTY_DB_.getException(index, 
-                                        UCharacterPropertyDB.EXC_DIGIT_VALUE_); 
+                                        UCharacterPropertyDB.EXC_DIGIT_VALUE_) &
+                    LAST_CHAR_MASK_; 
            }
            else {
                if (!PROPERTY_DB_.hasExceptionValue(index, 
@ -466,10 +534,6 @@ public final class UCharacter
            }
        }
        
-        if (result < 0) {
-          result = getHanDigit(ch);
-        }
-        
        if (result < 0) {
            return -2;
        }
@ -506,19 +570,18 @@ public final class UCharacter
    }
                                    
   /**
-    * Determines if a code point is a digit.<br>
-    * Note this method, unlike java.lang.Character.isDigit() does not regard the 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits.<br>
-    * @param ch code point to determine if it is a digit
-    * @return true if this code point is a digit
-    */
+    * Determines if a code point is a Java digit.
+    * <br>This method observes the semantics of
+    * <code>java.lang.Character.isDigit()</code>.  It returns true for
+    * decimal digits only.
+    * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
+    * treated numeric letters and other numbers as digits.  This has
+    * been changed to conform to the java semantics.
+    * @param ch code point to query
+    * @return true if this code point is a digit */
    public static boolean isDigit(int ch)
    {
-        int cat = getType(ch);
-        // if props == 0, it will just fall through and return false
-        return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || 
-            cat == UCharacterCategory.OTHER_NUMBER ||
-            cat == UCharacterCategory.LETTER_NUMBER;
+        return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
    }

    /**
@ -1527,38 +1590,220 @@ public final class UCharacter
    }
      
    /**
-    * Getting Han character digit values
-    * @param ch code point to test if it is a Han character
-    * @return Han digit value if ch is a Han digit character
+    * Return numeric value of Han code points.
+    * <br> This returns the value of Han 'numeric' code points,
+    * including those for zero, ten, hundred, thousand, ten thousand,
+    * and hundred million.  Unicode does not consider these to be
+    * numeric. This includes both the standard and 'checkwriting'
+    * characters, the 'big circle' zero character, and the standard
+    * zero character.
+    * @param ch code point to query
+    * @return value if it is a Han 'numeric character,' otherwise return -1.  
    */
-    private static int getHanDigit(int ch)
+    public static int getHanNumericValue(int ch)
    {
        switch(ch)
        {
        case IDEOGRAPHIC_NUMBER_ZERO_ :
+        case CJK_IDEOGRAPH_COMPLEX_ZERO:
            return 0; // Han Zero
        case CJK_IDEOGRAPH_FIRST_ :
+        case CJK_IDEOGRAPH_COMPLEX_ONE:
            return 1; // Han One
        case CJK_IDEOGRAPH_SECOND_ :
+        case CJK_IDEOGRAPH_COMPLEX_TWO:
            return 2; // Han Two
        case CJK_IDEOGRAPH_THIRD_ :
+        case CJK_IDEOGRAPH_COMPLEX_THREE:
            return 3; // Han Three
        case CJK_IDEOGRAPH_FOURTH_ :
+        case CJK_IDEOGRAPH_COMPLEX_FOUR:
            return 4; // Han Four
        case CJK_IDEOGRAPH_FIFTH_ :
+        case CJK_IDEOGRAPH_COMPLEX_FIVE:
            return 5; // Han Five
        case CJK_IDEOGRAPH_SIXTH_ :
+        case CJK_IDEOGRAPH_COMPLEX_SIX:
            return 6; // Han Six
        case CJK_IDEOGRAPH_SEVENTH_ :
+        case CJK_IDEOGRAPH_COMPLEX_SEVEN:
            return 7; // Han Seven
        case CJK_IDEOGRAPH_EIGHTH_ : 
+        case CJK_IDEOGRAPH_COMPLEX_EIGHT:
            return 8; // Han Eight
        case CJK_IDEOGRAPH_NINETH_ :
+        case CJK_IDEOGRAPH_COMPLEX_NINE:
            return 9; // Han Nine
+        case CJK_IDEOGRAPH_TEN:
+        case CJK_IDEOGRAPH_COMPLEX_TEN:
+            return 10;
+        case CJK_IDEOGRAPH_HUNDRED:
+        case CJK_IDEOGRAPH_COMPLEX_HUNDRED:
+            return 100;
+        case CJK_IDEOGRAPH_THOUSAND:
+        case CJK_IDEOGRAPH_COMPLEX_THOUSAND:
+            return 1000;
+        case CJK_IDEOGRAPH_TEN_THOUSAND:
+            return 10000;
+        case CJK_IDEOGRAPH_HUNDRED_MILLION:
+            return 100000000;
        }
        return -1; // no value
    }

+    /*
+     * Return a decimal code point in the given range for the provided value.
+     * The range is defined by a DIGIT_RANGE selector, see below.  Most ranges
+     * only accept values between 0 and 9, some ranges (EUROPEAN_EX) accept
+     * values between 0 and 35.
+     * <br>
+     * @param value a decimal value, from 0 to 9 for most standard ranges, and
+     * from 0 to 35 for the EUROPEAN_EX ranges.
+     * @param digitRange one of the DIGIT_RANGE selectors.
+     * @returns the code point, or -1 if no valid code point exists for that decimal.
+     */
+    public int getCodePointForDigit(int digit, int digitRange) {
+        if (digitRange < 0 || digitRange > DIGIT_RANGE_LIMIT) {
+            throw new IllegalArgumentException("invalid digit range selector: " + digitRange);
+        }
+        if (digit < 0 || digit > ((digitRange < 1 || digitRange > 4) ? 9 : 35)) {
+            return -1;
+        }
+
+        if (digit < 10) {
+            if (digitRange < DIGIT_RANGE_HAN) {
+                if (digit == 0 && digitRange == DIGIT_RANGE_TAMIL) {
+                    return -1;
+                }
+                return bases[digitRange] + digit;
+            } else if (digitRange == DIGIT_RANGE_HAN) {
+                return hanmap[digit];
+            } else {
+                return exhanmap[digit];
+            }
+        } else {
+            return exbases[digitRange] + digit;
+        }
+    }
+
+    private static int[] bases = {
+        0x0030, 0x0030, 0x0030, 0x0030, 0x0030, 
+        0x0660, 0x06f0, 0x0966, 0x09e6, 0x0a66,
+        0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6,
+        0x0d66, 0x0e50, 0x0ed0, 0x0f20, 0x1040,
+        0x1369, 0x17e0, 0x1810
+    };
+
+    private static int[] exbases = {
+        0x0000, 0x0040, 0x0060, 0xff21, 0xff41
+    };
+
+    /* uses 'big circle' ling, includes shi, bai, qian, wan, yi */
+    private static int[] hanmap = {
+        0x3007, 0x4e00, 0x48ec, 0x4e09, 0x56d8, 0x4e94, 0x516d, 0x4e03, 0x516b, 0x4e5d
+    };
+
+    /* uses lingsuide ling, includes shi, bai, qian, wan, yi */
+    private static int[] exhanmap = {
+        0x96f6, 0x58f9, 0x8cb3, 0x53c3, 0x8086, 0x4f0d, 0x9678, 0x67d2, 0x634c, 0x7396
+    };
+
+    private static final int CJK_IDEOGRAPH_COMPLEX_ZERO = 0x96f6;
+    private static final int CJK_IDEOGRAPH_COMPLEX_ONE  = 0x58f9;
+    private static final int CJK_IDEOGRAPH_COMPLEX_TWO = 0x8cb3;
+    private static final int CJK_IDEOGRAPH_COMPLEX_THREE = 0x53c3;
+    private static final int CJK_IDEOGRAPH_COMPLEX_FOUR = 0x8086;
+    private static final int CJK_IDEOGRAPH_COMPLEX_FIVE = 0x4f0d;
+    private static final int CJK_IDEOGRAPH_COMPLEX_SIX = 0x9678;
+    private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN = 0x67d2;
+    private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT = 0x634c;
+    private static final int CJK_IDEOGRAPH_COMPLEX_NINE = 0x7396;
+    private static final int CJK_IDEOGRAPH_TEN = 0x5341;
+    private static final int CJK_IDEOGRAPH_COMPLEX_TEN = 0x62fe;
+    private static final int CJK_IDEOGRAPH_HUNDRED = 0x767e;
+    private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED = 0x4f70;
+    private static final int CJK_IDEOGRAPH_THOUSAND = 0x5343;
+    private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND = 0x4edf;
+    private static final int CJK_IDEOGRAPH_TEN_THOUSAND = 0x824c;
+    private static final int CJK_IDEOGRAPH_HUNDRED_MILLION = 0x5104;
+
+    /** European (ASCII) digits for values 0-9 */
+    public static final int DIGIT_RANGE_EUROPEAN = 0;
+
+    /** European (ASCII) digits for values 0-9 and upper case letters for values 10-35 */
+    public static final int DIGIT_RANGE_EUROPEAN_EX_UC = 1;
+
+    /** European (ASCII) digits for values 0-9 and lower case letters for values 10-35 */
+    public static final int DIGIT_RANGE_EUROPEAN_EX_LC = 2;
+
+    /** European (FullWidth) digits for values 0-9 and fullwidth upper case letters for values 10-35 */
+    public static final int DIGIT_RANGE_EUROPEAN_EX_FW_UC = 3;
+
+    /** European (FullWidth) digits for values 0-9 and fullwidth lower case letters for values 10-35 */
+    public static final int DIGIT_RANGE_EUROPEAN_EX_FW_LC = 4;
+
+    /** Arabic digits for values 0-9 */
+    public static final int DIGIT_RANGE_ARABIC = 5;
+
+    /** Eastern Arabic (Persian) digits for values 0-9 */
+    public static final int DIGIT_RANGE_EASTERN_ARABIC = 6;
+
+    /** Devanagari digits for values 0-9 */
+    public static final int DIGIT_RANGE_DEVANAGARI = 7;
+
+    /** Bengali digits for values 0-9 */
+    public static final int DIGIT_RANGE_BENGALI = 8;
+
+    /** Gurmukhi digits for values 0-9 */
+    public static final int DIGIT_RANGE_GURMUKHI = 9;
+
+    /** Gurjarati digits for values 0-9 */
+    public static final int DIGIT_RANGE_GUJARATI = 10;
+
+    /** Oriya digits for values 0-9 */
+    public static final int DIGIT_RANGE_ORIYA = 11;
+
+    /** Tamil digits for values 1-9, Tamil has no digit for zero. */
+    public static final int DIGIT_RANGE_TAMIL = 12;
+
+    /** Telugu digits for values 0-9 */
+    public static final int DIGIT_RANGE_TELUGU = 13;
+
+    /** Kannada digits for values 0-9 */
+    public static final int DIGIT_RANGE_KANNADA = 14;
+
+    /** Malayam digits for values 0-9 */
+    public static final int DIGIT_RANGE_MALAYAM = 15;
+
+    /** Thai digits for values 0-9 */
+    public static final int DIGIT_RANGE_THAI = 16;
+
+    /** Lao digits for values 0-9 */
+    public static final int DIGIT_RANGE_LAO = 17;
+
+    /** Tibetan digits for values 0-9 */
+    public static final int DIGIT_RANGE_TIBETAN = 18;
+
+    /** Myanmar digits for values 0-9 */
+    public static final int DIGIT_RANGE_MYANMAR = 19;
+
+    /** Ethiopic digits for values 0-9 */
+    public static final int DIGIT_RANGE_ETHIOPIC = 20;
+
+    /** Khmer digits for values 0-9 */
+    public static final int DIGIT_RANGE_KHMER = 21;
+
+    /** Montolian digits for values 0-9 */
+    public static final int DIGIT_RANGE_MONGOLIAN = 22;
+
+    /** Han digits for values 0-9 */
+    public static final int DIGIT_RANGE_HAN = 23;
+
+    /** Han ("checkwriting") digits for values 0-9 */
+    public static final int DIGIT_RANGE_HAN_CW = 24;
+
+    private static final int DIGIT_RANGE_LIMIT = 25;
+      
    /**
    * Special casing uppercase management
    * @param ch code point to convert
--- a/icu4j/src/com/ibm/text/UCharacter.java
+++ b/icu4j/src/com/ibm/text/UCharacter.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacter.java,v $ 
-* $Date: 2001/08/22 22:38:30 $ 
-* $Revision: 1.11 $
+* $Date: 2001/10/05 18:42:33 $ 
+* $Revision: 1.12 $
 *
 *******************************************************************************
 */
@ -57,8 +57,11 @@ import java.util.Locale;
 *      does it include the Java-specific character information, such as 
 *      boolean isJavaIdentifierPart(char ch).
 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric 
-*      values '10' - '35'. UCharacter does not treat the above code points 
-*      as having numeric values
+*      values '10' - '35'. UCharacter also does this in digit and
+*      getNumericValue, to adhere to the java semantics of these
+*      methods.  New methods unicodeDigit, and
+*      getUnicodeNumericValue do not treat the above code points 
+*      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
 * <li> For consistency with ICU4C's data, control code points below have their 
 *      Unicode general category reset to the types below.
 *      <ul>
@ -349,19 +352,25 @@ public final class UCharacter
    // public methods ===================================================
      
    /**
-    * Retrieves the decimal numeric value of a digit code point.<br>
-    * A code point is a valid digit if the following is true: 
+    * Retrieves the numeric value of a decimal digit code point.
+    * <br>This method observes the semantics of
+    * <code>java.lang.Character.digit()</code>.  Note that this
+    * will return positive values for code points for which isDigit
+    * returns false, just like java.lang.Character.
+    * <br><em>Semantic Change:</em> In release 1.3.1 and
+    * prior, this did not treat the European letters as having a
+    * digit value, and also treated numeric letters and other numbers as digits.  
+    * This has been changed to conform to the java semantics.
+    * <br>A code point is a valid digit if and only if:
    * <ul>
-    * <li> The method isDigit(ch) is true and the Unicode decimal digit value of 
-    *      ch is less than the specified radix. 
+    *   <li>ch is a decimal digit or one of the european letters, and
+    *   <li>the value of ch is less than the specified radix.
    * </ul>
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. 
-    * @param ch the code point whose numeric value is to be determined
-    * @param radix the radix which the digit is to be converted to
-    * @return the numeric value of the code point ch in the argument radix,
-    *         this method returns -1 if ch is not a valid digit code point or 
-    *         if its digit value exceeds the radix.
+    * @param ch the code point to query
+    * @param radix the radix
+    * @return the numeric value represented by the code point in the
+    * specified radix, or -1 if the code point is not a decimal digit
+    * or if its value is too large for the radix
    */
    public static int digit(int ch, int radix)
    {
@ -375,6 +384,7 @@ public final class UCharacter
                result = UCharacterPropertyDB.getSignedValue(props);
            }
        }
+ /*
        else {
            // contained in exception data
            int index = UCharacterPropertyDB.getExceptionIndex(props);
@ -398,6 +408,11 @@ public final class UCharacter
        if (result < 0) {
            result = getHanDigit(ch);
        }
+ */
+
+        if (result < 0 && radix > 10) {
+            result = getEuropeanDigit(ch);
+        }
        
        if (result < 0 || result >= radix) {
            return -1;
@ -405,14 +420,39 @@ public final class UCharacter
        return result;
    }
    
+    private static boolean isEuropeanDigit(int ch) {
+        return (ch <= 0x7a && ((ch >= 0x41 && ch <= 0x5a) || ch >= 0x61)) ||
+            (ch >= 0xff21 && (ch <= 0xff3a || (ch >= 0xff41 && ch <= 0xff5a)));
+    }
+
+    private static int getEuropeanDigit(int ch) {
+        if (ch <= 0x7a) {
+            if (ch >= 0x41 && ch <= 0x5a) {
+                return ch + 10 - 0x41;
+            } else if (ch >= 0x61) {
+                return ch + 10 - 0x61;
+            }
+        } else if (ch >= 0xff21) {
+            if (ch <= 0xff3a) {
+                return ch + 10 - 0xff21;
+            } else if (ch >= 0xff41 && ch <= 0xff5a) {
+                return ch + 10 - 0xff41;
+            }
+        }
+        return -1;
+    }
+
    /**
-    * Retrieves the decimal numeric value of a digit code point in radix 10<br>
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. 
-    * @param ch the code point whose numeric value is to be determined
-    * @return the numeric value of the code point ch, this method returns -1 if 
-    *         ch is not a valid digit code point
-    */
+    * Retrieves the numeric value of a decimal digit code point.
+    * <br>This is a convenience overload of <code>digit(int, int)</code> 
+    * that provides a decimal radix.
+    * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
+    * treated numeric letters and other numbers as digits.  This has
+    * been changed to conform to the java semantics.
+    * @param ch the code point to query
+    * @return the numeric value represented by the code point,
+    * or -1 if the code point is not a decimal digit or if its
+    * value is too large for a decimal radix */
    public static int digit(int ch)
    {
        return digit(ch, DECIMAL_RADIX_);
@ -420,17 +460,43 @@ public final class UCharacter

   /**
    * Returns the Unicode numeric value of the code point as a nonnegative 
-    * integer. <br>
-    * If the code point does not have a numeric value, then -1 is returned. <br>
+    * integer.
+    * <br>If the code point does not have a numeric value, then -1 is returned. <br>
    * If the code point has a numeric value that cannot be represented as a 
    * nonnegative integer (for example, a fractional value), then -2 is returned.
-    * <br>
-    * Note this method, unlike java.lang.Character.digit() does not regard the 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as numbers. 
-    * @param ch Unicode code point
-    * @return numeric value of the code point as a nonnegative integer
+    * <br><em>Semantic Change:</em> In release 1.3.1 and
+    * prior, this returned -1 for ASCII letters and their
+    * fullwidth counterparts.  This has been changed to
+    * conform to the java semantics.
+    * @param ch the code point to query
+    * @return the numeric value of the code point, or -1 if it has no numeric value,
+    * or -2 if it has a numeric value that cannot be represented as a nonnegative 
+    * integer
    */
    public static int getNumericValue(int ch)
+    {
+        return getNumericValueInternal(ch, true);
+    }
+
+   /**
+    * Returns the Unicode numeric value of the code point as a nonnegative 
+    * integer.
+    * <br>If the code point does not have a numeric value, then -1 is returned. <br>
+    * If the code point has a numeric value that cannot be represented as a 
+    * nonnegative integer (for example, a fractional value), then -2 is returned.
+    * This returns values other than -1 for all and only those code points whose
+    * type is a numeric type.
+    * @param ch the code point to query
+    * @return the numeric value of the code point, or -1 if it has no numeric value,
+    * or -2 if it has a numeric value that cannot be represented as a nonnegative 
+    * integer
+    */
+    public static int getUnicodeNumericValue(int ch)
+    {
+        return getNumericValueInternal(ch, false);
+    }
+
+    private static int getNumericValueInternal(int ch, boolean useEuropean)
    {
        int props = getProps(ch);
        int type = UCharacterPropertyDB.getPropType(props);
@ -439,7 +505,8 @@ public final class UCharacter
        if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER &&
            type != UCharacterCategory.LETTER_NUMBER &&
            type != UCharacterCategory.OTHER_NUMBER) {
-            return -1;
+
+            return useEuropean ? getEuropeanDigit(ch) : -1;
        }
          
        int result = -1;
@ -453,7 +520,8 @@ public final class UCharacter
            if (PROPERTY_DB_.hasExceptionValue(index, 
                                       UCharacterPropertyDB.EXC_DIGIT_VALUE_)) {
                result  = PROPERTY_DB_.getException(index, 
-                                        UCharacterPropertyDB.EXC_DIGIT_VALUE_); 
+                                        UCharacterPropertyDB.EXC_DIGIT_VALUE_) &
+                    LAST_CHAR_MASK_; 
            }
            else {
                if (!PROPERTY_DB_.hasExceptionValue(index, 
@ -466,10 +534,6 @@ public final class UCharacter
            }
        }
        
-        if (result < 0) {
-          result = getHanDigit(ch);
-        }
-        
        if (result < 0) {
            return -2;
        }
@ -506,19 +570,18 @@ public final class UCharacter
    }
                                    
   /**
-    * Determines if a code point is a digit.<br>
-    * Note this method, unlike java.lang.Character.isDigit() does not regard the 
-    * ascii characters 'A' - 'Z' and 'a' - 'z' as digits.<br>
-    * @param ch code point to determine if it is a digit
-    * @return true if this code point is a digit
-    */
+    * Determines if a code point is a Java digit.
+    * <br>This method observes the semantics of
+    * <code>java.lang.Character.isDigit()</code>.  It returns true for
+    * decimal digits only.
+    * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
+    * treated numeric letters and other numbers as digits.  This has
+    * been changed to conform to the java semantics.
+    * @param ch code point to query
+    * @return true if this code point is a digit */
    public static boolean isDigit(int ch)
    {
-        int cat = getType(ch);
-        // if props == 0, it will just fall through and return false
-        return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || 
-            cat == UCharacterCategory.OTHER_NUMBER ||
-            cat == UCharacterCategory.LETTER_NUMBER;
+        return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
    }

    /**
@ -1527,38 +1590,220 @@ public final class UCharacter
    }
      
    /**
-    * Getting Han character digit values
-    * @param ch code point to test if it is a Han character
-    * @return Han digit value if ch is a Han digit character
+    * Return numeric value of Han code points.
+    * <br> This returns the value of Han 'numeric' code points,
+    * including those for zero, ten, hundred, thousand, ten thousand,
+    * and hundred million.  Unicode does not consider these to be
+    * numeric. This includes both the standard and 'checkwriting'
+    * characters, the 'big circle' zero character, and the standard
+    * zero character.
+    * @param ch code point to query
+    * @return value if it is a Han 'numeric character,' otherwise return -1.  
    */
-    private static int getHanDigit(int ch)
+    public static int getHanNumericValue(int ch)
    {
        switch(ch)
        {
        case IDEOGRAPHIC_NUMBER_ZERO_ :
+        case CJK_IDEOGRAPH_COMPLEX_ZERO:
            return 0; // Han Zero
        case CJK_IDEOGRAPH_FIRST_ :
+        case CJK_IDEOGRAPH_COMPLEX_ONE:
            return 1; // Han One
        case CJK_IDEOGRAPH_SECOND_ :
+        case CJK_IDEOGRAPH_COMPLEX_TWO:
            return 2; // Han Two
        case CJK_IDEOGRAPH_THIRD_ :
+        case CJK_IDEOGRAPH_COMPLEX_THREE:
            return 3; // Han Three
        case CJK_IDEOGRAPH_FOURTH_ :
+        case CJK_IDEOGRAPH_COMPLEX_FOUR:
            return 4; // Han Four
        case CJK_IDEOGRAPH_FIFTH_ :
+        case CJK_IDEOGRAPH_COMPLEX_FIVE:
            return 5; // Han Five
        case CJK_IDEOGRAPH_SIXTH_ :
+        case CJK_IDEOGRAPH_COMPLEX_SIX:
            return 6; // Han Six
        case CJK_IDEOGRAPH_SEVENTH_ :
+        case CJK_IDEOGRAPH_COMPLEX_SEVEN:
            return 7; // Han Seven
        case CJK_IDEOGRAPH_EIGHTH_ : 
+        case CJK_IDEOGRAPH_COMPLEX_EIGHT:
            return 8; // Han Eight
        case CJK_IDEOGRAPH_NINETH_ :
+        case CJK_IDEOGRAPH_COMPLEX_NINE:
            return 9; // Han Nine
+        case CJK_IDEOGRAPH_TEN:
+        case CJK_IDEOGRAPH_COMPLEX_TEN:
+            return 10;
+        case CJK_IDEOGRAPH_HUNDRED:
+        case CJK_IDEOGRAPH_COMPLEX_HUNDRED:
+            return 100;
+        case CJK_IDEOGRAPH_THOUSAND:
+        case CJK_IDEOGRAPH_COMPLEX_THOUSAND:
+            return 1000;
+        case CJK_IDEOGRAPH_TEN_THOUSAND:
+            return 10000;
+        case CJK_IDEOGRAPH_HUNDRED_MILLION:
+            return 100000000;
        }
        return -1; // no value
    }

+    /*
+     * Return a decimal code point in the given range for the provided value.
+     * The range is defined by a DIGIT_RANGE selector, see below.  Most ranges
+     * only accept values between 0 and 9, some ranges (EUROPEAN_EX) accept
+     * values between 0 and 35.
+     * <br>
+     * @param value a decimal value, from 0 to 9 for most standard ranges, and
+     * from 0 to 35 for the EUROPEAN_EX ranges.
+     * @param digitRange one of the DIGIT_RANGE selectors.
+     * @returns the code point, or -1 if no valid code point exists for that decimal.
+     */
+    public int getCodePointForDigit(int digit, int digitRange) {
+        if (digitRange < 0 || digitRange > DIGIT_RANGE_LIMIT) {
+            throw new IllegalArgumentException("invalid digit range selector: " + digitRange);
+        }
+        if (digit < 0 || digit > ((digitRange < 1 || digitRange > 4) ? 9 : 35)) {
+            return -1;
+        }
+
+        if (digit < 10) {
+            if (digitRange < DIGIT_RANGE_HAN) {
+                if (digit == 0 && digitRange == DIGIT_RANGE_TAMIL) {
+                    return -1;
+                }
+                return bases[digitRange] + digit;
+            } else if (digitRange == DIGIT_RANGE_HAN) {
+                return hanmap[digit];
+            } else {
+                return exhanmap[digit];
+            }
+        } else {
+            return exbases[digitRange] + digit;
+        }
+    }
+
+    private static int[] bases = {
+        0x0030, 0x0030, 0x0030, 0x0030, 0x0030, 
+        0x0660, 0x06f0, 0x0966, 0x09e6, 0x0a66,
+        0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6,
+        0x0d66, 0x0e50, 0x0ed0, 0x0f20, 0x1040,
+        0x1369, 0x17e0, 0x1810
+    };
+
+    private static int[] exbases = {
+        0x0000, 0x0040, 0x0060, 0xff21, 0xff41
+    };
+
+    /* uses 'big circle' ling, includes shi, bai, qian, wan, yi */
+    private static int[] hanmap = {
+        0x3007, 0x4e00, 0x48ec, 0x4e09, 0x56d8, 0x4e94, 0x516d, 0x4e03, 0x516b, 0x4e5d
+    };
+
+    /* uses lingsuide ling, includes shi, bai, qian, wan, yi */
+    private static int[] exhanmap = {
+        0x96f6, 0x58f9, 0x8cb3, 0x53c3, 0x8086, 0x4f0d, 0x9678, 0x67d2, 0x634c, 0x7396
+    };
+
+    private static final int CJK_IDEOGRAPH_COMPLEX_ZERO = 0x96f6;
+    private static final int CJK_IDEOGRAPH_COMPLEX_ONE  = 0x58f9;
+    private static final int CJK_IDEOGRAPH_COMPLEX_TWO = 0x8cb3;
+    private static final int CJK_IDEOGRAPH_COMPLEX_THREE = 0x53c3;
+    private static final int CJK_IDEOGRAPH_COMPLEX_FOUR = 0x8086;
+    private static final int CJK_IDEOGRAPH_COMPLEX_FIVE = 0x4f0d;
+    private static final int CJK_IDEOGRAPH_COMPLEX_SIX = 0x9678;
+    private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN = 0x67d2;
+    private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT = 0x634c;
+    private static final int CJK_IDEOGRAPH_COMPLEX_NINE = 0x7396;
+    private static final int CJK_IDEOGRAPH_TEN = 0x5341;
+    private static final int CJK_IDEOGRAPH_COMPLEX_TEN = 0x62fe;
+    private static final int CJK_IDEOGRAPH_HUNDRED = 0x767e;
+    private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED = 0x4f70;
+    private static final int CJK_IDEOGRAPH_THOUSAND = 0x5343;
+    private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND = 0x4edf;
+    private static final int CJK_IDEOGRAPH_TEN_THOUSAND = 0x824c;
+    private static final int CJK_IDEOGRAPH_HUNDRED_MILLION = 0x5104;
+
+    /** European (ASCII) digits for values 0-9 */
+    public static final int DIGIT_RANGE_EUROPEAN = 0;
+
+    /** European (ASCII) digits for values 0-9 and upper case letters for values 10-35 */
+    public static final int DIGIT_RANGE_EUROPEAN_EX_UC = 1;
+
+    /** European (ASCII) digits for values 0-9 and lower case letters for values 10-35 */
+    public static final int DIGIT_RANGE_EUROPEAN_EX_LC = 2;
+
+    /** European (FullWidth) digits for values 0-9 and fullwidth upper case letters for values 10-35 */
+    public static final int DIGIT_RANGE_EUROPEAN_EX_FW_UC = 3;
+
+    /** European (FullWidth) digits for values 0-9 and fullwidth lower case letters for values 10-35 */
+    public static final int DIGIT_RANGE_EUROPEAN_EX_FW_LC = 4;
+
+    /** Arabic digits for values 0-9 */
+    public static final int DIGIT_RANGE_ARABIC = 5;
+
+    /** Eastern Arabic (Persian) digits for values 0-9 */
+    public static final int DIGIT_RANGE_EASTERN_ARABIC = 6;
+
+    /** Devanagari digits for values 0-9 */
+    public static final int DIGIT_RANGE_DEVANAGARI = 7;
+
+    /** Bengali digits for values 0-9 */
+    public static final int DIGIT_RANGE_BENGALI = 8;
+
+    /** Gurmukhi digits for values 0-9 */
+    public static final int DIGIT_RANGE_GURMUKHI = 9;
+
+    /** Gurjarati digits for values 0-9 */
+    public static final int DIGIT_RANGE_GUJARATI = 10;
+
+    /** Oriya digits for values 0-9 */
+    public static final int DIGIT_RANGE_ORIYA = 11;
+
+    /** Tamil digits for values 1-9, Tamil has no digit for zero. */
+    public static final int DIGIT_RANGE_TAMIL = 12;
+
+    /** Telugu digits for values 0-9 */
+    public static final int DIGIT_RANGE_TELUGU = 13;
+
+    /** Kannada digits for values 0-9 */
+    public static final int DIGIT_RANGE_KANNADA = 14;
+
+    /** Malayam digits for values 0-9 */
+    public static final int DIGIT_RANGE_MALAYAM = 15;
+
+    /** Thai digits for values 0-9 */
+    public static final int DIGIT_RANGE_THAI = 16;
+
+    /** Lao digits for values 0-9 */
+    public static final int DIGIT_RANGE_LAO = 17;
+
+    /** Tibetan digits for values 0-9 */
+    public static final int DIGIT_RANGE_TIBETAN = 18;
+
+    /** Myanmar digits for values 0-9 */
+    public static final int DIGIT_RANGE_MYANMAR = 19;
+
+    /** Ethiopic digits for values 0-9 */
+    public static final int DIGIT_RANGE_ETHIOPIC = 20;
+
+    /** Khmer digits for values 0-9 */
+    public static final int DIGIT_RANGE_KHMER = 21;
+
+    /** Montolian digits for values 0-9 */
+    public static final int DIGIT_RANGE_MONGOLIAN = 22;
+
+    /** Han digits for values 0-9 */
+    public static final int DIGIT_RANGE_HAN = 23;
+
+    /** Han ("checkwriting") digits for values 0-9 */
+    public static final int DIGIT_RANGE_HAN_CW = 24;
+
+    private static final int DIGIT_RANGE_LIMIT = 25;
+      
    /**
    * Special casing uppercase management
    * @param ch code point to convert