ICU-5909 lenient number formatting, with tests

X-SVN-Rev: 23110
2007-12-18 23:04:19 +00:00 · 2007-12-18 23:04:19 +00:00 · 7347bcee9c
commit 7347bcee9c
parent bebc001574
4 changed files with 133 additions and 26 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@ -1110,19 +1110,21 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
        /*6*/ "perr:", // <pattern or '-'> <invalid string>
        /*7*/ "pat:", // <pattern or '-'> <exp. toPattern or '-' or 'err'>
        /*8*/ "fpc:", // <loc or '-'> <curr.amt> <exp. string> <exp. curr.amt>
+        /*9*/ "strict=", // true or false
    };

    public void TestCases() {
        String caseFileName = "NumberFormatTestCases.txt";
        java.io.InputStream is = NumberFormatTest.class.getResourceAsStream(caseFileName);

-        ResourceReader reader = new ResourceReader(is, caseFileName);
+        ResourceReader reader = new ResourceReader(is, caseFileName, "utf-8");
        TokenIterator tokens = new TokenIterator(reader);

        Locale loc = new Locale("en", "US", "");
        DecimalFormat ref = null, fmt = null;
        MeasureFormat mfmt = null;
        String pat = null, str = null, mloc = null;
+        boolean strict = false;

        try {
            for (;;) {
@ -1137,10 +1139,14 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
                    // ref= <reference pattern>
                    ref = new DecimalFormat(tokens.next(),
                                            new DecimalFormatSymbols(Locale.US));
+                    ref.setParseStrict(strict);
+                    logln("Setting reference pattern to:\t" + ref);
                    break;
                case 1:
                    // loc= <locale>
                    loc = LocaleUtility.getLocaleFromName(tokens.next());
+                    pat = ((DecimalFormat) NumberFormat.getInstance(loc)).toPattern();
+                    logln("Setting locale to:\t" + loc + ", \tand pattern to:\t" + pat);
                    break;
                case 2: // f:
                case 3: // fp:
@ -1149,18 +1155,19 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
                    tok = tokens.next();
                    if (!tok.equals("-")) {
                        pat = tok;
-                        try {
-                            fmt = new DecimalFormat(pat, new DecimalFormatSymbols(loc));
-                        } catch (IllegalArgumentException iae) {
-                            errln(where + "Pattern \"" + pat + '"');
-                            iae.printStackTrace();
-                            tokens.next(); // consume remaining tokens
-                            tokens.next();
-                            if (cmd == 3) tokens.next();
-                            continue;
-                        }
                    }
-                    str = null;
+                    try {
+                        fmt = new DecimalFormat(pat, new DecimalFormatSymbols(loc));
+                        fmt.setParseStrict(strict);
+                    } catch (IllegalArgumentException iae) {
+                        errln(where + "Pattern \"" + pat + '"');
+                        iae.printStackTrace();
+                        tokens.next(); // consume remaining tokens
+                        //tokens.next();
+                        if (cmd == 3) tokens.next();
+                        continue;
+                    }
+                   str = null;
                    try {
                        if (cmd == 2 || cmd == 3 || cmd == 4) {
                            // f: <pattern or '-'> <number> <exp. string>
@ -1183,9 +1190,10 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
                        else {
                            str = tokens.next();
                            String expstr = tokens.next();
+                            Number parsed = fmt.parse(str);
                            Number exp = (Number) ref.parse(expstr);
                            assertEquals(where + '"' + pat + "\".parse(\"" + str + "\")",
-                                         exp, fmt.parse(str));
+                                         exp, parsed);
                        }
                    } catch (ParseException e) {
                        errln(where + '"' + pat + "\".parse(\"" + str +
@ -1216,6 +1224,7 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
                            f = fmt;
                        } else {
                            f = new DecimalFormat(testpat);
+                            f.setParseStrict(strict);
                        }
                        if (err) {
                            errln(where + "Invalid pattern \"" + testpat +
@ -1268,6 +1277,10 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
                        e.printStackTrace();
                    }
                    break;
+                case 9: // strict= true or false
+                    strict = "true".equalsIgnoreCase(tokens.next());
+                    logln("Setting strict to:\t" + strict);
+                    break;
                case -1:
                    errln("Unknown command \"" + tok + "\" at " + tokens.describePosition());
                    return;
--- a/icu4j/src/com/ibm/icu/dev/test/format/NumberFormatTestCases.txt
+++ b/icu4j/src/com/ibm/icu/dev/test/format/NumberFormatTestCases.txt
@ -1,4 +1,4 @@
-######################################################################
+######################################################################
 # Copyright (c) 2004, International Business Machines
 # Corporation and others.  All Rights Reserved.
 ######################################################################
@ -73,7 +73,7 @@ rt:  ""       -123.456    "-123.456"

 # Currency
 fpc: "en_US"        1234.56/USD  "$1,234.56"    1234.56/USD
-fpc: -              1234.56/JPY  "\u00A51,235"  1235/JPY
+fpc: -              1234.56/JPY  "¥1,235"  1235/JPY
 # ISO codes that overlap display names (QQQ vs. Q)
 fpc: -              123/QQQ      "QQQ123.00"    123/QQQ   # QQQ is fake
 fpc: -              123/GTQ      "Q123.00"      123/GTQ
@ -83,3 +83,38 @@ fpc: -              2/INR        "Rs.2.00"      2/INR
 # Display names with shared prefix (YDD vs. Y)
 fpc: -              100/YDD      "YDD100.00"    100/YDD
 fpc: -              100/CNY      "Y100.00"      100/CNY
+
+# Lenient Tests
+
+loc= "en"
+p: -              "1,234.56" 1234.56
+p: -              "1'234.56" 1234.56
+p: -              "1 234.56" 1234.56
+p: -              "1٬234.56" 1234.56
+p: -              "1，234．56" 1234.56
+p: -              "1.234.56" 1.234
+p: -              "1、234。56" 1234.56
+
+loc= "fr"
+p: -              "1.234,56" 1234.56
+p: -              "1'234,56" 1234.56
+p: -              "1 234,56" 1234.56
+p: -              "1,234,56" 1.234
+p: -              "1。234、56" 1234.56
+
+loc= "ar"
+p: -              "1.234٫56" 1234.56
+p: -              "1'234،56" 1234.56
+p: -              "1٬234،56" 1234.56
+p: -              "1.234,56" 1234.56
+p: -              "1'234,56" 1234.56
+p: -              "1٬234,56" 1234.56
+
+strict= true
+loc= "en"
+p: -              "1、234。56" 1
+loc= "fr"
+p: -              "1。234、56" 1
+loc= "ar"
+p: -              "1'234،56" 1234
+p: -              "1٬234،56" 1234
--- a/icu4j/src/com/ibm/icu/impl/data/ResourceReader.java
+++ b/icu4j/src/com/ibm/icu/impl/data/ResourceReader.java
@ -81,16 +81,16 @@ public class ResourceReader {
        _reset();
    }

-    /**
-     * Construct a reader object for the input stream associated with
-     * the given resource name.
-     * @param is the input stream of the resource
-     * @param resourceName the name of the resource
-     */
-     public ResourceReader(InputStream is, String resourceName) {
-         this.root = null;
+         /**
+          * Construct a reader object for the input stream associated with
+          * the given resource name.
+          * @param is the input stream of the resource
+          * @param resourceName the name of the resource
+          */
+          public ResourceReader(InputStream is, String resourceName, String encoding) {
+                   this.root = null;
         this.resourceName = resourceName;
-         this.encoding = null;
+         this.encoding = encoding;

         this.lineNo = -1;
         try {
@ -105,6 +105,16 @@ public class ResourceReader {
         }
     }

+          /**
+           * Construct a reader object for the input stream associated with
+           * the given resource name.
+           * @param is the input stream of the resource
+           * @param resourceName the name of the resource
+           */
+          public ResourceReader(InputStream is, String resourceName) {
+              this(is, resourceName, null);
+          }
+
    /**
     * Construct a reader object for the text file of the given name
     * in the given class's package, using the default encoding.
--- a/icu4j/src/com/ibm/icu/text/DecimalFormat.java
+++ b/icu4j/src/com/ibm/icu/text/DecimalFormat.java
@ -1709,6 +1709,24 @@ public class DecimalFormat extends NumberFormat {
    private static final int STATUS_INFINITE = 0;
    private static final int STATUS_POSITIVE = 1;
    private static final int STATUS_LENGTH   = 2;
+    private static final UnicodeSet dotEquivalents =(UnicodeSet) new UnicodeSet(
+        "[.\u2024\u3002\uFE12\uFE52\uFF0E\uFF61]").freeze();
+    private static final UnicodeSet commaEquivalents = (UnicodeSet) new UnicodeSet(
+        "[,\u060C\u066B\u3001\uFE10\uFE11\uFE50\uFE51\uFF0C\uFF64]").freeze();
+    private static final UnicodeSet otherGroupingSeparators = (UnicodeSet) new UnicodeSet(
+        "[\\ '\u00A0\u066C\u2000-\u200A\u2018\u2019\u202F\u205F\u3000\uFF07]").freeze();
+    
+    private static final UnicodeSet strictDotEquivalents =(UnicodeSet) new UnicodeSet(
+        "[.\u2024\uFE52\uFF0E\uFF61]").freeze();
+    private static final UnicodeSet strictCommaEquivalents = (UnicodeSet) new UnicodeSet(
+        "[,\u066B\uFE10\uFE50\uFF0C]").freeze();
+    private static final UnicodeSet strictOtherGroupingSeparators = (UnicodeSet) new UnicodeSet(
+        "[\\ '\u00A0\u066C\u2000-\u200A\u2018\u2019\u202F\u205F\u3000\uFF07]").freeze();
+
+    private static final UnicodeSet defaultGroupingSeparators = (UnicodeSet) new UnicodeSet(
+        dotEquivalents).addAll(commaEquivalents).addAll(otherGroupingSeparators).freeze();
+    private static final UnicodeSet strictDefaultGroupingSeparators = (UnicodeSet) new UnicodeSet(
+            strictDotEquivalents).addAll(strictCommaEquivalents).addAll(strictOtherGroupingSeparators).freeze();

    /**
     * <strong><font face=helvetica color=red>CHANGED</font></strong>
@ -1783,6 +1801,7 @@ public class DecimalFormat extends NumberFormat {
            char decimal = isCurrencyFormat ?
            symbols.getMonetaryDecimalSeparator() : symbols.getDecimalSeparator();
            char grouping = symbols.getGroupingSeparator();
+                        
            String exponentSep = symbols.getExponentSeparator();
            boolean sawDecimal = false;
            boolean sawExponent = false;
@ -1797,6 +1816,19 @@ public class DecimalFormat extends NumberFormat {
            int lastGroup = -1; // where did we last see a grouping separator?
            int prevGroup = -1; // where did we see the grouping separator before that?
            int gs2 = groupingSize2 == 0 ? groupingSize : groupingSize2;
+            
+            // equivalent grouping and decimal support
+            
+            // TODO markdavis Cache these if it makes a difference in performance.
+            UnicodeSet decimalSet = new UnicodeSet(getSimilarDecimals(decimal, strictParse));
+            UnicodeSet groupingSet = new UnicodeSet(strictParse ? strictDefaultGroupingSeparators : defaultGroupingSeparators)
+                .add(grouping).removeAll(decimalSet);
+            
+            // we are guaranteed that 
+            // decimalSet contains the decimal, and 
+            // groupingSet contains the groupingSeparator
+            // (unless decimal and grouping are the same, which should never happen. But in that case, groupingSet will just be empty.)
+

            // We have to track digitCount ourselves, because digits.count will
            // pin when the maximum allowable digits is reached.
@ -1894,7 +1926,7 @@ public class DecimalFormat extends NumberFormat {
                    // Cancel out backup setting (see grouping handler below)
                    backup = -1;
                }
-                else if (!isExponent && ch == decimal)
+                else if (!isExponent && decimalSet.contains(ch))
                {
                    if (strictParse) {
                        if (backup != -1 ||
@ -1910,7 +1942,7 @@ public class DecimalFormat extends NumberFormat {
                    sawDecimal = true;
                    leadingZero = false; // a single leading zero before a decimal is ok
                }
-                else if (!isExponent && ch == grouping && isGroupingUsed())
+                else if (!isExponent && isGroupingUsed() && groupingSet.contains(ch))
                {
                    if (sawDecimal) {
                        break;
@ -2067,6 +2099,23 @@ public class DecimalFormat extends NumberFormat {
        return true;
    }

+    /**
+     * Return characters that are used where this decimal is used.
+     * @param decimal
+     * @param strictParse 
+     * @return
+     */
+    private UnicodeSet getSimilarDecimals(char decimal, boolean strictParse) {
+        if (dotEquivalents.contains(decimal)) {
+            return strictParse ? strictDotEquivalents : dotEquivalents;
+        }
+        if (commaEquivalents.contains(decimal)) {
+            return strictParse ? strictCommaEquivalents : commaEquivalents;
+        }
+        // if there is no match, return the character itself
+        return new UnicodeSet().add(decimal);
+    }
+
    /**
     * Starting at position, advance past a run of pad characters, if any.
     * Return the index of the first character after position that is not a pad