ICU-21071 Fix lenient parse rules

- Check non-lenient rules before call lenint parsing - Remove logKnownIssue 9503 from test code - Adjust TestAllLocales test on ICU4C - Add lenient checks on ICU4J
2020-04-21 18:25:46 -03:00 · 2020-04-21 18:25:46 -03:00 · 440cef61a7
commit 440cef61a7
parent 5944e1856c
6 changed files with 156 additions and 59 deletions
--- a/icu4c/source/i18n/nfrule.cpp
+++ b/icu4c/source/i18n/nfrule.cpp
@ -1297,6 +1297,10 @@ NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErr
 #if !UCONFIG_NO_COLLATION
    // go through all this grief if we're in lenient-parse mode
    if (formatter->isLenient()) {
+        // Check if non-lenient rule finds the text before call lenient parsing
+        if (str.startsWith(prefix)) {
+            return prefix.length();
+        }
        // get the formatter's collator and use it to create two
        // collation element iterators, one over the target string
        // and another over the prefix (right now, we'll throw an
@ -1505,9 +1509,15 @@ NFRule::findText(const UnicodeString& str,
        return str.indexOf(key, startingAt);
    }
    else {
-        // but if lenient parsing is turned ON, we've got some work
-        // ahead of us
-        return findTextLenient(str, key, startingAt, length);
+        // Check if non-lenient rule finds the text before call lenient parsing
+        *length = key.length();
+        int32_t pos = str.indexOf(key, startingAt);
+        if(pos >= 0) {
+            return pos;
+        } else {
+            // but if lenient parsing is turned ON, we've got some work ahead of us
+            return findTextLenient(str, key, startingAt, length);
+        }
    }
 }

--- a/icu4c/source/i18n/plurfmt.cpp
+++ b/icu4c/source/i18n/plurfmt.cpp
@ -549,9 +549,15 @@ void PluralFormat::parseType(const UnicodeString& source, const NFRule *rbnfLeni

        UnicodeString currArg = pattern.tempSubString(partStart->getLimit(), partLimit->getIndex() - partStart->getLimit());
        if (rbnfLenientScanner != NULL) {
-            // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
-            int32_t length = -1;
-            currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length);
+            // Check if non-lenient rule finds the text before call lenient parsing
+            int32_t tempIndex = source.indexOf(currArg, startingAt);
+            if (tempIndex >= 0) {
+                currMatchIndex = tempIndex;
+            } else {
+                // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
+                int32_t length = -1;
+                currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length);
+            }
        }
        else {
            currMatchIndex = source.indexOf(currArg, startingAt);
--- a/icu4c/source/test/intltest/itrbnf.cpp
+++ b/icu4c/source/test/intltest/itrbnf.cpp
@ -1889,16 +1889,19 @@ IntlTestRBNF::TestAllLocales()
            UErrorCode status = U_ZERO_ERROR;
            RuleBasedNumberFormat* f = new RuleBasedNumberFormat((URBNFRuleSetTag)j, *loc, status);

-            if (status == U_USING_DEFAULT_WARNING || status == U_USING_FALLBACK_WARNING) {
-                // Skip it.
-                delete f;
-                break;
-            }
            if (U_FAILURE(status)) {
                errln(UnicodeString(loc->getName()) + names[j]
                    + "ERROR could not instantiate -> " + u_errorName(status));
                continue;
            }
+
+            Locale actualLocale = f->getLocale(ULOC_ACTUAL_LOCALE, status);
+            if (actualLocale != *loc) {
+                // Skip the redundancy
+                delete f;
+                break;
+            }
+
 #if !UCONFIG_NO_COLLATION
            for (unsigned int numidx = 0; numidx < UPRV_LENGTHOF(numbers); numidx++) {
                double n = numbers[numidx];
@ -1936,28 +1939,26 @@ IntlTestRBNF::TestAllLocales()
                            + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
                    }
                }
-                if (!quick && !logKnownIssue("9503") ) {
-                    // lenient parse
-                    status = U_ZERO_ERROR;
-                    f->setLenient(TRUE);
-                    f->parse(str, num, status);
-                    if (U_FAILURE(status)) {
+                // lenient parse
+                status = U_ZERO_ERROR;
+                f->setLenient(TRUE);
+                f->parse(str, num, status);
+                if (U_FAILURE(status)) {
+                    errln(UnicodeString(loc->getName()) + names[j]
+                        + "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status));
+                }
+                // We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers.
+                if (j == 0) {
+                    if (num.getType() == Formattable::kLong && num.getLong() != n) {
                        errln(UnicodeString(loc->getName()) + names[j]
-                            + "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status));
+                            + UnicodeString("ERROR could not roundtrip ") + n
+                            + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong());
                    }
-                    // We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers.
-                    if (j == 0) {
-                        if (num.getType() == Formattable::kLong && num.getLong() != n) {
-                            errln(UnicodeString(loc->getName()) + names[j]
-                                + UnicodeString("ERROR could not roundtrip ") + n
-                                + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong());
-                        }
-                        else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) {
-                            // The epsilon difference is too high.
-                            errln(UnicodeString(loc->getName()) + names[j]
-                                + UnicodeString("ERROR could not roundtrip ") + n
-                                + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
-                        }
+                    else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) {
+                        // The epsilon difference is too high.
+                        errln(UnicodeString(loc->getName()) + names[j]
+                            + UnicodeString("ERROR could not roundtrip ") + n
+                            + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
                    }
                }
            }
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java
@ -1241,6 +1241,10 @@ final class NFRule {

        RbnfLenientScanner scanner = formatter.getLenientScanner();
        if (scanner != null) {
+            // Check if non-lenient rule finds the text before call lenient parsing
+            if (str.startsWith(prefix)) {
+                return prefix.length();
+            }
            return scanner.prefixLength(str, prefix);
        }

@ -1290,9 +1294,14 @@ final class NFRule {
        }

        if (scanner != null) {
-            // if lenient parsing is turned ON, we've got some work
-            // ahead of us
-            return scanner.findText(str, key, startingAt);
+            // Check if non-lenient rule finds the text before call lenient parsing
+            int pos[] = new int[] { str.indexOf(key, startingAt), key.length() };
+            if (pos[0] >= 0) {
+                return pos;
+            } else {
+                // if lenient parsing is turned ON, we've got some work ahead of us
+                return scanner.findText(str, key, startingAt);
+            }
        }
        // if lenient parsing is turned off, this is easy. Just call
        // String.indexOf() and we're done
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java
@ -760,9 +760,15 @@ public class PluralFormat extends UFormat {

            String currArg = pattern.substring(partStart.getLimit(), partLimit.getIndex());
            if (scanner != null) {
-                // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
-                int[] scannerMatchResult = scanner.findText(source, currArg, startingAt);
-                currMatchIndex = scannerMatchResult[0];
+                // Check if non-lenient rule finds the text before call lenient parsing
+                int tempPos = source.indexOf(currArg, startingAt);
+                if (tempPos >= 0) {
+                    currMatchIndex = tempPos;
+                } else {
+                    // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
+                    int[] scannerMatchResult = scanner.findText(source, currArg, startingAt);
+                    currMatchIndex = scannerMatchResult[0];
+                }
            }
            else {
                currMatchIndex = source.indexOf(currArg, startingAt);
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java
@ -296,6 +296,17 @@ public class RbnfTest extends TestFmwk {
        };

        doTest(formatter, testData, true);
+
+        String[][] testDataLenient = {
+                { "fifty-7", "57" },
+                { " fifty-7", "57" },
+                { "  fifty-7", "57" },
+                { "2 thousand six    HUNDRED fifty-7", "2,657" },
+                { "fifteen hundred and zero", "1,500" },
+                { "FOurhundred     thiRTY six", "436" }
+        };
+
+        doParsingTest(formatter, testDataLenient, true);
    }

    /**
@ -350,6 +361,12 @@ public class RbnfTest extends TestFmwk {
        };

        doTest(formatter, testData, true);
+
+        String[][] testDataLenient = {
+                { "2-51-33", "10,293" },
+        };
+
+        doParsingTest(formatter, testDataLenient, true);
    }

    /**
@ -425,6 +442,13 @@ public class RbnfTest extends TestFmwk {
        };

        doTest(formatter, testData, true);
+
+        String[][] testDataLenient = {
+                { "trente-et-un", "31" },
+                { "un cent quatre vingt dix huit", "198" },
+        };
+
+        doParsingTest(formatter, testDataLenient, true);
    }

    /**
@ -529,6 +553,12 @@ public class RbnfTest extends TestFmwk {
        };

        doTest(formatter, testData, true);
+
+        String[][] testDataLenient = {
+                { "ein Tausend sechs Hundert fuenfunddreissig", "1,635" },
+        };
+
+        doParsingTest(formatter, testDataLenient, true);
    }

    /**
@ -1117,6 +1147,10 @@ public class RbnfTest extends TestFmwk {
                " (ordinal) "
                //" (duration) " // English only
        };
+        boolean[] lenientMode = {
+                false, // non-lenient mode
+                true // lenient mode
+        };
        double[] numbers = {45.678, 1, 2, 10, 11, 100, 110, 200, 1000, 1111, -1111};
        int count = numbers.length;
        Random r = (count <= numbers.length ? null : createRandom());
@ -1142,25 +1176,25 @@ public class RbnfTest extends TestFmwk {
                        logln(loc.getName() + names[j] + "success format: " + n + " -> " + s);
                    }

-                    try {
-                        // RBNF parse is extremely slow when lenient option is enabled.
-                        // non-lenient parse
-                        fmt.setLenientParseMode(false);
-                        Number num = fmt.parse(s);
-                        if (isVerbose()) {
-                            logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num);
+                    for (int k = 0; k < lenientMode.length; k++) {
+                        try {
+                            fmt.setLenientParseMode(lenientMode[k]);
+                            Number num = fmt.parse(s);
+                            if (isVerbose()) {
+                                logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num);
+                            }
+                            if (j != 0) {
+                                // TODO: Fix the ordinal rules.
+                                continue;
+                            }
+                            if (n != num.doubleValue()) {
+                                errors.append("\n" + loc + names[j] + "got " + num + " expected " + n);
+                            }
+                        } catch (ParseException pe) {
+                            String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage();
+                            logln(msg);
+                            errors.append("\n" + msg);
                        }
-                        if (j != 0) {
-                            // TODO: Fix the ordinal rules.
-                            continue;
-                        }
-                        if (n != num.doubleValue()) {
-                            errors.append("\n" + loc + names[j] + "got " + num + " expected " + n);
-                        }
-                    } catch (ParseException pe) {
-                        String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage();
-                        logln(msg);
-                        errors.append("\n" + msg);
                    }
                }
            }
@ -1170,10 +1204,12 @@ public class RbnfTest extends TestFmwk {
        }
    }

-    void doTest(RuleBasedNumberFormat formatter, String[][] testData,
-                boolean testParsing) {
-        //        NumberFormat decFmt = NumberFormat.getInstance(Locale.US);
-        NumberFormat decFmt = new DecimalFormat("#,###.################");
+    NumberFormat createDecimalFormatter() {
+        return new DecimalFormat("#,###.################");
+    }
+
+    void doTest(RuleBasedNumberFormat formatter, String[][] testData, boolean testParsing) {
+        NumberFormat decFmt = createDecimalFormatter();
        try {
            for (int i = 0; i < testData.length; i++) {
                String number = testData[i][0];
@ -1207,6 +1243,35 @@ public class RbnfTest extends TestFmwk {
        }
    }

+    void doParsingTest(RuleBasedNumberFormat formatter, String[][] testData, boolean lenient) {
+        NumberFormat decFmt = createDecimalFormatter();
+
+        if (lenient) {
+            formatter.setLenientParseMode(true);
+        }
+        for (int i = 0; i < testData.length; i++) {
+            try {
+                String s = testData[i][0];
+                Number expectedNumber = decFmt.parse(testData[i][1]);
+                if (isVerbose()) {
+                    logln("test[" + i + "] spellout value: (" + s + ") target: " + expectedNumber);
+                }
+
+                Number num = formatter.parse(s);
+                if (isVerbose()) {
+                    logln("success parse: (" + s + ") -> " + num);
+                }
+
+                if (expectedNumber.doubleValue() != num.doubleValue()) {
+                    errln("\nParsing (" + s + ") failed: got " + num + " expected " + expectedNumber);
+                }
+            } catch (Throwable e) {
+                e.printStackTrace();
+                errln("Test failed with exception: " + e.toString());
+            }
+        }
+    }
+
    /* Tests the method
     *      public boolean equals(Object that)
     */