From 9ae7e8eba17f23cdb9d27167d9862d320c7c8afe Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Wed, 28 Feb 2018 03:42:32 +0000 Subject: [PATCH] ICU-13084 Updating set of ignorable control characters to [:DI:]. X-SVN-Rev: 41002 --- icu4c/source/i18n/numparse_unisets.cpp | 5 ++-- .../number/parse/UnicodeSetStaticCache.java | 5 ++-- .../src/com/ibm/icu/text/DecimalFormat.java | 3 ++- .../icu/dev/test/format/NumberFormatTest.java | 23 ++++++++++++++++++- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp index fc0274f2a3..0a8ec2bebb 100644 --- a/icu4c/source/i18n/numparse_unisets.cpp +++ b/icu4c/source/i18n/numparse_unisets.cpp @@ -63,8 +63,9 @@ void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { gUnicodeSets[EMPTY] = new UnicodeSet(); - // BiDi characters are skipped over and ignored at any point in the string, even in strict mode. - gUnicodeSets[BIDI] = new UnicodeSet(u"[[\\u200E\\u200F\\u061C]]", status); + // These characters are skipped over and ignored at any point in the string, even in strict mode. + // See ticket #13084. + gUnicodeSets[BIDI] = new UnicodeSet(u"[[:DI:]]", status); // This set was decided after discussion with icu-design@. See ticket #13309. // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java index 5ab7081704..edc0e99114 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java @@ -81,8 +81,9 @@ public class UnicodeSetStaticCache { } static { - // BiDi characters are skipped over and ignored at any point in the string, even in strict mode. - unicodeSets.put(Key.BIDI, new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze()); + // These characters are skipped over and ignored at any point in the string, even in strict mode. + // See ticket #13084. + unicodeSets.put(Key.BIDI, new UnicodeSet("[[:DI:]]").freeze()); // This set was decided after discussion with icu-design@. See ticket #13309. // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java index 5f68fe6046..37e4064666 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java @@ -198,7 +198,7 @@ import com.ibm.icu.util.ULocale.Category; * example, a formatter instance gotten from NumberFormat.getInstance(ULocale, * NumberFormat.CURRENCYSTYLE) can parse both "USD1.00" and "3.00 US dollars". * - *

Whitespace characters (lenient mode) and bidi control characters (lenient and strict mode), + *

Whitespace characters (lenient mode) and control characters (lenient and strict mode), * collectively called "ignorables", do not need to match in identity or quantity between the * pattern string and the input string. For example, the pattern "# %" matches "35 %" (with a single * space), "35%" (with no space), "35 %" (with a non-breaking space), and "35  %" (with @@ -206,6 +206,7 @@ import com.ibm.icu.util.ULocale.Category; * number: prefix, number, exponent separator, and suffix. Ignorable whitespace characters are those * having the Unicode "blank" property for regular expressions, defined in UTS #18 Annex C, which is * "horizontal" whitespace, like spaces and tabs, but not "vertical" whitespace, like line breaks. + * Ignorable control characters are those in the Unicode set [:Default_Ignorable_Code_Point:]. * *

If {@link #parse(String, ParsePosition)} fails to parse a string, it returns null * and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java index 4697abd5da..370a843f51 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java @@ -1722,11 +1722,32 @@ public class NumberFormatTest extends TestFmwk { // Test all characters in the UTS 18 "blank" set stated in the API docstring. UnicodeSet blanks = new UnicodeSet("[[:Zs:][\\u0009]]").freeze(); for (String space : blanks) { - String str = "a " + space + " b1234"; + String str = "a " + space + " b1234c "; + expect(fmt, str, n); + } + + // Arbitrary whitespace is not accepted in strict mode. + fmt.setParseStrict(true); + for (String space : blanks) { + String str = "a " + space + " b1234c "; + expectParseException(fmt, str, n); + } + + // Test default ignorable characters. These should work in both lenient and strict. + UnicodeSet defaultIgnorables = new UnicodeSet("[[:Default_Ignorable_Code_Point:]]").freeze(); + fmt.setParseStrict(false); + for (String ignorable : defaultIgnorables) { + String str = "a b " + ignorable + "1234c "; + expect(fmt, str, n); + } + fmt.setParseStrict(true); + for (String ignorable : defaultIgnorables) { + String str = "a b " + ignorable + "1234c "; expect(fmt, str, n); } // Test that other whitespace characters do not work + fmt.setParseStrict(false); UnicodeSet otherWhitespace = new UnicodeSet("[[:whitespace:]]").removeAll(blanks).freeze(); for (String space : otherWhitespace) { String str = "a " + space + " b1234";