ICU-13513 Assorted fixes to AffixPatternMatcher and other classes. Found and fixed exponential loop in non-greedy parser.

X-SVN-Rev: 40797
This commit is contained in:
Shane Carr 2018-01-24 02:32:03 +00:00
parent 77b084f6fa
commit a6a18243b1
19 changed files with 96 additions and 226 deletions

View File

@ -176,12 +176,6 @@ public class AffixMatcher implements NumberParseMatcher {
return leadCodePoints.freeze();
}
@Override
public boolean matchesEmpty() {
// This is a stub implementation.
throw new AssertionError();
}
@Override
public void postProcess(ParsedNumber result) {
// Check to see if our affix is the one that was matched. If so, set the flags in the result.

View File

@ -59,12 +59,17 @@ public class AffixPatternMatcher extends SeriesMatcher implements AffixUtils.Tok
@Override
public void consumeToken(int typeOrCp) {
// This is called by AffixUtils.iterateWithConsumer() for each token.
// Add an ignorables matcher between tokens except between two literals, and don't put two
// ignorables matchers in a row.
if (ignorables != null
&& length() > 0
&& (lastTypeOrCp < 0 || !ignorables.getSet().contains(lastTypeOrCp))) {
addMatcher(ignorables);
}
if (typeOrCp < 0) {
// Don't add more than two ignorables matchers in a row
if (ignorables != null
&& (lastTypeOrCp < 0 || !ignorables.getSet().contains(lastTypeOrCp))) {
addMatcher(ignorables);
}
// Case 1: the token is a symbol.
switch (typeOrCp) {
case AffixUtils.TYPE_MINUS_SIGN:
addMatcher(factory.minusSign());
@ -89,16 +94,13 @@ public class AffixPatternMatcher extends SeriesMatcher implements AffixUtils.Tok
default:
throw new AssertionError();
}
} else if (ignorables != null && ignorables.getSet().contains(typeOrCp)) {
// Don't add more than two ignorables matchers in a row
if (lastTypeOrCp < 0 || !ignorables.getSet().contains(lastTypeOrCp)) {
addMatcher(ignorables);
}
// Case 2: the token is an ignorable literal.
// No action necessary: the ignorables matcher has already been added.
} else {
// Start of a literal: add ignorables matcher if the previous token was a symbol
if (ignorables != null && lastTypeOrCp < 0) {
addMatcher(ignorables);
}
// Case 3: the token is a non-ignorable literal.
addMatcher(CodePointMatcher.getInstance(typeOrCp));
}
lastTypeOrCp = typeOrCp;

View File

@ -8,7 +8,8 @@ import java.util.List;
import com.ibm.icu.text.UnicodeSet;
/**
* Composes a number of matchers, and succeeds if any of the matchers succeed.
* Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses
* the first matcher in the list to succeed.
*
* @author sffc
* @see SeriesMatcher
@ -37,20 +38,17 @@ public class AnyMatcher implements NumberParseMatcher {
return false;
}
// TODO: Give a nice way to reset ParsedNumber to avoid the copy here.
ParsedNumber backup = new ParsedNumber();
backup.copyFrom(result);
int initialOffset = segment.getOffset();
boolean maybeMore = false;
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
maybeMore = maybeMore || matcher.match(segment, result);
if (segment.getOffset() != initialOffset) {
// Match succeeded. Return true here to be safe.
// TODO: Better would be to run each matcher and return true only if at least one of the
// matchers returned true.
return true;
// Match succeeded.
// NOTE: Except for a couple edge cases, if a matcher accepted string A, then it will
// accept any string starting with A. Therefore, there is no possibility that matchers
// later in the list may be evaluated on longer strings, and we can exit the loop here.
break;
}
}
@ -65,6 +63,10 @@ public class AnyMatcher implements NumberParseMatcher {
return UnicodeSet.EMPTY;
}
if (matchers.size() == 1) {
return matchers.get(0).getLeadCodePoints();
}
UnicodeSet leadCodePoints = new UnicodeSet();
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
@ -73,22 +75,6 @@ public class AnyMatcher implements NumberParseMatcher {
return leadCodePoints.freeze();
}
@Override
public boolean matchesEmpty() {
assert frozen;
if (matchers == null) {
return true;
}
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
if (matcher.matchesEmpty()) {
return true;
}
}
return false;
}
@Override
public void postProcess(ParsedNumber result) {
assert frozen;
@ -104,7 +90,7 @@ public class AnyMatcher implements NumberParseMatcher {
@Override
public String toString() {
return "<SeriesMatcher " + matchers + ">";
return "<AnyMatcher " + matchers + ">";
}
}

View File

@ -36,11 +36,6 @@ public class CodePointMatcher implements NumberParseMatcher {
return new UnicodeSet().add(cp).freeze();
}
@Override
public boolean matchesEmpty() {
return false;
}
@Override
public void postProcess(ParsedNumber result) {
// No-op

View File

@ -58,11 +58,6 @@ public class CurrencyMatcher implements NumberParseMatcher {
return leadCodePoints.freeze();
}
@Override
public boolean matchesEmpty() {
return false;
}
@Override
public void postProcess(ParsedNumber result) {
// No-op

View File

@ -58,11 +58,6 @@ public class CurrencyTrieMatcher implements NumberParseMatcher {
return leadCodePoints.freeze();
}
@Override
public boolean matchesEmpty() {
return false;
}
@Override
public void postProcess(ParsedNumber result) {
// No-op

View File

@ -333,11 +333,6 @@ public class DecimalMatcher implements NumberParseMatcher {
return leadCodePoints.freeze();
}
@Override
public boolean matchesEmpty() {
return false;
}
@Override
public void postProcess(ParsedNumber result) {
// No-op

View File

@ -8,7 +8,7 @@ import com.ibm.icu.text.UnicodeSet;
* @author sffc
*
*/
public class IgnorablesMatcher extends RangeMatcher {
public class IgnorablesMatcher extends SymbolMatcher implements NumberParseMatcher.Flexible {
public static final IgnorablesMatcher DEFAULT = new IgnorablesMatcher(
UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES));
@ -22,18 +22,7 @@ public class IgnorablesMatcher extends RangeMatcher {
}
private IgnorablesMatcher(UnicodeSet ignorables) {
super(ignorables);
}
@Override
public UnicodeSet getLeadCodePoints() {
if (this == DEFAULT) {
return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
} else if (this == STRICT) {
return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
} else {
return super.getLeadCodePoints();
}
super("", ignorables);
}
@Override

View File

@ -5,10 +5,27 @@ package com.ibm.icu.impl.number.parse;
import com.ibm.icu.text.UnicodeSet;
/**
* @author sffc
* Given a string, there should NOT be more than one way to consume the string with the same matcher
* applied multiple times. If there is, the non-greedy parsing algorithm will be unhappy and may enter an
* exponential-time loop. For example, consider the "A Matcher" that accepts "any number of As". Given
* the string "AAAA", there are 2^N = 8 ways to apply the A Matcher to this string: you could have the A
* Matcher apply 4 times to each character; you could have it apply just once to all the characters; you
* could have it apply to the first 2 characters and the second 2 characters; and so on. A better version
* of the "A Matcher" would be for it to accept exactly one A, and allow the algorithm to run it
* repeatedly to consume a string of multiple As. The A Matcher can implement the Flexible interface
* below to signal that it can be applied multiple times in a row.
*
* @author sffc
*/
public interface NumberParseMatcher {
/**
* Matchers can implement the Flexible interface to indicate that they are optional and can be run
* repeatedly. Used by SeriesMatcher, primarily in the context of IgnorablesMatcher.
*/
public interface Flexible {
}
/**
* Runs this matcher starting at the beginning of the given StringSegment. If this matcher finds
* something interesting in the StringSegment, it should update the offset of the StringSegment
@ -32,15 +49,6 @@ public interface NumberParseMatcher {
*/
public UnicodeSet getLeadCodePoints();
/**
* Whether this matcher is well-defined for the empty string. Matchers that are looking for specific
* symbols should return false here. Matchers that are looking for any number of copies of a certain
* code point or string, like RangeMatcher and IgnorablesMatcher, should return true.
*
* @return Whether this matcher can accept the empty string.
*/
public boolean matchesEmpty();
/**
* Method called at the end of a parse, after all matchers have failed to consume any more chars.
* Allows a matcher to make final modifications to the result given the knowledge that no more

View File

@ -33,7 +33,9 @@ import com.ibm.icu.util.ULocale;
public class NumberParserImpl {
@Deprecated
public static NumberParserImpl createParserFromPattern(
ULocale locale, String pattern, boolean strictGrouping) {
ULocale locale,
String pattern,
boolean strictGrouping) {
// Temporary frontend for testing.
int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE
@ -196,7 +198,7 @@ public class NumberParserImpl {
parser.addMatcher(InfinityMatcher.getInstance(symbols));
String padString = properties.getPadString();
if (padString != null && !ignorables.getSet().contains(padString)) {
parser.addMatcher(new PaddingMatcher(padString));
parser.addMatcher(PaddingMatcher.getInstance(padString));
}
parser.addMatcher(ignorables);
parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
@ -366,6 +368,7 @@ public class NumberParserImpl {
int initialOffset = segment.getOffset();
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
// In a non-greedy parse, we attempt all possible matches and pick the best.
for (int charsToConsume = 0; charsToConsume < segment.length();) {
charsToConsume += Character.charCount(Character.codePointAt(segment, charsToConsume));

View File

@ -8,13 +8,14 @@ import com.ibm.icu.text.UnicodeSet;
* @author sffc
*
*/
public class PaddingMatcher extends RangeMatcher {
public class PaddingMatcher extends SymbolMatcher implements NumberParseMatcher.Flexible {
/**
* @param uniSet
*/
protected PaddingMatcher(String padString) {
super(new UnicodeSet().add(padString).freeze());
public static PaddingMatcher getInstance(String padString) {
return new PaddingMatcher(padString);
}
private PaddingMatcher(String symbolString) {
super(symbolString, UnicodeSet.EMPTY);
}
@Override
@ -29,6 +30,6 @@ public class PaddingMatcher extends RangeMatcher {
@Override
public String toString() {
return "<PaddingMatcher " + uniSet + ">";
return "<PaddingMatcher>";
}
}

View File

@ -1,66 +0,0 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.text.UnicodeSet;
/**
* @author sffc
*
*/
public abstract class RangeMatcher implements NumberParseMatcher {
protected final UnicodeSet uniSet;
protected RangeMatcher(UnicodeSet uniSet) {
this.uniSet = uniSet;
}
public UnicodeSet getSet() {
return uniSet;
}
@Override
public boolean match(StringSegment segment, ParsedNumber result) {
// Smoke test first; this matcher might be disabled.
if (isDisabled(result)) {
return false;
}
while (segment.length() > 0) {
int cp = segment.getCodePoint();
if (cp != -1 && uniSet.contains(cp)) {
segment.adjustOffset(Character.charCount(cp));
accept(segment, result);
continue;
}
// If we get here, the code point didn't match the uniSet.
return false;
}
// If we get here, we consumed the entire string segment.
return true;
}
@Override
public UnicodeSet getLeadCodePoints() {
UnicodeSet leadCodePoints = new UnicodeSet();
ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
return leadCodePoints.freeze();
}
@Override
public boolean matchesEmpty() {
return true;
}
@Override
public void postProcess(ParsedNumber result) {
// No-op
}
protected abstract boolean isDisabled(ParsedNumber result);
protected abstract void accept(StringSegment segment, ParsedNumber result);
}

View File

@ -93,11 +93,6 @@ public class ScientificMatcher implements NumberParseMatcher {
}
}
@Override
public boolean matchesEmpty() {
return false;
}
@Override
public void postProcess(ParsedNumber result) {
// No-op

View File

@ -31,6 +31,10 @@ public class SeriesMatcher implements NumberParseMatcher {
frozen = true;
}
public int length() {
return matchers == null ? 0 : matchers.size();
}
@Override
public boolean match(StringSegment segment, ParsedNumber result) {
assert frozen;
@ -44,7 +48,7 @@ public class SeriesMatcher implements NumberParseMatcher {
int initialOffset = segment.getOffset();
boolean maybeMore = true;
for (int i = 0; i < matchers.size(); i++) {
for (int i = 0; i < matchers.size();) {
NumberParseMatcher matcher = matchers.get(i);
int matcherOffset = segment.getOffset();
if (segment.length() != 0) {
@ -53,8 +57,19 @@ public class SeriesMatcher implements NumberParseMatcher {
// Nothing for this matcher to match; ask for more.
maybeMore = true;
}
if (segment.getOffset() == matcherOffset && !matcher.matchesEmpty()) {
// Match failed.
boolean success = (segment.getOffset() != matcherOffset);
boolean isFlexible = matcher instanceof NumberParseMatcher.Flexible;
if (success && isFlexible) {
// Match succeeded, and this is a flexible matcher. Re-run it.
} else if (success) {
// Match succeeded, and this is NOT a flexible matcher. Proceed to the next matcher.
i++;
} else if (isFlexible) {
// Match failed, and this is a flexible matcher. Try again with the next matcher.
i++;
} else {
// Match failed, and this is NOT a flexible matcher. Exit.
segment.setOffset(initialOffset);
result.copyFrom(backup);
return maybeMore;
@ -72,35 +87,9 @@ public class SeriesMatcher implements NumberParseMatcher {
return UnicodeSet.EMPTY;
}
if (!matchers.get(0).matchesEmpty()) {
return matchers.get(0).getLeadCodePoints();
}
UnicodeSet leadCodePoints = new UnicodeSet();
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
leadCodePoints.addAll(matcher.getLeadCodePoints());
if (!matcher.matchesEmpty()) {
break;
}
}
return leadCodePoints.freeze();
}
@Override
public boolean matchesEmpty() {
assert frozen;
if (matchers == null) {
return true;
}
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
if (!matcher.matchesEmpty()) {
return false;
}
}
return true;
// SeriesMatchers are never allowed to start with a Flexible matcher.
assert !(matchers.get(0) instanceof NumberParseMatcher.Flexible);
return matchers.get(0).getLeadCodePoints();
}
@Override

View File

@ -25,6 +25,10 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
uniSet = UnicodeSetStaticCache.get(key);
}
public UnicodeSet getSet() {
return uniSet;
}
@Override
public boolean match(StringSegment segment, ParsedNumber result) {
// Smoke test first; this matcher might be disabled.
@ -55,7 +59,7 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
@Override
public UnicodeSet getLeadCodePoints() {
if (string == null || string.isEmpty()) {
if (string.isEmpty()) {
// Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
return uniSet;
}
@ -66,11 +70,6 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
return leadCodePoints.freeze();
}
@Override
public boolean matchesEmpty() {
return false;
}
@Override
public void postProcess(ParsedNumber result) {
// No-op

View File

@ -14,11 +14,6 @@ public abstract class ValidationMatcher implements NumberParseMatcher {
return false;
}
@Override
public boolean matchesEmpty() {
return false;
}
@Override
public UnicodeSet getLeadCodePoints() {
return UnicodeSet.EMPTY;

View File

@ -1492,13 +1492,12 @@ y gh56 -56 JK
y g h56 -56 JK
// S stops parsing after the 'i' for these and returns -56
// C stops before the 'i' and gets 56
// P does not allow ignorables between the 'j' and the 'k'
56ijk -56 CJK
56i jk -56 CJK
56ij k -56 CJKP
56ijk -56 CJKP
56ij k -56 CJK
56ijk -56 CJK
56ijk -56 CJK
56i jk -56 CJKP
56i jk -56 CJK
56i jk -56 CJK
// S and C get 56 (accepts ' ' gs grouping); J and K get null
5 6 fail CSP

View File

@ -5547,8 +5547,7 @@ public class NumberFormatTest extends TestFmwk {
ParsePosition ppos = new ParsePosition(0);
Number result = df.parse("42\u200E%\u200E ", ppos);
assertEquals("Should parse as percentage", new BigDecimal("0.42"), result);
// TODO: The following line breaks in ICU 61.
//assertEquals("Should consume the trailing bidi since it is in the symbol", 5, ppos.getIndex());
assertEquals("Should consume the trailing bidi since it is in the symbol", 5, ppos.getIndex());
ppos.setIndex(0);
result = df.parse("-42a\u200E ", ppos);
assertEquals("Should not parse as percent", new Long(-42), result);

View File

@ -3,7 +3,6 @@
package com.ibm.icu.dev.test.number;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
@ -84,6 +83,7 @@ public class NumberParserTest {
{ 3, "📺1.23", "📺0;📻0", 6, 1.23 },
{ 3, "📻1.23", "📺0;📻0", 6, -1.23 },
{ 3, ".00", "0", 3, 0.0 },
{ 3, " 0", "a0", 31, 0.0}, // should not hang
{ 3, "0", "0", 1, 0.0 } };
for (Object[] cas : cases) {
@ -147,7 +147,6 @@ public class NumberParserTest {
public void testSeriesMatcher() {
DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(ULocale.ENGLISH);
SeriesMatcher series = new SeriesMatcher();
series.addMatcher(IgnorablesMatcher.DEFAULT);
series.addMatcher(PlusSignMatcher.getInstance(symbols));
series.addMatcher(MinusSignMatcher.getInstance(symbols));
series.addMatcher(IgnorablesMatcher.DEFAULT);
@ -155,23 +154,21 @@ public class NumberParserTest {
series.addMatcher(IgnorablesMatcher.DEFAULT);
series.freeze();
assertEquals(UnicodeSetStaticCache.get(Key.DEFAULT_IGNORABLES).cloneAsThawed()
.addAll(UnicodeSetStaticCache.get(Key.PLUS_SIGN)), series.getLeadCodePoints());
assertFalse(series.matchesEmpty());
assertEquals(UnicodeSetStaticCache.get(Key.PLUS_SIGN), series.getLeadCodePoints());
Object[][] cases = new Object[][] {
{ "", 0, true },
{ " ", 0, true },
{ " ", 0, false },
{ "$", 0, false },
{ "+", 0, true },
{ " +", 0, true },
{ " + ", 0, false },
{ " +", 0, false },
{ "+-", 0, true },
{ "+ -", 0, false },
{ "+- ", 0, true },
{ "+- $", 0, false },
{ "+-%", 3, true },
{ " +- % ", 9, true },
{ " +- % ", 0, false },
{ "+- % ", 7, true },
{ "+-%$", 3, false } };
for (Object[] cas : cases) {
String input = (String) cas[0];