ICU-11738 Updated to handle string ranges.
X-SVN-Rev: 37943
This commit is contained in:
parent
1f9540cce1
commit
003c9da518
282
icu4j/main/classes/core/src/com/ibm/icu/impl/StringRange.java
Normal file
282
icu4j/main/classes/core/src/com/ibm/icu/impl/StringRange.java
Normal file
@ -0,0 +1,282 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2015, Google, Inc., International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.lang.CharSequences;
|
||||
import com.ibm.icu.util.ICUException;
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
public class StringRange {
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
public interface Adder {
|
||||
/**
|
||||
* @param start
|
||||
* @param end may be null, for adding single string
|
||||
*/
|
||||
void add(String start, String end);
|
||||
}
|
||||
|
||||
public static final Comparator<int[]> COMPARE_INT_ARRAYS = new Comparator<int[]>() {
|
||||
public int compare(int[] o1, int[] o2) {
|
||||
int minIndex = Math.min(o1.length, o2.length);
|
||||
for (int i = 0; i < minIndex; ++i) {
|
||||
int diff = o1[i] - o2[i];
|
||||
if (diff != 0) {
|
||||
return diff;
|
||||
}
|
||||
}
|
||||
return o1.length - o2.length;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Compact the set of strings.
|
||||
* @param source
|
||||
* @param adder adds each pair to the output. See the {@link Adder} interface.
|
||||
* @param shorterPairs use abc-d instead of abc-abd
|
||||
* @param moreCompact use a more compact form, at the expense of more processing. If false, source must be sorted.
|
||||
*/
|
||||
public static void compact(Set<String> source, Adder adder, boolean shorterPairs, boolean moreCompact) {
|
||||
if (!moreCompact) {
|
||||
String start = null;
|
||||
String end = null;
|
||||
int lastCp = 0;
|
||||
int prefixLen = 0;
|
||||
for (String s : source) {
|
||||
if (start != null) { // We have something queued up
|
||||
if (s.regionMatches(0, start, 0, prefixLen)) {
|
||||
int currentCp = s.codePointAt(prefixLen);
|
||||
if (currentCp == 1+lastCp && s.length() == prefixLen + Character.charCount(currentCp)) {
|
||||
end = s;
|
||||
lastCp = currentCp;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// We failed to find continuation. Add what we have and restart
|
||||
adder.add(start, end == null ? null
|
||||
: !shorterPairs ? end
|
||||
: end.substring(prefixLen, end.length()));
|
||||
}
|
||||
// new possible range
|
||||
start = s;
|
||||
end = null;
|
||||
lastCp = s.codePointBefore(s.length());
|
||||
prefixLen = s.length() - Character.charCount(lastCp);
|
||||
}
|
||||
adder.add(start, end == null ? null
|
||||
: !shorterPairs ? end
|
||||
: end.substring(prefixLen, end.length()));
|
||||
} else {
|
||||
// not a fast algorithm, but ok for now
|
||||
// TODO rewire to use the first (slower) algorithm to generate the ranges, then compact them from there.
|
||||
// first sort by lengths
|
||||
Relation<Integer,Ranges> lengthToArrays = Relation.of(new TreeMap<Integer,Set<Ranges>>(), TreeSet.class);
|
||||
for (String s : source) {
|
||||
Ranges item = new Ranges(s);
|
||||
lengthToArrays.put(item.size(), item);
|
||||
}
|
||||
// then compact items of each length and emit compacted sets
|
||||
for (Entry<Integer, Set<Ranges>> entry : lengthToArrays.keyValuesSet()) {
|
||||
LinkedList<Ranges> compacted = compact(entry.getKey(), entry.getValue());
|
||||
for (Ranges ranges : compacted) {
|
||||
adder.add(ranges.start(), ranges.end(shorterPairs));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Faster but not as good compaction. Only looks at final codepoint.
|
||||
* @param source
|
||||
* @param adder
|
||||
* @param shorterPairs
|
||||
*/
|
||||
public static void compact(Set<String> source, Adder adder, boolean shorterPairs) {
|
||||
compact(source,adder,shorterPairs,false);
|
||||
}
|
||||
|
||||
private static LinkedList<Ranges> compact(int size, Set<Ranges> inputRanges) {
|
||||
LinkedList<Ranges> ranges = new LinkedList<Ranges>(inputRanges);
|
||||
for (int i = size-1; i >= 0; --i) {
|
||||
Ranges last = null;
|
||||
for (Iterator<Ranges> it = ranges.iterator(); it.hasNext();) {
|
||||
Ranges item = it.next();
|
||||
if (last == null) {
|
||||
last = item;
|
||||
} else if (last.merge(i, item)) {
|
||||
it.remove();
|
||||
} else {
|
||||
last = item; // go to next
|
||||
}
|
||||
}
|
||||
};
|
||||
return ranges;
|
||||
}
|
||||
|
||||
static final class Range implements Comparable<Range>{
|
||||
int min;
|
||||
int max;
|
||||
public Range(int min, int max) {
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
}
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return compareTo((Range)obj) == 0;
|
||||
}
|
||||
public int compareTo(Range that) {
|
||||
int diff = min - that.min;
|
||||
if (diff != 0) {
|
||||
return diff;
|
||||
}
|
||||
return max - that.max;
|
||||
}
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return min * 37 + max;
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder result = new StringBuilder().appendCodePoint(min);
|
||||
return min == max ? result.toString() : result.append('~').appendCodePoint(max).toString();
|
||||
}
|
||||
}
|
||||
|
||||
static final class Ranges implements Comparable<Ranges> {
|
||||
private final Range[] ranges;
|
||||
public Ranges(String s) {
|
||||
int[] array = CharSequences.codePoints(s);
|
||||
ranges = new Range[array.length];
|
||||
for (int i = 0; i < array.length; ++i) {
|
||||
ranges[i] = new Range(array[i], array[i]);
|
||||
}
|
||||
}
|
||||
public boolean merge(int pivot, Ranges other) {
|
||||
// if (this.toString().equals("afz")) {
|
||||
// int debug = 0;
|
||||
// }
|
||||
// we will merge items if the pivot is adjacent, and all other ranges are equal
|
||||
for (int i = ranges.length-1; i >= 0; --i) {
|
||||
if (i == pivot) {
|
||||
if (ranges[i].max != other.ranges[i].min-1) { // not adjacent
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!ranges[i].equals(other.ranges[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (DEBUG) System.out.print("Merging: " + this + ", " + other);
|
||||
ranges[pivot].max = other.ranges[pivot].max;
|
||||
if (DEBUG) System.out.println(" => " + this);
|
||||
return true;
|
||||
}
|
||||
|
||||
public String start() {
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (int i = 0; i < ranges.length; ++i) {
|
||||
result.appendCodePoint(ranges[i].min);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
public String end(boolean mostCompact) {
|
||||
int firstDiff = firstDifference();
|
||||
if (firstDiff == ranges.length) {
|
||||
return null;
|
||||
}
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (int i = mostCompact ? firstDiff : 0; i < ranges.length; ++i) {
|
||||
result.appendCodePoint(ranges[i].max);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
public int firstDifference() {
|
||||
for (int i = 0; i < ranges.length; ++i) {
|
||||
if (ranges[i].min != ranges[i].max){
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return ranges.length;
|
||||
}
|
||||
public Integer size() {
|
||||
return ranges.length;
|
||||
}
|
||||
public int compareTo(Ranges other) {
|
||||
int diff = ranges.length - other.ranges.length;
|
||||
if (diff != 0) {
|
||||
return diff;
|
||||
}
|
||||
for (int i = 0; i < ranges.length; ++i) {
|
||||
diff = ranges[i].compareTo(other.ranges[i]);
|
||||
if (diff != 0) {
|
||||
return diff;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
String start = start();
|
||||
String end = end(false);
|
||||
return end == null ? start : start + "~" + end;
|
||||
}
|
||||
}
|
||||
|
||||
public static Collection<String> expand(String start, String end, boolean requireSameLength, Collection<String> output) {
|
||||
if (start == null || end == null) {
|
||||
throw new ICUException("Range must have 2 valid strings");
|
||||
}
|
||||
int[] startCps = CharSequences.codePoints(start);
|
||||
int[] endCps = CharSequences.codePoints(end);
|
||||
int startOffset = startCps.length - endCps.length;
|
||||
|
||||
if (requireSameLength && startOffset != 0) {
|
||||
throw new ICUException("Range must have equal-length strings");
|
||||
} else if (startOffset < 0) {
|
||||
throw new ICUException("Range must have start-length ≥ end-length");
|
||||
} else if (endCps.length == 0) {
|
||||
throw new ICUException("Range must have end-length > 0");
|
||||
}
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = 0; i < startOffset; ++i) {
|
||||
builder.appendCodePoint(startCps[i]);
|
||||
}
|
||||
add(0, startOffset, startCps, endCps, builder, output);
|
||||
return output;
|
||||
}
|
||||
|
||||
private static void add(int endIndex, int startOffset, int[] starts, int[] ends, StringBuilder builder, Collection<String> output) {
|
||||
int start = starts[endIndex+startOffset];
|
||||
int end = ends[endIndex];
|
||||
if (start > end) {
|
||||
throw new ICUException("Range must have xᵢ ≤ yᵢ for each index i");
|
||||
}
|
||||
boolean last = endIndex == ends.length - 1;
|
||||
int startLen = builder.length();
|
||||
for (int i = start; i <= end; ++i) {
|
||||
builder.appendCodePoint(i);
|
||||
if (last) {
|
||||
output.add(builder.toString());
|
||||
} else {
|
||||
add(endIndex+1, startOffset, starts, ends, builder, output);
|
||||
}
|
||||
builder.setLength(startLen);
|
||||
}
|
||||
}
|
||||
}
|
@ -20,6 +20,7 @@ import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.PatternProps;
|
||||
import com.ibm.icu.impl.RuleCharacterIterator;
|
||||
import com.ibm.icu.impl.SortedSetRelation;
|
||||
import com.ibm.icu.impl.StringRange;
|
||||
import com.ibm.icu.impl.UBiDiProps;
|
||||
import com.ibm.icu.impl.UCaseProps;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
@ -772,19 +773,19 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
T result, boolean escapeUnprintable, boolean includeStrings) {
|
||||
try {
|
||||
result.append('[');
|
||||
|
||||
|
||||
int count = getRangeCount();
|
||||
|
||||
|
||||
// If the set contains at least 2 intervals and includes both
|
||||
// MIN_VALUE and MAX_VALUE, then the inverse representation will
|
||||
// be more economical.
|
||||
if (count > 1 &&
|
||||
getRangeStart(0) == MIN_VALUE &&
|
||||
getRangeEnd(count-1) == MAX_VALUE) {
|
||||
|
||||
|
||||
// Emit the inverse
|
||||
result.append('^');
|
||||
|
||||
|
||||
for (int i = 1; i < count; ++i) {
|
||||
int start = getRangeEnd(i-1)+1;
|
||||
int end = getRangeStart(i)-1;
|
||||
@ -797,7 +798,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Default; emit the ranges as pairs
|
||||
else {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
@ -812,7 +813,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (includeStrings && strings.size() > 0) {
|
||||
for (String s : strings) {
|
||||
result.append('{');
|
||||
@ -2431,6 +2432,21 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
return this;
|
||||
}
|
||||
|
||||
// Add constants to make the code easier to follow
|
||||
|
||||
static final int LAST0_START = 0,
|
||||
LAST1_RANGE = 1,
|
||||
LAST2_SET = 2;
|
||||
|
||||
static final int MODE0_NONE = 0,
|
||||
MODE1_INBRACKET = 1,
|
||||
MODE2_OUTBRACKET = 2;
|
||||
|
||||
static final int SETMODE0_NONE = 0,
|
||||
SETMODE1_UNICODESET = 1,
|
||||
SETMODE2_PROPERTYPAT = 2,
|
||||
SETMODE3_PREPARSED = 3;
|
||||
|
||||
/**
|
||||
* Parse the pattern from the given RuleCharacterIterator. The
|
||||
* iterator is advanced over the parsed pattern.
|
||||
@ -2465,14 +2481,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
|
||||
// mode: 0=before [, 1=between [...], 2=after ]
|
||||
// lastItem: 0=none, 1=char, 2=set
|
||||
int lastItem = 0, lastChar = 0, mode = 0;
|
||||
int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE;
|
||||
char op = 0;
|
||||
|
||||
boolean invert = false;
|
||||
|
||||
clear();
|
||||
String lastString = null;
|
||||
|
||||
while (mode != 2 && !chars.atEnd()) {
|
||||
while (mode != MODE2_OUTBRACKET && !chars.atEnd()) {
|
||||
//Eclipse stated the following is "dead code"
|
||||
/*
|
||||
if (false) {
|
||||
@ -2491,9 +2508,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
// -------- Check for property pattern
|
||||
|
||||
// setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
|
||||
int setMode = 0;
|
||||
int setMode = SETMODE0_NONE;
|
||||
if (resemblesPropertyPattern(chars, opts)) {
|
||||
setMode = 2;
|
||||
setMode = SETMODE2_PROPERTYPAT;
|
||||
}
|
||||
|
||||
// -------- Parse '[' of opening delimiter OR nested set.
|
||||
@ -2511,12 +2528,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
literal = chars.isEscaped();
|
||||
|
||||
if (c == '[' && !literal) {
|
||||
if (mode == 1) {
|
||||
if (mode == MODE1_INBRACKET) {
|
||||
chars.setPos(backup); // backup
|
||||
setMode = 1;
|
||||
setMode = SETMODE1_UNICODESET;
|
||||
} else {
|
||||
// Handle opening '[' delimiter
|
||||
mode = 1;
|
||||
mode = MODE1_INBRACKET;
|
||||
patBuf.append('[');
|
||||
backup = chars.getPos(backup); // prepare to backup
|
||||
c = chars.next(opts);
|
||||
@ -2543,7 +2560,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
if (m != null) {
|
||||
try {
|
||||
nested = (UnicodeSet) m;
|
||||
setMode = 3;
|
||||
setMode = SETMODE3_PREPARSED;
|
||||
} catch (ClassCastException e) {
|
||||
syntaxError(chars, "Syntax error");
|
||||
}
|
||||
@ -2556,14 +2573,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
// previously been parsed and was looked up in the symbol
|
||||
// table.
|
||||
|
||||
if (setMode != 0) {
|
||||
if (lastItem == 1) {
|
||||
if (setMode != SETMODE0_NONE) {
|
||||
if (lastItem == LAST1_RANGE) {
|
||||
if (op != 0) {
|
||||
syntaxError(chars, "Char expected after operator");
|
||||
}
|
||||
add_unchecked(lastChar, lastChar);
|
||||
_appendToPat(patBuf, lastChar, false);
|
||||
lastItem = op = 0;
|
||||
lastItem = LAST0_START;
|
||||
op = 0;
|
||||
}
|
||||
|
||||
if (op == '-' || op == '&') {
|
||||
@ -2575,24 +2593,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
nested = scratch;
|
||||
}
|
||||
switch (setMode) {
|
||||
case 1:
|
||||
case SETMODE1_UNICODESET:
|
||||
nested.applyPattern(chars, symbols, patBuf, options);
|
||||
break;
|
||||
case 2:
|
||||
case SETMODE2_PROPERTYPAT:
|
||||
chars.skipIgnored(opts);
|
||||
nested.applyPropertyPattern(chars, patBuf, symbols);
|
||||
break;
|
||||
case 3: // `nested' already parsed
|
||||
case SETMODE3_PREPARSED: // `nested' already parsed
|
||||
nested._toPattern(patBuf, false);
|
||||
break;
|
||||
}
|
||||
|
||||
usePat = true;
|
||||
|
||||
if (mode == 0) {
|
||||
if (mode == MODE0_NONE) {
|
||||
// Entire pattern is a category; leave parse loop
|
||||
set(nested);
|
||||
mode = 2;
|
||||
mode = MODE2_OUTBRACKET;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2609,12 +2627,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
}
|
||||
|
||||
op = 0;
|
||||
lastItem = 2;
|
||||
lastItem = LAST2_SET;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mode == 0) {
|
||||
if (mode == MODE0_NONE) {
|
||||
syntaxError(chars, "Missing '['");
|
||||
}
|
||||
|
||||
@ -2625,7 +2643,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
if (!literal) {
|
||||
switch (c) {
|
||||
case ']':
|
||||
if (lastItem == 1) {
|
||||
if (lastItem == LAST1_RANGE) {
|
||||
add_unchecked(lastChar, lastChar);
|
||||
_appendToPat(patBuf, lastChar, false);
|
||||
}
|
||||
@ -2637,11 +2655,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
syntaxError(chars, "Trailing '&'");
|
||||
}
|
||||
patBuf.append(']');
|
||||
mode = 2;
|
||||
mode = MODE2_OUTBRACKET;
|
||||
continue;
|
||||
case '-':
|
||||
if (op == 0) {
|
||||
if (lastItem != 0) {
|
||||
if (lastItem != LAST0_START) {
|
||||
op = (char) c;
|
||||
continue;
|
||||
} else if (lastString != null) {
|
||||
op = (char) c;
|
||||
continue;
|
||||
} else {
|
||||
@ -2651,15 +2672,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
literal = chars.isEscaped();
|
||||
if (c == ']' && !literal) {
|
||||
patBuf.append("-]");
|
||||
mode = 2;
|
||||
mode = MODE2_OUTBRACKET;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
syntaxError(chars, "'-' not after char or set");
|
||||
syntaxError(chars, "'-' not after char, string, or set");
|
||||
break;
|
||||
case '&':
|
||||
if (lastItem == 2 && op == 0) {
|
||||
if (lastItem == LAST2_SET && op == 0) {
|
||||
op = (char) c;
|
||||
continue;
|
||||
}
|
||||
@ -2669,14 +2690,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
syntaxError(chars, "'^' not after '['");
|
||||
break;
|
||||
case '{':
|
||||
if (op != 0) {
|
||||
if (op != 0 && op != '-') {
|
||||
syntaxError(chars, "Missing operand after operator");
|
||||
}
|
||||
if (lastItem == 1) {
|
||||
if (lastItem == LAST1_RANGE) {
|
||||
add_unchecked(lastChar, lastChar);
|
||||
_appendToPat(patBuf, lastChar, false);
|
||||
}
|
||||
lastItem = 0;
|
||||
lastItem = LAST0_START;
|
||||
if (buf == null) {
|
||||
buf = new StringBuilder();
|
||||
} else {
|
||||
@ -2698,9 +2719,27 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
// We have new string. Add it to set and continue;
|
||||
// we don't need to drop through to the further
|
||||
// processing
|
||||
add(buf.toString());
|
||||
String curString = buf.toString();
|
||||
if (op == '-') {
|
||||
int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString);
|
||||
int curSingle = CharSequences.getSingleCodePoint(curString);
|
||||
if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) {
|
||||
add(lastSingle,curSingle);
|
||||
} else {
|
||||
try {
|
||||
StringRange.expand(lastString, curString, true, strings);
|
||||
} catch (Exception e) {
|
||||
syntaxError(chars, e.getMessage());
|
||||
}
|
||||
}
|
||||
lastString = null;
|
||||
op = 0;
|
||||
} else {
|
||||
add(curString);
|
||||
lastString = curString;
|
||||
}
|
||||
patBuf.append('{');
|
||||
_appendToPat(patBuf, buf.toString(), false);
|
||||
_appendToPat(patBuf, curString, false);
|
||||
patBuf.append('}');
|
||||
continue;
|
||||
case SymbolTable.SYMBOL_REF:
|
||||
@ -2720,14 +2759,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
break; // literal '$'
|
||||
}
|
||||
if (anchor && op == 0) {
|
||||
if (lastItem == 1) {
|
||||
if (lastItem == LAST1_RANGE) {
|
||||
add_unchecked(lastChar, lastChar);
|
||||
_appendToPat(patBuf, lastChar, false);
|
||||
}
|
||||
add_unchecked(UnicodeMatcher.ETHER);
|
||||
usePat = true;
|
||||
patBuf.append(SymbolTable.SYMBOL_REF).append(']');
|
||||
mode = 2;
|
||||
mode = MODE2_OUTBRACKET;
|
||||
continue;
|
||||
}
|
||||
syntaxError(chars, "Unquoted '$'");
|
||||
@ -2742,12 +2781,19 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
// ("a").
|
||||
|
||||
switch (lastItem) {
|
||||
case 0:
|
||||
lastItem = 1;
|
||||
case LAST0_START:
|
||||
if (op == '-' && lastString != null) {
|
||||
syntaxError(chars, "Invalid range");
|
||||
}
|
||||
lastItem = LAST1_RANGE;
|
||||
lastChar = c;
|
||||
lastString = null;
|
||||
break;
|
||||
case 1:
|
||||
case LAST1_RANGE:
|
||||
if (op == '-') {
|
||||
if (lastString != null) {
|
||||
syntaxError(chars, "Invalid range");
|
||||
}
|
||||
if (lastChar >= c) {
|
||||
// Don't allow redundant (a-a) or empty (b-a) ranges;
|
||||
// these are most likely typos.
|
||||
@ -2757,24 +2803,25 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
_appendToPat(patBuf, lastChar, false);
|
||||
patBuf.append(op);
|
||||
_appendToPat(patBuf, c, false);
|
||||
lastItem = op = 0;
|
||||
lastItem = LAST0_START;
|
||||
op = 0;
|
||||
} else {
|
||||
add_unchecked(lastChar, lastChar);
|
||||
_appendToPat(patBuf, lastChar, false);
|
||||
lastChar = c;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
case LAST2_SET:
|
||||
if (op != 0) {
|
||||
syntaxError(chars, "Set expected after operator");
|
||||
}
|
||||
lastChar = c;
|
||||
lastItem = 1;
|
||||
lastItem = LAST1_RANGE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (mode != 2) {
|
||||
if (mode != MODE2_OUTBRACKET) {
|
||||
syntaxError(chars, "Missing ']'");
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* Copyright (C) 1996-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -2611,7 +2611,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
assertEquals("CharSequence remove", new UnicodeSet("[Aa-c{qr}]"), new UnicodeSet("[a-cA{abc}{qr}]").remove(new StringBuilder("abc")) );
|
||||
assertEquals("CharSequence complement", new UnicodeSet("[Aa-c{qr}]"), new UnicodeSet("[a-cA{abc}{qr}]").complement(new StringBuilder("abc")) );
|
||||
assertEquals("CharSequence complement", new UnicodeSet("[Aa-c{abc}{qr}]"), new UnicodeSet("[a-cA{qr}]").complement(new StringBuilder("abc")) );
|
||||
|
||||
|
||||
assertEquals("CharSequence addAll", new UnicodeSet("[a-cABC]"), new UnicodeSet("[a-cA]").addAll(new StringBuilder("ABC")) );
|
||||
assertEquals("CharSequence retainAll", new UnicodeSet("[a-c]"), new UnicodeSet("[a-cA]").retainAll(new StringBuilder("abcB")) );
|
||||
assertEquals("CharSequence removeAll", new UnicodeSet("[Aab]"), new UnicodeSet("[a-cA]").removeAll(new StringBuilder("cC")) );
|
||||
@ -2621,7 +2621,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
assertEquals("CharSequence contains", true, new UnicodeSet("[a-cA{ab}]"). contains(new StringBuilder("ab")) );
|
||||
assertEquals("CharSequence containsNone", false, new UnicodeSet("[a-cA]"). containsNone(new StringBuilder("ab")) );
|
||||
assertEquals("CharSequence containsSome", true, new UnicodeSet("[a-cA{ab}]"). containsSome(new StringBuilder("ab")) );
|
||||
|
||||
|
||||
// spanning
|
||||
assertEquals("CharSequence span", 3, new UnicodeSet("[a-cA]"). span(new StringBuilder("abc"), SpanCondition.SIMPLE) );
|
||||
assertEquals("CharSequence span", 3, new UnicodeSet("[a-cA]"). span(new StringBuilder("abc"), 1, SpanCondition.SIMPLE) );
|
||||
@ -2636,4 +2636,34 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
assertEquals("CharSequence findLastIn", -1, new UnicodeSet("[a-cA]"). findLastIn(new StringBuilder("abc"), 1, true) );
|
||||
assertEquals("CharSequence add", "c", new UnicodeSet("[abA]"). stripFrom(new StringBuilder("abc"), true));
|
||||
}
|
||||
|
||||
public void TestAStringRange() {
|
||||
String[][] tests = {
|
||||
{"[{ax}-{bz}]", "[{ax}{ay}{az}{bx}{by}{bz}]"},
|
||||
{"[{a}-{c}]", "[a-c]"},
|
||||
//{"[a-{c}]", "[a-c]"}, // don't handle these yet: enable once we do
|
||||
//{"[{a}-c]", "[a-c]"}, // don't handle these yet: enable once we do
|
||||
{"[{ax}-{by}-{cz}]", "Error: '-' not after char, string, or set at \"[{ax}-{by}-{|cz}]\""},
|
||||
{"[{a}-{bz}]", "Error: Range must have equal-length strings at \"[{a}-{bz}|]\""},
|
||||
{"[{ax}-{b}]", "Error: Range must have equal-length strings at \"[{ax}-{b}|]\""},
|
||||
{"[{ax}-bz]", "Error: Invalid range at \"[{ax}-b|z]\""},
|
||||
{"[ax-{bz}]", "Error: Range must have 2 valid strings at \"[ax-{bz}|]\""},
|
||||
{"[{bx}-{az}]", "Error: Range must have xᵢ ≤ yᵢ for each index i at \"[{bx}-{az}|]\""},
|
||||
};
|
||||
int i = 0;
|
||||
for (String[] test : tests) {
|
||||
String expected = test[1];
|
||||
if (test[1].startsWith("[")) {
|
||||
expected = new UnicodeSet(expected).toPattern(false);
|
||||
}
|
||||
String actual;
|
||||
try {
|
||||
actual = new UnicodeSet(test[0]).toPattern(false);
|
||||
} catch (Exception e) {
|
||||
actual = e.getMessage();
|
||||
}
|
||||
assertEquals("StringRange " + i, expected, actual);
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user