From f7c551d636f96de30cf657845e4c74ab7a42f1ed Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 18 Aug 2014 12:58:44 +0000 Subject: [PATCH] ICU-9131 update trunk from branch, after fixes as per core review. X-SVN-Rev: 36187 --- .../core/src/com/ibm/icu/impl/BMPSet.java | 47 +- .../ibm/icu/impl/UnicodeSetStringSpan.java | 588 ++++++++++++------ .../core/src/com/ibm/icu/text/UTF16.java | 57 +- .../core/src/com/ibm/icu/text/UnicodeSet.java | 295 ++++++--- .../com/ibm/icu/text/UnicodeSetSpanner.java | 333 ++++++++++ .../core/src/com/ibm/icu/util/OutputInt.java | 58 ++ .../com/ibm/icu/dev/test/lang/UTF16Test.java | 37 +- .../test/lang/UnicodeSetStringSpanTest.java | 137 ++-- .../ibm/icu/dev/test/lang/UnicodeSetTest.java | 309 +++++---- 9 files changed, 1408 insertions(+), 453 deletions(-) create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/util/OutputInt.java diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java index 4b61bb526b..0125da650a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java @@ -1,7 +1,7 @@ /* ****************************************************************************** * - * Copyright (C) 2009-2011, International Business Machines + * Copyright (C) 2009-2014, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -10,23 +10,25 @@ package com.ibm.icu.impl; import com.ibm.icu.text.UnicodeSet.SpanCondition; +import com.ibm.icu.util.OutputInt; -/* +/** * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. * - * Latin-1: Look up bytes. 2-byte characters: Bits organized vertically. 3-byte characters: Use zero/one/mixed data - * per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. Supplementary characters: Call contains() on the - * parent set. + * Latin-1: Look up bytes. + * 2-byte characters: Bits organized vertically. + * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. + * Supplementary characters: Call contains() on the parent set. */ public final class BMPSet { public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000); - /* + /** * One boolean ('true' or 'false') per Latin-1 character. */ private boolean[] latin1Contains; - /* + /** * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) @@ -36,7 +38,7 @@ public final class BMPSet { */ private int[] table7FF; - /* + /** * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit @@ -48,14 +50,14 @@ public final class BMPSet { */ private int[] bmpBlockBits; - /* + /** * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. */ private int[] list4kStarts; - /* + /** * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for * supplementary code points. The list is terminated with list[listLength-1]=0x110000. */ @@ -120,22 +122,24 @@ public final class BMPSet { } } - /* + /** * Span the initial substring for which each character c has spanCondition==contains(c). It must be * spanCondition==0 or 1. * * @param start The start index - * @param end The end index - * @return The length of the span. + * @param outCount If not null: Receives the number of code points in the span. + * @return the limit (exclusive end) of the span * * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points * as usual in ICU. */ - public final int span(CharSequence s, int start, int end, SpanCondition spanCondition) { + public final int span(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { char c, c2; int i = start; - int limit = Math.min(s.length(), end); + int limit = s.length(); + int numSupplementary = 0; if (SpanCondition.NOT_CONTAINED != spanCondition) { // span while (i < limit) { @@ -170,6 +174,7 @@ public final class BMPSet { if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { break; } + ++numSupplementary; ++i; } ++i; @@ -208,15 +213,20 @@ public final class BMPSet { if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { break; } + ++numSupplementary; ++i; } ++i; } } - return i - start; + if (outCount != null) { + int spanLength = i - start; + outCount.value = spanLength - numSupplementary; // number of code points + } + return i; } - /* + /** * Symmetrical with span(). * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= * limit and spanCondition==0 or 1. @@ -226,7 +236,6 @@ public final class BMPSet { public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { char c, c2; - limit = Math.min(s.length(), limit); if (SpanCondition.NOT_CONTAINED != spanCondition) { // span for (;;) { @@ -311,7 +320,7 @@ public final class BMPSet { return limit + 1; } - /* + /** * Set bits in a bit rectangle in "vertical" bit organization. start=0xfe. + /** The spanLength is >=0xfe. */ static final short LONG_SPAN = ALL_CP_CONTAINED - 1; - // Set for span(). Same as parent but without strings. + /** Set for span(). Same as parent but without strings. */ private UnicodeSet spanSet; - // Set for span(not contained). - // Same as spanSet, plus characters that start or end strings. + /** + * Set for span(not contained). + * Same as spanSet, plus characters that start or end strings. + */ private UnicodeSet spanNotSet; - // The strings of the parent set. + /** The strings of the parent set. */ private ArrayList strings; - // the lengths of span(), spanBack() etc. for each string. + /** The lengths of span(), spanBack() etc. for each string. */ private short[] spanLengths; - // Maximum lengths of relevant strings. + /** Maximum lengths of relevant strings. */ private int maxLength16; - // Set up for all variants of span()? + /** Are there strings that are not fully contained in the code point set? */ + private boolean someRelevant; + + /** Set up for all variants of span()? */ private boolean all; - // Span helper + /** Span helper */ private OffsetList offsets; - // Construct for all variants of span(), or only for any one variant. - // Initialize as little as possible, for single use. + /** + * Constructs for all variants of span(), or only for any one variant. + * Initializes as little as possible, for single use. + */ public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList setStrings, int which) { spanSet = new UnicodeSet(0, 0x10ffff); + // TODO: With Java 6, just take the parent set's strings as is, + // as a NavigableSet, rather than as an ArrayList copy of the set of strings. + // Then iterate via the first() and higher() methods. + // (We do not want to create multiple Iterator objects in each span().) + // See ICU ticket #7454. strings = setStrings; all = (which == ALL); spanSet.retainAll(set); @@ -90,7 +106,7 @@ public class UnicodeSetStringSpan { int stringsLength = strings.size(); int i, spanLength; - boolean someRelevant = false; + someRelevant = false; for (i = 0; i < stringsLength; ++i) { String string = strings.get(i); int length16 = string.length(); @@ -98,12 +114,11 @@ public class UnicodeSetStringSpan { if (spanLength < length16) { // Relevant string. someRelevant = true; } - if ((0 != (which & UTF16)) && length16 > maxLength16) { + if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { maxLength16 = length16; } } - if (!someRelevant) { - maxLength16 = 0; + if (!someRelevant && (which & WITH_COUNT) == 0) { return; } @@ -140,7 +155,7 @@ public class UnicodeSetStringSpan { int length16 = string.length(); spanLength = spanSet.span(string, SpanCondition.CONTAINED); if (spanLength < length16) { // Relevant string. - if (0 != (which & UTF16)) { + if (true /* 0 != (which & UTF16) */) { if (0 != (which & CONTAINED)) { if (0 != (which & FWD)) { spanLengths[i] = makeSpanLengthByte(spanLength); @@ -188,10 +203,12 @@ public class UnicodeSetStringSpan { * Constructs a copy of an existing UnicodeSetStringSpan. * Assumes which==ALL for a frozen set. */ - public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList newParentSetStrings) { + public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, + final ArrayList newParentSetStrings) { spanSet = otherStringSpan.spanSet; strings = newParentSetStrings; maxLength16 = otherStringSpan.maxLength16; + someRelevant = otherStringSpan.someRelevant; all = true; if (otherStringSpan.spanNotSet == otherStringSpan.spanSet) { spanNotSet = spanSet; @@ -203,22 +220,25 @@ public class UnicodeSetStringSpan { spanLengths = otherStringSpan.spanLengths.clone(); } - /* + /** * Do the strings need to be checked in span() etc.? * - * @return TRUE if strings need to be checked (call span() here), FALSE if not (use a BMPSet for best performance). + * @return true if strings need to be checked (call span() here), + * false if not (use a BMPSet for best performance). */ public boolean needsStringSpanUTF16() { - return (maxLength16 != 0); + return someRelevant; } - // For fast UnicodeSet::contains(c). + /** For fast UnicodeSet::contains(c). */ public boolean contains(int c) { return spanSet.contains(c); } - // Add a starting or ending string character to the spanNotSet - // so that a character span ends before any string. + /** + * Adds a starting or ending string character to the spanNotSet + * so that a character span ends before any string. + */ private void addToSpanNotSet(int c) { if (spanNotSet == null || spanNotSet == spanSet) { if (spanSet.contains(c)) { @@ -230,12 +250,14 @@ public class UnicodeSetStringSpan { } /* - * Note: In span() when spanLength==0 (after a string match, or at the beginning after an empty code point span) and - * in spanNot() and spanNotUTF8(), string matching could use a binary search because all string matches are done + * Note: In span() when spanLength==0 + * (after a string match, or at the beginning after an empty code point span) + * and in spanNot() and spanNotUTF8(), + * string matching could use a binary search because all string matches are done * from the same start index. - * + * * For UTF-8, this would require a comparison function that returns UTF-16 order. - * + * * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets * with strings have very few very short strings. For cases with many strings, it might be better to use a different * API and implementation with a DFA (state machine). @@ -244,84 +266,119 @@ public class UnicodeSetStringSpan { /* * Algorithm for span(SpanCondition.CONTAINED) * - * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there - * is in the set, then remember to continue after it. + If a set string matches at the current position, then - * remember to continue after it. + Either recursively span for each code point or string match, or recursively span - * for all but the shortest one and iteratively continue the span with the shortest local match. + Remember the - * longest recursive span (the farthest end point). + If there is no match at the current position, neither for the - * code point there nor for any set string, then stop and return the longest recursive span length. - * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Either recursively span for each code point or string match, or recursively span + * for all but the shortest one and iteratively continue the span with the shortest local match. + * + Remember the longest recursive span (the farthest end point). + * + If there is no match at the current position, + * neither for the code point there nor for any set string, + * then stop and return the longest recursive span length. + * * Optimized implementation: - * - * (We assume that most sets will have very few very short strings. A span using a string-less set is extremely - * fast.) - * - * Create and cache a spanSet which contains all of the single code points of the original set but none of its - * strings. - * - * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set - * string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with - * a partial overlap because the recursive algorithm would have tried to match them at every position. ~ Set strings - * that entirely consist of set-contained code points are irrelevant for span(SpanCondition.CONTAINED) - * because the recursive algorithm would continue after them anyway and find the longest recursive match from their - * end. ~ Rather than recursing, note each end point of a set string match. + If no set string matched after - * spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string matched after - * spanSet.span(), then pop the shortest string match end point and continue the loop, trying to match all set - * strings from there. + If at least one more set string matched after a previous string match, then test if the - * code point after the previous string match is also contained in the set. Continue the loop with the shortest end - * point of either this code point or a matching set string. + If no more set string matched after a previous string - * match, then try another spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0, - * otherwise continue the loop. - * + * + * (We assume that most sets will have very few very short strings. + * A span using a string-less set is extremely fast.) + * + * Create and cache a spanSet which contains all of the single code points of the original set + * but none of its strings. + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the recursive algorithm would have tried to match them at every position. + * ~ Set strings that entirely consist of set-contained code points + * are irrelevant for span(SpanCondition.CONTAINED) + * because the recursive algorithm would continue after them anyway and + * find the longest recursive match from their end. + * ~ Rather than recursing, note each end point of a set string match. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched after spanSet.span(), + * then pop the shortest string match end point and continue the loop, + * trying to match all set strings from there. + * + If at least one more set string matched after a previous string match, then test if the + * code point after the previous string match is also contained in the set. + * Continue the loop with the shortest end point of + * either this code point or a matching set string. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + * * By noting each end point of a set string match, the function visits each string position at most once and * finishes in linear time. - * - * The recursive algorithm may visit the same string position many times if multiple paths lead to it and finishes - * in exponential time. + * + * The recursive algorithm may visit the same string position many times + * if multiple paths lead to it and finishes in exponential time. */ /* * Algorithm for span(SIMPLE) * - * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there - * is in the set, then remember to continue after it. + If a set string matches at the current position, then - * remember to continue after it. + Continue from the farthest match position and ignore all others. + If there is - * no match at the current position, then stop and return the current position. - * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Continue from the farthest match position and ignore all others. + * + If there is no match at the current position, then stop and return the current position. + * * Optimized implementation: - * + * * (Same assumption and spanSet as above.) - * - * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set - * string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with - * a partial overlap because the standard algorithm would have tried to match them earlier. ~ Set strings that - * entirely consist of set-contained code points must be matched with a full overlap because the longest-match - * algorithm would hide set string matches that end earlier. Such set strings need not be matched earlier inside the - * code point span because the standard algorithm would then have continued after the set string match anyway. ~ - * Remember the longest set string match (farthest end point) from the earliest starting point. + If no set string - * matched after spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string - * matched, then continue the loop after the longest match from the earliest position. + If no more set string - * matched after a previous string match, then try another - * spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0, otherwise continue the - * loop. + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the standard algorithm would have tried to match them earlier. + * ~ Set strings that entirely consist of set-contained code points + * must be matched with a full overlap because the longest-match algorithm + * would hide set string matches that end earlier. + * Such set strings need not be matched earlier inside the code point span + * because the standard algorithm would then have + * continued after the set string match anyway. + * ~ Remember the longest set string match (farthest end point) + * from the earliest starting point. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched, + * then continue the loop after the longest match from the earliest position. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. */ /** - * Span a string. + * Spans a string. * * @param s The string to be spanned * @param start The start index that the span begins * @param spanCondition The span condition - * @return the length of the span + * @return the limit (exclusive end) of the span */ - public synchronized int span(CharSequence s, int start, int length, SpanCondition spanCondition) { + public int span(CharSequence s, int start, SpanCondition spanCondition) { if (spanCondition == SpanCondition.NOT_CONTAINED) { - return spanNot(s, start, length); + return spanNot(s, start, null); } - int spanLength = spanSet.span(s.subSequence(start, start + length), SpanCondition.CONTAINED); - if (spanLength == length) { - return length; + int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); + if (spanLimit == s.length()) { + return spanLimit; } + return spanWithStrings(s, start, spanLimit, spanCondition); + } + /** + * Synchronized method for complicated spans using the offsets. + * Avoids synchronization for simple cases. + * + * @param spanLimit = spanSet.span(s, start, CONTAINED) + */ + private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, + SpanCondition spanCondition) { // Consider strings; they may overlap with the span. int initSize = 0; if (spanCondition == SpanCondition.CONTAINED) { @@ -329,7 +386,9 @@ public class UnicodeSetStringSpan { initSize = maxLength16; } offsets.setMaxLength(initSize); - int pos = start + spanLength, rest = length - spanLength; + int length = s.length(); + int pos = spanLimit, rest = length - spanLimit; + int spanLength = spanLimit - start; int i, stringsLength = strings.size(); for (;;) { if (spanCondition == SpanCondition.CONTAINED) { @@ -429,7 +488,7 @@ public class UnicodeSetStringSpan { // Otherwise, an unlimited code point span is only tried again when no // strings match, and if such a non-initial span fails we stop. if (offsets.isEmpty()) { - return pos - start; // No strings matched after a span. + return pos; // No strings matched after a span. } // Match strings from after the next string match. } else { @@ -437,11 +496,12 @@ public class UnicodeSetStringSpan { if (offsets.isEmpty()) { // No more strings matched after a previous string match. // Try another code point span from after the last string match. - spanLength = spanSet.span(s.subSequence(pos, pos + rest), SpanCondition.CONTAINED); + spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); + spanLength = spanLimit - pos; if (spanLength == rest || // Reached the end of the string, or spanLength == 0 // neither strings nor span progressed. ) { - return pos + spanLength - start; + return spanLimit; } pos += spanLength; rest -= spanLength; @@ -467,13 +527,110 @@ public class UnicodeSetStringSpan { // Match strings from after the next string match. } } - int minOffset = offsets.popMinimum(); + int minOffset = offsets.popMinimum(null); pos += minOffset; rest -= minOffset; spanLength = 0; // Match strings from after a string match. } } + /** + * Spans a string and counts the smallest number of set elements on any path across the span. + * + *

For proper counting, we cannot ignore strings that are fully contained in code point spans. + * + *

If the set does not have any fully-contained strings, then we could optimize this + * like span(), but such sets are likely rare, and this is at least still linear. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @param outCount The count + * @return the limit (exclusive end) of the span + */ + public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, outCount); + } + // Consider strings; they may overlap with the span, + // and they may result in a smaller count that with just code points. + if (spanCondition == SpanCondition.CONTAINED) { + return spanContainedAndCount(s, start, outCount); + } + // SIMPLE (not synchronized, does not use offsets) + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + int maxInc = (cpLength > 0) ? cpLength : 0; + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + if (maxInc < length16 && length16 <= rest && + matches16CPB(s, pos, length, string, length16)) { + maxInc = length16; + } + } + // We are done if there is no match beyond pos. + if (maxInc == 0) { + outCount.value = count; + return pos; + } + // Continue from the longest match. + ++count; + pos += maxInc; + rest -= maxInc; + } + outCount.value = count; + return pos; + } + + private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + offsets.addOffsetAndCount(cpLength, count + 1); + } + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + // Note: If the strings were sorted by length, then we could also + // avoid trying to match if there is already a match of the same length. + if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && + matches16CPB(s, pos, length, string, length16)) { + offsets.addOffsetAndCount(length16, count + 1); + } + } + // We are done if there is no match beyond pos. + if (offsets.isEmpty()) { + outCount.value = count; + return pos; + } + // Continue from the nearest match. + int minOffset = offsets.popMinimum(outCount); + count = outCount.value; + pos += minOffset; + rest -= minOffset; + } + outCount.value = count; + return pos; + } + /** * Span a string backwards. * @@ -638,59 +795,72 @@ public class UnicodeSetStringSpan { // Match strings from before the next string match. } } - pos -= offsets.popMinimum(); + pos -= offsets.popMinimum(null); spanLength = 0; // Match strings from before a string match. } } - /* + /** * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) * - * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there - * is in the set, then return with the current position. + If a set string matches at the current position, then - * return with the current position. - * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then return with the current position. + * + If a set string matches at the current position, then return with the current position. + * * Optimized implementation: - * + * * (Same assumption as for span() above.) - * - * Create and cache a spanNotSet which contains all of the single code points of the original set but none of its - * strings. For each set string add its initial code point to the spanNotSet. (Also add its final code point for - * spanNotBack().) - * + * + * Create and cache a spanNotSet which contains + * all of the single code points of the original set but none of its strings. + * For each set string add its initial code point to the spanNotSet. + * (Also add its final code point for spanNotBack().) + * * - Loop: * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). * + If the current code point is in the original set, then return the current position. * + If any set string matches at the current position, then return the current position. * + If there is no match at the current position, neither for the code point - * there nor for any set string, then skip this code point and continue the loop. This happens for - * set-string-initial code points that were added to spanNotSet when there is not actually a match for such a set - * string. + * there nor for any set string, then skip this code point and continue the loop. + * This happens for set-string-initial code points that were added to spanNotSet + * when there is not actually a match for such a set string. * - * @return the length of the span + * @param s The string to be spanned + * @param start The start index that the span begins + * @param outCount If not null: Receives the number of code points across the span. + * @return the limit (exclusive end) of the span */ - private int spanNot(CharSequence s, int start, int length) { - int pos = start, rest = length; - int i, stringsLength = strings.size(); + private int spanNot(CharSequence s, int start, OutputInt outCount) { + int length = s.length(); + int pos = start, rest = length - start; + int stringsLength = strings.size(); + int count = 0; do { // Span until we find a code point from the set, // or a code point that starts or ends some string. - i = spanNotSet.span(s.subSequence(pos, pos + rest), SpanCondition.NOT_CONTAINED); - if (i == rest) { + int spanLimit; + if (outCount == null) { + spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); + } else { + spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); + outCount.value = count = count + outCount.value; + } + if (spanLimit == length) { return length; // Reached the end of the string. } - pos += i; - rest -= i; + pos = spanLimit; + rest = length - spanLimit; // Check whether the current code point is in the original set, // without the string starts and ends. int cpLength = spanOne(spanSet, s, pos, rest); if (cpLength > 0) { - return pos - start; // There is a set element at pos. + return pos; // There is a set element at pos. } // Try to match the strings at pos. - for (i = 0; i < stringsLength; ++i) { + for (int i = 0; i < stringsLength; ++i) { if (spanLengths[i] == ALL_CP_CONTAINED) { continue; // Irrelevant string. } @@ -698,7 +868,7 @@ public class UnicodeSetStringSpan { int length16 = string.length(); if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { - return pos - start; // There is a set element at pos. + return pos; // There is a set element at pos. } } @@ -707,7 +877,11 @@ public class UnicodeSetStringSpan { // cpLength<0 pos -= cpLength; rest += cpLength; + ++count; } while (rest != 0); + if (outCount != null) { + outCount.value = count; + } return length; // Reached the end of the string. } @@ -773,20 +947,24 @@ public class UnicodeSetStringSpan { * Compare 16-bit Unicode strings (which may be malformed UTF-16) * at code point boundaries. * That is, each edge of a match must not be in the middle of a surrogate pair. + * @param s The string to match in. * @param start The start index of s. - * @param slength The length of s from start. + * @param limit The limit of the subsequence of s being spanned. + * @param t The substring to be matched in s. * @param tlength The length of t. */ - static boolean matches16CPB(CharSequence s, int start, int slength, final String t, int tlength) { - return !(0 < start && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start - 1)) && - com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + 0))) - && !(tlength < slength && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start + tlength - 1)) && - com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + tlength))) - && matches16(s, start, t, tlength); + static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { + return matches16(s, start, t, tlength) + && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && + Character.isLowSurrogate(s.charAt(start))) + && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && + Character.isLowSurrogate(s.charAt(start + tlength))); } - // Does the set contain the next code point? - // If so, return its length; otherwise return its negative length. + /** + * Does the set contain the next code point? + * If so, return its length; otherwise return its negative length. + */ static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { char c = s.charAt(start); if (c >= 0xd800 && c <= 0xdbff && length >= 2) { @@ -811,47 +989,57 @@ public class UnicodeSetStringSpan { return set.contains(c) ? 1 : -1; } - - /* + /** * Helper class for UnicodeSetStringSpan. * - * List of offsets from the current position from where to try matching a code point or a string. Store offsets rather - * than indexes to simplify the code and use the same list for both increments (in span()) and decrements (in - * spanBack()). - * - * Assumption: The maximum offset is limited, and the offsets that are stored at any one time are relatively dense, that - * is, there are normally no gaps of hundreds or thousands of offset values. - * - * The implementation uses a circular buffer of byte flags, each indicating whether the corresponding offset is in the - * list. This avoids inserting into a sorted list of offsets (or absolute indexes) and physically moving part of the - * list. - * - * Note: In principle, the caller should setMaxLength() to the maximum of the max string length and U16_LENGTH/U8_LENGTH + *

List of offsets from the current position from where to try matching + * a code point or a string. + * Stores offsets rather than indexes to simplify the code and use the same list + * for both increments (in span()) and decrements (in spanBack()). + * + *

Assumption: The maximum offset is limited, and the offsets that are stored at any one time + * are relatively dense, that is, + * there are normally no gaps of hundreds or thousands of offset values. + * + *

This class optionally also tracks the minimum non-negative count for each position, + * intended to count the smallest number of elements of any path leading to that position. + * + *

The implementation uses a circular buffer of count integers, + * each indicating whether the corresponding offset is in the list, + * and its path element count. + * This avoids inserting into a sorted list of offsets (or absolute indexes) + * and physically moving part of the list. + * + *

Note: In principle, the caller should setMaxLength() to + * the maximum of the max string length and U16_LENGTH/U8_LENGTH * to account for "long" single code points. - * - * Note: If maxLength were guaranteed to be no more than 32 or 64, the list could be stored as bit flags in a single - * integer. Rather than handling a circular buffer with a start list index, the integer would simply be shifted when - * lower offsets are removed. UnicodeSet does not have a limit on the lengths of strings. + * + *

Note: An earlier version did not track counts and stored only byte flags. + * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, + * the list could be stored as bit flags in a single integer. + * Rather than handling a circular buffer with a start list index, + * the integer would simply be shifted when lower offsets are removed. + * UnicodeSet does not have a limit on the lengths of strings. */ - static class OffsetList { - private boolean[] list; + private static final class OffsetList { + private int[] list; private int length; private int start; public OffsetList() { - list = new boolean[16]; // default size + list = new int[16]; // default size } public void setMaxLength(int maxLength) { if (maxLength > list.length) { - list = new boolean[maxLength]; + list = new int[maxLength]; } clear(); } public void clear() { for (int i = list.length; i-- > 0;) { - list[i] = false; + list[i] = 0; } start = length = 0; } @@ -860,55 +1048,97 @@ public class UnicodeSetStringSpan { return (length == 0); } - // Reduce all stored offsets by delta, used when the current position - // moves by delta. - // There must not be any offsets lower than delta. - // If there is an offset equal to delta, it is removed. - // delta=[1..maxLength] + /** + * Reduces all stored offsets by delta, used when the current position moves by delta. + * There must not be any offsets lower than delta. + * If there is an offset equal to delta, it is removed. + * + * @param delta [1..maxLength] + */ public void shift(int delta) { int i = start + delta; if (i >= list.length) { i -= list.length; } - if (list[i]) { - list[i] = false; + if (list[i] != 0) { + list[i] = 0; --length; } start = i; } - // Add an offset. The list must not contain it yet. - // offset=[1..maxLength] + /** + * Adds an offset. The list must not contain it yet. + * @param offset [1..maxLength] + */ public void addOffset(int offset) { int i = start + offset; if (i >= list.length) { i -= list.length; } - list[i] = true; + assert list[i] == 0; + list[i] = 1; ++length; } - // offset=[1..maxLength] + /** + * Adds an offset and updates its count. + * The list may already contain the offset. + * @param offset [1..maxLength] + */ + public void addOffsetAndCount(int offset, int count) { + assert count > 0; + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + if (list[i] == 0) { + list[i] = count; + ++length; + } else if (count < list[i]) { + list[i] = count; + } + } + + /** + * @param offset [1..maxLength] + */ public boolean containsOffset(int offset) { int i = start + offset; if (i >= list.length) { i -= list.length; } - return list[i]; + return list[i] != 0; } - // Find the lowest stored offset from a non-empty list, remove it, - // and reduce all other offsets by this minimum. - // Returns [1..maxLength]. - public int popMinimum() { + /** + * @param offset [1..maxLength] + */ + public boolean hasCountAtOffset(int offset, int count) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + int oldCount = list[i]; + return oldCount != 0 && oldCount <= count; + } + + /** + * Finds the lowest stored offset from a non-empty list, removes it, + * and reduces all other offsets by this minimum. + * @return min=[1..maxLength] + */ + public int popMinimum(OutputInt outCount) { // Look for the next offset in list[start+1..list.length-1]. int i = start, result; while (++i < list.length) { - if (list[i]) { - list[i] = false; + int count = list[i]; + if (count != 0) { + list[i] = 0; --length; result = i - start; start = i; + if (outCount != null) { outCount.value = count; } return result; } } @@ -918,12 +1148,14 @@ public class UnicodeSetStringSpan { // Since the list is not empty, there will be one. result = list.length - start; i = 0; - while (!list[i]) { + int count; + while ((count = list[i]) == 0) { ++i; } - list[i] = false; + list[i] = 0; --length; start = i; + if (outCount != null) { outCount.value = count; } return result + i; } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java index bb2d3d5005..0f3c77a7ac 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java @@ -1,6 +1,6 @@ /** ******************************************************************************* - * Copyright (C) 1996-2012, International Business Machines Corporation and + * Copyright (C) 1996-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ @@ -2612,6 +2612,61 @@ public final class UTF16 { } } + /** + * Utility for getting a code point from a CharSequence that contains exactly one code point. + * @return a code point IF the string is non-null and consists of a single code point. + * otherwise returns -1. + * @param s to test + */ + public static int getSingleCodePoint(CharSequence s) { + if (s == null || s.length() == 0) { + return -1; + } else if (s.length() == 1) { + return s.charAt(0); + } else if (s.length() > 2) { + return -1; + } + + // at this point, len = 2 + int cp = UTF16.charAt(s, 0); + if (cp > 0xFFFF) { // is surrogate pair + return cp; + } + return -1; + } + + /** + * Utility for comparing a code point to a string without having to create a new string. Returns the same results + * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if + *

+     * sc = new StringComparator(true,false,0);
+     * fast = UTF16.compare(codePoint, charSequence)
+     * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
+     * 
+ * then + * + * Integer.signum(fast) == Integer.signum(slower) + * + * @param codePoint to test + * @param s to test + * @return equivalent of code point comparator comparing two strings. + */ + public static int compareCodePoint(int codePoint, CharSequence s) { + if (s == null) { + return 1; + } + final int strLen = s.length(); + if (strLen == 0) { + return 1; + } + int second = Character.codePointAt(s, 0); + int diff = codePoint - second; + if (diff != 0) { + return diff; + } + return strLen == Character.charCount(codePoint) ? 0 : -1; + } + // private data members ------------------------------------------------- /** diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index e67d8f3ec6..6fd9cef1bd 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -29,6 +29,7 @@ import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; import com.ibm.icu.util.Freezable; +import com.ibm.icu.util.OutputInt; import com.ibm.icu.util.ULocale; import com.ibm.icu.util.VersionInfo; @@ -265,11 +266,20 @@ import com.ibm.icu.util.VersionInfo; * * * - *

To iterate over contents of UnicodeSet, use UnicodeSetIterator class. + *

To iterate over contents of UnicodeSet, the following are available: + *

  • {@link #ranges()} to iterate through the ranges
  • + *
  • {@link #strings()} to iterate through the strings
  • + *
  • {@link #iterator()} to iterate through the entire contents in a single loop. + * That method is, however, not particularly efficient, since it "boxes" each code point into a String. + *
+ * All of the above can be used in for loops. + * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in for loops. + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. * * @author Alan Liu * @stable ICU 2.0 * @see UnicodeSetIterator + * @see UnicodeSetSpanner */ public class UnicodeSet extends UnicodeFilter implements Iterable, Comparable, Freezable { @@ -283,7 +293,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @stable ICU 4.8 */ public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); - + private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints @@ -338,7 +348,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa */ private static UnicodeSet INCLUSIONS[] = null; - private BMPSet bmpSet; // The set is frozen iff either bmpSet or stringSpan is not null. + private BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. private UnicodeSetStringSpan stringSpan; //---------------------------------------------------------------- // Public API @@ -492,6 +502,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @stable ICU 2.0 */ public Object clone() { + if (isFrozen()) { + return this; + } UnicodeSet result = new UnicodeSet(this); result.bmpSet = this.bmpSet; result.stringSpan = this.stringSpan; @@ -588,27 +601,30 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa /** * Append the toPattern() representation of a * string to the given StringBuffer. + * @return */ - private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) { + private static StringBuffer _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) { int cp; for (int i = 0; i < s.length(); i += Character.charCount(cp)) { cp = s.codePointAt(i); _appendToPat(buf, cp, escapeUnprintable); } + return buf; } /** * Append the toPattern() representation of a * character to the given StringBuffer. + * @return */ - private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) { + private static StringBuffer _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) { // "Utility.isUnprintable(c)" seems redundant since the the call // "Utility.escapeUnprintable(buf, c)" does it again inside the if statement if (escapeUnprintable && Utility.isUnprintable(c)) { // Use hex escape notation (uxxxx or Uxxxxxxxx) for anything // unprintable if (Utility.escapeUnprintable(buf, c)) { - return; + return buf; } } // Okay to let ':' pass through @@ -633,6 +649,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa break; } UTF16.append(buf, c); + return buf; } /** @@ -1279,9 +1296,11 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } /** + * Utility for getting code point from single code point CharSequence. + * See the public UTF16.getSingleCodePoint() * @return a code point IF the string consists of a single one. * otherwise returns -1. - * @param string to test + * @param s to test */ private static int getSingleCP(CharSequence s) { if (s.length() < 1) { @@ -1322,7 +1341,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return this object, for chaining * @stable ICU 2.0 */ - public final UnicodeSet retainAll(String s) { + public final UnicodeSet retainAll(CharSequence s) { return retainAll(fromAll(s)); } @@ -1333,7 +1352,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return this object, for chaining * @stable ICU 2.0 */ - public final UnicodeSet complementAll(String s) { + public final UnicodeSet complementAll(CharSequence s) { return complementAll(fromAll(s)); } @@ -1344,7 +1363,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return this object, for chaining * @stable ICU 2.0 */ - public final UnicodeSet removeAll(String s) { + public final UnicodeSet removeAll(CharSequence s) { return removeAll(fromAll(s)); } @@ -1369,7 +1388,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return a newly created set containing the given string * @stable ICU 2.0 */ - public static UnicodeSet from(String s) { + public static UnicodeSet from(CharSequence s) { return new UnicodeSet().add(s); } @@ -1380,7 +1399,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return a newly created set containing the given characters * @stable ICU 2.0 */ - public static UnicodeSet fromAll(String s) { + public static UnicodeSet fromAll(CharSequence s) { return new UnicodeSet().addAll(s); } @@ -1428,13 +1447,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * Retain the specified string in this set if it is present. * Upon return this set will be empty if it did not contain s, or * will only contain s if it did contain s. - * @param s the string to be retained + * @param cs the string to be retained * @return this object, for chaining * @stable ICU 2.0 */ - public final UnicodeSet retain(String s) { - int cp = getSingleCP(s); + public final UnicodeSet retain(CharSequence cs) { + + int cp = getSingleCP(cs); if (cp < 0) { + String s = cs.toString(); boolean isIn = strings.contains(s); if (isIn && size() == 1) { return this; @@ -1494,7 +1515,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return this object, for chaining * @stable ICU 2.0 */ - public final UnicodeSet remove(String s) { + public final UnicodeSet remove(CharSequence s) { int cp = getSingleCP(s); if (cp < 0) { strings.remove(s); @@ -1571,14 +1592,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return this object, for chaining * @stable ICU 2.0 */ - public final UnicodeSet complement(String s) { + public final UnicodeSet complement(CharSequence s) { checkFrozen(); int cp = getSingleCP(s); if (cp < 0) { if (strings.contains(s)) { strings.remove(s); } else { - strings.add(s); + strings.add(s.toString()); } pat = null; } else { @@ -1804,11 +1825,11 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return true if this set contains the specified string * @stable ICU 2.0 */ - public final boolean contains(String s) { + public final boolean contains(CharSequence s) { int cp = getSingleCP(s); if (cp < 0) { - return strings.contains(s); + return strings.contains(s.toString()); } else { return contains(cp); } @@ -2072,7 +2093,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return true if the test condition is met * @stable ICU 2.0 */ - public boolean containsNone(String s) { + public boolean containsNone(CharSequence s) { return span(s, SpanCondition.NOT_CONTAINED) == s.length(); } @@ -2106,7 +2127,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @return true if the condition is met * @stable ICU 2.0 */ - public final boolean containsSome(String s) { + public final boolean containsSome(CharSequence s) { return !containsNone(s); } @@ -2344,7 +2365,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa StringBuffer rebuiltPat = new StringBuffer(); RuleCharacterIterator chars = - new RuleCharacterIterator(pattern, symbols, pos); + new RuleCharacterIterator(pattern, symbols, pos); applyPattern(chars, symbols, rebuiltPat, options); if (chars.inVariable()) { syntaxError(chars, "Extra chars in variable value"); @@ -2388,7 +2409,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // Recognized special forms for chars, sets: c-c s-s s&s int opts = RuleCharacterIterator.PARSE_VARIABLES | - RuleCharacterIterator.PARSE_ESCAPES; + RuleCharacterIterator.PARSE_ESCAPES; if ((options & IGNORE_SPACE) != 0) { opts |= RuleCharacterIterator.SKIP_WHITESPACE; } @@ -2740,7 +2761,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa private static void syntaxError(RuleCharacterIterator chars, String msg) { throw new IllegalArgumentException("Error: " + msg + " at \"" + Utility.escape(chars.toString()) + - '"'); + '"'); } /** @@ -2771,23 +2792,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } /** - * Add the contents of the collection (as strings) into this UnicodeSet. + * Add the contents of the collection (as strings) into this UnicodeSet. + * The collection must not contain null. * @param source the collection to add * @return a reference to this object * @stable ICU 4.4 */ - public UnicodeSet add(Collection source) { + public UnicodeSet add(Iterable source) { return addAll(source); } /** - * Add the contents of the UnicodeSet (as strings) into a collection. + * Add a collection (as strings) into this UnicodeSet. * Uses standard naming convention. * @param source collection to add into * @return a reference to this object * @stable ICU 4.4 */ - public UnicodeSet addAll(Collection source) { + public UnicodeSet addAll(Iterable source) { checkFrozen(); for (Object o : source) { add(o.toString()); @@ -3104,7 +3126,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // Reference comparison ok; VersionInfo caches and reuses // unique objects. return v != NO_VERSION && - v.compareTo(version) <= 0; + v.compareTo(version) <= 0; } } @@ -3297,7 +3319,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { return applyPropertyAlias(propertyAlias, valueAlias, null); } - + /** * Modifies this set to contain those code points which have the * given value for the given property. Prior contents of this @@ -3321,7 +3343,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { return this; } - + if (XSYMBOL_TABLE != null) { if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { return this; @@ -3476,8 +3498,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // Look for an opening [:, [:^, \p, or \P return pattern.regionMatches(pos, "[:", 0, 2) || - pattern.regionMatches(true, pos, "\\p", 0, 2) || - pattern.regionMatches(pos, "\\N", 0, 2); + pattern.regionMatches(true, pos, "\\p", 0, 2) || + pattern.regionMatches(pos, "\\N", 0, 2); } /** @@ -3879,17 +3901,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // Optimize contains() and span() and similar functions. if (!strings.isEmpty()) { stringSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), UnicodeSetStringSpan.ALL); - if (!stringSpan.needsStringSpanUTF16()) { - // All strings are irrelevant for span() etc. because - // all of each string's code points are contained in this set. - // Do not check needsStringSpanUTF8() because UTF-8 has at most as - // many relevant strings as UTF-16. - // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) - stringSpan = null; - } } - if (stringSpan == null) { - // No span-relevant strings: Optimize for code point spans. + if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { + // Optimize for code point spans. + // There are no strings, or + // all strings are irrelevant for span() etc. because + // all of each string's code points are contained in this set. + // However, fully contained strings are relevant for spanAndCount(), + // so we create both objects. bmpSet = new BMPSet(list, len); } } @@ -3898,7 +3917,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa /** * Span a string using this UnicodeSet. - * + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. * @param s The string to be spanned * @param spanCondition The span condition * @return the length of the span @@ -3912,7 +3931,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * Span a string using this UnicodeSet. * If the start index is less than 0, span will start from 0. * If the start index is greater than the string length, span returns the string length. - * + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. * @param s The string to be spanned * @param start The start index that the span begins * @param spanCondition The span condition @@ -3927,52 +3946,97 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa return end; } if (bmpSet != null) { - return start + bmpSet.span(s, start, end, spanCondition); + // Frozen set without strings, or no string is relevant for span(). + return bmpSet.span(s, start, spanCondition, null); } - int len = end - start; if (stringSpan != null) { - return start + stringSpan.span(s, start, len, spanCondition); + return stringSpan.span(s, start, spanCondition); } else if (!strings.isEmpty()) { int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); if (strSpan.needsStringSpanUTF16()) { - return start + strSpan.span(s, start, len, spanCondition); + return strSpan.span(s, start, spanCondition); } } + return spanCodePointsAndCount(s, start, spanCondition, null); + } + + /** + * Same as span() but also counts the smallest number of set elements on any path across the span. + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param outCount An output-only object (must not be null) for returning the count. + * @return the limit (exclusive end) of the span + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { + if (outCount == null) { + throw new IllegalArgumentException("outCount must not be null"); + } + int end = s.length(); + if (start < 0) { + start = 0; + } else if (start >= end) { + return end; + } + if (stringSpan != null) { + // We might also have bmpSet != null, + // but fully-contained strings are relevant for counting elements. + return stringSpan.spanAndCount(s, start, spanCondition, outCount); + } else if (bmpSet != null) { + return bmpSet.span(s, start, spanCondition, outCount); + } else if (!strings.isEmpty()) { + int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED + : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; + which |= UnicodeSetStringSpan.WITH_COUNT; + UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); + return strSpan.spanAndCount(s, start, spanCondition, outCount); + } + + return spanCodePointsAndCount(s, start, spanCondition, outCount); + } + + private int spanCodePointsAndCount(CharSequence s, int start, + SpanCondition spanCondition, OutputInt outCount) { // Pin to 0/1 values. boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); int c; int next = start; + int length = s.length(); + int count = 0; do { c = Character.codePointAt(s, next); if (spanContained != contains(c)) { break; } - next = Character.offsetByCodePoints(s, next, 1); - } while (next < end); + ++count; + next += Character.charCount(c); + } while (next < length); + if (outCount != null) { outCount.value = count; } return next; } /** * Span a string backwards (from the end) using this UnicodeSet. - * + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. * @param s The string to be spanned * @param spanCondition The span condition * @return The string index which starts the span (i.e. inclusive). * @stable ICU 4.4 */ public int spanBack(CharSequence s, SpanCondition spanCondition) { - return spanBack(s, s.length(), spanCondition); + return spanBack(s, s.length(), spanCondition); } /** * Span a string backwards (from the fromIndex) using this UnicodeSet. * If the fromIndex is less than 0, spanBack will return 0. * If fromIndex is greater than the string length, spanBack will start from the string length. - * + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. * @param s The string to be spanned * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards * @param spanCondition The span condition @@ -3987,6 +4051,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa fromIndex = s.length(); } if (bmpSet != null) { + // Frozen set without strings, or no string is relevant for spanBack(). return bmpSet.spanBack(s, fromIndex, spanCondition); } if (stringSpan != null) { @@ -3994,7 +4059,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } else if (!strings.isEmpty()) { int which = (spanCondition == SpanCondition.NOT_CONTAINED) ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED - : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; + : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); if (strSpan.needsStringSpanUTF16()) { return strSpan.spanBack(s, fromIndex, spanCondition); @@ -4011,20 +4076,19 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa if (spanContained != contains(c)) { break; } - prev = Character.offsetByCodePoints(s, prev, -1); + prev -= Character.charCount(c); } while (prev > 0); return prev; } /** * Clone a thawed version of this class, according to the Freezable interface. - * @return this + * @return the clone, not frozen * @stable ICU 4.4 */ public UnicodeSet cloneAsThawed() { - UnicodeSet result = (UnicodeSet) clone(); - result.bmpSet = null; - result.stringSpan = null; + UnicodeSet result = new UnicodeSet(this); + assert !result.isFrozen(); return result; } @@ -4039,6 +4103,80 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // Additional methods for integration with Generics and Collections // ************************ + /** + * A struct-like class used for iteration through ranges, for faster iteration than by String. + * Read about the restrictions on usage in {@link #UnicodeSet.ranges()}. + */ + public static class EntryRange { + /** + * The starting code point of the range. + */ + public int codepoint; + /** + * The ending code point of the range + */ + public int codepointEnd; + + @Override + public String toString() { + StringBuffer b = new StringBuffer(); + return ( + codepoint == codepointEnd ? _appendToPat(b, codepoint, false) + : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false)) + .toString(); + } + } + + /** + * Provide for faster iteration than by String. Returns an iterator over a range values. The UnicodeSet + * must not be altered during the iteration. The EntryRange is the same each time; the contents are just reset. + *
Warning: To iterate over the full contents, you have to also iterate over the strings. + * + *

+     * // Sample code
+     * for (EntryRange range : us1.ranges()) {
+     *     // do something with code points between range.codepointEnd and range.codepointEnd;
+     * }
+     * for (String s : us1.strings()) {
+     *     // do something with each string;
+     * }
+     * 
+ */ + public Iterable ranges() { + return new EntryRanges(); + } + + private class EntryRanges implements Iterable, Iterator { + int pos; + EntryRange result = new EntryRange(); + // Iterator stringIterator = strings == null ? null : strings.iterator(); + + public Iterator iterator() { + return this; + } + public boolean hasNext() { + return pos < len-1 + // || (stringIterator != null && stringIterator.hasNext()) + ; + } + public EntryRange next() { + if (pos < len-1) { + result.codepoint = list[pos++]; + result.codepointEnd = list[pos++]-1; +// result.string = null; + } else { + throw new ArrayIndexOutOfBoundsException(pos); +// result.codepoint = -1; +// result.string = stringIterator.next(); + } + return result; + } + public void remove() { + throw new UnsupportedOperationException(); + } + } + + /** * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. * @see java.util.Set#iterator() @@ -4129,8 +4267,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @see #containsAll(com.ibm.icu.text.UnicodeSet) * @stable ICU 4.4 */ - public boolean containsAll(Collection collection) { - for (String o : collection) { + public boolean containsAll(Iterable collection) { + for (T o : collection) { if (!contains(o)) { return false; } @@ -4142,8 +4280,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @see #containsNone(com.ibm.icu.text.UnicodeSet) * @stable ICU 4.4 */ - public boolean containsNone(Collection collection) { - for (String o : collection) { + public boolean containsNone(Iterable collection) { + for (T o : collection) { if (contains(o)) { return false; } @@ -4155,7 +4293,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @see #containsAll(com.ibm.icu.text.UnicodeSet) * @stable ICU 4.4 */ - public final boolean containsSome(Collection collection) { + public final boolean containsSome(Iterable collection) { return !containsNone(collection); } @@ -4163,9 +4301,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @see #addAll(com.ibm.icu.text.UnicodeSet) * @stable ICU 4.4 */ - public UnicodeSet addAll(String... collection) { + public UnicodeSet addAll(T... collection) { checkFrozen(); - for (String str : collection) { + for (T str : collection) { add(str); } return this; @@ -4176,9 +4314,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @see #removeAll(com.ibm.icu.text.UnicodeSet) * @stable ICU 4.4 */ - public UnicodeSet removeAll(Collection collection) { + public UnicodeSet removeAll(Iterable collection) { checkFrozen(); - for (String o : collection) { + for (T o : collection) { remove(o); } return this; @@ -4188,7 +4326,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @see #retainAll(com.ibm.icu.text.UnicodeSet) * @stable ICU 4.4 */ - public UnicodeSet retainAll(Collection collection) { + public UnicodeSet retainAll(Iterable collection) { checkFrozen(); // TODO optimize UnicodeSet toRetain = new UnicodeSet(); @@ -4277,7 +4415,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @stable ICU 4.4 */ - public static int compare(String string, int codePoint) { + public static int compare(CharSequence string, int codePoint) { return CharSequences.compare(string, codePoint); } @@ -4288,7 +4426,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * Note that this (=String) order is UTF-16 order -- *not* code point order. * @stable ICU 4.4 */ - public static int compare(int codePoint, String string) { + public static int compare(int codePoint, CharSequence string) { return -CharSequences.compare(string, codePoint); } @@ -4304,7 +4442,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa public static > int compare(Iterable collection1, Iterable collection2) { return compare(collection1.iterator(), collection2.iterator()); } - + /** * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered, * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. @@ -4378,7 +4516,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * * @stable ICU 4.4 */ - public Iterable strings() { + public Collection strings() { return Collections.unmodifiableSortedSet(strings); } @@ -4417,7 +4555,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match. * If there is no match, length is returned. * @internal - * @deprecated This API is ICU internal only. + * @deprecated This API is ICU internal only. Use span instead. */ @Deprecated public int findIn(CharSequence value, int fromIndex, boolean findNot) { @@ -4438,7 +4576,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * If there is no match, -1 is returned. * BEFORE index is not in the UnicodeSet. * @internal - * @deprecated This API is ICU internal only. + * @deprecated This API is ICU internal only. Use spanBack instead. */ @Deprecated public int findLastIn(CharSequence value, int fromIndex, boolean findNot) { @@ -4460,7 +4598,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object. * @return The string after it has been stripped. * @internal - * @deprecated This API is ICU internal only. + * @deprecated This API is ICU internal only. Use replaceFrom. */ @Deprecated public String stripFrom(CharSequence source, boolean matches) { @@ -4593,6 +4731,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa */ @Deprecated public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { + INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated. XSYMBOL_TABLE = xSymbolTable; } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java new file mode 100644 index 0000000000..cac14805b8 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java @@ -0,0 +1,333 @@ +/* + ******************************************************************************* + * Copyright (C) 2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package com.ibm.icu.text; + +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import com.ibm.icu.util.OutputInt; + +/** + * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches. + * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen. + */ +public class UnicodeSetSpanner { + + private final UnicodeSet unicodeSet; + + /** + * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class + * can be used with a non-frozen version to avoid the cost of freezing. + * + * @param source + * the original UnicodeSet + */ + public UnicodeSetSpanner(UnicodeSet source) { + unicodeSet = source; + } + + /** + * Returns the UnicodeSet used for processing. It is frozen iff the original was. + * + * @return the construction set. + */ + public UnicodeSet getUnicodeSet() { + return unicodeSet; + } + + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object other) { + return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#hashCode() + */ + @Override + public int hashCode() { + return unicodeSet.hashCode(); + } + + /** + * Options for replaceFrom and countIn to control how to treat each matched span. The name is from "qualifier" as used in regex, + * since it is similar to whether one is replacing [abc] by x, or [abc]* by x. + * + */ + public enum Quantifier { + /** + * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate + * code points. + * + */ + SPAN, + /** + * Use the smallest number of elements in the spanned range for counting and modification. In other words, the "longest matches" are + * used where possible. If there are no strings, this will be the same as code points. + *

For example, in the string "abab": + *

    + *
  • spanning with [ab] will also count four MIN_ELEMENTS.
  • + *
  • spanning with [{ab}] will count two MIN_ELEMENTS.
  • + *
  • spanning with [ab{ab}] will also count two MIN_ELEMENTS.
  • + *
+ */ + MIN_ELEMENTS, + // Note: could in the future have an additional option MAX_ELEMENTS + } + + /** + * Returns the number of matching characters found in a character sequence, counting by Quantifier.ELEMENT using SpanCondition.CONTAINED. + * + * @param sequence + * the sequence to count characters in + * @return the count. Zero if there are none. + */ + public int countIn(CharSequence sequence) { + return countIn(sequence, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED); + } + + /** + * Returns the number of matching characters found in a character sequence, using SpanCondition.CONTAINED + * + * @param sequence + * the sequence to count characters in + * @return the count. Zero if there are none. + */ + public int countIn(CharSequence sequence, Quantifier quantifier) { + return countIn(sequence, quantifier, SpanCondition.CONTAINED); + } + + /** + * Returns the number of matching characters found in a character sequence. + * + * @param sequence + * the sequence to count characters in + * @param quantifier + * (optional) whether to treat the entire span as a match, or individual code points + * @param countSpan + * (optional) the spanCondition to use. CONTAINED means only count the code points in the CONTAINED span; + * NOT_CONTAINED is the reverse. + * @return the count. Zero if there are none. + */ + public int countIn(CharSequence sequence, Quantifier quantifier, SpanCondition countSpan) { + int count = 0; + int start = 0; + SpanCondition skipSpan = countSpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED + : SpanCondition.CONTAINED; + final int length = sequence.length(); + OutputInt spanCount = new OutputInt(); + while (start != length) { + int endNotContained = unicodeSet.span(sequence, start, skipSpan); + if (endNotContained == length) { + break; + } + start = unicodeSet.spanAndCount(sequence, endNotContained, countSpan, spanCount); + count += quantifier == Quantifier.SPAN ? 1 : spanCount.value; + } + return count; + } + + /** + * Delete all the matching spans in sequence, using SpanCondition.CONTAINED + * + * @param sequence + * charsequence to replace matching spans in. + * @return modified string. + */ + public String deleteFrom(CharSequence sequence) { + return replaceFrom(sequence, "", Quantifier.SPAN, SpanCondition.CONTAINED); + } + + /** + * Delete all matching spans in sequence, according to the operations. + * + * @param sequence + * charsequence to replace matching spans in. + * @param modifySpan + * specify whether to modify the matching spans (CONTAINED) or the non-matching (NOT_CONTAINED) + * @return modified string. + */ + public String deleteFrom(CharSequence sequence, SpanCondition modifySpan) { + return replaceFrom(sequence, "", Quantifier.SPAN, modifySpan); + } + + /** + * Replace all matching spans in sequence by the replacement, + * counting by Quantifier.ELEMENT using SpanCondition.CONTAINED. + * + * @param sequence + * charsequence to replace matching spans in. + * @param replacement + * replacement sequence. To delete, use "" + * @return modified string. + */ + public String replaceFrom(CharSequence sequence, CharSequence replacement) { + return replaceFrom(sequence, replacement, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED); + } + + /** + * Replace all matching spans in sequence by replacement, according to the Quantifier, using SpanCondition.CONTAINED. + * + * @param sequence + * charsequence to replace matching spans in. + * @param replacement + * replacement sequence. To delete, use "" + * @param quantifier + * whether to treat the entire span as a match, or individual code points + * @return modified string. + */ + public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier) { + return replaceFrom(sequence, replacement, quantifier, SpanCondition.CONTAINED); + } + + /** + * Replace all matching spans in sequence by replacement, according to the operations quantifier and modifySpan. + * + * @param sequence + * charsequence to replace matching spans in. + * @param replacement + * replacement sequence. To delete, use "" + * @param modifySpan + * (optional) specify whether to modify the matching spans (CONTAINED) or the non-matching + * (NOT_CONTAINED) + * @param quantifier + * (optional) specify whether to collapse or do codepoint by codepoint. + * @return modified string. + */ + public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier, + SpanCondition modifySpan) { + SpanCondition copySpan = modifySpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED + : SpanCondition.CONTAINED; + final boolean remove = replacement.length() == 0; + StringBuilder result = new StringBuilder(); + // TODO, we can optimize this to + // avoid this allocation unless needed + + final int length = sequence.length(); + OutputInt spanCount = new OutputInt(); + for (int endCopy = 0; endCopy != length;) { + int endModify = unicodeSet.spanAndCount(sequence, endCopy, modifySpan, spanCount); + if (remove || endModify == 0) { + // do nothing + } else if (quantifier == Quantifier.SPAN) { + result.append(replacement); + } else { + for (int i = spanCount.value; i > 0; --i) { + result.append(replacement); + } + } + if (endModify == length) { + break; + } + endCopy = unicodeSet.span(sequence, endModify, copySpan); + result.append(sequence.subSequence(endModify, endCopy)); + } + return result.toString(); + } + + /** + * Options for the trim() method + * + */ + public enum TrimOption { + /** + * Trim leading spans (subject to INVERT). + * + */ + LEADING, + /** + * Trim leading and trailing spans (subject to INVERT). + * + */ + BOTH, + /** + * Trim trailing spans (subject to INVERT). + * + */ + TRAILING; + } + + /** + * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or + * end of the string, using TrimOption.BOTH and SpanCondition.CONTAINED. For example: + * + *
+     * {@code
+     * 
+     *   new UnicodeSet("[ab]").trim("abacatbab")}
+     * 
+ * + * ... returns {@code "catbab"}. + * + */ + public CharSequence trim(CharSequence sequence) { + return trim(sequence, TrimOption.BOTH, SpanCondition.CONTAINED); + } + + /** + * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or + * end of the string, using the trimOption and SpanCondition.CONTAINED. For example: + * + *
+     * {@code
+     * 
+     *   new UnicodeSet("[ab]").trim("abacatbab")}
+     * 
+ * + * ... returns {@code "catbab"}. + * + */ + public CharSequence trim(CharSequence sequence, TrimOption trimOption) { + return trim(sequence, trimOption, SpanCondition.CONTAINED); + } + + /** + * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or + * end of the string, depending on the trimOption and modifySpan. For example: + * + *
+     * {@code
+     * 
+     *   new UnicodeSet("[ab]").trim("abacatbab")}
+     * 
+ * + * ... returns {@code "catbab"}. + * + * @param sequence + * the sequence to trim + * @param trimOption + * (optional) LEADING, TRAILING, or BOTH + * @param modifySpan + * (optional) CONTAINED or NOT_CONTAINED + * @return a subsequence + */ + public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition modifySpan) { + int endLeadContained, startTrailContained; + final int length = sequence.length(); + if (trimOption != TrimOption.TRAILING) { + endLeadContained = unicodeSet.span(sequence, modifySpan); + if (endLeadContained == length) { + return ""; + } + } else { + endLeadContained = 0; + } + if (trimOption != TrimOption.LEADING) { + startTrailContained = unicodeSet.spanBack(sequence, modifySpan); + } else { + startTrailContained = length; + } + return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence( + endLeadContained, startTrailContained); + } + +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/OutputInt.java b/icu4j/main/classes/core/src/com/ibm/icu/util/OutputInt.java new file mode 100644 index 0000000000..adcd8d20c5 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/OutputInt.java @@ -0,0 +1,58 @@ +/* + ******************************************************************************* + * Copyright (C) 2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package com.ibm.icu.util; + +/** + * Simple struct-like class for int output parameters. + * Like Output<Integer> but without auto-boxing. + * + * @internal but could become public + * @deprecated This API is ICU internal only. + */ +@Deprecated +public class OutputInt { + /** + * The value field. + * + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public int value; + + /** + * Constructs an OutputInt with value 0. + * + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public OutputInt() { + } + + /** + * Constructs an OutputInt with the given value. + * + * @param value the initial value + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public OutputInt(int value) { + this.value = value; + } + + /** + * {@inheritDoc} + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public String toString() { + return Integer.toString(value); + } +} diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UTF16Test.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UTF16Test.java index e3ce28cdf7..c73f1b898a 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UTF16Test.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UTF16Test.java @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 1996-2010, International Business Machines Corporation and * +* Copyright (C) 1996-2014, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -13,6 +13,7 @@ import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.ReplaceableString; import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UTF16.StringComparator; /** * Testing class for UTF16 @@ -1560,6 +1561,39 @@ public final class UTF16Test extends TestFmwk } } + public void TestUtilities() { + String[] tests = { + "a", + "\uFFFF", + "😀", + "\uD800", + "\uDC00", + "\uDBFF\uDfff", + "", + "\u0000", + "\uDC00\uD800", + "ab", + "😀a", + null, + }; + StringComparator sc = new UTF16.StringComparator(true,false,0); + for (String item1 : tests) { + String nonNull1 = item1 == null ? "" : item1; + int count = UTF16.countCodePoint(nonNull1); + int expected = count == 0 || count > 1 ? -1 : nonNull1.codePointAt(0); + assertEquals("codepoint test " + Utility.hex(nonNull1), expected, UTF16.getSingleCodePoint(item1)); + if (expected == -1) { + continue; + } + for (String item2 : tests) { + String nonNull2 = item2 == null ? "" : item2; + int scValue = Integer.signum(sc.compare(nonNull1, nonNull2)); + int fValue = Integer.signum(UTF16.compareCodePoint(expected, item2)); + assertEquals("comparison " + Utility.hex(nonNull1) + ", " + Utility.hex(nonNull2), scValue, fValue); + } + } + } + public void TestNewString() { final int[] codePoints = { UCharacter.toCodePoint(UCharacter.MIN_HIGH_SURROGATE, UCharacter.MAX_LOW_SURROGATE), @@ -1568,6 +1602,7 @@ public final class UTF16Test extends TestFmwk 'A', -1, }; + final String cpString = "" + UCharacter.MIN_HIGH_SURROGATE + diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetStringSpanTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetStringSpanTest.java index 6d41a0fb5b..5b22564e48 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetStringSpanTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetStringSpanTest.java @@ -1,17 +1,19 @@ /* ******************************************************************************* - * Copyright (C) 2009-2011, International Business Machines Corporation and * - * others. All Rights Reserved. * + * Copyright (C) 2009-2014, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ package com.ibm.icu.dev.test.lang; +import java.util.Collection; + import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSet.SpanCondition; -import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.OutputInt; /** * @test @@ -41,7 +43,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk { } pos = set.span(string, 1, SpanCondition.SIMPLE); if (pos != 3) { - errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)", + errln(String.format("FAIL: UnicodeSet(%s).span(%s, 1) returns the wrong value pos %d (!= 3)", set.toString(), string, pos)); } } @@ -129,33 +131,15 @@ public class UnicodeSetStringSpanTest extends TestFmwk { // more complex test. -------------------------------------------------------- // Make the strings in a UnicodeSet easily accessible. - static class UnicodeSetWithStrings { - + private static class UnicodeSetWithStrings { private UnicodeSet set; - - private String strings[]; + private Collection setStrings; private int stringsLength; - private boolean hasSurrogates; public UnicodeSetWithStrings(final UnicodeSet normalSet) { set = normalSet; - stringsLength = 0; - hasSurrogates = false; - strings = new String[20]; - int size = set.size(); - if (size > 0 && set.charAt(size - 1) < 0) { - // If a set's last element is not a code point, then it must contain strings. - // Iterate over the set, skip all code point ranges, and cache the strings. - UnicodeSetIterator iter = new UnicodeSetIterator(set); - while (iter.nextRange() && stringsLength < strings.length) { - if (iter.codepoint == UnicodeSetIterator.IS_STRING) { - // Store the pointer to the set's string element - // which we happen to know is a stable pointer. - strings[stringsLength] = iter.getString(); - ++stringsLength; - } - } - } + setStrings = normalSet.strings(); + stringsLength = setStrings.size(); } public final UnicodeSet getSet() { @@ -166,34 +150,9 @@ public class UnicodeSetStringSpanTest extends TestFmwk { return (stringsLength > 0); } - public boolean hasStringsWithSurrogates() { - return hasSurrogates; + public Iterable strings() { + return setStrings; } - - } - - static class UnicodeSetWithStringsIterator { - - private UnicodeSetWithStrings fSet; - private int nextStringIndex; - - public UnicodeSetWithStringsIterator(final UnicodeSetWithStrings set) { - fSet = set; - nextStringIndex = 0; - } - - public void reset() { - nextStringIndex = 0; - } - - public final String nextString() { - if (nextStringIndex < fSet.stringsLength) { - return fSet.strings[nextStringIndex++]; - } else { - return null; - } - } - } // Compare 16-bit Unicode strings (which may be malformed UTF-16) @@ -231,7 +190,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk { } return prev; } else if (spanCondition == SpanCondition.NOT_CONTAINED) { - UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set); int c; int start, next; for (start = next = 0; start < length;) { @@ -240,9 +198,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk { if (realSet.contains(c)) { break; } - String str; - iter.reset(); - while ((str = iter.nextString()) != null) { + for (String str : set.strings()) { if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) { // spanNeedsStrings=true; return start; @@ -252,7 +208,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk { } return start; } else /* CONTAINED or SIMPLE */{ - UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set); int c; int start, next, maxSpanLimit = 0; for (start = next = 0; start < length;) { @@ -261,9 +216,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk { if (!realSet.contains(c)) { next = start; // Do not span this single, not-contained code point. } - String str; - iter.reset(); - while ((str = iter.nextString()) != null) { + for (String str : set.strings()) { if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) { // spanNeedsStrings=true; int matchLimit = start + str.length(); @@ -336,7 +289,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk { } while (prev > 0); return prev; } else if (spanCondition == SpanCondition.NOT_CONTAINED) { - UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set); int c; int prev = length, length0 = length; do { @@ -344,9 +296,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk { if (realSet.contains(c)) { break; } - String str; - iter.reset(); - while ((str = iter.nextString()) != null) { + for (String str : set.strings()) { if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) { // spanNeedsStrings=true; return prev; @@ -356,7 +306,6 @@ public class UnicodeSetStringSpanTest extends TestFmwk { } while (prev > 0); return prev; } else /* SpanCondition.CONTAINED or SIMPLE */{ - UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set); int c; int prev = length, minSpanStart = length, length0 = length; do { @@ -365,9 +314,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk { if (!realSet.contains(c)) { length = prev; // Do not span this single, not-contained code point. } - String str; - iter.reset(); - while ((str = iter.nextString()) != null) { + for (String str : set.strings()) { if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) { // spanNeedsStrings=true; int matchStart = prev - str.length(); @@ -616,7 +563,7 @@ public class UnicodeSetStringSpanTest extends TestFmwk { * input expectCount<0). */ void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans, - int expectLimits[], int expectCount, // TODO + int expectLimits[], int expectCount, final String testName, int index) { int[] limits = new int[500]; int limitsCount; @@ -1129,4 +1076,54 @@ public class UnicodeSetStringSpanTest extends TestFmwk { } } + public void TestSpanAndCount() { + // a set with no strings + UnicodeSet abc = new UnicodeSet('a', 'c'); + // a set with an "irrelevant" string (fully contained in the code point set) + UnicodeSet crlf = new UnicodeSet().add('\n').add('\r').add("\r\n"); + // a set with no "irrelevant" string but some interesting overlaps + UnicodeSet ab_cd = new UnicodeSet().add('a').add("ab").add("abc").add("cd"); + String s = "ab\n\r\r\n" + UTF16.valueOf(0x50000) + "abcde"; + OutputInt count = new OutputInt(); + assertEquals("abc span[8, 11[", 11, + abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); + assertEquals("abc count=3", 3, count.value); + assertEquals("no abc span[2, 8[", 8, + abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); + assertEquals("no abc count=5", 5, count.value); + assertEquals("line endings span[2, 6[", 6, + crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count)); + assertEquals("line endings count=3", 3, count.value); + assertEquals("no ab+cd span[2, 8[", 8, + ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); + assertEquals("no ab+cd count=5", 5, count.value); + assertEquals("ab+cd span[8, 12[", 12, + ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count)); + assertEquals("ab+cd count=2", 2, count.value); + assertEquals("1x abc span[8, 11[", 11, + ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); + assertEquals("1x abc count=1", 1, count.value); + + abc.freeze(); + crlf.freeze(); + ab_cd.freeze(); + assertEquals("abc span[8, 11[ (frozen)", 11, + abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); + assertEquals("abc count=3 (frozen)", 3, count.value); + assertEquals("no abc span[2, 8[ (frozen)", 8, + abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); + assertEquals("no abc count=5 (frozen)", 5, count.value); + assertEquals("line endings span[2, 6[ (frozen)", 6, + crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count)); + assertEquals("line endings count=3 (frozen)", 3, count.value); + assertEquals("no ab+cd span[2, 8[ (frozen)", 8, + ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); + assertEquals("no ab+cd count=5 (frozen)", 5, count.value); + assertEquals("ab+cd span[8, 12[ (frozen)", 12, + ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count)); + assertEquals("ab+cd count=2 (frozen)", 2, count.value); + assertEquals("1x abc span[8, 11[ (frozen)", 11, + ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); + assertEquals("1x abc count=1 (frozen)", 1, count.value); + } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java index e0389bf77a..e598f3985c 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java @@ -11,6 +11,7 @@ import java.text.ParsePosition; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -22,6 +23,7 @@ import java.util.SortedSet; import java.util.TreeSet; import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.dev.util.CollectionUtilities; import com.ibm.icu.impl.SortedSetRelation; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; @@ -33,6 +35,11 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeMatcher; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSet.ComparisonStyle; +import com.ibm.icu.text.UnicodeSet.EntryRange; +import com.ibm.icu.text.UnicodeSetSpanner; +import com.ibm.icu.text.UnicodeSetSpanner.Quantifier; +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import com.ibm.icu.text.UnicodeSetSpanner.TrimOption; import com.ibm.icu.text.UnicodeSetIterator; /** @@ -1256,10 +1263,10 @@ public class UnicodeSetTest extends TestFmwk { String pat = ""; try { String name = - (j==0) ? UScript.getName(i) : UScript.getShortName(i); - pat = "[:" + name + ":]"; - UnicodeSet set = new UnicodeSet(pat); - logln("Ok: " + pat + " -> " + set.toPattern(false)); + (j==0) ? UScript.getName(i) : UScript.getShortName(i); + pat = "[:" + name + ":]"; + UnicodeSet set = new UnicodeSet(pat); + logln("Ok: " + pat + " -> " + set.toPattern(false)); } catch (IllegalArgumentException e) { if (pat.length() == 0) { errln("FAIL (in UScript): No name for script " + i); @@ -1330,9 +1337,9 @@ public class UnicodeSetTest extends TestFmwk { // The following pattern must contain at least one range "c-d" // where c or d is a Pattern_White_Space. String pattern = - "[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; + "[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; String exp = - "[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; + "[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; // We test this with two passes; in the second pass we // pre-unescape the pattern. Since U+200E is Pattern_White_Space, // this fails -- which is what we expect. @@ -1563,7 +1570,7 @@ public class UnicodeSetTest extends TestFmwk { mod2 = new UnicodeSet(set1).retainAll(set2.addAllTo(new LinkedHashSet())); assertEquals("remove all", mod1, mod2); } - + public void TestComparison() { UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze(); UnicodeSet set2 = new UnicodeSet("[c-e {ch}]").freeze(); @@ -1579,7 +1586,7 @@ public class UnicodeSetTest extends TestFmwk { List sorted = new ArrayList(new TreeSet(unsorted)); assertNotEquals("compareTo-shorter-first", unsorted, sorted); assertEquals("compareTo-shorter-first", goalShortest, sorted); - + TreeSet sorted1 = new TreeSet(new Comparator(){ public int compare(UnicodeSet o1, UnicodeSet o2) { // TODO Auto-generated method stub @@ -1616,34 +1623,34 @@ public class UnicodeSetTest extends TestFmwk { // now compare all the combinations. If any of them is a code point, use it. int maxErrorCount = 0; compare: - for (String last : target) { - for (String curr : target) { - int lastCount = Character.codePointCount(last, 0, last.length()); - int currCount = Character.codePointCount(curr, 0, curr.length()); - int comparison; - if (lastCount == 1) { - comparison = UnicodeSet.compare(last.codePointAt(0), curr); - } else if (currCount == 1) { - comparison = UnicodeSet.compare(last, curr.codePointAt(0)); - } else { - continue; - } - if (comparison != last.compareTo(curr)) { - // repeat for debugging + for (String last : target) { + for (String curr : target) { + int lastCount = Character.codePointCount(last, 0, last.length()); + int currCount = Character.codePointCount(curr, 0, curr.length()); + int comparison; if (lastCount == 1) { comparison = UnicodeSet.compare(last.codePointAt(0), curr); } else if (currCount == 1) { comparison = UnicodeSet.compare(last, curr.codePointAt(0)); + } else { + continue; } - if (maxErrorCount++ > 10) { - errln(maxErrorCount + " Failure in comparing " + last + " & " + curr + "\tOmitting others..."); - break compare; + if (comparison != last.compareTo(curr)) { + // repeat for debugging + if (lastCount == 1) { + comparison = UnicodeSet.compare(last.codePointAt(0), curr); + } else if (currCount == 1) { + comparison = UnicodeSet.compare(last, curr.codePointAt(0)); + } + if (maxErrorCount++ > 10) { + errln(maxErrorCount + " Failure in comparing " + last + " & " + curr + "\tOmitting others..."); + break compare; + } + errln(maxErrorCount + " Failure in comparing " + last + " & " + curr); } - errln(maxErrorCount + " Failure in comparing " + last + " & " + curr); } } - } - + //compare(Iterable, Iterable) int max = 10; List test1 = new ArrayList(max); @@ -1669,7 +1676,7 @@ public class UnicodeSetTest extends TestFmwk { // check to make sure right exceptions are thrown Class expected = IllegalArgumentException.class; Class actual; - + try { actual = null; @SuppressWarnings("unused") @@ -1678,7 +1685,7 @@ public class UnicodeSetTest extends TestFmwk { actual = e.getClass(); } assertEquals("exception if odd", expected, actual); - + try { actual = null; @SuppressWarnings("unused") @@ -1687,7 +1694,7 @@ public class UnicodeSetTest extends TestFmwk { actual = e.getClass(); } assertEquals("exception for start/end problem", expected, actual); - + try { actual = null; @SuppressWarnings("unused") @@ -1696,7 +1703,7 @@ public class UnicodeSetTest extends TestFmwk { actual = e.getClass(); } assertEquals("exception for end/start problem", expected, actual); - + CheckRangeSpeed(10000, new UnicodeSet("[:whitespace:]")); CheckRangeSpeed(1000, new UnicodeSet("[:letter:]")); } @@ -1731,14 +1738,14 @@ public class UnicodeSetTest extends TestFmwk { double rangeConstructorTime = (middle - start)/iterations; double patternConstructorTime = (end - middle)/iterations; String message = "Range constructor:\t" + rangeConstructorTime + ";\tPattern constructor:\t" + patternConstructorTime + "\t\t" - + percent.format(rangeConstructorTime/patternConstructorTime-1); + + percent.format(rangeConstructorTime/patternConstructorTime-1); if (rangeConstructorTime < 2*patternConstructorTime) { logln(message); } else { errln(message); } } - + NumberFormat percent = NumberFormat.getPercentInstance(); { percent.setMaximumFractionDigits(2); @@ -1806,69 +1813,69 @@ public class UnicodeSetTest extends TestFmwk { } } -// Following cod block is commented out to eliminate PrettyPrinter depenencies + // Following cod block is commented out to eliminate PrettyPrinter depenencies -// String[] prettyData = { -// "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case -// "[:any:]", -// "[:whitespace:]", -// "[:linebreak=AL:]", -// }; -// -// public void TestPrettyPrinting() { -// try{ -// PrettyPrinter pp = new PrettyPrinter(); -// -// int i = 0; -// for (; i < prettyData.length; ++i) { -// UnicodeSet test = new UnicodeSet(prettyData[i]); -// checkPrettySet(pp, i, test); -// } -// Random random = new Random(0); -// UnicodeSet test = new UnicodeSet(); -// -// // To keep runtimes under control, make the number of random test cases -// // to try depends on the test framework exhaustive setting. -// // params.inclusions = 5: default exhaustive value -// // params.inclusions = 10: max exhaustive value. -// int iterations = 50; -// if (params.inclusion > 5) { -// iterations = (params.inclusion-5) * 200; -// } -// for (; i < iterations; ++i) { -// double start = random.nextGaussian() * 0x10000; -// if (start < 0) start = - start; -// if (start > 0x10FFFF) { -// start = 0x10FFFF; -// } -// double end = random.nextGaussian() * 0x100; -// if (end < 0) end = -end; -// end = start + end; -// if (end > 0x10FFFF) { -// end = 0x10FFFF; -// } -// test.complement((int)start, (int)end); -// checkPrettySet(pp, i, test); -// } -// }catch(RuntimeException ex){ -// warnln("Could not load Collator"); -// } -// } -// -// private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) { -// String pretty = pp.toPattern(test); -// UnicodeSet retry = new UnicodeSet(pretty); -// if (!test.equals(retry)) { -// errln(i + ". Failed test: " + test + " != " + pretty); -// } else { -// logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty)); -// } -// } -// -// private String truncate(String string) { -// if (string.length() <= 100) return string; -// return string.substring(0,97) + "..."; -// } + // String[] prettyData = { + // "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case + // "[:any:]", + // "[:whitespace:]", + // "[:linebreak=AL:]", + // }; + // + // public void TestPrettyPrinting() { + // try{ + // PrettyPrinter pp = new PrettyPrinter(); + // + // int i = 0; + // for (; i < prettyData.length; ++i) { + // UnicodeSet test = new UnicodeSet(prettyData[i]); + // checkPrettySet(pp, i, test); + // } + // Random random = new Random(0); + // UnicodeSet test = new UnicodeSet(); + // + // // To keep runtimes under control, make the number of random test cases + // // to try depends on the test framework exhaustive setting. + // // params.inclusions = 5: default exhaustive value + // // params.inclusions = 10: max exhaustive value. + // int iterations = 50; + // if (params.inclusion > 5) { + // iterations = (params.inclusion-5) * 200; + // } + // for (; i < iterations; ++i) { + // double start = random.nextGaussian() * 0x10000; + // if (start < 0) start = - start; + // if (start > 0x10FFFF) { + // start = 0x10FFFF; + // } + // double end = random.nextGaussian() * 0x100; + // if (end < 0) end = -end; + // end = start + end; + // if (end > 0x10FFFF) { + // end = 0x10FFFF; + // } + // test.complement((int)start, (int)end); + // checkPrettySet(pp, i, test); + // } + // }catch(RuntimeException ex){ + // warnln("Could not load Collator"); + // } + // } + // + // private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) { + // String pretty = pp.toPattern(test); + // UnicodeSet retry = new UnicodeSet(pretty); + // if (!test.equals(retry)) { + // errln(i + ". Failed test: " + test + " != " + pretty); + // } else { + // logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty)); + // } + // } + // + // private String truncate(String string) { + // if (string.length() <= 100) return string; + // return string.substring(0,97) + "..."; + // } public class TokenSymbolTable implements SymbolTable { HashMap contents = new HashMap(); @@ -1944,7 +1951,7 @@ public class UnicodeSetTest extends TestFmwk { UnicodeSet set = new UnicodeSet(DATA[i]); expectContainment(set, CharsToUnicodeString("abc\\U00010000"), - "\uD800;\uDC00"); // split apart surrogate-pair + "\uD800;\uDC00"); // split apart surrogate-pair if (set.size() != 4) { errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " + set.size() + ", expected 4")); @@ -2171,16 +2178,16 @@ public class UnicodeSetTest extends TestFmwk { // Now see if the expected relation is true int status = (minus12.size() != 0 ? 4 : 0) - | (intersection.size() != 0 ? 2 : 0) - | (minus21.size() != 0 ? 1 : 0); + | (intersection.size() != 0 ? 2 : 0) + | (minus21.size() != 0 ? 1 : 0); if (status != relation) { errln("FAIL relation incorrect" + message + "; desired = " + RELATION_NAME[relation] - + "; found = " + RELATION_NAME[status] - + "; set1 = " + set1.toPattern(true) - + "; set2 = " + set2.toPattern(true) - ); + + "; found = " + RELATION_NAME[status] + + "; set1 = " + set1.toPattern(true) + + "; set2 = " + set2.toPattern(true) + ); } } @@ -2234,7 +2241,7 @@ public class UnicodeSetTest extends TestFmwk { errln("FAIL " + message + "; source = " + s.toPattern(true) + "; result = " + t.toPattern(true) - ); + ); return false; } return true; @@ -2379,7 +2386,7 @@ public class UnicodeSetTest extends TestFmwk { errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception."); } } - + /* Tests the method public UnicodeSet add(Collection source) */ public void TestAddCollection() { UnicodeSet us = new UnicodeSet(); @@ -2390,9 +2397,99 @@ public class UnicodeSetTest extends TestFmwk { } catch (Exception e) { } } - + public void TestConstants() { assertEquals("Empty", new UnicodeSet(), UnicodeSet.EMPTY); assertEquals("All", new UnicodeSet(0,0x10FFFF), UnicodeSet.ALL_CODE_POINTS); } + + public void TestIteration() { + UnicodeSet us1 = new UnicodeSet("[abcM{xy}]"); + assertEquals("", "M, a-c", CollectionUtilities.join(us1.ranges(), ", ")); + + // Sample code + for (EntryRange range : us1.ranges()) { + // do something with code points between range.codepointEnd and range.codepointEnd; + } + for (String s : us1.strings()) { + // do something with each string; + } + + String[] tests = { + "[M-Qzab{XY}{ZW}]", + "[]", + "[a]", + "[a-c]", + "[{XY}]", + }; + for (String test : tests) { + UnicodeSet us = new UnicodeSet(test); + UnicodeSetIterator it = new UnicodeSetIterator(us); + for (EntryRange range : us.ranges()) { + final String title = range.toString(); + logln(title); + it.nextRange(); + assertEquals(title, it.codepoint, range.codepoint); + assertEquals(title, it.codepointEnd, range.codepointEnd); +// if (range.codepoint != -1) { +// } else { +// assertEquals(title, it.string, range.string); +// } + } + for (String s : us.strings()) { + it.nextRange(); + assertEquals("strings", it.string, s); + } + assertFalse("", it.next()); + } + } + + public void TestReplaceAndDelete() { + UnicodeSetSpanner m; + + m = new UnicodeSetSpanner(new UnicodeSet("[._]")); + assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._")); + assertEquals("", "_.__.__.__._", m.deleteFrom("_._a_._b_._c_._", SpanCondition.NOT_CONTAINED)); + + assertEquals("", "a_._b_._c", m.trim("_._a_._b_._c_._")); + assertEquals("", "a_._b_._c_._", m.trim("_._a_._b_._c_._", TrimOption.LEADING)); + assertEquals("", "_._a_._b_._c", m.trim("_._a_._b_._c_._", TrimOption.TRAILING)); + + assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", Quantifier.SPAN)); + assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", Quantifier.SPAN)); + assertEquals("", "XYXYXYaXYXYXYbXYXYXYcXYXYXY", m.replaceFrom("_._a_._b_._c_._", "XY")); + assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", Quantifier.SPAN)); + + m = new UnicodeSetSpanner(new UnicodeSet("\\p{uppercase}")); + assertEquals("", "TQBF", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED)); + + m = new UnicodeSetSpanner(m.getUnicodeSet().addAll(new UnicodeSet("\\p{lowercase}"))); + assertEquals("", "TheQuickBrownFox", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED)); + + m = new UnicodeSetSpanner(new UnicodeSet("[{ab}]")); + assertEquals("", "XXc acb", m.replaceFrom("ababc acb", "X")); + assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", Quantifier.SPAN)); + } + + public void TestCodePoints() { + // test supplemental code points and strings clusters + checkCodePoints("x\u0308", "z\u0308", Quantifier.MIN_ELEMENTS, null, 1); + checkCodePoints("𣿡", "𣿢", Quantifier.MIN_ELEMENTS, null, 1); + checkCodePoints("👦", "👧", Quantifier.MIN_ELEMENTS, null, 1); + } + + private void checkCodePoints(String a, String b, Quantifier quantifier, String expectedReplaced, int expectedCount) { + final String ab = a+b; + UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]")); + assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")", + expectedCount, + m.countIn(ab, quantifier)); + + if (expectedReplaced == null) { + expectedReplaced = "-" + b; + } + assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")", + expectedReplaced, m.replaceFrom(ab, "-", quantifier)); + } + }