From 8866e097b341e329930f8cebe2e8dec687c3d047 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 1 Sep 2017 19:32:35 +0000 Subject: [PATCH] ICU-11909 work in progress on FilteredBreakJ X-SVN-Rev: 40367 --- .../SimpleFilteredSentenceBreakIterator.java | 79 +++++++++----- .../ibm/icu/text/BreakIteratorFactory.java | 2 +- .../text/FilteredBreakIteratorBuilder.java | 101 +++++++++--------- .../icu/dev/test/rbbi/BreakIteratorTest.java | 28 ++--- 4 files changed, 119 insertions(+), 91 deletions(-) diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/SimpleFilteredSentenceBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/SimpleFilteredSentenceBreakIterator.java index 52549364e0..375a318fbf 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/SimpleFilteredSentenceBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/SimpleFilteredSentenceBreakIterator.java @@ -10,11 +10,13 @@ package com.ibm.icu.impl; import java.text.CharacterIterator; import java.util.HashSet; +import java.util.Locale; import com.ibm.icu.impl.ICUResourceBundle.OpenType; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.FilteredBreakIteratorBuilder; import com.ibm.icu.text.UCharacterIterator; +import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.BytesTrie; import com.ibm.icu.util.CharsTrie; import com.ibm.icu.util.CharsTrieBuilder; @@ -30,6 +32,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ? private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs. private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M." + private UnicodeSet glueSet = null; /** * @param adoptBreakIterator @@ -38,12 +41,16 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { * forward & partial char trie to adopt * @param backwardsTrie * backward trie to adopt + * @param glueSet the glue set to adopt */ public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, - CharsTrie backwardsTrie) { + CharsTrie backwardsTrie, UnicodeSet glueSet) { this.delegate = adoptBreakIterator; this.forwardsPartialTrie = forwardsPartialTrie; this.backwardsTrie = backwardsTrie; + if(!glueSet.isEmpty()) { + this.glueSet = new UnicodeSet(glueSet).freeze(); // copy + } } @@ -57,7 +64,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { /** * Is there an exception at this point? * - * @param n + * @param n the location of the possible break * @return */ private final boolean breakExceptionAt(int n) { @@ -71,6 +78,8 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { backwardsTrie.reset(); int uch; + + // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here?? // TODO only do this the 1st time? @@ -78,6 +87,17 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { uch = text.nextCodePoint(); } + // Check for a glue character + if(this.glueSet != null && text.getIndex() filterSet = new HashSet(); + private HashSet filterSet = new HashSet(); static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie static final int MATCH = (1 << 1); // < exact match - skip this one. static final int SuppressInReverse = (1 << 0); static final int AddToForward = (1 << 1); + private UnicodeSet glueSet = new UnicodeSet(); + + public Builder(Locale loc) { + this(ULocale.forLocale(loc)); + } /** * Create SimpleFilteredBreakIteratorBuilder using given locale * @param loc the locale to get filtered iterators @@ -300,28 +325,20 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { * Create SimpleFilteredBreakIteratorBuilder with no exception */ public Builder() { - filterSet = new HashSet(); } @Override - public boolean suppressBreakAfter(String str) { - if (filterSet == null) { - filterSet = new HashSet(); - } + public boolean suppressBreakAfter(CharSequence str) { return filterSet.add(str); } @Override - public boolean unsuppressBreakAfter(String str) { - if (filterSet == null) { - return false; - } else { - return filterSet.remove(str); - } + public boolean unsuppressBreakAfter(CharSequence str) { + return filterSet.remove(str); } @Override - public BreakIterator build(BreakIterator adoptBreakIterator) { + public BreakIterator wrapIteratorWithFilter(BreakIterator adoptBreakIterator) { if( filterSet.isEmpty() ) { // Short circuit - nothing to except. return adoptBreakIterator; @@ -334,29 +351,30 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { int fwdCount = 0; int subCount = filterSet.size(); - String[] ustrs = new String[subCount]; + CharSequence[] ustrs = new CharSequence[subCount]; int[] partials = new int[subCount]; CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs. CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M." int i = 0; - for (String s : filterSet) { + for (CharSequence s : filterSet) { ustrs[i] = s; // copy by value? partials[i] = 0; // default: no partial i++; } for (i = 0; i < subCount; i++) { - int nn = ustrs[i].indexOf('.'); // TODO: non-'.' abbreviations - if (nn > -1 && (nn + 1) != ustrs[i].length()) { + String thisStr = ustrs[i].toString(); // TODO: don't cast to String? + int nn = thisStr.indexOf('.'); // TODO: non-'.' abbreviations + if (nn > -1 && (nn + 1) != thisStr.length()) { // is partial. // is it unique? int sameAs = -1; for (int j = 0; j < subCount; j++) { if (j == i) continue; - if (ustrs[i].regionMatches(0, ustrs[j], 0, nn + 1)) { + if (thisStr.regionMatches(0, ustrs[j].toString() /* TODO */, 0, nn + 1)) { if (partials[j] == 0) { // hasn't been processed yet partials[j] = SuppressInReverse | AddToForward; } else if ((partials[j] & SuppressInReverse) != 0) { @@ -366,7 +384,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { } if ((sameAs == -1) && (partials[i] == 0)) { - StringBuilder prefix = new StringBuilder(ustrs[i].substring(0, nn + 1)); + StringBuilder prefix = new StringBuilder(thisStr.substring(0, nn + 1)); // first one - add the prefix to the reverse table. prefix.reverse(); builder.add(prefix, PARTIAL); @@ -377,8 +395,9 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { } for (i = 0; i < subCount; i++) { + final String thisStr = ustrs[i].toString(); // TODO if (partials[i] == 0) { - StringBuilder reversed = new StringBuilder(ustrs[i]).reverse(); + StringBuilder reversed = new StringBuilder(thisStr).reverse(); builder.add(reversed, MATCH); revCount++; } else { @@ -387,7 +406,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { // forward, // instead of "Ph.D." since we already know the "Ph." part is a match. // would need the trie to be able to hold 0-length strings, though. - builder2.add(ustrs[i], MATCH); // forward + builder2.add(thisStr, MATCH); // forward fwdCount++; } } @@ -399,7 +418,19 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { if (fwdCount > 0) { forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST); } - return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie); + return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie, glueSet); + } + + /* (non-Javadoc) + * @see com.ibm.icu.text.FilteredBreakIteratorBuilder#setGlueCharacters(com.ibm.icu.text.UnicodeSet) + * @internal + */ + public void setGlueCharacters(UnicodeSet set) { + if (set == null || set.isEmpty()) { + glueSet.clear(); + } else { + glueSet.set(set); + } } } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java index e08e33594a..f46ede1100 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java @@ -168,7 +168,7 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim final String ssKeyword = locale.getKeywordValue("ss"); if (ssKeyword != null && ssKeyword.equals("standard")) { final ULocale base = new ULocale(locale.getBaseName()); - return FilteredBreakIteratorBuilder.createInstance(base).build(iter); + return FilteredBreakIteratorBuilder.getInstance(base).wrapIteratorWithFilter(iter); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java index a265942a04..081ab393c3 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java @@ -8,6 +8,8 @@ */ package com.ibm.icu.text; +import java.util.Locale; + import com.ibm.icu.impl.SimpleFilteredSentenceBreakIterator; import com.ibm.icu.util.ULocale; @@ -19,29 +21,16 @@ import com.ibm.icu.util.ULocale; * in the string "Mr. Smith" (resulting in two segments), * but with "Mr." as an exception, a filtered break iterator * would consider the string "Mr. Smith" to be a single segment. - * - *

Note: An instance of {@link BreakIterator} returned by this builder - * class currently does not support following operations in this technology preview - * version: - *

    - *
  • {@link BreakIterator#next(int) next(int n)}
  • - *
  • {@link BreakIterator#previous() previous()}
  • - *
  • {@link BreakIterator#following(int) following(int offset)}
  • - *
  • {@link BreakIterator#preceding(int) preceding(int offset)}
  • - *
- * When one of above methods is called, {@link UnsupportedOperationException} will be - * thrown. * - * @author tomzhang - * - * @internal ICU 54 technology preview - * @deprecated This API might change or be removed in a future release. + *

This class is not intended for public subclassing. + * + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. */ -@Deprecated public abstract class FilteredBreakIteratorBuilder { /** - * Construct a FilteredBreakIteratorBuilder based on rules in a locale. + * Construct a FilteredBreakIteratorBuilder based on sentence break exception rules in a locale. * The rules are taken from CLDR exception data for the locale, * see http://www.unicode.org/reports/tr35/tr35-general.html#Segmentation_Exceptions * This is the equivalent of calling createInstance(UErrorCode&) @@ -49,26 +38,38 @@ public abstract class FilteredBreakIteratorBuilder { * of the CLDR exception data. * @param where the locale. * @return the new builder - * @internal ICU 54 technology preview - * @deprecated This API might change or be removed in a future release. + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. */ - @Deprecated - public static FilteredBreakIteratorBuilder createInstance(ULocale where) { - FilteredBreakIteratorBuilder ret = new SimpleFilteredSentenceBreakIterator.Builder(where); - return ret; + public static final FilteredBreakIteratorBuilder getInstance(Locale where) { + return new SimpleFilteredSentenceBreakIterator.Builder(where); + } + + /** + * Construct a FilteredBreakIteratorBuilder based on sentence break exception rules in a locale. + * The rules are taken from CLDR exception data for the locale, + * see http://www.unicode.org/reports/tr35/tr35-general.html#Segmentation_Exceptions + * This is the equivalent of calling createInstance(UErrorCode&) + * and then repeatedly calling addNoBreakAfter(...) with the contents + * of the CLDR exception data. + * @param where the locale. + * @return the new builder + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. + */ + public static final FilteredBreakIteratorBuilder getInstance(ULocale where) { + return new SimpleFilteredSentenceBreakIterator.Builder(where); } /** * Construct an empty FilteredBreakIteratorBuilder. * In this state, it will not suppress any segment boundaries. * @return the new builder - * @internal ICU 54 technology preview - * @deprecated This API might change or be removed in a future release. + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. */ - @Deprecated - public static FilteredBreakIteratorBuilder createInstance() { - FilteredBreakIteratorBuilder ret = new SimpleFilteredSentenceBreakIterator.Builder(); - return ret; + public static final FilteredBreakIteratorBuilder getEmptyInstance() { + return new SimpleFilteredSentenceBreakIterator.Builder(); } /** @@ -76,13 +77,12 @@ public abstract class FilteredBreakIteratorBuilder { * For example, suppressing "Mr.", then segments ending in "Mr." will not be returned * by the iterator. * @param str the string to suppress, such as "Mr." - * @return returns true if the string was not present and now added, + * @return true if the string was not present and now added, * false if the call was a no-op because the string was already being suppressed. - * @internal ICU 54 technology preview - * @deprecated This API might change or be removed in a future release. + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. */ - @Deprecated - public abstract boolean suppressBreakAfter(String str); + public abstract boolean suppressBreakAfter(CharSequence str); /** * Stop suppressing a certain string from being the end of the segment. @@ -90,34 +90,31 @@ public abstract class FilteredBreakIteratorBuilder { * the effect of earlier calls to suppressBreakAfter, or to un-do the effect of * locale data which may be suppressing certain strings. * @param str the str the string to unsuppress, such as "Mr." - * @return returns true if the string was present and now removed, + * @return true if the string was present and now removed, * false if the call was a no-op because the string was not being suppressed. - * @internal ICU 54 technology preview - * @deprecated This API might change or be removed in a future release. + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. */ - @Deprecated - public abstract boolean unsuppressBreakAfter(String str); + public abstract boolean unsuppressBreakAfter(CharSequence str); /** * Wrap (adopt) an existing break iterator in a new filtered instance. - * The resulting BreakIterator is owned by the caller. - * The BreakIteratorFilter may be destroyed before the BreakIterator is destroyed. - * Note that the adoptBreakIterator is adopted by the new BreakIterator + * Note that the wrappedBreakIterator is adopted by the new BreakIterator * and should no longer be used by the caller. * The FilteredBreakIteratorBuilder may be reused. - * @param adoptBreakIterator the break iterator to adopt - * @return the new BreakIterator, owned by the caller. - * @internal ICU 54 technology preview - * @deprecated This API might change or be removed in a future release. + * @param wrappedBreakIterator the break iterator to wrap + * @return the new BreakIterator + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. */ - @Deprecated - public abstract BreakIterator build(BreakIterator adoptBreakIterator); + public abstract BreakIterator wrapIteratorWithFilter(BreakIterator wrappedBreakIterator); /** * For subclass use - * @internal ICU 54 technology preview - * @deprecated This API might change or be removed in a future release. + * @internal + * @deprecated internal to ICU */ @Deprecated - protected FilteredBreakIteratorBuilder() {} + protected FilteredBreakIteratorBuilder() { + } } \ No newline at end of file diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java index 3e497ecf23..ce607c577f 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java @@ -427,8 +427,8 @@ public class BreakIteratorTest extends TestFmwk public void TestFilteredJapanese() { ULocale loc = ULocale.JAPANESE; BreakIterator brk = FilteredBreakIteratorBuilder - .createInstance(loc) - .build(BreakIterator.getSentenceInstance(loc)); + .getInstance(loc) + .wrapIteratorWithFilter(BreakIterator.getSentenceInstance(loc)); brk.setText("OKです。"); assertEquals("Starting point", 0, brk.current()); assertEquals("Next point", 5, brk.next()); @@ -513,20 +513,20 @@ public class BreakIteratorTest extends TestFmwk { logln("Constructing empty builder\n"); - builder = FilteredBreakIteratorBuilder.createInstance(); + builder = FilteredBreakIteratorBuilder.getEmptyInstance(); logln("Constructing base BI\n"); baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); logln("Building new BI\n"); - filteredBI = builder.build(baseBI); + filteredBI = builder.wrapIteratorWithFilter(baseBI); assertDefaultBreakBehavior(filteredBI, text); } { logln("Constructing empty builder\n"); - builder = FilteredBreakIteratorBuilder.createInstance(); + builder = FilteredBreakIteratorBuilder.getEmptyInstance(); logln("Adding Mr. as an exception\n"); @@ -540,7 +540,7 @@ public class BreakIteratorTest extends TestFmwk baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); logln("Building new BI\n"); - filteredBI = builder.build(baseBI); + filteredBI = builder.wrapIteratorWithFilter(baseBI); logln("Testing:"); filteredBI.setText(text); @@ -553,7 +553,7 @@ public class BreakIteratorTest extends TestFmwk { logln("Constructing empty builder\n"); - builder = FilteredBreakIteratorBuilder.createInstance(); + builder = FilteredBreakIteratorBuilder.getEmptyInstance(); logln("Adding Mr. and Capt as an exception\n"); assertEquals("3.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR)); @@ -563,7 +563,7 @@ public class BreakIteratorTest extends TestFmwk baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); logln("Building new BI\n"); - filteredBI = builder.build(baseBI); + filteredBI = builder.wrapIteratorWithFilter(baseBI); logln("Testing:"); filteredBI.setText(text); @@ -574,7 +574,7 @@ public class BreakIteratorTest extends TestFmwk { logln("Constructing English builder\n"); - builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH); + builder = FilteredBreakIteratorBuilder.getInstance(ULocale.ENGLISH); logln("Constructing base BI\n"); baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); @@ -583,7 +583,7 @@ public class BreakIteratorTest extends TestFmwk assertEquals("1st suppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_CAPT)); logln("Building new BI\n"); - filteredBI = builder.build(baseBI); + filteredBI = builder.wrapIteratorWithFilter(baseBI); if(filteredBI != null) { logln("Testing:"); @@ -597,13 +597,13 @@ public class BreakIteratorTest extends TestFmwk { logln("Constructing English builder\n"); - builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH); + builder = FilteredBreakIteratorBuilder.getInstance(ULocale.ENGLISH); logln("Constructing base BI\n"); baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); logln("Building new BI\n"); - filteredBI = builder.build(baseBI); + filteredBI = builder.wrapIteratorWithFilter(baseBI); if(filteredBI != null) { assertEnglishBreakBehavior(filteredBI, text); @@ -641,13 +641,13 @@ public class BreakIteratorTest extends TestFmwk { logln("Constructing French builder"); - builder = FilteredBreakIteratorBuilder.createInstance(ULocale.FRENCH); + builder = FilteredBreakIteratorBuilder.getInstance(ULocale.FRENCH); logln("Constructing base BI\n"); baseBI = BreakIterator.getSentenceInstance(Locale.FRENCH); logln("Building new BI\n"); - filteredBI = builder.build(baseBI); + filteredBI = builder.wrapIteratorWithFilter(baseBI); if(filteredBI != null) { assertFrenchBreakBehavior(filteredBI, text);