diff --git a/icu4j/build.xml b/icu4j/build.xml
index cbe4adf313..6065ed5246 100644
--- a/icu4j/build.xml
+++ b/icu4j/build.xml
@@ -194,7 +194,7 @@
diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java
index 4eeafb27f6..adc9b80eb5 100755
--- a/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java
@@ -8,6 +8,7 @@ package com.ibm.icu.dev.test.rbbi;
import com.ibm.icu.dev.test.*;
import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator_Old;
import java.text.StringCharacterIterator;
import java.util.Locale;
import java.util.Vector;
@@ -385,85 +386,91 @@ public class BreakIteratorTest extends TestFmwk
//=========================================================================
public void TestWordBreak() {
- Vector wordSelectionData = new Vector();
-
- wordSelectionData.addElement("12,34");
-
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("\u00A2"); //cent sign
- wordSelectionData.addElement("\u00A3"); //pound sign
- wordSelectionData.addElement("\u00A4"); //currency sign
- wordSelectionData.addElement("\u00A5"); //yen sign
- wordSelectionData.addElement("alpha-beta-gamma");
- wordSelectionData.addElement(".");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("Badges");
- wordSelectionData.addElement("?");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("BADGES");
- wordSelectionData.addElement("!");
- wordSelectionData.addElement("?");
- wordSelectionData.addElement("!");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("We");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("don't");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("need");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("no");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("STINKING");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("BADGES");
- wordSelectionData.addElement("!");
- wordSelectionData.addElement("!");
- wordSelectionData.addElement("!");
-
- wordSelectionData.addElement("012.566,5");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("123.3434,900");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("1000,233,456.000");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("1,23.322%");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("123.1222");
-
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("\u0024123,000.20");
-
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("179.01\u0025");
-
- wordSelectionData.addElement("Hello");
- wordSelectionData.addElement(",");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("how");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("are");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("you");
- wordSelectionData.addElement(" ");
- wordSelectionData.addElement("X");
- wordSelectionData.addElement(" ");
-
- wordSelectionData.addElement("Now");
- wordSelectionData.addElement("\r");
- wordSelectionData.addElement("is");
- wordSelectionData.addElement("\n");
- wordSelectionData.addElement("the");
- wordSelectionData.addElement("\r\n");
- wordSelectionData.addElement("time");
- wordSelectionData.addElement("\n");
- wordSelectionData.addElement("\r");
- wordSelectionData.addElement("for");
- wordSelectionData.addElement("\r");
- wordSelectionData.addElement("\r");
- wordSelectionData.addElement("all");
- wordSelectionData.addElement(" ");
-
- generalIteratorTest(wordBreak, wordSelectionData);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)wordBreak;
+ Vector wordSelectionData = new Vector();
+
+ wordSelectionData.addElement("12,34");
+
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("\u00A2"); //cent sign
+ wordSelectionData.addElement("\u00A3"); //pound sign
+ wordSelectionData.addElement("\u00A4"); //currency sign
+ wordSelectionData.addElement("\u00A5"); //yen sign
+ wordSelectionData.addElement("alpha-beta-gamma");
+ wordSelectionData.addElement(".");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("Badges");
+ wordSelectionData.addElement("?");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("BADGES");
+ wordSelectionData.addElement("!");
+ wordSelectionData.addElement("?");
+ wordSelectionData.addElement("!");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("We");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("don't");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("need");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("no");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("STINKING");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("BADGES");
+ wordSelectionData.addElement("!");
+ wordSelectionData.addElement("!");
+ wordSelectionData.addElement("!");
+
+ wordSelectionData.addElement("012.566,5");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("123.3434,900");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("1000,233,456.000");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("1,23.322%");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("123.1222");
+
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("\u0024123,000.20");
+
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("179.01\u0025");
+
+ wordSelectionData.addElement("Hello");
+ wordSelectionData.addElement(",");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("how");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("are");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("you");
+ wordSelectionData.addElement(" ");
+ wordSelectionData.addElement("X");
+ wordSelectionData.addElement(" ");
+
+ wordSelectionData.addElement("Now");
+ wordSelectionData.addElement("\r");
+ wordSelectionData.addElement("is");
+ wordSelectionData.addElement("\n");
+ wordSelectionData.addElement("the");
+ wordSelectionData.addElement("\r\n");
+ wordSelectionData.addElement("time");
+ wordSelectionData.addElement("\n");
+ wordSelectionData.addElement("\r");
+ wordSelectionData.addElement("for");
+ wordSelectionData.addElement("\r");
+ wordSelectionData.addElement("\r");
+ wordSelectionData.addElement("all");
+ wordSelectionData.addElement(" ");
+
+ generalIteratorTest(wordBreak, wordSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
/**
@@ -514,62 +521,81 @@ public class BreakIteratorTest extends TestFmwk
* @bug 4117554
*/
public void TestBug4117554Words() {
- Vector wordSelectionData = new Vector();
-
- // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
- // count as a Kanji character for the purposes of word breaking
- wordSelectionData.addElement("abc");
- wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03");
- wordSelectionData.addElement("abc");
-
- generalIteratorTest(wordBreak, wordSelectionData);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)wordBreak;
+ Vector wordSelectionData = new Vector();
+
+ // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
+ // count as a Kanji character for the purposes of word breaking
+ wordSelectionData.addElement("abc");
+ wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03");
+ wordSelectionData.addElement("abc");
+
+ generalIteratorTest(wordBreak, wordSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
public void TestSentenceBreak() {
- Vector sentenceSelectionData = new Vector();
-
- sentenceSelectionData.addElement("This is a simple sample sentence. ");
- sentenceSelectionData.addElement("(This is it.) ");
- sentenceSelectionData.addElement("This is a simple sample sentence. ");
- sentenceSelectionData.addElement("\"This isn\'t it.\" ");
- sentenceSelectionData.addElement("Hi! ");
- sentenceSelectionData.addElement("This is a simple sample sentence. ");
- sentenceSelectionData.addElement("It does not have to make any sense as you can see. ");
- sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
- sentenceSelectionData.addElement("Che la dritta via aveo smarrita. ");
- sentenceSelectionData.addElement("He said, that I said, that you said!! ");
-
- sentenceSelectionData.addElement("Don't rock the boat.\u2029");
-
- sentenceSelectionData.addElement("Because I am the daddy, that is why. ");
- sentenceSelectionData.addElement("Not on my time (el timo.)! ");
-
- sentenceSelectionData.addElement("So what!!\u2029");
-
- sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" ");
- sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
- sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n");
- sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
- sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? ");
- sentenceSelectionData.addElement("He answered, \"You may not!\" ");
- sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". ");
- sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
- sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? ");
- sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!");
-
- generalIteratorTest(sentenceBreak, sentenceSelectionData);
- }
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)sentenceBreak;
+ Vector sentenceSelectionData = new Vector();
+
+ sentenceSelectionData.addElement("This is a simple sample sentence. ");
+ sentenceSelectionData.addElement("(This is it.) ");
+ sentenceSelectionData.addElement("This is a simple sample sentence. ");
+ sentenceSelectionData.addElement("\"This isn\'t it.\" ");
+ sentenceSelectionData.addElement("Hi! ");
+ sentenceSelectionData.addElement("This is a simple sample sentence. ");
+ sentenceSelectionData.addElement("It does not have to make any sense as you can see. ");
+ sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
+ sentenceSelectionData.addElement("Che la dritta via aveo smarrita. ");
+ sentenceSelectionData.addElement("He said, that I said, that you said!! ");
+
+ sentenceSelectionData.addElement("Don't rock the boat.\u2029");
+
+ sentenceSelectionData.addElement("Because I am the daddy, that is why. ");
+ sentenceSelectionData.addElement("Not on my time (el timo.)! ");
+
+ sentenceSelectionData.addElement("So what!!\u2029");
+
+ sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" ");
+ sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
+ sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n");
+ sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
+ sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? ");
+ sentenceSelectionData.addElement("He answered, \"You may not!\" ");
+ sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". ");
+ sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
+ sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? ");
+ sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!");
+
+ generalIteratorTest(sentenceBreak, sentenceSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
+}
/**
* @bug 4113835
*/
public void TestBug4113835() {
- Vector sentenceSelectionData = new Vector();
-
- // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
- sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029");
-
- generalIteratorTest(sentenceBreak, sentenceSelectionData);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)sentenceBreak;
+
+ Vector sentenceSelectionData = new Vector();
+
+ // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
+ sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029");
+
+ generalIteratorTest(sentenceBreak, sentenceSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
/**
@@ -598,46 +624,58 @@ public class BreakIteratorTest extends TestFmwk
* @bug 4117554
*/
public void TestBug4117554Sentences() {
- Vector sentenceSelectionData = new Vector();
-
- // Treat fullwidth variants of .!? the same as their
- // normal counterparts
- sentenceSelectionData.addElement("I know I'm right\uff0e ");
- sentenceSelectionData.addElement("Right\uff1f ");
- sentenceSelectionData.addElement("Right\uff01 ");
-
- // Don't break sentences at boundary between CJK and digits
- sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
- + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
- + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
-
- // Break sentence between a sentence terminator and
- // opening punctuation
- sentenceSelectionData.addElement("no?");
- sentenceSelectionData.addElement("(yes)");
-
- generalIteratorTest(sentenceBreak, sentenceSelectionData);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)sentenceBreak;
+ Vector sentenceSelectionData = new Vector();
+
+ // Treat fullwidth variants of .!? the same as their
+ // normal counterparts
+ sentenceSelectionData.addElement("I know I'm right\uff0e ");
+ sentenceSelectionData.addElement("Right\uff1f ");
+ sentenceSelectionData.addElement("Right\uff01 ");
+
+ // Don't break sentences at boundary between CJK and digits
+ sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
+ + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
+ + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
+
+ // Break sentence between a sentence terminator and
+ // opening punctuation
+ sentenceSelectionData.addElement("no?");
+ sentenceSelectionData.addElement("(yes)");
+
+ generalIteratorTest(sentenceBreak, sentenceSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
/**
* @bug 4158381
*/
public void TestBug4158381() {
- Vector sentenceSelectionData = new Vector();
-
- // Don't break sentence after period if it isn't followed by a space
- sentenceSelectionData.addElement("Test Flags.Flag
class. ");
- sentenceSelectionData.addElement("Another test.\u2029");
-
- // No breaks when there are no terminators around
- sentenceSelectionData.addElement("
Provides a set of "
- + ""lightweight" (all-javaTM"
- + " language) components that, "
- + "to the maximum degree possible, work the same on all platforms. ");
- sentenceSelectionData.addElement("Another test.\u2029");
-
- generalIteratorTest(sentenceBreak, sentenceSelectionData);
- }
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)sentenceBreak;
+ Vector sentenceSelectionData = new Vector();
+
+ // Don't break sentence after period if it isn't followed by a space
+ sentenceSelectionData.addElement("Test Flags.Flag
class. ");
+ sentenceSelectionData.addElement("Another test.\u2029");
+
+ // No breaks when there are no terminators around
+ sentenceSelectionData.addElement("
Provides a set of "
+ + ""lightweight" (all-javaTM"
+ + " language) components that, "
+ + "to the maximum degree possible, work the same on all platforms. ");
+ sentenceSelectionData.addElement("Another test.\u2029");
+
+ generalIteratorTest(sentenceBreak, sentenceSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
+}
/**
* @bug 4143071
@@ -767,7 +805,6 @@ public class BreakIteratorTest extends TestFmwk
lineSelectionData.addElement("mouse ");
lineSelectionData.addElement("(one)");
lineSelectionData.addElement("(two)\n");
-
generalIteratorTest(lineBreak, lineSelectionData);
}
@@ -775,40 +812,52 @@ public class BreakIteratorTest extends TestFmwk
* @bug 4035266
*/
public void TestBug4035266() {
- Vector lineSelectionData = new Vector();
-
- lineSelectionData.addElement("The ");
- lineSelectionData.addElement("balance ");
- lineSelectionData.addElement("is ");
- lineSelectionData.addElement("$-23,456.78, ");
- lineSelectionData.addElement("not ");
- lineSelectionData.addElement("-$32,456.78!\n");
-
- generalIteratorTest(lineBreak, lineSelectionData);
- }
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)lineBreak;
+ Vector lineSelectionData = new Vector();
+
+ lineSelectionData.addElement("The ");
+ lineSelectionData.addElement("balance ");
+ lineSelectionData.addElement("is ");
+ lineSelectionData.addElement("$-23,456.78, ");
+ lineSelectionData.addElement("not ");
+ lineSelectionData.addElement("-$32,456.78!\n");
+
+ generalIteratorTest(lineBreak, lineSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
+}
/**
* @bug 4098467
*/
public void TestBug4098467Lines() {
- Vector lineSelectionData = new Vector();
-
- // What follows is a string of Korean characters (I found it in the Yellow Pages
- // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
- // it correctly), first as precomposed syllables, and then as conjoining jamo.
- // Both sequences should be semantically identical and break the same way.
- // precomposed syllables...
- lineSelectionData.addElement("\uc0c1\ud56d ");
- lineSelectionData.addElement("\ud55c\uc778 ");
- lineSelectionData.addElement("\uc5f0\ud569 ");
- lineSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c ");
- // conjoining jamo...
- lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc ");
- lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab ");
- lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 ");
- lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
-
- generalIteratorTest(lineBreak, lineSelectionData);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)lineBreak;
+ Vector lineSelectionData = new Vector();
+
+ // What follows is a string of Korean characters (I found it in the Yellow Pages
+ // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
+ // it correctly), first as precomposed syllables, and then as conjoining jamo.
+ // Both sequences should be semantically identical and break the same way.
+ // precomposed syllables...
+ lineSelectionData.addElement("\uc0c1\ud56d ");
+ lineSelectionData.addElement("\ud55c\uc778 ");
+ lineSelectionData.addElement("\uc5f0\ud569 ");
+ lineSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c ");
+ // conjoining jamo...
+ lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc ");
+ lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab ");
+ lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 ");
+ lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
+
+ generalIteratorTest(lineBreak, lineSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
public void TestThaiLineBreak() {
@@ -949,22 +998,28 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\
* @bug 4217703
*/
public void TestBug4217703() {
- Vector lineSelectionData = new Vector();
-
- // There shouldn't be a line break between sentence-ending punctuation
- // and a closing quote
- lineSelectionData.addElement("He ");
- lineSelectionData.addElement("said ");
- lineSelectionData.addElement("\"Go!\" ");
- lineSelectionData.addElement("I ");
- lineSelectionData.addElement("went. ");
-
- lineSelectionData.addElement("Hashtable$Enumeration ");
- lineSelectionData.addElement("getText().");
- lineSelectionData.addElement("getIndex()");
-
- generalIteratorTest(lineBreak, lineSelectionData);
- }
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)lineBreak;
+ Vector lineSelectionData = new Vector();
+
+ // There shouldn't be a line break between sentence-ending punctuation
+ // and a closing quote
+ lineSelectionData.addElement("He ");
+ lineSelectionData.addElement("said ");
+ lineSelectionData.addElement("\"Go!\" ");
+ lineSelectionData.addElement("I ");
+ lineSelectionData.addElement("went. ");
+
+ lineSelectionData.addElement("Hashtable$Enumeration ");
+ lineSelectionData.addElement("getText().");
+ lineSelectionData.addElement("getIndex()");
+
+ generalIteratorTest(lineBreak, lineSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
+}
private static final String graveS = "S\u0300";
private static final String acuteBelowI = "i\u0317";
@@ -1091,18 +1146,24 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\
}
public void TestBug4146175Sentences() {
- Vector sentenceSelectionData = new Vector();
-
- // break between periods and opening punctuation even when there's no
- // intervening space
- sentenceSelectionData.addElement("end.");
- sentenceSelectionData.addElement("(This is\u2029");
-
- // treat the fullwidth period as an unambiguous sentence terminator
- sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e");
- sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f");
-
- generalIteratorTest(sentenceBreak, sentenceSelectionData);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)lineBreak;
+ Vector sentenceSelectionData = new Vector();
+
+ // break between periods and opening punctuation even when there's no
+ // intervening space
+ sentenceSelectionData.addElement("end.");
+ sentenceSelectionData.addElement("(This is\u2029");
+
+ // treat the fullwidth period as an unambiguous sentence terminator
+ sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e");
+ sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f");
+
+ generalIteratorTest(sentenceBreak, sentenceSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
public void TestBug4146175Lines() {
@@ -1116,14 +1177,20 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\
}
public void TestBug4214367() {
- Vector wordSelectionData = new Vector();
-
- // the hiragana and katakana iteration marks and the long vowel mark
- // are not being treated correctly by the word-break iterator
- wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042");
- wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2");
-
- generalIteratorTest(wordBreak, wordSelectionData);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)wordBreak;
+ Vector wordSelectionData = new Vector();
+
+ // the hiragana and katakana iteration marks and the long vowel mark
+ // are not being treated correctly by the word-break iterator
+ wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042");
+ wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2");
+
+ generalIteratorTest(wordBreak, wordSelectionData);
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
private static final String cannedTestChars
@@ -1142,100 +1209,118 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\
public void TestWordInvariants()
{
- BreakIterator e = BreakIterator.getWordInstance();
- doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
- + "\u30a3\u4e00\u4e01\u4e02");
- doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
- + "\u30a3\u4e00\u4e01\u4e02");
+ BreakIterator e = BreakIterator.getWordInstance();
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)e;
+ doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
+ + "\u30a3\u4e00\u4e01\u4e02");
+ doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
+ + "\u30a3\u4e00\u4e01\u4e02");
+ }
+ catch (ClassCastException ex) {
+ logln("New Break Iterator, skipping old test");
+ }
}
public void TestLineInvariants()
{
- BreakIterator e = BreakIterator.getLineInstance();
- String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045"
- + "\u30a3\u4e00\u4e01\u4e02";
- doBreakInvariantTest(e, testChars);
- doOtherInvariantTest(e, testChars);
-
- int errorCount = 0;
-
- // in addition to the other invariants, a line-break iterator should make sure that:
- // it doesn't break around the non-breaking characters
- String noBreak = "\u00a0\u2007\u2011\ufeff";
- StringBuffer work = new StringBuffer("aaa");
- for (int i = 0; i < testChars.length(); i++) {
- char c = testChars.charAt(i);
- if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003')
- continue;
- work.setCharAt(0, c);
- for (int j = 0; j < noBreak.length(); j++) {
- work.setCharAt(1, noBreak.charAt(j));
- for (int k = 0; k < testChars.length(); k++) {
- work.setCharAt(2, testChars.charAt(k));
- e.setText(work.toString());
- for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
- if (l == 1 || l == 2) {
- errln("Got break between U+" + Integer.toHexString((int)
- (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
- (int)(work.charAt(l))));
- errorCount++;
- if (errorCount >= 75)
- return;
- }
- }
- }
- }
-
- // it does break after dashes (unless they're followed by a digit, a non-spacing mark,
- // a currency symbol, a space, a format-control character, a regular control character,
- // a line or paragraph separator, or another dash)
- String dashes = "-\u00ad\u2010\u2012\u2013\u2014";
- for (int i = 0; i < testChars.length(); i++) {
- work.setCharAt(0, testChars.charAt(i));
- for (int j = 0; j < dashes.length(); j++) {
- work.setCharAt(1, dashes.charAt(j));
- for (int k = 0; k < testChars.length(); k++) {
- char c = testChars.charAt(k);
- if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER ||
- Character.getType(c) == Character.OTHER_NUMBER ||
- Character.getType(c) == Character.NON_SPACING_MARK ||
- Character.getType(c) == Character.ENCLOSING_MARK ||
- Character.getType(c) == Character.CURRENCY_SYMBOL ||
- Character.getType(c) == Character.DASH_PUNCTUATION ||
- Character.getType(c) == Character.SPACE_SEPARATOR ||
- Character.getType(c) == Character.FORMAT ||
- Character.getType(c) == Character.CONTROL ||
- c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' ||
- c == '\u0003' || c == '\u2007' || c == '\u2011' ||
- c == '\ufeff')
- continue;
- work.setCharAt(2, c);
- e.setText(work.toString());
- boolean saw2 = false;
- for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
- if (l == 2)
- saw2 = true;
- if (!saw2) {
- errln("Didn't get break between U+" + Integer.toHexString((int)
- (work.charAt(1))) + " and U+" + Integer.toHexString(
- (int)(work.charAt(2))));
- errorCount++;
- if (errorCount >= 75)
- return;
- }
- }
- }
- }
+ BreakIterator e = BreakIterator.getLineInstance();
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)e;
+ String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045"
+ + "\u30a3\u4e00\u4e01\u4e02";
+ doBreakInvariantTest(e, testChars);
+ doOtherInvariantTest(e, testChars);
+
+ int errorCount = 0;
+
+ // in addition to the other invariants, a line-break iterator should make sure that:
+ // it doesn't break around the non-breaking characters
+ String noBreak = "\u00a0\u2007\u2011\ufeff";
+ StringBuffer work = new StringBuffer("aaa");
+ for (int i = 0; i < testChars.length(); i++) {
+ char c = testChars.charAt(i);
+ if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003')
+ continue;
+ work.setCharAt(0, c);
+ for (int j = 0; j < noBreak.length(); j++) {
+ work.setCharAt(1, noBreak.charAt(j));
+ for (int k = 0; k < testChars.length(); k++) {
+ work.setCharAt(2, testChars.charAt(k));
+ e.setText(work.toString());
+ for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
+ if (l == 1 || l == 2) {
+ errln("Got break between U+" + Integer.toHexString((int)
+ (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
+ (int)(work.charAt(l))));
+ errorCount++;
+ if (errorCount >= 75)
+ return;
+ }
+ }
+ }
+ }
+
+ // it does break after dashes (unless they're followed by a digit, a non-spacing mark,
+ // a currency symbol, a space, a format-control character, a regular control character,
+ // a line or paragraph separator, or another dash)
+ String dashes = "-\u00ad\u2010\u2012\u2013\u2014";
+ for (int i = 0; i < testChars.length(); i++) {
+ work.setCharAt(0, testChars.charAt(i));
+ for (int j = 0; j < dashes.length(); j++) {
+ work.setCharAt(1, dashes.charAt(j));
+ for (int k = 0; k < testChars.length(); k++) {
+ char c = testChars.charAt(k);
+ if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER ||
+ Character.getType(c) == Character.OTHER_NUMBER ||
+ Character.getType(c) == Character.NON_SPACING_MARK ||
+ Character.getType(c) == Character.ENCLOSING_MARK ||
+ Character.getType(c) == Character.CURRENCY_SYMBOL ||
+ Character.getType(c) == Character.DASH_PUNCTUATION ||
+ Character.getType(c) == Character.SPACE_SEPARATOR ||
+ Character.getType(c) == Character.FORMAT ||
+ Character.getType(c) == Character.CONTROL ||
+ c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' ||
+ c == '\u0003' || c == '\u2007' || c == '\u2011' ||
+ c == '\ufeff')
+ continue;
+ work.setCharAt(2, c);
+ e.setText(work.toString());
+ boolean saw2 = false;
+ for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
+ if (l == 2)
+ saw2 = true;
+ if (!saw2) {
+ errln("Didn't get break between U+" + Integer.toHexString((int)
+ (work.charAt(1))) + " and U+" + Integer.toHexString(
+ (int)(work.charAt(2))));
+ errorCount++;
+ if (errorCount >= 75)
+ return;
+ }
+ }
+ }
+ }
+ }
+ catch (ClassCastException ex) {
+ logln("New Break Iterator, skipping old test");
+ }
}
public void TestCharacterInvariants()
- {
- BreakIterator e = BreakIterator.getCharacterInstance();
- doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
- + "\u11a9\u11aa");
- doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
- + "\u11a9\u11aa");
- }
+ {
+ BreakIterator e = BreakIterator.getCharacterInstance();
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)e;
+ doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
+ + "\u11a9\u11aa");
+ doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
+ + "\u11a9\u11aa");
+ }
+ catch (ClassCastException ex) {
+ logln("New Break Iterator, skipping old test");
+ }
+ }
public void TestEmptyString()
{
@@ -1260,132 +1345,144 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\
*/
public void TestJapaneseLineBreak()
{
- StringBuffer testString = new StringBuffer("\u4e00x\u4e8c");
- String precedingChars = "([{\u00ab$\u00a5\u00a3\u00a4\u2018\u201a\u201c\u201e\u201b\u201f";
- String followingChars = ")]}\u00bb!%,.\u3001\u3002\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc:;\u309b\u309c\u3005\u309d\u309e\u30fd\u30fe\u2019\u201d\u00b0\u2032\u2033\u2034\u2030\u2031\u2103\u2109\u00a2\u0300\u0301\u0302";
- BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN);
-
- for (int i = 0; i < precedingChars.length(); i++) {
- testString.setCharAt(1, precedingChars.charAt(i));
- iter.setText(testString.toString());
- int j = iter.first();
- if (j != 0)
- errln("ja line break failure: failed to start at 0");
- j = iter.next();
- if (j != 1)
- errln("ja line break failure: failed to stop before '" + precedingChars.charAt(i)
- + "' (" + ((int)(precedingChars.charAt(i))) + ")");
- j = iter.next();
- if (j != 3)
- errln("ja line break failure: failed to skip position after '" + precedingChars.charAt(i)
- + "' (" + ((int)(precedingChars.charAt(i))) + ")");
- }
-
- for (int i = 0; i < followingChars.length(); i++) {
- testString.setCharAt(1, followingChars.charAt(i));
- iter.setText(testString.toString());
- int j = iter.first();
- if (j != 0)
- errln("ja line break failure: failed to start at 0");
- j = iter.next();
- if (j != 2)
- errln("ja line break failure: failed to skip position before '" + followingChars.charAt(i)
- + "' (" + ((int)(followingChars.charAt(i))) + ")");
- j = iter.next();
- if (j != 3)
- errln("ja line break failure: failed to stop after '" + followingChars.charAt(i)
- + "' (" + ((int)(followingChars.charAt(i))) + ")");
- }
+ StringBuffer testString = new StringBuffer("\u4e00x\u4e8c");
+ String precedingChars = "([{\u00ab$\u00a5\u00a3\u00a4\u2018\u201a\u201c\u201e\u201b\u201f";
+ String followingChars = ")]}\u00bb!%,.\u3001\u3002\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc:;\u309b\u309c\u3005\u309d\u309e\u30fd\u30fe\u2019\u201d\u00b0\u2032\u2033\u2034\u2030\u2031\u2103\u2109\u00a2\u0300\u0301\u0302";
+ BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)iter;
+
+ for (int i = 0; i < precedingChars.length(); i++) {
+ testString.setCharAt(1, precedingChars.charAt(i));
+ iter.setText(testString.toString());
+ int j = iter.first();
+ if (j != 0)
+ errln("ja line break failure: failed to start at 0");
+ j = iter.next();
+ if (j != 1)
+ errln("ja line break failure: failed to stop before '" + precedingChars.charAt(i)
+ + "' (" + ((int)(precedingChars.charAt(i))) + ")");
+ j = iter.next();
+ if (j != 3)
+ errln("ja line break failure: failed to skip position after '" + precedingChars.charAt(i)
+ + "' (" + ((int)(precedingChars.charAt(i))) + ")");
+ }
+
+ for (int i = 0; i < followingChars.length(); i++) {
+ testString.setCharAt(1, followingChars.charAt(i));
+ iter.setText(testString.toString());
+ int j = iter.first();
+ if (j != 0)
+ errln("ja line break failure: failed to start at 0");
+ j = iter.next();
+ if (j != 2)
+ errln("ja line break failure: failed to skip position before '" + followingChars.charAt(i)
+ + "' (" + ((int)(followingChars.charAt(i))) + ")");
+ j = iter.next();
+ if (j != 3)
+ errln("ja line break failure: failed to stop after '" + followingChars.charAt(i)
+ + "' (" + ((int)(followingChars.charAt(i))) + ")");
+ }
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
/**
* Bug 4638433
*/
- public void TestLineBreakBasedOnUnicode3_0_0() {
- BreakIterator iter;
- int i;
-
- /* Latin Extend-B characters
- * 0x0218-0x0233 which have been added since Unicode 3.0.0.
- */
- iter = BreakIterator.getWordInstance(Locale.US);
- iter.setText("\u0216\u0217\u0218\u0219\u021A");
- i = iter.first();
- i = iter.next();
- if (i != 5) {
- errln("Word break failure: failed to stop at 5 and bounded at " + i);
+ public void TestLineBreakBasedOnUnicode3_0_0() {
+ BreakIterator iter;
+ int i;
+
+ /* Latin Extend-B characters
+ * 0x0218-0x0233 which have been added since Unicode 3.0.0.
+ */
+ iter = BreakIterator.getWordInstance(Locale.US);
+ try {
+ RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)iter;
+ iter.setText("\u0216\u0217\u0218\u0219\u021A");
+ i = iter.first();
+ i = iter.next();
+ if (i != 5) {
+ errln("Word break failure: failed to stop at 5 and bounded at " + i);
+ }
+
+
+ iter = BreakIterator.getLineInstance(Locale.US);
+
+ /*
+ * \u301f has changed its category from Ps to Pe since Unicode 2.1.
+ */
+ iter.setText("32\u301f1");
+ i = iter.first();
+ i = iter.next();
+ if (i != 3) {
+ errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
+ }
+
+ /* Mongolian
+ * which have been added since Unicode 3.0.0.
+ */
+ iter.setText("\u1820\u1806\u1821");
+ i = iter.first();
+ i = iter.next();
+ if (i != 2) {
+ errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
+ }
+
+ /* Khmer which have
+ * been added since Unicode 3.0.0.
+ */
+ /*
+ * Richard: fail to pass, refer to #3550
+ iter.setText("\u17E0\u17DB\u17E1");
+ i = iter.first();
+ i = iter.next();
+ if (i != 1) {
+ errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
+ }
+ i = iter.next();
+ if (i != 3) {
+ errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
+ }*/
+
+ /* Ogham which have
+ * been added since Unicode 3.0.0.
+ */
+ iter.setText("\u1692\u1680\u1696");
+ i = iter.first();
+ i = iter.next();
+ if (i != 2) {
+ errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
+ }
+
+
+ // Confirm changes in BreakIteratorRules_th.java have been reflected.
+ iter = BreakIterator.getLineInstance(new Locale("th", ""));
+
+ /* Thai
+ *
+ *
+ *
+ *
+ */
+ iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
+ i = iter.first();
+ i = iter.next();
+ if (i != 1) {
+ errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
+ }
+ i = iter.next();
+ if (i != 4) {
+ errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
+ }
+ }
+ catch (ClassCastException e) {
+ logln("New Break Iterator, skipping old test");
+ }
}
-
-
- iter = BreakIterator.getLineInstance(Locale.US);
-
- /*
- * \u301f has changed its category from Ps to Pe since Unicode 2.1.
- */
- iter.setText("32\u301f1");
- i = iter.first();
- i = iter.next();
- if (i != 3) {
- errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
- }
-
- /* Mongolian
- * which have been added since Unicode 3.0.0.
- */
- iter.setText("\u1820\u1806\u1821");
- i = iter.first();
- i = iter.next();
- if (i != 2) {
- errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
- }
-
- /* Khmer which have
- * been added since Unicode 3.0.0.
- */
- /*
- * Richard: fail to pass, refer to #3550
- iter.setText("\u17E0\u17DB\u17E1");
- i = iter.first();
- i = iter.next();
- if (i != 1) {
- errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
- }
- i = iter.next();
- if (i != 3) {
- errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
- }*/
-
- /* Ogham which have
- * been added since Unicode 3.0.0.
- */
- iter.setText("\u1692\u1680\u1696");
- i = iter.first();
- i = iter.next();
- if (i != 2) {
- errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
- }
-
-
- // Confirm changes in BreakIteratorRules_th.java have been reflected.
- iter = BreakIterator.getLineInstance(new Locale("th", ""));
-
- /* Thai
- *
- *
- *
- *
- */
- iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
- i = iter.first();
- i = iter.next();
- if (i != 1) {
- errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
- }
- i = iter.next();
- if (i != 4) {
- errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
- }
- }
/**
* @bug 4068137
diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java
index f1a8684f29..6fdd8700d1 100755
--- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java
@@ -186,7 +186,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
errln("ERROR: next()/following() at last position returned #"
+ p + " and " + q + " instead of" + testString.length() + "\n");
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
- testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
+ testString = "Write hindi here. \u092d\u093e\u0930\u0301 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
logln("testing char iter - string:- \"" + testString + "\"");
charIter1.setText(testString);
p = charIter1.first();
@@ -209,7 +209,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
// hindi starts here
p = q;
q = charIter1.next(4);
- doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0924");
+ doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0301"); // Nonsense, but compatible between old and new rules.
p = q;
q = charIter1.next(2);
doTest(testString, p, q, 26, " \u0938\u0941\u0902");
@@ -217,13 +217,13 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
q = charIter1.following(24);
doTest(testString, 24, q, 26, "\u0941\u0902");
q = charIter1.following(20);
- doTest(testString, 20, q, 21, "\u0930");
+ doTest(testString, 20, q, 22, "\u0930\u0301");
p = charIter1.following(charIter1.last());
q = charIter1.next(charIter1.last());
if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE)
errln("ERROR: following()/next() at last position returned #"
+ p + " and " + q + " instead of" + testString.length());
- testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
+ testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000.";
RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault());
logln("testing sentence iter - String:- \"" + testString + "\"");
sentIter1.setText(testString);
@@ -243,7 +243,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? ");
p = q;
q = sentIter1.next();
- doTest(testString, p, q, 83, "This\n costs $20,00,000.");
+ doTest(testString, p, q, 83, "This costs $20,00,000.");
q = sentIter1.following(1);
doTest(testString, 1, q, 7, "ello! ");
q = sentIter1.following(10);
@@ -324,7 +324,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
p = wordIter1.preceding(wordIter1.first());
if (p != RuleBasedBreakIterator.DONE)
errln("ERROR: preceding() at starting position returned #" + p + " instead of 0");
- testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
+ testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u0301\u0964";
logln("testing character iteration for string \" " + testString + "\" \n");
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
charIter1.setText(testString);
@@ -335,7 +335,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
doTest(testString, p, q, 31, "\u0964");
p = q;
q = charIter1.previous();
- doTest(testString, p, q, 29, "\u0939\u094c");
+ doTest(testString, p, q, 29, "\u0939\u0301");
q = charIter1.preceding(26);
doTest(testString, 26, q, 23, "\u0938\u0941\u0902");
q = charIter1.preceding(16);
@@ -349,7 +349,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE)
errln("ERROR: previous()/preceding() at starting position returned #"
+ p + " and " + q + " instead of 0\n");
- testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
+ testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000.";
logln("testing sentence iter - String:- \"" + testString + "\"");
RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault());
sentIter1.setText(testString);
@@ -357,7 +357,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
if (p != testString.length())
errln("ERROR: last() returned" + p + "instead of " + testString.length());
q = sentIter1.previous();
- doTest(testString, p, q, 60, "This\n costs $20,00,000.");
+ doTest(testString, p, q, 60, "This costs $20,00,000.");
p = q;
q = sentIter1.previous();
doTest(testString, p, q, 41, "How are you doing? ");
@@ -399,7 +399,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
* Tests the method IsBoundary() of RuleBasedBreakIterator
**/
public void TestIsBoundary() {
- String testString1 = "Write here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
+ String testString1 = "Write here. \u092d\u0301\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 a\u0301u";
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
charIter1.setText(testString1);
int bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26};
diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
index b46869951d..f868b60229 100755
--- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@@ -9,6 +9,7 @@ package com.ibm.icu.dev.test.rbbi;
//Regression testing of RuleBasedBreakIterator
import com.ibm.icu.dev.test.*;
import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator_Old;
import java.util.Vector;
public class RBBITest extends TestFmwk
@@ -43,6 +44,15 @@ public class RBBITest extends TestFmwk
public void TestDefaultRuleBasedCharacterIteration(){
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getCharacterInstance();
logln("Testing the RBBI for character iteration by using default rules");
+ try {
+ RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
+ }
+ catch (ClassCastException e) {
+ // Bail out if using new RBBI implementation
+ logln("Test Skipped.");
+ return;
+ }
+
//fetch the rules used to create the above RuleBasedBreakIterator
String defaultRules=rbbi.toString();
@@ -172,6 +182,14 @@ public class RBBITest extends TestFmwk
public void TestDefaultRuleBasedWordIteration(){
logln("Testing the RBBI for word iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance();
+ try {
+ RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
+ }
+ catch (ClassCastException e) {
+ // Bail out if using new RBBI implementation
+ logln("Test Skipped.");
+ return;
+ }
//fetch the rules used to create the above RuleBasedBreakIterator
String defaultRules=rbbi.toString();
@@ -325,6 +343,14 @@ public class RBBITest extends TestFmwk
logln("Testing the RBBI for sentence iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getSentenceInstance();
//fetch the rules used to create the above RuleBasedBreakIterator
+ try {
+ RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
+ }
+ catch (ClassCastException e) {
+ // Bail out if using new RBBI implementation
+ logln("Test Skipped.");
+ return;
+ }
String defaultRules=rbbi.toString();
RuleBasedBreakIterator sentIterDefault=null;
try{
@@ -418,16 +444,24 @@ public class RBBITest extends TestFmwk
}
public void TestDefaultRuleBasedLineIteration(){
- logln("Testing the RBBI for line iteration using default rules");
- RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance();
- //fetch the rules used to create the above RuleBasedBreakIterator
- String defaultRules=rbbi.toString();
- RuleBasedBreakIterator lineIterDefault=null;
- try{
- lineIterDefault = new RuleBasedBreakIterator(defaultRules);
- }catch(IllegalArgumentException iae){
- errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());
- }
+ logln("Testing the RBBI for line iteration using default rules");
+ RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance();
+ //fetch the rules used to create the above RuleBasedBreakIterator
+ try {
+ RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
+ }
+ catch (ClassCastException e) {
+ // Bail out if using new RBBI implementation
+ logln("Test Skipped.");
+ return;
+ }
+ String defaultRules=rbbi.toString();
+ RuleBasedBreakIterator lineIterDefault=null;
+ try{
+ lineIterDefault = new RuleBasedBreakIterator(defaultRules);
+ }catch(IllegalArgumentException iae){
+ errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());
+ }
Vector linedata = new Vector();
linedata.addElement("Multi-");
@@ -524,6 +558,15 @@ public class RBBITest extends TestFmwk
// get overridden.
rbbi.toString();
RuleBasedBreakIterator lineIter=null;
+ try {
+ RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
+ }
+ catch (ClassCastException e) {
+ // Bail out if using new RBBI implementation
+ logln("Test Skipped.");
+ return;
+ }
+
try{
lineIter = new RuleBasedBreakIterator(rules);
}catch(IllegalArgumentException iae){
@@ -651,7 +694,15 @@ public class RBBITest extends TestFmwk
public void TestAbbrRuleBasedWordIteration(){
logln("Testing the RBBI for word iteration by adding rules to support abbreviation");
RuleBasedBreakIterator rb =(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance();
-
+ try {
+ // This test won't work with the new break iterators. Cast will fail in this case.
+ RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old) rb;
+ }
+ catch (ClassCastException e) {
+ logln("Test skipped.");
+ return;
+ }
+
String wrules2="$abbr=((Mr.)|(Mrs.)|(Ms.)|(Dr.)|(U.S.));" + // abbreviations.
rb.toString() +
"($abbr$ws)*$word;";
@@ -701,6 +752,10 @@ public class RBBITest extends TestFmwk
buffer.append(text);
}
text = buffer.toString();
+ if (rbbi == null) {
+ errln("null iterator, test skipped.");
+ return;
+ }
rbbi.setText(text);
diff --git a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java
index fa8bf4ac7a..feadf38e96 100755
--- a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java
+++ b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java
@@ -29,11 +29,11 @@ public class BreakIteratorRules extends ListResourceBundle {
// BreakIteratorClasses lists the class names to instantiate for each
// built-in type of BreakIterator
{ "BreakIteratorClasses",
- new String[] { "RuleBasedBreakIterator", // character-break iterator class
- "RuleBasedBreakIterator", // word-break iterator class
- "RuleBasedBreakIterator", // line-break iterator class
- "RuleBasedBreakIterator", // sentence-break iterator class
- "RuleBasedBreakIterator"} // Title-Case break iterator class
+ new String[] { "RuleBasedBreakIterator_New", // character-break iterator class
+ "RuleBasedBreakIterator_New", // word-break iterator class
+ "RuleBasedBreakIterator_New", // line-break iterator class
+ "RuleBasedBreakIterator_New", // sentence-break iterator class
+ "RuleBasedBreakIterator_New"} // Title-Case break iterator class
},
// rules describing how to break between logical characters
diff --git a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules_th.java b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules_th.java
index a834611126..15901398dc 100755
--- a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules_th.java
+++ b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules_th.java
@@ -27,10 +27,10 @@ public class BreakIteratorRules_th extends ListResourceBundle {
// iterator. Notice we're now using DictionaryBasedBreakIterator
// for word and line breaking.
{ "BreakIteratorClasses",
- new String[] { "RuleBasedBreakIterator", // character-break iterator class
+ new String[] { "RuleBasedBreakIterator_New", // character-break iterator class
"DictionaryBasedBreakIterator", // word-break iterator class
"DictionaryBasedBreakIterator", // line-break iterator class
- "RuleBasedBreakIterator" } // sentence-break iterator class
+ "RuleBasedBreakIterator_New" } // sentence-break iterator class
},
{ "WordBreakRules",
diff --git a/icu4j/src/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/src/com/ibm/icu/text/BreakIteratorFactory.java
index db4cb9abed..738e2b6e39 100644
--- a/icu4j/src/com/ibm/icu/text/BreakIteratorFactory.java
+++ b/icu4j/src/com/ibm/icu/text/BreakIteratorFactory.java
@@ -18,6 +18,7 @@ import com.ibm.icu.impl.ICULocaleService;
import com.ibm.icu.impl.ICUService;
import com.ibm.icu.impl.ICUService.Factory;
import com.ibm.icu.util.ULocale;
+import com.ibm.icu.util.VersionInfo;
/**
* @author Ram
@@ -76,10 +77,26 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
}
static final ICULocaleService service = new BFService();
+ // KIND_NAMES are used in synthesizing the resource name that holds the source
+ // break rules. For old-style (ICU 2.8 and previous) break iterators.
+ // The resources are com.ibm.icu.impl.data.BreakIteratorRules, and have
+ // names like "CharacterBreakRules", where the "Character" part of the
+ // name comes from here (this array).
private static final String[] KIND_NAMES = {
"Character", "Word", "Line", "Sentence", "Title"
};
+ /** KIND_NAMES_2 are used in synthesizing the names for
+ * the precompiled break rules used with the new (ICU 3.0) RBBI.
+ * The fully assembled names look like icudt30b_char.brk, which is the
+ * file name of the brk file as produced by the ICU4C build.
+ * @internal
+ */
+ private static final String[] KIND_NAMES_2 = {
+ "char", "word", "line", "sent", "title"
+ };
+
+
private static BreakIterator createBreakInstance(Locale locale, int kind) {
String prefix = KIND_NAMES[kind];
return createBreakInstance(locale, kind,
@@ -97,8 +114,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
String[] classNames = bundle.getStringArray("BreakIteratorClasses");
String rules = bundle.getString(rulesName);
if (classNames[kind].equals("RuleBasedBreakIterator")) {
+ // Old style (2.8 and previous) Break Iterator.
+ // Not used by default, but if someone wants to specify the old class
+ // in some locale's resources, it should still work.
iter = new RuleBasedBreakIterator_Old(rules);
}
+ else if (classNames[kind].equals("RuleBasedBreakIterator_New")) {
+ try {
+ // Class for new RBBI engine.
+ // Set up path to precompiled rule data.
+ String rulesFileName =
+ "data/icudt" + VersionInfo.ICU_VERSION.getMajor() +
+ VersionInfo.ICU_VERSION.getMinor() + "b_" + KIND_NAMES_2[kind] + ".brk";
+ InputStream is = ICUData.getRequiredStream(rulesFileName);
+ iter = RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is);
+ }
+ catch (IOException e) {
+ throw new IllegalArgumentException(e.toString());
+ }
+ }
else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
try {
InputStream dictionary = ICUData.getStream(bundle.getString(dictionaryName));
diff --git a/icu4j/src/com/ibm/icu/text/RBBIDataWrapper.java b/icu4j/src/com/ibm/icu/text/RBBIDataWrapper.java
index 01c316c247..4ac03dc974 100644
--- a/icu4j/src/com/ibm/icu/text/RBBIDataWrapper.java
+++ b/icu4j/src/com/ibm/icu/text/RBBIDataWrapper.java
@@ -8,16 +8,10 @@
package com.ibm.icu.text;
import java.io.InputStream;
-import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.IOException;
-import java.util.Locale;
-import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.VersionInfo;
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UCharacterCategory;
-import com.ibm.icu.lang.UProperty;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.Trie;
import com.ibm.icu.impl.CharTrie;
@@ -69,9 +63,13 @@ public class RBBIDataWrapper {
// Getters for fields from the state table header
//
- final static int getNumStates(int table[]) {
- return table[NUMSTATES]<<16 + (table[NUMSTATES+1]&0xffff);
- }
+ final static int getNumStates(short table[]) {
+ int hi = table[NUMSTATES];
+ int lo = table[NUMSTATES+1];
+ int val = (hi<<16) + (lo&0x0000ffff);
+ return val;
+ }
+
/**
* Data Header. A struct-like class with the fields from the RBBI data file header.
@@ -119,14 +117,14 @@ public class RBBIDataWrapper {
static class TrieFoldingFunc implements Trie.DataManipulate {
public int getFoldingOffset(int data) {
- if ((data & 0x8000) == 0) {
+ if ((data & 0x8000) != 0) {
return data & 0x7fff;
} else {
return 0;
}
}
};
- static TrieFoldingFunc fTrieFoldingFunc;
+ static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
RBBIDataWrapper() {
@@ -299,19 +297,148 @@ public class RBBIDataWrapper {
/** Debug function to display the break iterator data. */
void dump() {
System.out.println("RBBI Data Wrapper dump ...");
+ System.out.println();
+ System.out.println("Forward State Table");
+ dumpTable(fFTable);
+ System.out.println("Reverse State Table");
+ dumpTable(fRTable);
+ System.out.println("Forward Safe Points Table");
+ dumpTable(fSFTable);
+ System.out.println("Reverse Safe Points Table");
+ dumpTable(fSRTable);
+
+ dumpCharCategories();
System.out.println("Source Rules: " + fRuleSource);
+
+ }
+
+ /** Fixed width int-to-string conversion.
+ * TODO: there must be easy built-in way to do this */
+ private static String intToString(int n, int width) {
+ StringBuffer dest = new StringBuffer(width);
+ dest.append(n);
+ while (dest.length() < width) {
+ dest.insert(0, ' ');
+ }
+ return dest.toString();
+ }
+
+ /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
+ private void dumpTable(short table[]) {
+ int n;
+ int state;
+ String header = " Row Acc Look Tag";
+ for (n=0; n fHeader.fCatCount) {
+ System.out.println("Error, bad category " + Integer.toHexString(category) +
+ " for char " + Integer.toHexString(char32));
+ break;
+ }
+ if (category == lastCat ) {
+ rangeEnd = char32;
+ } else {
+ if (lastCat >= 0) {
+ if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
+ lastNewline[lastCat] = catStrings[lastCat].length() + 10;
+ catStrings[lastCat] += "\n ";
+ }
+
+ catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
+ if (rangeEnd != rangeStart) {
+ catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
+ }
+ }
+ lastCat = category;
+ rangeStart = rangeEnd = char32;
+ }
+ }
+ catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
+ if (rangeEnd != rangeStart) {
+ catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
+ }
+
+ for (category = 0; category <= fHeader.fCatCount; category ++) {
+ System.out.println (intToString(category, 5) + " " + catStrings[category]);
+ }
+ System.out.println();
}
public static void main(String[] args) {
String s;
if (args.length == 0) {
- s = "icudt28b_char.brk";
+ s = "char";
} else {
s = args[0];
}
System.out.println("RBBIDataWrapper.main(" + s + ") ");
+
+ String versionedName =
+ "icudt" + VersionInfo.ICU_VERSION.getMajor() +
+ VersionInfo.ICU_VERSION.getMinor() + "b_" + s + ".brk";
+
try {
- RBBIDataWrapper This = RBBIDataWrapper.get(s);
+ RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
This.dump();
}
catch (Exception e) {
diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java b/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java
index 8f48cc5005..a9034b33b4 100644
--- a/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java
@@ -7,7 +7,9 @@
package com.ibm.icu.text;
import java.text.CharacterIterator;
-import java.text.StringCharacterIterator;
+import java.io.IOException;
+import java.io.InputStream;
+
/**
* Rule Based Break Iterator implementation.
@@ -27,7 +29,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* The rule data for this BreakIterator instance
* @internal
*/
- private RBBIDataWrapper fData;
+ private RBBIDataWrapper fRData;
/** Index of the Rule {tag} values for the most recent match.
* @internal
@@ -61,7 +63,9 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
public Object clone()
{
RuleBasedBreakIterator_New result = (RuleBasedBreakIterator_New) super.clone();
- // TODO: real clone code
+ if (fText != null) {
+ fText = (CharacterIterator)fText.clone();
+ }
return result;
}
@@ -71,8 +75,27 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0
*/
public boolean equals(Object that) {
- return false; // TODO:
- }
+ try {
+ RuleBasedBreakIterator_New other = (RuleBasedBreakIterator_New) that;
+ if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
+ return false;
+ }
+ if (fRData != null && other.fRData != null &&
+ (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
+ return false;
+ }
+ if (fText == null && other.fText == null) {
+ return true;
+ }
+ if (fText == null || other.fText == null) {
+ return false;
+ }
+ return fText.equals(other.fText);
+ }
+ catch(ClassCastException e) {
+ return false;
+ }
+ }
/**
* Returns the description (rules) used to create this iterator.
@@ -81,8 +104,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
*/
public String toString() {
String retStr = null;
- if (fData != null) {
- retStr = fData.fRuleSource;
+ if (fRData != null) {
+ retStr = fRData.fRuleSource;
}
return retStr;
}
@@ -94,9 +117,23 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
*/
public int hashCode()
{
- return 0; // TODO
+ return fRData.fRuleSource.hashCode();
}
+
+ //=======================================================================
+ // Constructors & Factories
+ //=======================================================================
+ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
+ RuleBasedBreakIterator_New This = new RuleBasedBreakIterator_New();
+ This.fRData = RBBIDataWrapper.get(is);
+ This.fText = new java.text.StringCharacterIterator(""); // Note: some old tests fail if fText is null
+ // on a newly created instance.
+ return This;
+ }
+
+
+
//=======================================================================
// BreakIterator overrides
//=======================================================================
@@ -192,8 +229,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
return BreakIterator.DONE;
}
- if (fData.fSRTable != null || fData.fSFTable != null) {
- return handlePrevious(fData.fRTable);
+ if (fRData.fSRTable != null || fRData.fSFTable != null) {
+ return handlePrevious(fRData.fRTable);
}
// old rule syntax
@@ -266,7 +303,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
int result = 0;
- if (fData.fSRTable != null) {
+ if (fRData.fSRTable != null) {
// Safe Point Reverse rules exist.
// This allows us to use the optimum algorithm.
fText.setIndex(offset);
@@ -275,20 +312,20 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// this handles offset being between a supplementary character
CINext32(fText);
// handlePrevious will move most of the time to < 1 boundary away
- handlePrevious(fData.fSRTable);
+ handlePrevious(fRData.fSRTable);
result = next();
while (result <= offset) {
result = next();
}
return result;
}
- if (fData.fSFTable != null) {
+ if (fRData.fSFTable != null) {
// No Safe point reverse table, but there is a safe pt forward table.
//
fText.setIndex(offset);
CIPrevious32(fText);
// handle next will give result >= offset
- handleNext(fData.fSFTable);
+ handleNext(fRData.fSFTable);
// previous will give result 0 or 1 boundary away from offset,
// most of the time
// we have to
@@ -352,7 +389,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// to carry out this operation
int result;
- if (fData.fSFTable != null) {
+ if (fRData.fSFTable != null) {
/// todo synwee
// new rule syntax
fText.setIndex(offset);
@@ -360,19 +397,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// safe point.
// this handles offset being between a supplementary character
CIPrevious32(fText);
- handleNext(fData.fSFTable);
+ handleNext(fRData.fSFTable);
result = previous();
while (result >= offset) {
result = previous();
}
return result;
}
- if (fData.fSRTable != null) {
+ if (fRData.fSRTable != null) {
// backup plan if forward safe table is not available
fText.setIndex(offset);
CINext32(fText);
// handle previous will give result <= offset
- handlePrevious(fData.fSRTable);
+ handlePrevious(fRData.fSRTable);
// next will give result 0 or 1 boundary away from offset,
// most of the time
@@ -397,6 +434,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
return previous();
}
+ /**
+ * Throw IllegalArgumentException unless begin <= offset < end.
+ * TODO: subclassing interface from old RBBI is not really usable.
+ * What to do with old protected functions tagged as stable?
+ * @stable ICU 2.0
+ */
+ protected static final void checkOffset(int offset, CharacterIterator text) {
+ if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
+ throw new IllegalArgumentException("offset out of bounds");
+ }
+ }
+
+
/**
* Returns true if the specfied position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
@@ -406,8 +456,10 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0
*/
public boolean isBoundary(int offset) {
+ checkOffset(offset, fText);
+
// the beginning index of the iterator is always a boundary position by definition
- if (fText == null || offset == fText.getBeginIndex()) {
+ if (offset == fText.getBeginIndex()) {
first(); // For side effects on current position, tag values.
return true;
}
@@ -502,8 +554,8 @@ public int getRuleStatus() {
// Status val N-1 <-- the value we need to return
// The status values are sorted in ascending order.
// This function returns the last (largest) of the array of status values.
- int idx = fLastRuleStatusIndex + fData.fStatusTable[fLastRuleStatusIndex];
- int tagVal = fData.fStatusTable[idx];
+ int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
+ int tagVal = fRData.fStatusTable[idx];
return tagVal;
}
@@ -532,11 +584,11 @@ public int getRuleStatus() {
*/
public int getRuleStatusVec(int[] fillInArray) {
makeRuleStatusValid();
- int numStatusVals = fData.fStatusTable[fLastRuleStatusIndex];
+ int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
if (fillInArray != null) {
int numToCopy = Math.min(numStatusVals, fillInArray.length);
for (int i=0; i= ci.getEndIndex()) {
return false;
}
return true;
@@ -637,7 +688,7 @@ public int getRuleStatusVec(int[] fillInArray) {
* @internal
*/
private int handleNext() {
- return handleNext(fData.fFTable);
+ return handleNext(fRData.fFTable);
}
@@ -663,7 +714,7 @@ public int getRuleStatusVec(int[] fillInArray) {
int state = START_STATE;
short category;
int c = CICurrent32(fText);
- int row = fData.getRowIndex(state);
+ int row = fRData.getRowIndex(state);
int lookaheadStatus = 0;
int lookaheadTagIdx = 0;
@@ -671,7 +722,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// Character Category fetch for starting character.
// See comments on character category code within loop, below.
- category = (short)fData.fTrie.getCodePointValue(c);
+ category = (short)fRData.fTrie.getCodePointValue(c);
if ((category & 0x4000) != 0) {
// fDictionaryCharCount++;
category &= ~0x4000;
@@ -704,7 +755,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
- category = (short)fData.fTrie.getCodePointValue(c);
+ category = (short)fRData.fTrie.getCodePointValue(c);
// Clear the dictionary flag bit in the character's category.
// Note: not using the old style dictionary stuff in this Java engine.
@@ -725,7 +776,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the state table
// state = row->fNextState[category];
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
- row = fData.getRowIndex(state);
+ row = fRData.getRowIndex(state);
// Get the next character. Doing it here positions the iterator
// to the correct position for recording matches in the code that
@@ -793,15 +844,15 @@ public int getRuleStatusVec(int[] fillInArray) {
* handlePrevious
*/
private int handlePrevious() {
- if (fText == null || fData == null) {
+ if (fText == null || fRData == null) {
return 0;
}
- if (fData.fRTable == null) {
+ if (fRData.fRTable == null) {
fText.first();
return fText.getIndex();
}
- short stateTable[] = fData.fRTable;
+ short stateTable[] = fRData.fRTable;
int state = START_STATE;
int category;
int lastCategory = 0;
@@ -812,8 +863,8 @@ public int getRuleStatusVec(int[] fillInArray) {
int c = CICurrent32(fText);
int row;
- row = fData.getRowIndex(state);
- category = (short)fData.fTrie.getCodePointValue(c);
+ row = fRData.getRowIndex(state);
+ category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Clear the dictionary bit, just in case.
if (fTrace) {
@@ -829,7 +880,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// save the last character's category and look up the current
// character's category
lastCategory = category;
- category = (short)fData.fTrie.getCodePointValue(c);
+ category = (short)fRData.fTrie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Don't exist in this Java engine implementation. Clear the bit.
@@ -848,7 +899,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the backwards state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
- row = fData.getRowIndex(state);
+ row = fRData.getRowIndex(state);
continueOn: {
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0 &&
@@ -942,9 +993,9 @@ public int getRuleStatusVec(int[] fillInArray) {
boolean lookAheadHardBreak =
(stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
- int row = fData.getRowIndex(state);
+ int row = fRData.getRowIndex(state);
- category = (short)fData.fTrie.getCodePointValue(c);
+ category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Mask off dictionary bit.
if (fTrace) {
@@ -965,7 +1016,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// save the last character's category and look up the current
// character's category
lastCategory = category;
- category = (short)fData.fTrie.getCodePointValue(c);
+ category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Clear the dictionary bit flag
// (Should be unused; holdover from old RBBI)
@@ -982,7 +1033,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the backwards state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
- row = fData.getRowIndex(state);
+ row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case, could have lookahead so we move on to check it