diff --git a/icu4j/build.xml b/icu4j/build.xml index cbe4adf313..6065ed5246 100644 --- a/icu4j/build.xml +++ b/icu4j/build.xml @@ -194,7 +194,7 @@ diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java index 4eeafb27f6..adc9b80eb5 100755 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java @@ -8,6 +8,7 @@ package com.ibm.icu.dev.test.rbbi; import com.ibm.icu.dev.test.*; import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator_Old; import java.text.StringCharacterIterator; import java.util.Locale; import java.util.Vector; @@ -385,85 +386,91 @@ public class BreakIteratorTest extends TestFmwk //========================================================================= public void TestWordBreak() { - Vector wordSelectionData = new Vector(); - - wordSelectionData.addElement("12,34"); - - wordSelectionData.addElement(" "); - wordSelectionData.addElement("\u00A2"); //cent sign - wordSelectionData.addElement("\u00A3"); //pound sign - wordSelectionData.addElement("\u00A4"); //currency sign - wordSelectionData.addElement("\u00A5"); //yen sign - wordSelectionData.addElement("alpha-beta-gamma"); - wordSelectionData.addElement("."); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("Badges"); - wordSelectionData.addElement("?"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("BADGES"); - wordSelectionData.addElement("!"); - wordSelectionData.addElement("?"); - wordSelectionData.addElement("!"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("We"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("don't"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("need"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("no"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("STINKING"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("BADGES"); - wordSelectionData.addElement("!"); - wordSelectionData.addElement("!"); - wordSelectionData.addElement("!"); - - wordSelectionData.addElement("012.566,5"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("123.3434,900"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("1000,233,456.000"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("1,23.322%"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("123.1222"); - - wordSelectionData.addElement(" "); - wordSelectionData.addElement("\u0024123,000.20"); - - wordSelectionData.addElement(" "); - wordSelectionData.addElement("179.01\u0025"); - - wordSelectionData.addElement("Hello"); - wordSelectionData.addElement(","); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("how"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("are"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("you"); - wordSelectionData.addElement(" "); - wordSelectionData.addElement("X"); - wordSelectionData.addElement(" "); - - wordSelectionData.addElement("Now"); - wordSelectionData.addElement("\r"); - wordSelectionData.addElement("is"); - wordSelectionData.addElement("\n"); - wordSelectionData.addElement("the"); - wordSelectionData.addElement("\r\n"); - wordSelectionData.addElement("time"); - wordSelectionData.addElement("\n"); - wordSelectionData.addElement("\r"); - wordSelectionData.addElement("for"); - wordSelectionData.addElement("\r"); - wordSelectionData.addElement("\r"); - wordSelectionData.addElement("all"); - wordSelectionData.addElement(" "); - - generalIteratorTest(wordBreak, wordSelectionData); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)wordBreak; + Vector wordSelectionData = new Vector(); + + wordSelectionData.addElement("12,34"); + + wordSelectionData.addElement(" "); + wordSelectionData.addElement("\u00A2"); //cent sign + wordSelectionData.addElement("\u00A3"); //pound sign + wordSelectionData.addElement("\u00A4"); //currency sign + wordSelectionData.addElement("\u00A5"); //yen sign + wordSelectionData.addElement("alpha-beta-gamma"); + wordSelectionData.addElement("."); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("Badges"); + wordSelectionData.addElement("?"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("BADGES"); + wordSelectionData.addElement("!"); + wordSelectionData.addElement("?"); + wordSelectionData.addElement("!"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("We"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("don't"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("need"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("no"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("STINKING"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("BADGES"); + wordSelectionData.addElement("!"); + wordSelectionData.addElement("!"); + wordSelectionData.addElement("!"); + + wordSelectionData.addElement("012.566,5"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("123.3434,900"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("1000,233,456.000"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("1,23.322%"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("123.1222"); + + wordSelectionData.addElement(" "); + wordSelectionData.addElement("\u0024123,000.20"); + + wordSelectionData.addElement(" "); + wordSelectionData.addElement("179.01\u0025"); + + wordSelectionData.addElement("Hello"); + wordSelectionData.addElement(","); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("how"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("are"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("you"); + wordSelectionData.addElement(" "); + wordSelectionData.addElement("X"); + wordSelectionData.addElement(" "); + + wordSelectionData.addElement("Now"); + wordSelectionData.addElement("\r"); + wordSelectionData.addElement("is"); + wordSelectionData.addElement("\n"); + wordSelectionData.addElement("the"); + wordSelectionData.addElement("\r\n"); + wordSelectionData.addElement("time"); + wordSelectionData.addElement("\n"); + wordSelectionData.addElement("\r"); + wordSelectionData.addElement("for"); + wordSelectionData.addElement("\r"); + wordSelectionData.addElement("\r"); + wordSelectionData.addElement("all"); + wordSelectionData.addElement(" "); + + generalIteratorTest(wordBreak, wordSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } /** @@ -514,62 +521,81 @@ public class BreakIteratorTest extends TestFmwk * @bug 4117554 */ public void TestBug4117554Words() { - Vector wordSelectionData = new Vector(); - - // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should - // count as a Kanji character for the purposes of word breaking - wordSelectionData.addElement("abc"); - wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03"); - wordSelectionData.addElement("abc"); - - generalIteratorTest(wordBreak, wordSelectionData); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)wordBreak; + Vector wordSelectionData = new Vector(); + + // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should + // count as a Kanji character for the purposes of word breaking + wordSelectionData.addElement("abc"); + wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03"); + wordSelectionData.addElement("abc"); + + generalIteratorTest(wordBreak, wordSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } public void TestSentenceBreak() { - Vector sentenceSelectionData = new Vector(); - - sentenceSelectionData.addElement("This is a simple sample sentence. "); - sentenceSelectionData.addElement("(This is it.) "); - sentenceSelectionData.addElement("This is a simple sample sentence. "); - sentenceSelectionData.addElement("\"This isn\'t it.\" "); - sentenceSelectionData.addElement("Hi! "); - sentenceSelectionData.addElement("This is a simple sample sentence. "); - sentenceSelectionData.addElement("It does not have to make any sense as you can see. "); - sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. "); - sentenceSelectionData.addElement("Che la dritta via aveo smarrita. "); - sentenceSelectionData.addElement("He said, that I said, that you said!! "); - - sentenceSelectionData.addElement("Don't rock the boat.\u2029"); - - sentenceSelectionData.addElement("Because I am the daddy, that is why. "); - sentenceSelectionData.addElement("Not on my time (el timo.)! "); - - sentenceSelectionData.addElement("So what!!\u2029"); - - sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" "); - sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). "); - sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n"); - sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" "); - sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? "); - sentenceSelectionData.addElement("He answered, \"You may not!\" "); - sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". "); - sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' "); - sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? "); - sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!"); - - generalIteratorTest(sentenceBreak, sentenceSelectionData); - } + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)sentenceBreak; + Vector sentenceSelectionData = new Vector(); + + sentenceSelectionData.addElement("This is a simple sample sentence. "); + sentenceSelectionData.addElement("(This is it.) "); + sentenceSelectionData.addElement("This is a simple sample sentence. "); + sentenceSelectionData.addElement("\"This isn\'t it.\" "); + sentenceSelectionData.addElement("Hi! "); + sentenceSelectionData.addElement("This is a simple sample sentence. "); + sentenceSelectionData.addElement("It does not have to make any sense as you can see. "); + sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. "); + sentenceSelectionData.addElement("Che la dritta via aveo smarrita. "); + sentenceSelectionData.addElement("He said, that I said, that you said!! "); + + sentenceSelectionData.addElement("Don't rock the boat.\u2029"); + + sentenceSelectionData.addElement("Because I am the daddy, that is why. "); + sentenceSelectionData.addElement("Not on my time (el timo.)! "); + + sentenceSelectionData.addElement("So what!!\u2029"); + + sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" "); + sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). "); + sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n"); + sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" "); + sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? "); + sentenceSelectionData.addElement("He answered, \"You may not!\" "); + sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". "); + sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' "); + sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? "); + sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!"); + + generalIteratorTest(sentenceBreak, sentenceSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } +} /** * @bug 4113835 */ public void TestBug4113835() { - Vector sentenceSelectionData = new Vector(); - - // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks - sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029"); - - generalIteratorTest(sentenceBreak, sentenceSelectionData); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)sentenceBreak; + + Vector sentenceSelectionData = new Vector(); + + // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks + sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029"); + + generalIteratorTest(sentenceBreak, sentenceSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } /** @@ -598,46 +624,58 @@ public class BreakIteratorTest extends TestFmwk * @bug 4117554 */ public void TestBug4117554Sentences() { - Vector sentenceSelectionData = new Vector(); - - // Treat fullwidth variants of .!? the same as their - // normal counterparts - sentenceSelectionData.addElement("I know I'm right\uff0e "); - sentenceSelectionData.addElement("Right\uff1f "); - sentenceSelectionData.addElement("Right\uff01 "); - - // Don't break sentences at boundary between CJK and digits - sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8" - + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0" - + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029"); - - // Break sentence between a sentence terminator and - // opening punctuation - sentenceSelectionData.addElement("no?"); - sentenceSelectionData.addElement("(yes)"); - - generalIteratorTest(sentenceBreak, sentenceSelectionData); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)sentenceBreak; + Vector sentenceSelectionData = new Vector(); + + // Treat fullwidth variants of .!? the same as their + // normal counterparts + sentenceSelectionData.addElement("I know I'm right\uff0e "); + sentenceSelectionData.addElement("Right\uff1f "); + sentenceSelectionData.addElement("Right\uff01 "); + + // Don't break sentences at boundary between CJK and digits + sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8" + + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0" + + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029"); + + // Break sentence between a sentence terminator and + // opening punctuation + sentenceSelectionData.addElement("no?"); + sentenceSelectionData.addElement("(yes)"); + + generalIteratorTest(sentenceBreak, sentenceSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } /** * @bug 4158381 */ public void TestBug4158381() { - Vector sentenceSelectionData = new Vector(); - - // Don't break sentence after period if it isn't followed by a space - sentenceSelectionData.addElement("Test Flags.Flag class. "); - sentenceSelectionData.addElement("Another test.\u2029"); - - // No breaks when there are no terminators around - sentenceSelectionData.addElement("

Provides a set of " - + ""lightweight" (all-javaTM" - + " language) components that, " - + "to the maximum degree possible, work the same on all platforms. "); - sentenceSelectionData.addElement("Another test.\u2029"); - - generalIteratorTest(sentenceBreak, sentenceSelectionData); - } + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)sentenceBreak; + Vector sentenceSelectionData = new Vector(); + + // Don't break sentence after period if it isn't followed by a space + sentenceSelectionData.addElement("Test Flags.Flag class. "); + sentenceSelectionData.addElement("Another test.\u2029"); + + // No breaks when there are no terminators around + sentenceSelectionData.addElement("

Provides a set of " + + ""lightweight" (all-javaTM" + + " language) components that, " + + "to the maximum degree possible, work the same on all platforms. "); + sentenceSelectionData.addElement("Another test.\u2029"); + + generalIteratorTest(sentenceBreak, sentenceSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } +} /** * @bug 4143071 @@ -767,7 +805,6 @@ public class BreakIteratorTest extends TestFmwk lineSelectionData.addElement("mouse "); lineSelectionData.addElement("(one)"); lineSelectionData.addElement("(two)\n"); - generalIteratorTest(lineBreak, lineSelectionData); } @@ -775,40 +812,52 @@ public class BreakIteratorTest extends TestFmwk * @bug 4035266 */ public void TestBug4035266() { - Vector lineSelectionData = new Vector(); - - lineSelectionData.addElement("The "); - lineSelectionData.addElement("balance "); - lineSelectionData.addElement("is "); - lineSelectionData.addElement("$-23,456.78, "); - lineSelectionData.addElement("not "); - lineSelectionData.addElement("-$32,456.78!\n"); - - generalIteratorTest(lineBreak, lineSelectionData); - } + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)lineBreak; + Vector lineSelectionData = new Vector(); + + lineSelectionData.addElement("The "); + lineSelectionData.addElement("balance "); + lineSelectionData.addElement("is "); + lineSelectionData.addElement("$-23,456.78, "); + lineSelectionData.addElement("not "); + lineSelectionData.addElement("-$32,456.78!\n"); + + generalIteratorTest(lineBreak, lineSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } +} /** * @bug 4098467 */ public void TestBug4098467Lines() { - Vector lineSelectionData = new Vector(); - - // What follows is a string of Korean characters (I found it in the Yellow Pages - // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed - // it correctly), first as precomposed syllables, and then as conjoining jamo. - // Both sequences should be semantically identical and break the same way. - // precomposed syllables... - lineSelectionData.addElement("\uc0c1\ud56d "); - lineSelectionData.addElement("\ud55c\uc778 "); - lineSelectionData.addElement("\uc5f0\ud569 "); - lineSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c "); - // conjoining jamo... - lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc "); - lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab "); - lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 "); - lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c"); - - generalIteratorTest(lineBreak, lineSelectionData); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)lineBreak; + Vector lineSelectionData = new Vector(); + + // What follows is a string of Korean characters (I found it in the Yellow Pages + // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed + // it correctly), first as precomposed syllables, and then as conjoining jamo. + // Both sequences should be semantically identical and break the same way. + // precomposed syllables... + lineSelectionData.addElement("\uc0c1\ud56d "); + lineSelectionData.addElement("\ud55c\uc778 "); + lineSelectionData.addElement("\uc5f0\ud569 "); + lineSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c "); + // conjoining jamo... + lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc "); + lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab "); + lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 "); + lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c"); + + generalIteratorTest(lineBreak, lineSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } public void TestThaiLineBreak() { @@ -949,22 +998,28 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\ * @bug 4217703 */ public void TestBug4217703() { - Vector lineSelectionData = new Vector(); - - // There shouldn't be a line break between sentence-ending punctuation - // and a closing quote - lineSelectionData.addElement("He "); - lineSelectionData.addElement("said "); - lineSelectionData.addElement("\"Go!\" "); - lineSelectionData.addElement("I "); - lineSelectionData.addElement("went. "); - - lineSelectionData.addElement("Hashtable$Enumeration "); - lineSelectionData.addElement("getText()."); - lineSelectionData.addElement("getIndex()"); - - generalIteratorTest(lineBreak, lineSelectionData); - } + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)lineBreak; + Vector lineSelectionData = new Vector(); + + // There shouldn't be a line break between sentence-ending punctuation + // and a closing quote + lineSelectionData.addElement("He "); + lineSelectionData.addElement("said "); + lineSelectionData.addElement("\"Go!\" "); + lineSelectionData.addElement("I "); + lineSelectionData.addElement("went. "); + + lineSelectionData.addElement("Hashtable$Enumeration "); + lineSelectionData.addElement("getText()."); + lineSelectionData.addElement("getIndex()"); + + generalIteratorTest(lineBreak, lineSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } +} private static final String graveS = "S\u0300"; private static final String acuteBelowI = "i\u0317"; @@ -1091,18 +1146,24 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\ } public void TestBug4146175Sentences() { - Vector sentenceSelectionData = new Vector(); - - // break between periods and opening punctuation even when there's no - // intervening space - sentenceSelectionData.addElement("end."); - sentenceSelectionData.addElement("(This is\u2029"); - - // treat the fullwidth period as an unambiguous sentence terminator - sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e"); - sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f"); - - generalIteratorTest(sentenceBreak, sentenceSelectionData); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)lineBreak; + Vector sentenceSelectionData = new Vector(); + + // break between periods and opening punctuation even when there's no + // intervening space + sentenceSelectionData.addElement("end."); + sentenceSelectionData.addElement("(This is\u2029"); + + // treat the fullwidth period as an unambiguous sentence terminator + sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e"); + sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f"); + + generalIteratorTest(sentenceBreak, sentenceSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } public void TestBug4146175Lines() { @@ -1116,14 +1177,20 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\ } public void TestBug4214367() { - Vector wordSelectionData = new Vector(); - - // the hiragana and katakana iteration marks and the long vowel mark - // are not being treated correctly by the word-break iterator - wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042"); - wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2"); - - generalIteratorTest(wordBreak, wordSelectionData); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)wordBreak; + Vector wordSelectionData = new Vector(); + + // the hiragana and katakana iteration marks and the long vowel mark + // are not being treated correctly by the word-break iterator + wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042"); + wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2"); + + generalIteratorTest(wordBreak, wordSelectionData); + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } private static final String cannedTestChars @@ -1142,100 +1209,118 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\ public void TestWordInvariants() { - BreakIterator e = BreakIterator.getWordInstance(); - doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2" - + "\u30a3\u4e00\u4e01\u4e02"); - doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2" - + "\u30a3\u4e00\u4e01\u4e02"); + BreakIterator e = BreakIterator.getWordInstance(); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)e; + doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2" + + "\u30a3\u4e00\u4e01\u4e02"); + doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2" + + "\u30a3\u4e00\u4e01\u4e02"); + } + catch (ClassCastException ex) { + logln("New Break Iterator, skipping old test"); + } } public void TestLineInvariants() { - BreakIterator e = BreakIterator.getLineInstance(); - String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045" - + "\u30a3\u4e00\u4e01\u4e02"; - doBreakInvariantTest(e, testChars); - doOtherInvariantTest(e, testChars); - - int errorCount = 0; - - // in addition to the other invariants, a line-break iterator should make sure that: - // it doesn't break around the non-breaking characters - String noBreak = "\u00a0\u2007\u2011\ufeff"; - StringBuffer work = new StringBuffer("aaa"); - for (int i = 0; i < testChars.length(); i++) { - char c = testChars.charAt(i); - if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003') - continue; - work.setCharAt(0, c); - for (int j = 0; j < noBreak.length(); j++) { - work.setCharAt(1, noBreak.charAt(j)); - for (int k = 0; k < testChars.length(); k++) { - work.setCharAt(2, testChars.charAt(k)); - e.setText(work.toString()); - for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) - if (l == 1 || l == 2) { - errln("Got break between U+" + Integer.toHexString((int) - (work.charAt(l - 1))) + " and U+" + Integer.toHexString( - (int)(work.charAt(l)))); - errorCount++; - if (errorCount >= 75) - return; - } - } - } - } - - // it does break after dashes (unless they're followed by a digit, a non-spacing mark, - // a currency symbol, a space, a format-control character, a regular control character, - // a line or paragraph separator, or another dash) - String dashes = "-\u00ad\u2010\u2012\u2013\u2014"; - for (int i = 0; i < testChars.length(); i++) { - work.setCharAt(0, testChars.charAt(i)); - for (int j = 0; j < dashes.length(); j++) { - work.setCharAt(1, dashes.charAt(j)); - for (int k = 0; k < testChars.length(); k++) { - char c = testChars.charAt(k); - if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER || - Character.getType(c) == Character.OTHER_NUMBER || - Character.getType(c) == Character.NON_SPACING_MARK || - Character.getType(c) == Character.ENCLOSING_MARK || - Character.getType(c) == Character.CURRENCY_SYMBOL || - Character.getType(c) == Character.DASH_PUNCTUATION || - Character.getType(c) == Character.SPACE_SEPARATOR || - Character.getType(c) == Character.FORMAT || - Character.getType(c) == Character.CONTROL || - c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' || - c == '\u0003' || c == '\u2007' || c == '\u2011' || - c == '\ufeff') - continue; - work.setCharAt(2, c); - e.setText(work.toString()); - boolean saw2 = false; - for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) - if (l == 2) - saw2 = true; - if (!saw2) { - errln("Didn't get break between U+" + Integer.toHexString((int) - (work.charAt(1))) + " and U+" + Integer.toHexString( - (int)(work.charAt(2)))); - errorCount++; - if (errorCount >= 75) - return; - } - } - } - } + BreakIterator e = BreakIterator.getLineInstance(); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)e; + String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045" + + "\u30a3\u4e00\u4e01\u4e02"; + doBreakInvariantTest(e, testChars); + doOtherInvariantTest(e, testChars); + + int errorCount = 0; + + // in addition to the other invariants, a line-break iterator should make sure that: + // it doesn't break around the non-breaking characters + String noBreak = "\u00a0\u2007\u2011\ufeff"; + StringBuffer work = new StringBuffer("aaa"); + for (int i = 0; i < testChars.length(); i++) { + char c = testChars.charAt(i); + if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003') + continue; + work.setCharAt(0, c); + for (int j = 0; j < noBreak.length(); j++) { + work.setCharAt(1, noBreak.charAt(j)); + for (int k = 0; k < testChars.length(); k++) { + work.setCharAt(2, testChars.charAt(k)); + e.setText(work.toString()); + for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) + if (l == 1 || l == 2) { + errln("Got break between U+" + Integer.toHexString((int) + (work.charAt(l - 1))) + " and U+" + Integer.toHexString( + (int)(work.charAt(l)))); + errorCount++; + if (errorCount >= 75) + return; + } + } + } + } + + // it does break after dashes (unless they're followed by a digit, a non-spacing mark, + // a currency symbol, a space, a format-control character, a regular control character, + // a line or paragraph separator, or another dash) + String dashes = "-\u00ad\u2010\u2012\u2013\u2014"; + for (int i = 0; i < testChars.length(); i++) { + work.setCharAt(0, testChars.charAt(i)); + for (int j = 0; j < dashes.length(); j++) { + work.setCharAt(1, dashes.charAt(j)); + for (int k = 0; k < testChars.length(); k++) { + char c = testChars.charAt(k); + if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER || + Character.getType(c) == Character.OTHER_NUMBER || + Character.getType(c) == Character.NON_SPACING_MARK || + Character.getType(c) == Character.ENCLOSING_MARK || + Character.getType(c) == Character.CURRENCY_SYMBOL || + Character.getType(c) == Character.DASH_PUNCTUATION || + Character.getType(c) == Character.SPACE_SEPARATOR || + Character.getType(c) == Character.FORMAT || + Character.getType(c) == Character.CONTROL || + c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' || + c == '\u0003' || c == '\u2007' || c == '\u2011' || + c == '\ufeff') + continue; + work.setCharAt(2, c); + e.setText(work.toString()); + boolean saw2 = false; + for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) + if (l == 2) + saw2 = true; + if (!saw2) { + errln("Didn't get break between U+" + Integer.toHexString((int) + (work.charAt(1))) + " and U+" + Integer.toHexString( + (int)(work.charAt(2)))); + errorCount++; + if (errorCount >= 75) + return; + } + } + } + } + } + catch (ClassCastException ex) { + logln("New Break Iterator, skipping old test"); + } } public void TestCharacterInvariants() - { - BreakIterator e = BreakIterator.getCharacterInstance(); - doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8" - + "\u11a9\u11aa"); - doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8" - + "\u11a9\u11aa"); - } + { + BreakIterator e = BreakIterator.getCharacterInstance(); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)e; + doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8" + + "\u11a9\u11aa"); + doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8" + + "\u11a9\u11aa"); + } + catch (ClassCastException ex) { + logln("New Break Iterator, skipping old test"); + } + } public void TestEmptyString() { @@ -1260,132 +1345,144 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\ */ public void TestJapaneseLineBreak() { - StringBuffer testString = new StringBuffer("\u4e00x\u4e8c"); - String precedingChars = "([{\u00ab$\u00a5\u00a3\u00a4\u2018\u201a\u201c\u201e\u201b\u201f"; - String followingChars = ")]}\u00bb!%,.\u3001\u3002\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc:;\u309b\u309c\u3005\u309d\u309e\u30fd\u30fe\u2019\u201d\u00b0\u2032\u2033\u2034\u2030\u2031\u2103\u2109\u00a2\u0300\u0301\u0302"; - BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN); - - for (int i = 0; i < precedingChars.length(); i++) { - testString.setCharAt(1, precedingChars.charAt(i)); - iter.setText(testString.toString()); - int j = iter.first(); - if (j != 0) - errln("ja line break failure: failed to start at 0"); - j = iter.next(); - if (j != 1) - errln("ja line break failure: failed to stop before '" + precedingChars.charAt(i) - + "' (" + ((int)(precedingChars.charAt(i))) + ")"); - j = iter.next(); - if (j != 3) - errln("ja line break failure: failed to skip position after '" + precedingChars.charAt(i) - + "' (" + ((int)(precedingChars.charAt(i))) + ")"); - } - - for (int i = 0; i < followingChars.length(); i++) { - testString.setCharAt(1, followingChars.charAt(i)); - iter.setText(testString.toString()); - int j = iter.first(); - if (j != 0) - errln("ja line break failure: failed to start at 0"); - j = iter.next(); - if (j != 2) - errln("ja line break failure: failed to skip position before '" + followingChars.charAt(i) - + "' (" + ((int)(followingChars.charAt(i))) + ")"); - j = iter.next(); - if (j != 3) - errln("ja line break failure: failed to stop after '" + followingChars.charAt(i) - + "' (" + ((int)(followingChars.charAt(i))) + ")"); - } + StringBuffer testString = new StringBuffer("\u4e00x\u4e8c"); + String precedingChars = "([{\u00ab$\u00a5\u00a3\u00a4\u2018\u201a\u201c\u201e\u201b\u201f"; + String followingChars = ")]}\u00bb!%,.\u3001\u3002\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc:;\u309b\u309c\u3005\u309d\u309e\u30fd\u30fe\u2019\u201d\u00b0\u2032\u2033\u2034\u2030\u2031\u2103\u2109\u00a2\u0300\u0301\u0302"; + BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)iter; + + for (int i = 0; i < precedingChars.length(); i++) { + testString.setCharAt(1, precedingChars.charAt(i)); + iter.setText(testString.toString()); + int j = iter.first(); + if (j != 0) + errln("ja line break failure: failed to start at 0"); + j = iter.next(); + if (j != 1) + errln("ja line break failure: failed to stop before '" + precedingChars.charAt(i) + + "' (" + ((int)(precedingChars.charAt(i))) + ")"); + j = iter.next(); + if (j != 3) + errln("ja line break failure: failed to skip position after '" + precedingChars.charAt(i) + + "' (" + ((int)(precedingChars.charAt(i))) + ")"); + } + + for (int i = 0; i < followingChars.length(); i++) { + testString.setCharAt(1, followingChars.charAt(i)); + iter.setText(testString.toString()); + int j = iter.first(); + if (j != 0) + errln("ja line break failure: failed to start at 0"); + j = iter.next(); + if (j != 2) + errln("ja line break failure: failed to skip position before '" + followingChars.charAt(i) + + "' (" + ((int)(followingChars.charAt(i))) + ")"); + j = iter.next(); + if (j != 3) + errln("ja line break failure: failed to stop after '" + followingChars.charAt(i) + + "' (" + ((int)(followingChars.charAt(i))) + ")"); + } + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } /** * Bug 4638433 */ - public void TestLineBreakBasedOnUnicode3_0_0() { - BreakIterator iter; - int i; - - /* Latin Extend-B characters - * 0x0218-0x0233 which have been added since Unicode 3.0.0. - */ - iter = BreakIterator.getWordInstance(Locale.US); - iter.setText("\u0216\u0217\u0218\u0219\u021A"); - i = iter.first(); - i = iter.next(); - if (i != 5) { - errln("Word break failure: failed to stop at 5 and bounded at " + i); + public void TestLineBreakBasedOnUnicode3_0_0() { + BreakIterator iter; + int i; + + /* Latin Extend-B characters + * 0x0218-0x0233 which have been added since Unicode 3.0.0. + */ + iter = BreakIterator.getWordInstance(Locale.US); + try { + RuleBasedBreakIterator_Old olb = (RuleBasedBreakIterator_Old)iter; + iter.setText("\u0216\u0217\u0218\u0219\u021A"); + i = iter.first(); + i = iter.next(); + if (i != 5) { + errln("Word break failure: failed to stop at 5 and bounded at " + i); + } + + + iter = BreakIterator.getLineInstance(Locale.US); + + /* + * \u301f has changed its category from Ps to Pe since Unicode 2.1. + */ + iter.setText("32\u301f1"); + i = iter.first(); + i = iter.next(); + if (i != 3) { + errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i); + } + + /* Mongolian + * which have been added since Unicode 3.0.0. + */ + iter.setText("\u1820\u1806\u1821"); + i = iter.first(); + i = iter.next(); + if (i != 2) { + errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i); + } + + /* Khmer which have + * been added since Unicode 3.0.0. + */ + /* + * Richard: fail to pass, refer to #3550 + iter.setText("\u17E0\u17DB\u17E1"); + i = iter.first(); + i = iter.next(); + if (i != 1) { + errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i); + } + i = iter.next(); + if (i != 3) { + errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i); + }*/ + + /* Ogham which have + * been added since Unicode 3.0.0. + */ + iter.setText("\u1692\u1680\u1696"); + i = iter.first(); + i = iter.next(); + if (i != 2) { + errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i); + } + + + // Confirm changes in BreakIteratorRules_th.java have been reflected. + iter = BreakIterator.getLineInstance(new Locale("th", "")); + + /* Thai + * + * + * + * + */ + iter.setText("\u0E57\u201C\u0E55\u201D\u0E53"); + i = iter.first(); + i = iter.next(); + if (i != 1) { + errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i); + } + i = iter.next(); + if (i != 4) { + errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i); + } + } + catch (ClassCastException e) { + logln("New Break Iterator, skipping old test"); + } } - - - iter = BreakIterator.getLineInstance(Locale.US); - - /* - * \u301f has changed its category from Ps to Pe since Unicode 2.1. - */ - iter.setText("32\u301f1"); - i = iter.first(); - i = iter.next(); - if (i != 3) { - errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i); - } - - /* Mongolian - * which have been added since Unicode 3.0.0. - */ - iter.setText("\u1820\u1806\u1821"); - i = iter.first(); - i = iter.next(); - if (i != 2) { - errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i); - } - - /* Khmer which have - * been added since Unicode 3.0.0. - */ - /* - * Richard: fail to pass, refer to #3550 - iter.setText("\u17E0\u17DB\u17E1"); - i = iter.first(); - i = iter.next(); - if (i != 1) { - errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i); - } - i = iter.next(); - if (i != 3) { - errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i); - }*/ - - /* Ogham which have - * been added since Unicode 3.0.0. - */ - iter.setText("\u1692\u1680\u1696"); - i = iter.first(); - i = iter.next(); - if (i != 2) { - errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i); - } - - - // Confirm changes in BreakIteratorRules_th.java have been reflected. - iter = BreakIterator.getLineInstance(new Locale("th", "")); - - /* Thai - * - * - * - * - */ - iter.setText("\u0E57\u201C\u0E55\u201D\u0E53"); - i = iter.first(); - i = iter.next(); - if (i != 1) { - errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i); - } - i = iter.next(); - if (i != 4) { - errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i); - } - } /** * @bug 4068137 diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java index f1a8684f29..6fdd8700d1 100755 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBIAPITest.java @@ -186,7 +186,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { errln("ERROR: next()/following() at last position returned #" + p + " and " + q + " instead of" + testString.length() + "\n"); RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault()); - testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964"; + testString = "Write hindi here. \u092d\u093e\u0930\u0301 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964"; logln("testing char iter - string:- \"" + testString + "\""); charIter1.setText(testString); p = charIter1.first(); @@ -209,7 +209,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { // hindi starts here p = q; q = charIter1.next(4); - doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0924"); + doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0301"); // Nonsense, but compatible between old and new rules. p = q; q = charIter1.next(2); doTest(testString, p, q, 26, " \u0938\u0941\u0902"); @@ -217,13 +217,13 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { q = charIter1.following(24); doTest(testString, 24, q, 26, "\u0941\u0902"); q = charIter1.following(20); - doTest(testString, 20, q, 21, "\u0930"); + doTest(testString, 20, q, 22, "\u0930\u0301"); p = charIter1.following(charIter1.last()); q = charIter1.next(charIter1.last()); if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE) errln("ERROR: following()/next() at last position returned #" + p + " and " + q + " instead of" + testString.length()); - testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000."; + testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000."; RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault()); logln("testing sentence iter - String:- \"" + testString + "\""); sentIter1.setText(testString); @@ -243,7 +243,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? "); p = q; q = sentIter1.next(); - doTest(testString, p, q, 83, "This\n costs $20,00,000."); + doTest(testString, p, q, 83, "This costs $20,00,000."); q = sentIter1.following(1); doTest(testString, 1, q, 7, "ello! "); q = sentIter1.following(10); @@ -324,7 +324,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { p = wordIter1.preceding(wordIter1.first()); if (p != RuleBasedBreakIterator.DONE) errln("ERROR: preceding() at starting position returned #" + p + " instead of 0"); - testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964"; + testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u0301\u0964"; logln("testing character iteration for string \" " + testString + "\" \n"); RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault()); charIter1.setText(testString); @@ -335,7 +335,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { doTest(testString, p, q, 31, "\u0964"); p = q; q = charIter1.previous(); - doTest(testString, p, q, 29, "\u0939\u094c"); + doTest(testString, p, q, 29, "\u0939\u0301"); q = charIter1.preceding(26); doTest(testString, 26, q, 23, "\u0938\u0941\u0902"); q = charIter1.preceding(16); @@ -349,7 +349,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE) errln("ERROR: previous()/preceding() at starting position returned #" + p + " and " + q + " instead of 0\n"); - testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000."; + testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000."; logln("testing sentence iter - String:- \"" + testString + "\""); RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault()); sentIter1.setText(testString); @@ -357,7 +357,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { if (p != testString.length()) errln("ERROR: last() returned" + p + "instead of " + testString.length()); q = sentIter1.previous(); - doTest(testString, p, q, 60, "This\n costs $20,00,000."); + doTest(testString, p, q, 60, "This costs $20,00,000."); p = q; q = sentIter1.previous(); doTest(testString, p, q, 41, "How are you doing? "); @@ -399,7 +399,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk { * Tests the method IsBoundary() of RuleBasedBreakIterator **/ public void TestIsBoundary() { - String testString1 = "Write here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964"; + String testString1 = "Write here. \u092d\u0301\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 a\u0301u"; RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault()); charIter1.setText(testString1); int bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26}; diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index b46869951d..f868b60229 100755 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -9,6 +9,7 @@ package com.ibm.icu.dev.test.rbbi; //Regression testing of RuleBasedBreakIterator import com.ibm.icu.dev.test.*; import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator_Old; import java.util.Vector; public class RBBITest extends TestFmwk @@ -43,6 +44,15 @@ public class RBBITest extends TestFmwk public void TestDefaultRuleBasedCharacterIteration(){ RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getCharacterInstance(); logln("Testing the RBBI for character iteration by using default rules"); + try { + RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi; + } + catch (ClassCastException e) { + // Bail out if using new RBBI implementation + logln("Test Skipped."); + return; + } + //fetch the rules used to create the above RuleBasedBreakIterator String defaultRules=rbbi.toString(); @@ -172,6 +182,14 @@ public class RBBITest extends TestFmwk public void TestDefaultRuleBasedWordIteration(){ logln("Testing the RBBI for word iteration using default rules"); RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance(); + try { + RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi; + } + catch (ClassCastException e) { + // Bail out if using new RBBI implementation + logln("Test Skipped."); + return; + } //fetch the rules used to create the above RuleBasedBreakIterator String defaultRules=rbbi.toString(); @@ -325,6 +343,14 @@ public class RBBITest extends TestFmwk logln("Testing the RBBI for sentence iteration using default rules"); RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getSentenceInstance(); //fetch the rules used to create the above RuleBasedBreakIterator + try { + RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi; + } + catch (ClassCastException e) { + // Bail out if using new RBBI implementation + logln("Test Skipped."); + return; + } String defaultRules=rbbi.toString(); RuleBasedBreakIterator sentIterDefault=null; try{ @@ -418,16 +444,24 @@ public class RBBITest extends TestFmwk } public void TestDefaultRuleBasedLineIteration(){ - logln("Testing the RBBI for line iteration using default rules"); - RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance(); - //fetch the rules used to create the above RuleBasedBreakIterator - String defaultRules=rbbi.toString(); - RuleBasedBreakIterator lineIterDefault=null; - try{ - lineIterDefault = new RuleBasedBreakIterator(defaultRules); - }catch(IllegalArgumentException iae){ - errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString()); - } + logln("Testing the RBBI for line iteration using default rules"); + RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance(); + //fetch the rules used to create the above RuleBasedBreakIterator + try { + RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi; + } + catch (ClassCastException e) { + // Bail out if using new RBBI implementation + logln("Test Skipped."); + return; + } + String defaultRules=rbbi.toString(); + RuleBasedBreakIterator lineIterDefault=null; + try{ + lineIterDefault = new RuleBasedBreakIterator(defaultRules); + }catch(IllegalArgumentException iae){ + errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString()); + } Vector linedata = new Vector(); linedata.addElement("Multi-"); @@ -524,6 +558,15 @@ public class RBBITest extends TestFmwk // get overridden. rbbi.toString(); RuleBasedBreakIterator lineIter=null; + try { + RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi; + } + catch (ClassCastException e) { + // Bail out if using new RBBI implementation + logln("Test Skipped."); + return; + } + try{ lineIter = new RuleBasedBreakIterator(rules); }catch(IllegalArgumentException iae){ @@ -651,7 +694,15 @@ public class RBBITest extends TestFmwk public void TestAbbrRuleBasedWordIteration(){ logln("Testing the RBBI for word iteration by adding rules to support abbreviation"); RuleBasedBreakIterator rb =(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance(); - + try { + // This test won't work with the new break iterators. Cast will fail in this case. + RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old) rb; + } + catch (ClassCastException e) { + logln("Test skipped."); + return; + } + String wrules2="$abbr=((Mr.)|(Mrs.)|(Ms.)|(Dr.)|(U.S.));" + // abbreviations. rb.toString() + "($abbr$ws)*$word;"; @@ -701,6 +752,10 @@ public class RBBITest extends TestFmwk buffer.append(text); } text = buffer.toString(); + if (rbbi == null) { + errln("null iterator, test skipped."); + return; + } rbbi.setText(text); diff --git a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java index fa8bf4ac7a..feadf38e96 100755 --- a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java +++ b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java @@ -29,11 +29,11 @@ public class BreakIteratorRules extends ListResourceBundle { // BreakIteratorClasses lists the class names to instantiate for each // built-in type of BreakIterator { "BreakIteratorClasses", - new String[] { "RuleBasedBreakIterator", // character-break iterator class - "RuleBasedBreakIterator", // word-break iterator class - "RuleBasedBreakIterator", // line-break iterator class - "RuleBasedBreakIterator", // sentence-break iterator class - "RuleBasedBreakIterator"} // Title-Case break iterator class + new String[] { "RuleBasedBreakIterator_New", // character-break iterator class + "RuleBasedBreakIterator_New", // word-break iterator class + "RuleBasedBreakIterator_New", // line-break iterator class + "RuleBasedBreakIterator_New", // sentence-break iterator class + "RuleBasedBreakIterator_New"} // Title-Case break iterator class }, // rules describing how to break between logical characters diff --git a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules_th.java b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules_th.java index a834611126..15901398dc 100755 --- a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules_th.java +++ b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules_th.java @@ -27,10 +27,10 @@ public class BreakIteratorRules_th extends ListResourceBundle { // iterator. Notice we're now using DictionaryBasedBreakIterator // for word and line breaking. { "BreakIteratorClasses", - new String[] { "RuleBasedBreakIterator", // character-break iterator class + new String[] { "RuleBasedBreakIterator_New", // character-break iterator class "DictionaryBasedBreakIterator", // word-break iterator class "DictionaryBasedBreakIterator", // line-break iterator class - "RuleBasedBreakIterator" } // sentence-break iterator class + "RuleBasedBreakIterator_New" } // sentence-break iterator class }, { "WordBreakRules", diff --git a/icu4j/src/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/src/com/ibm/icu/text/BreakIteratorFactory.java index db4cb9abed..738e2b6e39 100644 --- a/icu4j/src/com/ibm/icu/text/BreakIteratorFactory.java +++ b/icu4j/src/com/ibm/icu/text/BreakIteratorFactory.java @@ -18,6 +18,7 @@ import com.ibm.icu.impl.ICULocaleService; import com.ibm.icu.impl.ICUService; import com.ibm.icu.impl.ICUService.Factory; import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; /** * @author Ram @@ -76,10 +77,26 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim } static final ICULocaleService service = new BFService(); + // KIND_NAMES are used in synthesizing the resource name that holds the source + // break rules. For old-style (ICU 2.8 and previous) break iterators. + // The resources are com.ibm.icu.impl.data.BreakIteratorRules, and have + // names like "CharacterBreakRules", where the "Character" part of the + // name comes from here (this array). private static final String[] KIND_NAMES = { "Character", "Word", "Line", "Sentence", "Title" }; + /** KIND_NAMES_2 are used in synthesizing the names for + * the precompiled break rules used with the new (ICU 3.0) RBBI. + * The fully assembled names look like icudt30b_char.brk, which is the + * file name of the brk file as produced by the ICU4C build. + * @internal + */ + private static final String[] KIND_NAMES_2 = { + "char", "word", "line", "sent", "title" + }; + + private static BreakIterator createBreakInstance(Locale locale, int kind) { String prefix = KIND_NAMES[kind]; return createBreakInstance(locale, kind, @@ -97,8 +114,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim String[] classNames = bundle.getStringArray("BreakIteratorClasses"); String rules = bundle.getString(rulesName); if (classNames[kind].equals("RuleBasedBreakIterator")) { + // Old style (2.8 and previous) Break Iterator. + // Not used by default, but if someone wants to specify the old class + // in some locale's resources, it should still work. iter = new RuleBasedBreakIterator_Old(rules); } + else if (classNames[kind].equals("RuleBasedBreakIterator_New")) { + try { + // Class for new RBBI engine. + // Set up path to precompiled rule data. + String rulesFileName = + "data/icudt" + VersionInfo.ICU_VERSION.getMajor() + + VersionInfo.ICU_VERSION.getMinor() + "b_" + KIND_NAMES_2[kind] + ".brk"; + InputStream is = ICUData.getRequiredStream(rulesFileName); + iter = RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is); + } + catch (IOException e) { + throw new IllegalArgumentException(e.toString()); + } + } else if (classNames[kind].equals("DictionaryBasedBreakIterator")) { try { InputStream dictionary = ICUData.getStream(bundle.getString(dictionaryName)); diff --git a/icu4j/src/com/ibm/icu/text/RBBIDataWrapper.java b/icu4j/src/com/ibm/icu/text/RBBIDataWrapper.java index 01c316c247..4ac03dc974 100644 --- a/icu4j/src/com/ibm/icu/text/RBBIDataWrapper.java +++ b/icu4j/src/com/ibm/icu/text/RBBIDataWrapper.java @@ -8,16 +8,10 @@ package com.ibm.icu.text; import java.io.InputStream; -import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.IOException; -import java.util.Locale; -import com.ibm.icu.util.RangeValueIterator; import com.ibm.icu.util.VersionInfo; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UCharacterCategory; -import com.ibm.icu.lang.UProperty; import com.ibm.icu.impl.ICUData; import com.ibm.icu.impl.Trie; import com.ibm.icu.impl.CharTrie; @@ -69,9 +63,13 @@ public class RBBIDataWrapper { // Getters for fields from the state table header // - final static int getNumStates(int table[]) { - return table[NUMSTATES]<<16 + (table[NUMSTATES+1]&0xffff); - } + final static int getNumStates(short table[]) { + int hi = table[NUMSTATES]; + int lo = table[NUMSTATES+1]; + int val = (hi<<16) + (lo&0x0000ffff); + return val; + } + /** * Data Header. A struct-like class with the fields from the RBBI data file header. @@ -119,14 +117,14 @@ public class RBBIDataWrapper { static class TrieFoldingFunc implements Trie.DataManipulate { public int getFoldingOffset(int data) { - if ((data & 0x8000) == 0) { + if ((data & 0x8000) != 0) { return data & 0x7fff; } else { return 0; } } }; - static TrieFoldingFunc fTrieFoldingFunc; + static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc(); RBBIDataWrapper() { @@ -299,19 +297,148 @@ public class RBBIDataWrapper { /** Debug function to display the break iterator data. */ void dump() { System.out.println("RBBI Data Wrapper dump ..."); + System.out.println(); + System.out.println("Forward State Table"); + dumpTable(fFTable); + System.out.println("Reverse State Table"); + dumpTable(fRTable); + System.out.println("Forward Safe Points Table"); + dumpTable(fSFTable); + System.out.println("Reverse Safe Points Table"); + dumpTable(fSRTable); + + dumpCharCategories(); System.out.println("Source Rules: " + fRuleSource); + + } + + /** Fixed width int-to-string conversion. + * TODO: there must be easy built-in way to do this */ + private static String intToString(int n, int width) { + StringBuffer dest = new StringBuffer(width); + dest.append(n); + while (dest.length() < width) { + dest.insert(0, ' '); + } + return dest.toString(); + } + + /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */ + private void dumpTable(short table[]) { + int n; + int state; + String header = " Row Acc Look Tag"; + for (n=0; n fHeader.fCatCount) { + System.out.println("Error, bad category " + Integer.toHexString(category) + + " for char " + Integer.toHexString(char32)); + break; + } + if (category == lastCat ) { + rangeEnd = char32; + } else { + if (lastCat >= 0) { + if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) { + lastNewline[lastCat] = catStrings[lastCat].length() + 10; + catStrings[lastCat] += "\n "; + } + + catStrings[lastCat] += " " + Integer.toHexString(rangeStart); + if (rangeEnd != rangeStart) { + catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd); + } + } + lastCat = category; + rangeStart = rangeEnd = char32; + } + } + catStrings[lastCat] += " " + Integer.toHexString(rangeStart); + if (rangeEnd != rangeStart) { + catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd); + } + + for (category = 0; category <= fHeader.fCatCount; category ++) { + System.out.println (intToString(category, 5) + " " + catStrings[category]); + } + System.out.println(); } public static void main(String[] args) { String s; if (args.length == 0) { - s = "icudt28b_char.brk"; + s = "char"; } else { s = args[0]; } System.out.println("RBBIDataWrapper.main(" + s + ") "); + + String versionedName = + "icudt" + VersionInfo.ICU_VERSION.getMajor() + + VersionInfo.ICU_VERSION.getMinor() + "b_" + s + ".brk"; + try { - RBBIDataWrapper This = RBBIDataWrapper.get(s); + RBBIDataWrapper This = RBBIDataWrapper.get(versionedName); This.dump(); } catch (Exception e) { diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java b/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java index 8f48cc5005..a9034b33b4 100644 --- a/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java +++ b/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java @@ -7,7 +7,9 @@ package com.ibm.icu.text; import java.text.CharacterIterator; -import java.text.StringCharacterIterator; +import java.io.IOException; +import java.io.InputStream; + /** * Rule Based Break Iterator implementation. @@ -27,7 +29,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { * The rule data for this BreakIterator instance * @internal */ - private RBBIDataWrapper fData; + private RBBIDataWrapper fRData; /** Index of the Rule {tag} values for the most recent match. * @internal @@ -61,7 +63,9 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { public Object clone() { RuleBasedBreakIterator_New result = (RuleBasedBreakIterator_New) super.clone(); - // TODO: real clone code + if (fText != null) { + fText = (CharacterIterator)fText.clone(); + } return result; } @@ -71,8 +75,27 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { * @stable ICU 2.0 */ public boolean equals(Object that) { - return false; // TODO: - } + try { + RuleBasedBreakIterator_New other = (RuleBasedBreakIterator_New) that; + if (fRData != other.fRData && (fRData == null || other.fRData == null)) { + return false; + } + if (fRData != null && other.fRData != null && + (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { + return false; + } + if (fText == null && other.fText == null) { + return true; + } + if (fText == null || other.fText == null) { + return false; + } + return fText.equals(other.fText); + } + catch(ClassCastException e) { + return false; + } + } /** * Returns the description (rules) used to create this iterator. @@ -81,8 +104,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { */ public String toString() { String retStr = null; - if (fData != null) { - retStr = fData.fRuleSource; + if (fRData != null) { + retStr = fRData.fRuleSource; } return retStr; } @@ -94,9 +117,23 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { */ public int hashCode() { - return 0; // TODO + return fRData.fRuleSource.hashCode(); } + + //======================================================================= + // Constructors & Factories + //======================================================================= + public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { + RuleBasedBreakIterator_New This = new RuleBasedBreakIterator_New(); + This.fRData = RBBIDataWrapper.get(is); + This.fText = new java.text.StringCharacterIterator(""); // Note: some old tests fail if fText is null + // on a newly created instance. + return This; + } + + + //======================================================================= // BreakIterator overrides //======================================================================= @@ -192,8 +229,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { return BreakIterator.DONE; } - if (fData.fSRTable != null || fData.fSFTable != null) { - return handlePrevious(fData.fRTable); + if (fRData.fSRTable != null || fRData.fSFTable != null) { + return handlePrevious(fRData.fRTable); } // old rule syntax @@ -266,7 +303,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { int result = 0; - if (fData.fSRTable != null) { + if (fRData.fSRTable != null) { // Safe Point Reverse rules exist. // This allows us to use the optimum algorithm. fText.setIndex(offset); @@ -275,20 +312,20 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { // this handles offset being between a supplementary character CINext32(fText); // handlePrevious will move most of the time to < 1 boundary away - handlePrevious(fData.fSRTable); + handlePrevious(fRData.fSRTable); result = next(); while (result <= offset) { result = next(); } return result; } - if (fData.fSFTable != null) { + if (fRData.fSFTable != null) { // No Safe point reverse table, but there is a safe pt forward table. // fText.setIndex(offset); CIPrevious32(fText); // handle next will give result >= offset - handleNext(fData.fSFTable); + handleNext(fRData.fSFTable); // previous will give result 0 or 1 boundary away from offset, // most of the time // we have to @@ -352,7 +389,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { // to carry out this operation int result; - if (fData.fSFTable != null) { + if (fRData.fSFTable != null) { /// todo synwee // new rule syntax fText.setIndex(offset); @@ -360,19 +397,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { // safe point. // this handles offset being between a supplementary character CIPrevious32(fText); - handleNext(fData.fSFTable); + handleNext(fRData.fSFTable); result = previous(); while (result >= offset) { result = previous(); } return result; } - if (fData.fSRTable != null) { + if (fRData.fSRTable != null) { // backup plan if forward safe table is not available fText.setIndex(offset); CINext32(fText); // handle previous will give result <= offset - handlePrevious(fData.fSRTable); + handlePrevious(fRData.fSRTable); // next will give result 0 or 1 boundary away from offset, // most of the time @@ -397,6 +434,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { return previous(); } + /** + * Throw IllegalArgumentException unless begin <= offset < end. + * TODO: subclassing interface from old RBBI is not really usable. + * What to do with old protected functions tagged as stable? + * @stable ICU 2.0 + */ + protected static final void checkOffset(int offset, CharacterIterator text) { + if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { + throw new IllegalArgumentException("offset out of bounds"); + } + } + + /** * Returns true if the specfied position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at @@ -406,8 +456,10 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator { * @stable ICU 2.0 */ public boolean isBoundary(int offset) { + checkOffset(offset, fText); + // the beginning index of the iterator is always a boundary position by definition - if (fText == null || offset == fText.getBeginIndex()) { + if (offset == fText.getBeginIndex()) { first(); // For side effects on current position, tag values. return true; } @@ -502,8 +554,8 @@ public int getRuleStatus() { // Status val N-1 <-- the value we need to return // The status values are sorted in ascending order. // This function returns the last (largest) of the array of status values. - int idx = fLastRuleStatusIndex + fData.fStatusTable[fLastRuleStatusIndex]; - int tagVal = fData.fStatusTable[idx]; + int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; + int tagVal = fRData.fStatusTable[idx]; return tagVal; } @@ -532,11 +584,11 @@ public int getRuleStatus() { */ public int getRuleStatusVec(int[] fillInArray) { makeRuleStatusValid(); - int numStatusVals = fData.fStatusTable[fLastRuleStatusIndex]; + int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; if (fillInArray != null) { int numToCopy = Math.min(numStatusVals, fillInArray.length); for (int i=0; i= ci.getEndIndex()) { return false; } return true; @@ -637,7 +688,7 @@ public int getRuleStatusVec(int[] fillInArray) { * @internal */ private int handleNext() { - return handleNext(fData.fFTable); + return handleNext(fRData.fFTable); } @@ -663,7 +714,7 @@ public int getRuleStatusVec(int[] fillInArray) { int state = START_STATE; short category; int c = CICurrent32(fText); - int row = fData.getRowIndex(state); + int row = fRData.getRowIndex(state); int lookaheadStatus = 0; int lookaheadTagIdx = 0; @@ -671,7 +722,7 @@ public int getRuleStatusVec(int[] fillInArray) { // Character Category fetch for starting character. // See comments on character category code within loop, below. - category = (short)fData.fTrie.getCodePointValue(c); + category = (short)fRData.fTrie.getCodePointValue(c); if ((category & 0x4000) != 0) { // fDictionaryCharCount++; category &= ~0x4000; @@ -704,7 +755,7 @@ public int getRuleStatusVec(int[] fillInArray) { // look up the current character's character category, which tells us // which column in the state table to look at. // - category = (short)fData.fTrie.getCodePointValue(c); + category = (short)fRData.fTrie.getCodePointValue(c); // Clear the dictionary flag bit in the character's category. // Note: not using the old style dictionary stuff in this Java engine. @@ -725,7 +776,7 @@ public int getRuleStatusVec(int[] fillInArray) { // look up a state transition in the state table // state = row->fNextState[category]; state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; - row = fData.getRowIndex(state); + row = fRData.getRowIndex(state); // Get the next character. Doing it here positions the iterator // to the correct position for recording matches in the code that @@ -793,15 +844,15 @@ public int getRuleStatusVec(int[] fillInArray) { * handlePrevious */ private int handlePrevious() { - if (fText == null || fData == null) { + if (fText == null || fRData == null) { return 0; } - if (fData.fRTable == null) { + if (fRData.fRTable == null) { fText.first(); return fText.getIndex(); } - short stateTable[] = fData.fRTable; + short stateTable[] = fRData.fRTable; int state = START_STATE; int category; int lastCategory = 0; @@ -812,8 +863,8 @@ public int getRuleStatusVec(int[] fillInArray) { int c = CICurrent32(fText); int row; - row = fData.getRowIndex(state); - category = (short)fData.fTrie.getCodePointValue(c); + row = fRData.getRowIndex(state); + category = (short)fRData.fTrie.getCodePointValue(c); category &= ~0x4000; // Clear the dictionary bit, just in case. if (fTrace) { @@ -829,7 +880,7 @@ public int getRuleStatusVec(int[] fillInArray) { // save the last character's category and look up the current // character's category lastCategory = category; - category = (short)fData.fTrie.getCodePointValue(c); + category = (short)fRData.fTrie.getCodePointValue(c); // Check the dictionary bit in the character's category. // Don't exist in this Java engine implementation. Clear the bit. @@ -848,7 +899,7 @@ public int getRuleStatusVec(int[] fillInArray) { // look up a state transition in the backwards state table state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; - row = fData.getRowIndex(state); + row = fRData.getRowIndex(state); continueOn: { if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0 && @@ -942,9 +993,9 @@ public int getRuleStatusVec(int[] fillInArray) { boolean lookAheadHardBreak = (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0; - int row = fData.getRowIndex(state); + int row = fRData.getRowIndex(state); - category = (short)fData.fTrie.getCodePointValue(c); + category = (short)fRData.fTrie.getCodePointValue(c); category &= ~0x4000; // Mask off dictionary bit. if (fTrace) { @@ -965,7 +1016,7 @@ public int getRuleStatusVec(int[] fillInArray) { // save the last character's category and look up the current // character's category lastCategory = category; - category = (short)fData.fTrie.getCodePointValue(c); + category = (short)fRData.fTrie.getCodePointValue(c); category &= ~0x4000; // Clear the dictionary bit flag // (Should be unused; holdover from old RBBI) @@ -982,7 +1033,7 @@ public int getRuleStatusVec(int[] fillInArray) { // look up a state transition in the backwards state table state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; - row = fData.getRowIndex(state); + row = fRData.getRowIndex(state); if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { // Match found, common case, could have lookahead so we move on to check it