ICU-3295 RBBI runtime port to Java

X-SVN-Rev: 15029
This commit is contained in:
Andy Heninger 2004-04-23 20:54:33 +00:00
parent 4514814ae6
commit b6b23502af
9 changed files with 881 additions and 517 deletions

View File

@ -194,7 +194,7 @@
<target name ="coreData" depends="init">
<copy todir="${build.dir}/com/ibm/icu/impl/data">
<fileset dir="${src.dir}/com/ibm/icu/impl/data"
includes="Transliterator_*.txt,*.icu,*.spp"
includes="Transliterator_*.txt,*.icu,*.spp,*.brk"
excludes="**/CVS/**/*,Transliterator_Han_Latin_*.txt"/>
</copy>
</target>

File diff suppressed because it is too large Load Diff

View File

@ -186,7 +186,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
errln("ERROR: next()/following() at last position returned #"
+ p + " and " + q + " instead of" + testString.length() + "\n");
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
testString = "Write hindi here. \u092d\u093e\u0930\u0301 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
logln("testing char iter - string:- \"" + testString + "\"");
charIter1.setText(testString);
p = charIter1.first();
@ -209,7 +209,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
// hindi starts here
p = q;
q = charIter1.next(4);
doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0924");
doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0301"); // Nonsense, but compatible between old and new rules.
p = q;
q = charIter1.next(2);
doTest(testString, p, q, 26, " \u0938\u0941\u0902");
@ -217,13 +217,13 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
q = charIter1.following(24);
doTest(testString, 24, q, 26, "\u0941\u0902");
q = charIter1.following(20);
doTest(testString, 20, q, 21, "\u0930");
doTest(testString, 20, q, 22, "\u0930\u0301");
p = charIter1.following(charIter1.last());
q = charIter1.next(charIter1.last());
if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE)
errln("ERROR: following()/next() at last position returned #"
+ p + " and " + q + " instead of" + testString.length());
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000.";
RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault());
logln("testing sentence iter - String:- \"" + testString + "\"");
sentIter1.setText(testString);
@ -243,7 +243,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? ");
p = q;
q = sentIter1.next();
doTest(testString, p, q, 83, "This\n costs $20,00,000.");
doTest(testString, p, q, 83, "This costs $20,00,000.");
q = sentIter1.following(1);
doTest(testString, 1, q, 7, "ello! ");
q = sentIter1.following(10);
@ -324,7 +324,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
p = wordIter1.preceding(wordIter1.first());
if (p != RuleBasedBreakIterator.DONE)
errln("ERROR: preceding() at starting position returned #" + p + " instead of 0");
testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u0301\u0964";
logln("testing character iteration for string \" " + testString + "\" \n");
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
charIter1.setText(testString);
@ -335,7 +335,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
doTest(testString, p, q, 31, "\u0964");
p = q;
q = charIter1.previous();
doTest(testString, p, q, 29, "\u0939\u094c");
doTest(testString, p, q, 29, "\u0939\u0301");
q = charIter1.preceding(26);
doTest(testString, 26, q, 23, "\u0938\u0941\u0902");
q = charIter1.preceding(16);
@ -349,7 +349,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE)
errln("ERROR: previous()/preceding() at starting position returned #"
+ p + " and " + q + " instead of 0\n");
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000.";
logln("testing sentence iter - String:- \"" + testString + "\"");
RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault());
sentIter1.setText(testString);
@ -357,7 +357,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
if (p != testString.length())
errln("ERROR: last() returned" + p + "instead of " + testString.length());
q = sentIter1.previous();
doTest(testString, p, q, 60, "This\n costs $20,00,000.");
doTest(testString, p, q, 60, "This costs $20,00,000.");
p = q;
q = sentIter1.previous();
doTest(testString, p, q, 41, "How are you doing? ");
@ -399,7 +399,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
* Tests the method IsBoundary() of RuleBasedBreakIterator
**/
public void TestIsBoundary() {
String testString1 = "Write here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
String testString1 = "Write here. \u092d\u0301\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 a\u0301u";
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
charIter1.setText(testString1);
int bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26};

View File

@ -9,6 +9,7 @@ package com.ibm.icu.dev.test.rbbi;
//Regression testing of RuleBasedBreakIterator
import com.ibm.icu.dev.test.*;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator_Old;
import java.util.Vector;
public class RBBITest extends TestFmwk
@ -43,6 +44,15 @@ public class RBBITest extends TestFmwk
public void TestDefaultRuleBasedCharacterIteration(){
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getCharacterInstance();
logln("Testing the RBBI for character iteration by using default rules");
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
//fetch the rules used to create the above RuleBasedBreakIterator
String defaultRules=rbbi.toString();
@ -172,6 +182,14 @@ public class RBBITest extends TestFmwk
public void TestDefaultRuleBasedWordIteration(){
logln("Testing the RBBI for word iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance();
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
//fetch the rules used to create the above RuleBasedBreakIterator
String defaultRules=rbbi.toString();
@ -325,6 +343,14 @@ public class RBBITest extends TestFmwk
logln("Testing the RBBI for sentence iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getSentenceInstance();
//fetch the rules used to create the above RuleBasedBreakIterator
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
String defaultRules=rbbi.toString();
RuleBasedBreakIterator sentIterDefault=null;
try{
@ -418,16 +444,24 @@ public class RBBITest extends TestFmwk
}
public void TestDefaultRuleBasedLineIteration(){
logln("Testing the RBBI for line iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance();
//fetch the rules used to create the above RuleBasedBreakIterator
String defaultRules=rbbi.toString();
RuleBasedBreakIterator lineIterDefault=null;
try{
lineIterDefault = new RuleBasedBreakIterator(defaultRules);
}catch(IllegalArgumentException iae){
errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());
}
logln("Testing the RBBI for line iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance();
//fetch the rules used to create the above RuleBasedBreakIterator
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
String defaultRules=rbbi.toString();
RuleBasedBreakIterator lineIterDefault=null;
try{
lineIterDefault = new RuleBasedBreakIterator(defaultRules);
}catch(IllegalArgumentException iae){
errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());
}
Vector linedata = new Vector();
linedata.addElement("Multi-");
@ -524,6 +558,15 @@ public class RBBITest extends TestFmwk
// get overridden.
rbbi.toString();
RuleBasedBreakIterator lineIter=null;
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
try{
lineIter = new RuleBasedBreakIterator(rules);
}catch(IllegalArgumentException iae){
@ -651,7 +694,15 @@ public class RBBITest extends TestFmwk
public void TestAbbrRuleBasedWordIteration(){
logln("Testing the RBBI for word iteration by adding rules to support abbreviation");
RuleBasedBreakIterator rb =(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance();
try {
// This test won't work with the new break iterators. Cast will fail in this case.
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old) rb;
}
catch (ClassCastException e) {
logln("Test skipped.");
return;
}
String wrules2="$abbr=((Mr.)|(Mrs.)|(Ms.)|(Dr.)|(U.S.));" + // abbreviations.
rb.toString() +
"($abbr$ws)*$word;";
@ -701,6 +752,10 @@ public class RBBITest extends TestFmwk
buffer.append(text);
}
text = buffer.toString();
if (rbbi == null) {
errln("null iterator, test skipped.");
return;
}
rbbi.setText(text);

View File

@ -29,11 +29,11 @@ public class BreakIteratorRules extends ListResourceBundle {
// BreakIteratorClasses lists the class names to instantiate for each
// built-in type of BreakIterator
{ "BreakIteratorClasses",
new String[] { "RuleBasedBreakIterator", // character-break iterator class
"RuleBasedBreakIterator", // word-break iterator class
"RuleBasedBreakIterator", // line-break iterator class
"RuleBasedBreakIterator", // sentence-break iterator class
"RuleBasedBreakIterator"} // Title-Case break iterator class
new String[] { "RuleBasedBreakIterator_New", // character-break iterator class
"RuleBasedBreakIterator_New", // word-break iterator class
"RuleBasedBreakIterator_New", // line-break iterator class
"RuleBasedBreakIterator_New", // sentence-break iterator class
"RuleBasedBreakIterator_New"} // Title-Case break iterator class
},
// rules describing how to break between logical characters

View File

@ -27,10 +27,10 @@ public class BreakIteratorRules_th extends ListResourceBundle {
// iterator. Notice we're now using DictionaryBasedBreakIterator
// for word and line breaking.
{ "BreakIteratorClasses",
new String[] { "RuleBasedBreakIterator", // character-break iterator class
new String[] { "RuleBasedBreakIterator_New", // character-break iterator class
"DictionaryBasedBreakIterator", // word-break iterator class
"DictionaryBasedBreakIterator", // line-break iterator class
"RuleBasedBreakIterator" } // sentence-break iterator class
"RuleBasedBreakIterator_New" } // sentence-break iterator class
},
{ "WordBreakRules",

View File

@ -18,6 +18,7 @@ import com.ibm.icu.impl.ICULocaleService;
import com.ibm.icu.impl.ICUService;
import com.ibm.icu.impl.ICUService.Factory;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;
/**
* @author Ram
@ -76,10 +77,26 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
}
static final ICULocaleService service = new BFService();
// KIND_NAMES are used in synthesizing the resource name that holds the source
// break rules. For old-style (ICU 2.8 and previous) break iterators.
// The resources are com.ibm.icu.impl.data.BreakIteratorRules, and have
// names like "CharacterBreakRules", where the "Character" part of the
// name comes from here (this array).
private static final String[] KIND_NAMES = {
"Character", "Word", "Line", "Sentence", "Title"
};
/** KIND_NAMES_2 are used in synthesizing the names for
* the precompiled break rules used with the new (ICU 3.0) RBBI.
* The fully assembled names look like icudt30b_char.brk, which is the
* file name of the brk file as produced by the ICU4C build.
* @internal
*/
private static final String[] KIND_NAMES_2 = {
"char", "word", "line", "sent", "title"
};
private static BreakIterator createBreakInstance(Locale locale, int kind) {
String prefix = KIND_NAMES[kind];
return createBreakInstance(locale, kind,
@ -97,8 +114,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
String[] classNames = bundle.getStringArray("BreakIteratorClasses");
String rules = bundle.getString(rulesName);
if (classNames[kind].equals("RuleBasedBreakIterator")) {
// Old style (2.8 and previous) Break Iterator.
// Not used by default, but if someone wants to specify the old class
// in some locale's resources, it should still work.
iter = new RuleBasedBreakIterator_Old(rules);
}
else if (classNames[kind].equals("RuleBasedBreakIterator_New")) {
try {
// Class for new RBBI engine.
// Set up path to precompiled rule data.
String rulesFileName =
"data/icudt" + VersionInfo.ICU_VERSION.getMajor() +
VersionInfo.ICU_VERSION.getMinor() + "b_" + KIND_NAMES_2[kind] + ".brk";
InputStream is = ICUData.getRequiredStream(rulesFileName);
iter = RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is);
}
catch (IOException e) {
throw new IllegalArgumentException(e.toString());
}
}
else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
try {
InputStream dictionary = ICUData.getStream(bundle.getString(dictionaryName));

View File

@ -8,16 +8,10 @@
package com.ibm.icu.text;
import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.Locale;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.VersionInfo;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.Trie;
import com.ibm.icu.impl.CharTrie;
@ -69,9 +63,13 @@ public class RBBIDataWrapper {
// Getters for fields from the state table header
//
final static int getNumStates(int table[]) {
return table[NUMSTATES]<<16 + (table[NUMSTATES+1]&0xffff);
}
final static int getNumStates(short table[]) {
int hi = table[NUMSTATES];
int lo = table[NUMSTATES+1];
int val = (hi<<16) + (lo&0x0000ffff);
return val;
}
/**
* Data Header. A struct-like class with the fields from the RBBI data file header.
@ -119,14 +117,14 @@ public class RBBIDataWrapper {
static class TrieFoldingFunc implements Trie.DataManipulate {
public int getFoldingOffset(int data) {
if ((data & 0x8000) == 0) {
if ((data & 0x8000) != 0) {
return data & 0x7fff;
} else {
return 0;
}
}
};
static TrieFoldingFunc fTrieFoldingFunc;
static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
RBBIDataWrapper() {
@ -299,19 +297,148 @@ public class RBBIDataWrapper {
/** Debug function to display the break iterator data. */
void dump() {
System.out.println("RBBI Data Wrapper dump ...");
System.out.println();
System.out.println("Forward State Table");
dumpTable(fFTable);
System.out.println("Reverse State Table");
dumpTable(fRTable);
System.out.println("Forward Safe Points Table");
dumpTable(fSFTable);
System.out.println("Reverse Safe Points Table");
dumpTable(fSRTable);
dumpCharCategories();
System.out.println("Source Rules: " + fRuleSource);
}
/** Fixed width int-to-string conversion.
* TODO: there must be easy built-in way to do this */
private static String intToString(int n, int width) {
StringBuffer dest = new StringBuffer(width);
dest.append(n);
while (dest.length() < width) {
dest.insert(0, ' ');
}
return dest.toString();
}
/** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
private void dumpTable(short table[]) {
int n;
int state;
String header = " Row Acc Look Tag";
for (n=0; n<fHeader.fCatCount; n++) {
header += intToString(n, 5);
}
System.out.println(header);
for (n=0; n<header.length(); n++) {
System.out.print("-");
}
System.out.println();
for (state=0; state< getNumStates(table); state++) {
dumpRow(table, state);
}
System.out.println();
}
/**
* Dump (for debug) a single row of an RBBI state table
* @param table
* @param state
* @internal
*/
private void dumpRow(short table[], int state) {
StringBuffer dest = new StringBuffer(fHeader.fCatCount*5 + 20);
dest.append(intToString(state, 4));
int row = getRowIndex(state);
if (table[row+ACCEPTING] != 0) {
dest.append(intToString(table[row+ACCEPTING], 5));
}else {
dest.append(" ");
}
if (table[row+LOOKAHEAD] != 0) {
System.out.println(dest);
dest.append(intToString(table[row+LOOKAHEAD], 5));
}else {
dest.append(" ");
}
dest.append(intToString(table[row+TAGIDX], 5));
for (int col=0; col<fHeader.fCatCount; col++) {
dest.append(intToString(table[row+NEXTSTATES+col], 5));
}
System.out.println(dest);
}
private void dumpCharCategories() {
int n = fHeader.fCatCount;
String catStrings[] = new String[n+1];
int rangeStart = 0;
int rangeEnd = 0;
int lastCat = -1;
int char32;
int category;
int lastNewline[] = new int[n+1];
for (category = 0; category <= fHeader.fCatCount; category ++) {
catStrings[category] = "";
}
System.out.println("\nCharacter Categories");
System.out.println("--------------------");
for (char32 = 0; char32<=0x10ffff; char32++) {
category = fTrie.getCodePointValue(char32);
category &= ~0x4000; // Mask off dictionary bit.
if (category < 0 || category > fHeader.fCatCount) {
System.out.println("Error, bad category " + Integer.toHexString(category) +
" for char " + Integer.toHexString(char32));
break;
}
if (category == lastCat ) {
rangeEnd = char32;
} else {
if (lastCat >= 0) {
if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
lastNewline[lastCat] = catStrings[lastCat].length() + 10;
catStrings[lastCat] += "\n ";
}
catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
if (rangeEnd != rangeStart) {
catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
}
}
lastCat = category;
rangeStart = rangeEnd = char32;
}
}
catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
if (rangeEnd != rangeStart) {
catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
}
for (category = 0; category <= fHeader.fCatCount; category ++) {
System.out.println (intToString(category, 5) + " " + catStrings[category]);
}
System.out.println();
}
public static void main(String[] args) {
String s;
if (args.length == 0) {
s = "icudt28b_char.brk";
s = "char";
} else {
s = args[0];
}
System.out.println("RBBIDataWrapper.main(" + s + ") ");
String versionedName =
"icudt" + VersionInfo.ICU_VERSION.getMajor() +
VersionInfo.ICU_VERSION.getMinor() + "b_" + s + ".brk";
try {
RBBIDataWrapper This = RBBIDataWrapper.get(s);
RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
This.dump();
}
catch (Exception e) {

View File

@ -7,7 +7,9 @@
package com.ibm.icu.text;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.io.IOException;
import java.io.InputStream;
/**
* Rule Based Break Iterator implementation.
@ -27,7 +29,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* The rule data for this BreakIterator instance
* @internal
*/
private RBBIDataWrapper fData;
private RBBIDataWrapper fRData;
/** Index of the Rule {tag} values for the most recent match.
* @internal
@ -61,7 +63,9 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
public Object clone()
{
RuleBasedBreakIterator_New result = (RuleBasedBreakIterator_New) super.clone();
// TODO: real clone code
if (fText != null) {
fText = (CharacterIterator)fText.clone();
}
return result;
}
@ -71,8 +75,27 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0
*/
public boolean equals(Object that) {
return false; // TODO:
}
try {
RuleBasedBreakIterator_New other = (RuleBasedBreakIterator_New) that;
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
return false;
}
if (fRData != null && other.fRData != null &&
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
return false;
}
if (fText == null && other.fText == null) {
return true;
}
if (fText == null || other.fText == null) {
return false;
}
return fText.equals(other.fText);
}
catch(ClassCastException e) {
return false;
}
}
/**
* Returns the description (rules) used to create this iterator.
@ -81,8 +104,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
*/
public String toString() {
String retStr = null;
if (fData != null) {
retStr = fData.fRuleSource;
if (fRData != null) {
retStr = fRData.fRuleSource;
}
return retStr;
}
@ -94,9 +117,23 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
*/
public int hashCode()
{
return 0; // TODO
return fRData.fRuleSource.hashCode();
}
//=======================================================================
// Constructors & Factories
//=======================================================================
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
RuleBasedBreakIterator_New This = new RuleBasedBreakIterator_New();
This.fRData = RBBIDataWrapper.get(is);
This.fText = new java.text.StringCharacterIterator(""); // Note: some old tests fail if fText is null
// on a newly created instance.
return This;
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
@ -192,8 +229,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
return BreakIterator.DONE;
}
if (fData.fSRTable != null || fData.fSFTable != null) {
return handlePrevious(fData.fRTable);
if (fRData.fSRTable != null || fRData.fSFTable != null) {
return handlePrevious(fRData.fRTable);
}
// old rule syntax
@ -266,7 +303,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
int result = 0;
if (fData.fSRTable != null) {
if (fRData.fSRTable != null) {
// Safe Point Reverse rules exist.
// This allows us to use the optimum algorithm.
fText.setIndex(offset);
@ -275,20 +312,20 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// this handles offset being between a supplementary character
CINext32(fText);
// handlePrevious will move most of the time to < 1 boundary away
handlePrevious(fData.fSRTable);
handlePrevious(fRData.fSRTable);
result = next();
while (result <= offset) {
result = next();
}
return result;
}
if (fData.fSFTable != null) {
if (fRData.fSFTable != null) {
// No Safe point reverse table, but there is a safe pt forward table.
//
fText.setIndex(offset);
CIPrevious32(fText);
// handle next will give result >= offset
handleNext(fData.fSFTable);
handleNext(fRData.fSFTable);
// previous will give result 0 or 1 boundary away from offset,
// most of the time
// we have to
@ -352,7 +389,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// to carry out this operation
int result;
if (fData.fSFTable != null) {
if (fRData.fSFTable != null) {
/// todo synwee
// new rule syntax
fText.setIndex(offset);
@ -360,19 +397,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// safe point.
// this handles offset being between a supplementary character
CIPrevious32(fText);
handleNext(fData.fSFTable);
handleNext(fRData.fSFTable);
result = previous();
while (result >= offset) {
result = previous();
}
return result;
}
if (fData.fSRTable != null) {
if (fRData.fSRTable != null) {
// backup plan if forward safe table is not available
fText.setIndex(offset);
CINext32(fText);
// handle previous will give result <= offset
handlePrevious(fData.fSRTable);
handlePrevious(fRData.fSRTable);
// next will give result 0 or 1 boundary away from offset,
// most of the time
@ -397,6 +434,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
return previous();
}
/**
* Throw IllegalArgumentException unless begin <= offset < end.
* TODO: subclassing interface from old RBBI is not really usable.
* What to do with old protected functions tagged as stable?
* @stable ICU 2.0
*/
protected static final void checkOffset(int offset, CharacterIterator text) {
if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
}
}
/**
* Returns true if the specfied position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
@ -406,8 +456,10 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0
*/
public boolean isBoundary(int offset) {
checkOffset(offset, fText);
// the beginning index of the iterator is always a boundary position by definition
if (fText == null || offset == fText.getBeginIndex()) {
if (offset == fText.getBeginIndex()) {
first(); // For side effects on current position, tag values.
return true;
}
@ -502,8 +554,8 @@ public int getRuleStatus() {
// Status val N-1 <-- the value we need to return
// The status values are sorted in ascending order.
// This function returns the last (largest) of the array of status values.
int idx = fLastRuleStatusIndex + fData.fStatusTable[fLastRuleStatusIndex];
int tagVal = fData.fStatusTable[idx];
int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
int tagVal = fRData.fStatusTable[idx];
return tagVal;
}
@ -532,11 +584,11 @@ public int getRuleStatus() {
*/
public int getRuleStatusVec(int[] fillInArray) {
makeRuleStatusValid();
int numStatusVals = fData.fStatusTable[fLastRuleStatusIndex];
int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
if (fillInArray != null) {
int numToCopy = Math.min(numStatusVals, fillInArray.length);
for (int i=0; i<numToCopy; i++) {
fillInArray[i] = fData.fStatusTable[fLastRuleStatusIndex + i + 1];
fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
}
}
return numStatusVals;
@ -618,8 +670,7 @@ public int getRuleStatusVec(int[] fillInArray) {
if (ci == null) {
return false;
}
int end = ci.getEndIndex();
if (end == 0 || ci.getIndex() < end) {
if (ci.getIndex() >= ci.getEndIndex()) {
return false;
}
return true;
@ -637,7 +688,7 @@ public int getRuleStatusVec(int[] fillInArray) {
* @internal
*/
private int handleNext() {
return handleNext(fData.fFTable);
return handleNext(fRData.fFTable);
}
@ -663,7 +714,7 @@ public int getRuleStatusVec(int[] fillInArray) {
int state = START_STATE;
short category;
int c = CICurrent32(fText);
int row = fData.getRowIndex(state);
int row = fRData.getRowIndex(state);
int lookaheadStatus = 0;
int lookaheadTagIdx = 0;
@ -671,7 +722,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// Character Category fetch for starting character.
// See comments on character category code within loop, below.
category = (short)fData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.getCodePointValue(c);
if ((category & 0x4000) != 0) {
// fDictionaryCharCount++;
category &= ~0x4000;
@ -704,7 +755,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
category = (short)fData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.getCodePointValue(c);
// Clear the dictionary flag bit in the character's category.
// Note: not using the old style dictionary stuff in this Java engine.
@ -725,7 +776,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the state table
// state = row->fNextState[category];
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fData.getRowIndex(state);
row = fRData.getRowIndex(state);
// Get the next character. Doing it here positions the iterator
// to the correct position for recording matches in the code that
@ -793,15 +844,15 @@ public int getRuleStatusVec(int[] fillInArray) {
* handlePrevious
*/
private int handlePrevious() {
if (fText == null || fData == null) {
if (fText == null || fRData == null) {
return 0;
}
if (fData.fRTable == null) {
if (fRData.fRTable == null) {
fText.first();
return fText.getIndex();
}
short stateTable[] = fData.fRTable;
short stateTable[] = fRData.fRTable;
int state = START_STATE;
int category;
int lastCategory = 0;
@ -812,8 +863,8 @@ public int getRuleStatusVec(int[] fillInArray) {
int c = CICurrent32(fText);
int row;
row = fData.getRowIndex(state);
category = (short)fData.fTrie.getCodePointValue(c);
row = fRData.getRowIndex(state);
category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Clear the dictionary bit, just in case.
if (fTrace) {
@ -829,7 +880,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// save the last character's category and look up the current
// character's category
lastCategory = category;
category = (short)fData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Don't exist in this Java engine implementation. Clear the bit.
@ -848,7 +899,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the backwards state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fData.getRowIndex(state);
row = fRData.getRowIndex(state);
continueOn: {
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0 &&
@ -942,9 +993,9 @@ public int getRuleStatusVec(int[] fillInArray) {
boolean lookAheadHardBreak =
(stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
int row = fData.getRowIndex(state);
int row = fRData.getRowIndex(state);
category = (short)fData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Mask off dictionary bit.
if (fTrace) {
@ -965,7 +1016,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// save the last character's category and look up the current
// character's category
lastCategory = category;
category = (short)fData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Clear the dictionary bit flag
// (Should be unused; holdover from old RBBI)
@ -982,7 +1033,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the backwards state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fData.getRowIndex(state);
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case, could have lookahead so we move on to check it