ICU-3295 RBBI runtime port to Java

X-SVN-Rev: 15029
This commit is contained in:
Andy Heninger 2004-04-23 20:54:33 +00:00
parent 4514814ae6
commit b6b23502af
9 changed files with 881 additions and 517 deletions

View File

@ -194,7 +194,7 @@
<target name ="coreData" depends="init"> <target name ="coreData" depends="init">
<copy todir="${build.dir}/com/ibm/icu/impl/data"> <copy todir="${build.dir}/com/ibm/icu/impl/data">
<fileset dir="${src.dir}/com/ibm/icu/impl/data" <fileset dir="${src.dir}/com/ibm/icu/impl/data"
includes="Transliterator_*.txt,*.icu,*.spp" includes="Transliterator_*.txt,*.icu,*.spp,*.brk"
excludes="**/CVS/**/*,Transliterator_Han_Latin_*.txt"/> excludes="**/CVS/**/*,Transliterator_Han_Latin_*.txt"/>
</copy> </copy>
</target> </target>

File diff suppressed because it is too large Load Diff

View File

@ -186,7 +186,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
errln("ERROR: next()/following() at last position returned #" errln("ERROR: next()/following() at last position returned #"
+ p + " and " + q + " instead of" + testString.length() + "\n"); + p + " and " + q + " instead of" + testString.length() + "\n");
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault()); RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964"; testString = "Write hindi here. \u092d\u093e\u0930\u0301 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
logln("testing char iter - string:- \"" + testString + "\""); logln("testing char iter - string:- \"" + testString + "\"");
charIter1.setText(testString); charIter1.setText(testString);
p = charIter1.first(); p = charIter1.first();
@ -209,7 +209,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
// hindi starts here // hindi starts here
p = q; p = q;
q = charIter1.next(4); q = charIter1.next(4);
doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0924"); doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0301"); // Nonsense, but compatible between old and new rules.
p = q; p = q;
q = charIter1.next(2); q = charIter1.next(2);
doTest(testString, p, q, 26, " \u0938\u0941\u0902"); doTest(testString, p, q, 26, " \u0938\u0941\u0902");
@ -217,13 +217,13 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
q = charIter1.following(24); q = charIter1.following(24);
doTest(testString, 24, q, 26, "\u0941\u0902"); doTest(testString, 24, q, 26, "\u0941\u0902");
q = charIter1.following(20); q = charIter1.following(20);
doTest(testString, 20, q, 21, "\u0930"); doTest(testString, 20, q, 22, "\u0930\u0301");
p = charIter1.following(charIter1.last()); p = charIter1.following(charIter1.last());
q = charIter1.next(charIter1.last()); q = charIter1.next(charIter1.last());
if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE) if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE)
errln("ERROR: following()/next() at last position returned #" errln("ERROR: following()/next() at last position returned #"
+ p + " and " + q + " instead of" + testString.length()); + p + " and " + q + " instead of" + testString.length());
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000."; testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000.";
RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault()); RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault());
logln("testing sentence iter - String:- \"" + testString + "\""); logln("testing sentence iter - String:- \"" + testString + "\"");
sentIter1.setText(testString); sentIter1.setText(testString);
@ -243,7 +243,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? "); doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? ");
p = q; p = q;
q = sentIter1.next(); q = sentIter1.next();
doTest(testString, p, q, 83, "This\n costs $20,00,000."); doTest(testString, p, q, 83, "This costs $20,00,000.");
q = sentIter1.following(1); q = sentIter1.following(1);
doTest(testString, 1, q, 7, "ello! "); doTest(testString, 1, q, 7, "ello! ");
q = sentIter1.following(10); q = sentIter1.following(10);
@ -324,7 +324,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
p = wordIter1.preceding(wordIter1.first()); p = wordIter1.preceding(wordIter1.first());
if (p != RuleBasedBreakIterator.DONE) if (p != RuleBasedBreakIterator.DONE)
errln("ERROR: preceding() at starting position returned #" + p + " instead of 0"); errln("ERROR: preceding() at starting position returned #" + p + " instead of 0");
testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964"; testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u0301\u0964";
logln("testing character iteration for string \" " + testString + "\" \n"); logln("testing character iteration for string \" " + testString + "\" \n");
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault()); RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
charIter1.setText(testString); charIter1.setText(testString);
@ -335,7 +335,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
doTest(testString, p, q, 31, "\u0964"); doTest(testString, p, q, 31, "\u0964");
p = q; p = q;
q = charIter1.previous(); q = charIter1.previous();
doTest(testString, p, q, 29, "\u0939\u094c"); doTest(testString, p, q, 29, "\u0939\u0301");
q = charIter1.preceding(26); q = charIter1.preceding(26);
doTest(testString, 26, q, 23, "\u0938\u0941\u0902"); doTest(testString, 26, q, 23, "\u0938\u0941\u0902");
q = charIter1.preceding(16); q = charIter1.preceding(16);
@ -349,7 +349,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE) if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE)
errln("ERROR: previous()/preceding() at starting position returned #" errln("ERROR: previous()/preceding() at starting position returned #"
+ p + " and " + q + " instead of 0\n"); + p + " and " + q + " instead of 0\n");
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000."; testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000.";
logln("testing sentence iter - String:- \"" + testString + "\""); logln("testing sentence iter - String:- \"" + testString + "\"");
RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault()); RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault());
sentIter1.setText(testString); sentIter1.setText(testString);
@ -357,7 +357,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
if (p != testString.length()) if (p != testString.length())
errln("ERROR: last() returned" + p + "instead of " + testString.length()); errln("ERROR: last() returned" + p + "instead of " + testString.length());
q = sentIter1.previous(); q = sentIter1.previous();
doTest(testString, p, q, 60, "This\n costs $20,00,000."); doTest(testString, p, q, 60, "This costs $20,00,000.");
p = q; p = q;
q = sentIter1.previous(); q = sentIter1.previous();
doTest(testString, p, q, 41, "How are you doing? "); doTest(testString, p, q, 41, "How are you doing? ");
@ -399,7 +399,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
* Tests the method IsBoundary() of RuleBasedBreakIterator * Tests the method IsBoundary() of RuleBasedBreakIterator
**/ **/
public void TestIsBoundary() { public void TestIsBoundary() {
String testString1 = "Write here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964"; String testString1 = "Write here. \u092d\u0301\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 a\u0301u";
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault()); RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
charIter1.setText(testString1); charIter1.setText(testString1);
int bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26}; int bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26};

View File

@ -9,6 +9,7 @@ package com.ibm.icu.dev.test.rbbi;
//Regression testing of RuleBasedBreakIterator //Regression testing of RuleBasedBreakIterator
import com.ibm.icu.dev.test.*; import com.ibm.icu.dev.test.*;
import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator_Old;
import java.util.Vector; import java.util.Vector;
public class RBBITest extends TestFmwk public class RBBITest extends TestFmwk
@ -43,6 +44,15 @@ public class RBBITest extends TestFmwk
public void TestDefaultRuleBasedCharacterIteration(){ public void TestDefaultRuleBasedCharacterIteration(){
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getCharacterInstance(); RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getCharacterInstance();
logln("Testing the RBBI for character iteration by using default rules"); logln("Testing the RBBI for character iteration by using default rules");
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
//fetch the rules used to create the above RuleBasedBreakIterator //fetch the rules used to create the above RuleBasedBreakIterator
String defaultRules=rbbi.toString(); String defaultRules=rbbi.toString();
@ -172,6 +182,14 @@ public class RBBITest extends TestFmwk
public void TestDefaultRuleBasedWordIteration(){ public void TestDefaultRuleBasedWordIteration(){
logln("Testing the RBBI for word iteration using default rules"); logln("Testing the RBBI for word iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance(); RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance();
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
//fetch the rules used to create the above RuleBasedBreakIterator //fetch the rules used to create the above RuleBasedBreakIterator
String defaultRules=rbbi.toString(); String defaultRules=rbbi.toString();
@ -325,6 +343,14 @@ public class RBBITest extends TestFmwk
logln("Testing the RBBI for sentence iteration using default rules"); logln("Testing the RBBI for sentence iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getSentenceInstance(); RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getSentenceInstance();
//fetch the rules used to create the above RuleBasedBreakIterator //fetch the rules used to create the above RuleBasedBreakIterator
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
String defaultRules=rbbi.toString(); String defaultRules=rbbi.toString();
RuleBasedBreakIterator sentIterDefault=null; RuleBasedBreakIterator sentIterDefault=null;
try{ try{
@ -418,16 +444,24 @@ public class RBBITest extends TestFmwk
} }
public void TestDefaultRuleBasedLineIteration(){ public void TestDefaultRuleBasedLineIteration(){
logln("Testing the RBBI for line iteration using default rules"); logln("Testing the RBBI for line iteration using default rules");
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance(); RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance();
//fetch the rules used to create the above RuleBasedBreakIterator //fetch the rules used to create the above RuleBasedBreakIterator
String defaultRules=rbbi.toString(); try {
RuleBasedBreakIterator lineIterDefault=null; RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
try{ }
lineIterDefault = new RuleBasedBreakIterator(defaultRules); catch (ClassCastException e) {
}catch(IllegalArgumentException iae){ // Bail out if using new RBBI implementation
errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString()); logln("Test Skipped.");
} return;
}
String defaultRules=rbbi.toString();
RuleBasedBreakIterator lineIterDefault=null;
try{
lineIterDefault = new RuleBasedBreakIterator(defaultRules);
}catch(IllegalArgumentException iae){
errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());
}
Vector linedata = new Vector(); Vector linedata = new Vector();
linedata.addElement("Multi-"); linedata.addElement("Multi-");
@ -524,6 +558,15 @@ public class RBBITest extends TestFmwk
// get overridden. // get overridden.
rbbi.toString(); rbbi.toString();
RuleBasedBreakIterator lineIter=null; RuleBasedBreakIterator lineIter=null;
try {
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
}
catch (ClassCastException e) {
// Bail out if using new RBBI implementation
logln("Test Skipped.");
return;
}
try{ try{
lineIter = new RuleBasedBreakIterator(rules); lineIter = new RuleBasedBreakIterator(rules);
}catch(IllegalArgumentException iae){ }catch(IllegalArgumentException iae){
@ -651,7 +694,15 @@ public class RBBITest extends TestFmwk
public void TestAbbrRuleBasedWordIteration(){ public void TestAbbrRuleBasedWordIteration(){
logln("Testing the RBBI for word iteration by adding rules to support abbreviation"); logln("Testing the RBBI for word iteration by adding rules to support abbreviation");
RuleBasedBreakIterator rb =(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance(); RuleBasedBreakIterator rb =(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance();
try {
// This test won't work with the new break iterators. Cast will fail in this case.
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old) rb;
}
catch (ClassCastException e) {
logln("Test skipped.");
return;
}
String wrules2="$abbr=((Mr.)|(Mrs.)|(Ms.)|(Dr.)|(U.S.));" + // abbreviations. String wrules2="$abbr=((Mr.)|(Mrs.)|(Ms.)|(Dr.)|(U.S.));" + // abbreviations.
rb.toString() + rb.toString() +
"($abbr$ws)*$word;"; "($abbr$ws)*$word;";
@ -701,6 +752,10 @@ public class RBBITest extends TestFmwk
buffer.append(text); buffer.append(text);
} }
text = buffer.toString(); text = buffer.toString();
if (rbbi == null) {
errln("null iterator, test skipped.");
return;
}
rbbi.setText(text); rbbi.setText(text);

View File

@ -29,11 +29,11 @@ public class BreakIteratorRules extends ListResourceBundle {
// BreakIteratorClasses lists the class names to instantiate for each // BreakIteratorClasses lists the class names to instantiate for each
// built-in type of BreakIterator // built-in type of BreakIterator
{ "BreakIteratorClasses", { "BreakIteratorClasses",
new String[] { "RuleBasedBreakIterator", // character-break iterator class new String[] { "RuleBasedBreakIterator_New", // character-break iterator class
"RuleBasedBreakIterator", // word-break iterator class "RuleBasedBreakIterator_New", // word-break iterator class
"RuleBasedBreakIterator", // line-break iterator class "RuleBasedBreakIterator_New", // line-break iterator class
"RuleBasedBreakIterator", // sentence-break iterator class "RuleBasedBreakIterator_New", // sentence-break iterator class
"RuleBasedBreakIterator"} // Title-Case break iterator class "RuleBasedBreakIterator_New"} // Title-Case break iterator class
}, },
// rules describing how to break between logical characters // rules describing how to break between logical characters

View File

@ -27,10 +27,10 @@ public class BreakIteratorRules_th extends ListResourceBundle {
// iterator. Notice we're now using DictionaryBasedBreakIterator // iterator. Notice we're now using DictionaryBasedBreakIterator
// for word and line breaking. // for word and line breaking.
{ "BreakIteratorClasses", { "BreakIteratorClasses",
new String[] { "RuleBasedBreakIterator", // character-break iterator class new String[] { "RuleBasedBreakIterator_New", // character-break iterator class
"DictionaryBasedBreakIterator", // word-break iterator class "DictionaryBasedBreakIterator", // word-break iterator class
"DictionaryBasedBreakIterator", // line-break iterator class "DictionaryBasedBreakIterator", // line-break iterator class
"RuleBasedBreakIterator" } // sentence-break iterator class "RuleBasedBreakIterator_New" } // sentence-break iterator class
}, },
{ "WordBreakRules", { "WordBreakRules",

View File

@ -18,6 +18,7 @@ import com.ibm.icu.impl.ICULocaleService;
import com.ibm.icu.impl.ICUService; import com.ibm.icu.impl.ICUService;
import com.ibm.icu.impl.ICUService.Factory; import com.ibm.icu.impl.ICUService.Factory;
import com.ibm.icu.util.ULocale; import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;
/** /**
* @author Ram * @author Ram
@ -76,10 +77,26 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
} }
static final ICULocaleService service = new BFService(); static final ICULocaleService service = new BFService();
// KIND_NAMES are used in synthesizing the resource name that holds the source
// break rules. For old-style (ICU 2.8 and previous) break iterators.
// The resources are com.ibm.icu.impl.data.BreakIteratorRules, and have
// names like "CharacterBreakRules", where the "Character" part of the
// name comes from here (this array).
private static final String[] KIND_NAMES = { private static final String[] KIND_NAMES = {
"Character", "Word", "Line", "Sentence", "Title" "Character", "Word", "Line", "Sentence", "Title"
}; };
/** KIND_NAMES_2 are used in synthesizing the names for
* the precompiled break rules used with the new (ICU 3.0) RBBI.
* The fully assembled names look like icudt30b_char.brk, which is the
* file name of the brk file as produced by the ICU4C build.
* @internal
*/
private static final String[] KIND_NAMES_2 = {
"char", "word", "line", "sent", "title"
};
private static BreakIterator createBreakInstance(Locale locale, int kind) { private static BreakIterator createBreakInstance(Locale locale, int kind) {
String prefix = KIND_NAMES[kind]; String prefix = KIND_NAMES[kind];
return createBreakInstance(locale, kind, return createBreakInstance(locale, kind,
@ -97,8 +114,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
String[] classNames = bundle.getStringArray("BreakIteratorClasses"); String[] classNames = bundle.getStringArray("BreakIteratorClasses");
String rules = bundle.getString(rulesName); String rules = bundle.getString(rulesName);
if (classNames[kind].equals("RuleBasedBreakIterator")) { if (classNames[kind].equals("RuleBasedBreakIterator")) {
// Old style (2.8 and previous) Break Iterator.
// Not used by default, but if someone wants to specify the old class
// in some locale's resources, it should still work.
iter = new RuleBasedBreakIterator_Old(rules); iter = new RuleBasedBreakIterator_Old(rules);
} }
else if (classNames[kind].equals("RuleBasedBreakIterator_New")) {
try {
// Class for new RBBI engine.
// Set up path to precompiled rule data.
String rulesFileName =
"data/icudt" + VersionInfo.ICU_VERSION.getMajor() +
VersionInfo.ICU_VERSION.getMinor() + "b_" + KIND_NAMES_2[kind] + ".brk";
InputStream is = ICUData.getRequiredStream(rulesFileName);
iter = RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is);
}
catch (IOException e) {
throw new IllegalArgumentException(e.toString());
}
}
else if (classNames[kind].equals("DictionaryBasedBreakIterator")) { else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
try { try {
InputStream dictionary = ICUData.getStream(bundle.getString(dictionaryName)); InputStream dictionary = ICUData.getStream(bundle.getString(dictionaryName));

View File

@ -8,16 +8,10 @@
package com.ibm.icu.text; package com.ibm.icu.text;
import java.io.InputStream; import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.DataInputStream; import java.io.DataInputStream;
import java.io.IOException; import java.io.IOException;
import java.util.Locale;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.VersionInfo; import com.ibm.icu.util.VersionInfo;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.impl.ICUData; import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.Trie; import com.ibm.icu.impl.Trie;
import com.ibm.icu.impl.CharTrie; import com.ibm.icu.impl.CharTrie;
@ -69,9 +63,13 @@ public class RBBIDataWrapper {
// Getters for fields from the state table header // Getters for fields from the state table header
// //
final static int getNumStates(int table[]) { final static int getNumStates(short table[]) {
return table[NUMSTATES]<<16 + (table[NUMSTATES+1]&0xffff); int hi = table[NUMSTATES];
} int lo = table[NUMSTATES+1];
int val = (hi<<16) + (lo&0x0000ffff);
return val;
}
/** /**
* Data Header. A struct-like class with the fields from the RBBI data file header. * Data Header. A struct-like class with the fields from the RBBI data file header.
@ -119,14 +117,14 @@ public class RBBIDataWrapper {
static class TrieFoldingFunc implements Trie.DataManipulate { static class TrieFoldingFunc implements Trie.DataManipulate {
public int getFoldingOffset(int data) { public int getFoldingOffset(int data) {
if ((data & 0x8000) == 0) { if ((data & 0x8000) != 0) {
return data & 0x7fff; return data & 0x7fff;
} else { } else {
return 0; return 0;
} }
} }
}; };
static TrieFoldingFunc fTrieFoldingFunc; static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
RBBIDataWrapper() { RBBIDataWrapper() {
@ -299,19 +297,148 @@ public class RBBIDataWrapper {
/** Debug function to display the break iterator data. */ /** Debug function to display the break iterator data. */
void dump() { void dump() {
System.out.println("RBBI Data Wrapper dump ..."); System.out.println("RBBI Data Wrapper dump ...");
System.out.println();
System.out.println("Forward State Table");
dumpTable(fFTable);
System.out.println("Reverse State Table");
dumpTable(fRTable);
System.out.println("Forward Safe Points Table");
dumpTable(fSFTable);
System.out.println("Reverse Safe Points Table");
dumpTable(fSRTable);
dumpCharCategories();
System.out.println("Source Rules: " + fRuleSource); System.out.println("Source Rules: " + fRuleSource);
}
/** Fixed width int-to-string conversion.
* TODO: there must be easy built-in way to do this */
private static String intToString(int n, int width) {
StringBuffer dest = new StringBuffer(width);
dest.append(n);
while (dest.length() < width) {
dest.insert(0, ' ');
}
return dest.toString();
}
/** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
private void dumpTable(short table[]) {
int n;
int state;
String header = " Row Acc Look Tag";
for (n=0; n<fHeader.fCatCount; n++) {
header += intToString(n, 5);
}
System.out.println(header);
for (n=0; n<header.length(); n++) {
System.out.print("-");
}
System.out.println();
for (state=0; state< getNumStates(table); state++) {
dumpRow(table, state);
}
System.out.println();
}
/**
* Dump (for debug) a single row of an RBBI state table
* @param table
* @param state
* @internal
*/
private void dumpRow(short table[], int state) {
StringBuffer dest = new StringBuffer(fHeader.fCatCount*5 + 20);
dest.append(intToString(state, 4));
int row = getRowIndex(state);
if (table[row+ACCEPTING] != 0) {
dest.append(intToString(table[row+ACCEPTING], 5));
}else {
dest.append(" ");
}
if (table[row+LOOKAHEAD] != 0) {
System.out.println(dest);
dest.append(intToString(table[row+LOOKAHEAD], 5));
}else {
dest.append(" ");
}
dest.append(intToString(table[row+TAGIDX], 5));
for (int col=0; col<fHeader.fCatCount; col++) {
dest.append(intToString(table[row+NEXTSTATES+col], 5));
}
System.out.println(dest);
}
private void dumpCharCategories() {
int n = fHeader.fCatCount;
String catStrings[] = new String[n+1];
int rangeStart = 0;
int rangeEnd = 0;
int lastCat = -1;
int char32;
int category;
int lastNewline[] = new int[n+1];
for (category = 0; category <= fHeader.fCatCount; category ++) {
catStrings[category] = "";
}
System.out.println("\nCharacter Categories");
System.out.println("--------------------");
for (char32 = 0; char32<=0x10ffff; char32++) {
category = fTrie.getCodePointValue(char32);
category &= ~0x4000; // Mask off dictionary bit.
if (category < 0 || category > fHeader.fCatCount) {
System.out.println("Error, bad category " + Integer.toHexString(category) +
" for char " + Integer.toHexString(char32));
break;
}
if (category == lastCat ) {
rangeEnd = char32;
} else {
if (lastCat >= 0) {
if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
lastNewline[lastCat] = catStrings[lastCat].length() + 10;
catStrings[lastCat] += "\n ";
}
catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
if (rangeEnd != rangeStart) {
catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
}
}
lastCat = category;
rangeStart = rangeEnd = char32;
}
}
catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
if (rangeEnd != rangeStart) {
catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
}
for (category = 0; category <= fHeader.fCatCount; category ++) {
System.out.println (intToString(category, 5) + " " + catStrings[category]);
}
System.out.println();
} }
public static void main(String[] args) { public static void main(String[] args) {
String s; String s;
if (args.length == 0) { if (args.length == 0) {
s = "icudt28b_char.brk"; s = "char";
} else { } else {
s = args[0]; s = args[0];
} }
System.out.println("RBBIDataWrapper.main(" + s + ") "); System.out.println("RBBIDataWrapper.main(" + s + ") ");
String versionedName =
"icudt" + VersionInfo.ICU_VERSION.getMajor() +
VersionInfo.ICU_VERSION.getMinor() + "b_" + s + ".brk";
try { try {
RBBIDataWrapper This = RBBIDataWrapper.get(s); RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
This.dump(); This.dump();
} }
catch (Exception e) { catch (Exception e) {

View File

@ -7,7 +7,9 @@
package com.ibm.icu.text; package com.ibm.icu.text;
import java.text.CharacterIterator; import java.text.CharacterIterator;
import java.text.StringCharacterIterator; import java.io.IOException;
import java.io.InputStream;
/** /**
* Rule Based Break Iterator implementation. * Rule Based Break Iterator implementation.
@ -27,7 +29,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* The rule data for this BreakIterator instance * The rule data for this BreakIterator instance
* @internal * @internal
*/ */
private RBBIDataWrapper fData; private RBBIDataWrapper fRData;
/** Index of the Rule {tag} values for the most recent match. /** Index of the Rule {tag} values for the most recent match.
* @internal * @internal
@ -61,7 +63,9 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
public Object clone() public Object clone()
{ {
RuleBasedBreakIterator_New result = (RuleBasedBreakIterator_New) super.clone(); RuleBasedBreakIterator_New result = (RuleBasedBreakIterator_New) super.clone();
// TODO: real clone code if (fText != null) {
fText = (CharacterIterator)fText.clone();
}
return result; return result;
} }
@ -71,8 +75,27 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0 * @stable ICU 2.0
*/ */
public boolean equals(Object that) { public boolean equals(Object that) {
return false; // TODO: try {
} RuleBasedBreakIterator_New other = (RuleBasedBreakIterator_New) that;
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
return false;
}
if (fRData != null && other.fRData != null &&
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
return false;
}
if (fText == null && other.fText == null) {
return true;
}
if (fText == null || other.fText == null) {
return false;
}
return fText.equals(other.fText);
}
catch(ClassCastException e) {
return false;
}
}
/** /**
* Returns the description (rules) used to create this iterator. * Returns the description (rules) used to create this iterator.
@ -81,8 +104,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
*/ */
public String toString() { public String toString() {
String retStr = null; String retStr = null;
if (fData != null) { if (fRData != null) {
retStr = fData.fRuleSource; retStr = fRData.fRuleSource;
} }
return retStr; return retStr;
} }
@ -94,9 +117,23 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
*/ */
public int hashCode() public int hashCode()
{ {
return 0; // TODO return fRData.fRuleSource.hashCode();
} }
//=======================================================================
// Constructors & Factories
//=======================================================================
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
RuleBasedBreakIterator_New This = new RuleBasedBreakIterator_New();
This.fRData = RBBIDataWrapper.get(is);
This.fText = new java.text.StringCharacterIterator(""); // Note: some old tests fail if fText is null
// on a newly created instance.
return This;
}
//======================================================================= //=======================================================================
// BreakIterator overrides // BreakIterator overrides
//======================================================================= //=======================================================================
@ -192,8 +229,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
return BreakIterator.DONE; return BreakIterator.DONE;
} }
if (fData.fSRTable != null || fData.fSFTable != null) { if (fRData.fSRTable != null || fRData.fSFTable != null) {
return handlePrevious(fData.fRTable); return handlePrevious(fRData.fRTable);
} }
// old rule syntax // old rule syntax
@ -266,7 +303,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
int result = 0; int result = 0;
if (fData.fSRTable != null) { if (fRData.fSRTable != null) {
// Safe Point Reverse rules exist. // Safe Point Reverse rules exist.
// This allows us to use the optimum algorithm. // This allows us to use the optimum algorithm.
fText.setIndex(offset); fText.setIndex(offset);
@ -275,20 +312,20 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// this handles offset being between a supplementary character // this handles offset being between a supplementary character
CINext32(fText); CINext32(fText);
// handlePrevious will move most of the time to < 1 boundary away // handlePrevious will move most of the time to < 1 boundary away
handlePrevious(fData.fSRTable); handlePrevious(fRData.fSRTable);
result = next(); result = next();
while (result <= offset) { while (result <= offset) {
result = next(); result = next();
} }
return result; return result;
} }
if (fData.fSFTable != null) { if (fRData.fSFTable != null) {
// No Safe point reverse table, but there is a safe pt forward table. // No Safe point reverse table, but there is a safe pt forward table.
// //
fText.setIndex(offset); fText.setIndex(offset);
CIPrevious32(fText); CIPrevious32(fText);
// handle next will give result >= offset // handle next will give result >= offset
handleNext(fData.fSFTable); handleNext(fRData.fSFTable);
// previous will give result 0 or 1 boundary away from offset, // previous will give result 0 or 1 boundary away from offset,
// most of the time // most of the time
// we have to // we have to
@ -352,7 +389,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// to carry out this operation // to carry out this operation
int result; int result;
if (fData.fSFTable != null) { if (fRData.fSFTable != null) {
/// todo synwee /// todo synwee
// new rule syntax // new rule syntax
fText.setIndex(offset); fText.setIndex(offset);
@ -360,19 +397,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
// safe point. // safe point.
// this handles offset being between a supplementary character // this handles offset being between a supplementary character
CIPrevious32(fText); CIPrevious32(fText);
handleNext(fData.fSFTable); handleNext(fRData.fSFTable);
result = previous(); result = previous();
while (result >= offset) { while (result >= offset) {
result = previous(); result = previous();
} }
return result; return result;
} }
if (fData.fSRTable != null) { if (fRData.fSRTable != null) {
// backup plan if forward safe table is not available // backup plan if forward safe table is not available
fText.setIndex(offset); fText.setIndex(offset);
CINext32(fText); CINext32(fText);
// handle previous will give result <= offset // handle previous will give result <= offset
handlePrevious(fData.fSRTable); handlePrevious(fRData.fSRTable);
// next will give result 0 or 1 boundary away from offset, // next will give result 0 or 1 boundary away from offset,
// most of the time // most of the time
@ -397,6 +434,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
return previous(); return previous();
} }
/**
* Throw IllegalArgumentException unless begin <= offset < end.
* TODO: subclassing interface from old RBBI is not really usable.
* What to do with old protected functions tagged as stable?
* @stable ICU 2.0
*/
protected static final void checkOffset(int offset, CharacterIterator text) {
if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
}
}
/** /**
* Returns true if the specfied position is a boundary position. As a side * Returns true if the specfied position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at * effect, leaves the iterator pointing to the first boundary position at
@ -406,8 +456,10 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0 * @stable ICU 2.0
*/ */
public boolean isBoundary(int offset) { public boolean isBoundary(int offset) {
checkOffset(offset, fText);
// the beginning index of the iterator is always a boundary position by definition // the beginning index of the iterator is always a boundary position by definition
if (fText == null || offset == fText.getBeginIndex()) { if (offset == fText.getBeginIndex()) {
first(); // For side effects on current position, tag values. first(); // For side effects on current position, tag values.
return true; return true;
} }
@ -502,8 +554,8 @@ public int getRuleStatus() {
// Status val N-1 <-- the value we need to return // Status val N-1 <-- the value we need to return
// The status values are sorted in ascending order. // The status values are sorted in ascending order.
// This function returns the last (largest) of the array of status values. // This function returns the last (largest) of the array of status values.
int idx = fLastRuleStatusIndex + fData.fStatusTable[fLastRuleStatusIndex]; int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
int tagVal = fData.fStatusTable[idx]; int tagVal = fRData.fStatusTable[idx];
return tagVal; return tagVal;
} }
@ -532,11 +584,11 @@ public int getRuleStatus() {
*/ */
public int getRuleStatusVec(int[] fillInArray) { public int getRuleStatusVec(int[] fillInArray) {
makeRuleStatusValid(); makeRuleStatusValid();
int numStatusVals = fData.fStatusTable[fLastRuleStatusIndex]; int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
if (fillInArray != null) { if (fillInArray != null) {
int numToCopy = Math.min(numStatusVals, fillInArray.length); int numToCopy = Math.min(numStatusVals, fillInArray.length);
for (int i=0; i<numToCopy; i++) { for (int i=0; i<numToCopy; i++) {
fillInArray[i] = fData.fStatusTable[fLastRuleStatusIndex + i + 1]; fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
} }
} }
return numStatusVals; return numStatusVals;
@ -618,8 +670,7 @@ public int getRuleStatusVec(int[] fillInArray) {
if (ci == null) { if (ci == null) {
return false; return false;
} }
int end = ci.getEndIndex(); if (ci.getIndex() >= ci.getEndIndex()) {
if (end == 0 || ci.getIndex() < end) {
return false; return false;
} }
return true; return true;
@ -637,7 +688,7 @@ public int getRuleStatusVec(int[] fillInArray) {
* @internal * @internal
*/ */
private int handleNext() { private int handleNext() {
return handleNext(fData.fFTable); return handleNext(fRData.fFTable);
} }
@ -663,7 +714,7 @@ public int getRuleStatusVec(int[] fillInArray) {
int state = START_STATE; int state = START_STATE;
short category; short category;
int c = CICurrent32(fText); int c = CICurrent32(fText);
int row = fData.getRowIndex(state); int row = fRData.getRowIndex(state);
int lookaheadStatus = 0; int lookaheadStatus = 0;
int lookaheadTagIdx = 0; int lookaheadTagIdx = 0;
@ -671,7 +722,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// Character Category fetch for starting character. // Character Category fetch for starting character.
// See comments on character category code within loop, below. // See comments on character category code within loop, below.
category = (short)fData.fTrie.getCodePointValue(c); category = (short)fRData.fTrie.getCodePointValue(c);
if ((category & 0x4000) != 0) { if ((category & 0x4000) != 0) {
// fDictionaryCharCount++; // fDictionaryCharCount++;
category &= ~0x4000; category &= ~0x4000;
@ -704,7 +755,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up the current character's character category, which tells us // look up the current character's character category, which tells us
// which column in the state table to look at. // which column in the state table to look at.
// //
category = (short)fData.fTrie.getCodePointValue(c); category = (short)fRData.fTrie.getCodePointValue(c);
// Clear the dictionary flag bit in the character's category. // Clear the dictionary flag bit in the character's category.
// Note: not using the old style dictionary stuff in this Java engine. // Note: not using the old style dictionary stuff in this Java engine.
@ -725,7 +776,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the state table // look up a state transition in the state table
// state = row->fNextState[category]; // state = row->fNextState[category];
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fData.getRowIndex(state); row = fRData.getRowIndex(state);
// Get the next character. Doing it here positions the iterator // Get the next character. Doing it here positions the iterator
// to the correct position for recording matches in the code that // to the correct position for recording matches in the code that
@ -793,15 +844,15 @@ public int getRuleStatusVec(int[] fillInArray) {
* handlePrevious * handlePrevious
*/ */
private int handlePrevious() { private int handlePrevious() {
if (fText == null || fData == null) { if (fText == null || fRData == null) {
return 0; return 0;
} }
if (fData.fRTable == null) { if (fRData.fRTable == null) {
fText.first(); fText.first();
return fText.getIndex(); return fText.getIndex();
} }
short stateTable[] = fData.fRTable; short stateTable[] = fRData.fRTable;
int state = START_STATE; int state = START_STATE;
int category; int category;
int lastCategory = 0; int lastCategory = 0;
@ -812,8 +863,8 @@ public int getRuleStatusVec(int[] fillInArray) {
int c = CICurrent32(fText); int c = CICurrent32(fText);
int row; int row;
row = fData.getRowIndex(state); row = fRData.getRowIndex(state);
category = (short)fData.fTrie.getCodePointValue(c); category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Clear the dictionary bit, just in case. category &= ~0x4000; // Clear the dictionary bit, just in case.
if (fTrace) { if (fTrace) {
@ -829,7 +880,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// save the last character's category and look up the current // save the last character's category and look up the current
// character's category // character's category
lastCategory = category; lastCategory = category;
category = (short)fData.fTrie.getCodePointValue(c); category = (short)fRData.fTrie.getCodePointValue(c);
// Check the dictionary bit in the character's category. // Check the dictionary bit in the character's category.
// Don't exist in this Java engine implementation. Clear the bit. // Don't exist in this Java engine implementation. Clear the bit.
@ -848,7 +899,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the backwards state table // look up a state transition in the backwards state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fData.getRowIndex(state); row = fRData.getRowIndex(state);
continueOn: { continueOn: {
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0 && if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0 &&
@ -942,9 +993,9 @@ public int getRuleStatusVec(int[] fillInArray) {
boolean lookAheadHardBreak = boolean lookAheadHardBreak =
(stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0; (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
int row = fData.getRowIndex(state); int row = fRData.getRowIndex(state);
category = (short)fData.fTrie.getCodePointValue(c); category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Mask off dictionary bit. category &= ~0x4000; // Mask off dictionary bit.
if (fTrace) { if (fTrace) {
@ -965,7 +1016,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// save the last character's category and look up the current // save the last character's category and look up the current
// character's category // character's category
lastCategory = category; lastCategory = category;
category = (short)fData.fTrie.getCodePointValue(c); category = (short)fRData.fTrie.getCodePointValue(c);
category &= ~0x4000; // Clear the dictionary bit flag category &= ~0x4000; // Clear the dictionary bit flag
// (Should be unused; holdover from old RBBI) // (Should be unused; holdover from old RBBI)
@ -982,7 +1033,7 @@ public int getRuleStatusVec(int[] fillInArray) {
// look up a state transition in the backwards state table // look up a state transition in the backwards state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fData.getRowIndex(state); row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case, could have lookahead so we move on to check it // Match found, common case, could have lookahead so we move on to check it