ICU-3295 RBBI runtime port to Java
X-SVN-Rev: 15029
This commit is contained in:
parent
4514814ae6
commit
b6b23502af
@ -194,7 +194,7 @@
|
||||
<target name ="coreData" depends="init">
|
||||
<copy todir="${build.dir}/com/ibm/icu/impl/data">
|
||||
<fileset dir="${src.dir}/com/ibm/icu/impl/data"
|
||||
includes="Transliterator_*.txt,*.icu,*.spp"
|
||||
includes="Transliterator_*.txt,*.icu,*.spp,*.brk"
|
||||
excludes="**/CVS/**/*,Transliterator_Han_Latin_*.txt"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -186,7 +186,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
errln("ERROR: next()/following() at last position returned #"
|
||||
+ p + " and " + q + " instead of" + testString.length() + "\n");
|
||||
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
|
||||
testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
|
||||
testString = "Write hindi here. \u092d\u093e\u0930\u0301 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
|
||||
logln("testing char iter - string:- \"" + testString + "\"");
|
||||
charIter1.setText(testString);
|
||||
p = charIter1.first();
|
||||
@ -209,7 +209,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
// hindi starts here
|
||||
p = q;
|
||||
q = charIter1.next(4);
|
||||
doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0924");
|
||||
doTest(testString, p, q, 22, " \u092d\u093e\u0930\u0301"); // Nonsense, but compatible between old and new rules.
|
||||
p = q;
|
||||
q = charIter1.next(2);
|
||||
doTest(testString, p, q, 26, " \u0938\u0941\u0902");
|
||||
@ -217,13 +217,13 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
q = charIter1.following(24);
|
||||
doTest(testString, 24, q, 26, "\u0941\u0902");
|
||||
q = charIter1.following(20);
|
||||
doTest(testString, 20, q, 21, "\u0930");
|
||||
doTest(testString, 20, q, 22, "\u0930\u0301");
|
||||
p = charIter1.following(charIter1.last());
|
||||
q = charIter1.next(charIter1.last());
|
||||
if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE)
|
||||
errln("ERROR: following()/next() at last position returned #"
|
||||
+ p + " and " + q + " instead of" + testString.length());
|
||||
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
|
||||
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000.";
|
||||
RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault());
|
||||
logln("testing sentence iter - String:- \"" + testString + "\"");
|
||||
sentIter1.setText(testString);
|
||||
@ -243,7 +243,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? ");
|
||||
p = q;
|
||||
q = sentIter1.next();
|
||||
doTest(testString, p, q, 83, "This\n costs $20,00,000.");
|
||||
doTest(testString, p, q, 83, "This costs $20,00,000.");
|
||||
q = sentIter1.following(1);
|
||||
doTest(testString, 1, q, 7, "ello! ");
|
||||
q = sentIter1.following(10);
|
||||
@ -324,7 +324,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
p = wordIter1.preceding(wordIter1.first());
|
||||
if (p != RuleBasedBreakIterator.DONE)
|
||||
errln("ERROR: preceding() at starting position returned #" + p + " instead of 0");
|
||||
testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
|
||||
testString = "Write hindi here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u0301\u0964";
|
||||
logln("testing character iteration for string \" " + testString + "\" \n");
|
||||
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
|
||||
charIter1.setText(testString);
|
||||
@ -335,7 +335,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
doTest(testString, p, q, 31, "\u0964");
|
||||
p = q;
|
||||
q = charIter1.previous();
|
||||
doTest(testString, p, q, 29, "\u0939\u094c");
|
||||
doTest(testString, p, q, 29, "\u0939\u0301");
|
||||
q = charIter1.preceding(26);
|
||||
doTest(testString, 26, q, 23, "\u0938\u0941\u0902");
|
||||
q = charIter1.preceding(16);
|
||||
@ -349,7 +349,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
if (p != RuleBasedBreakIterator.DONE || q != RuleBasedBreakIterator.DONE)
|
||||
errln("ERROR: previous()/preceding() at starting position returned #"
|
||||
+ p + " and " + q + " instead of 0\n");
|
||||
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
|
||||
testString = "Hello! how are you? I'am fine. Thankyou. How are you doing? This costs $20,00,000.";
|
||||
logln("testing sentence iter - String:- \"" + testString + "\"");
|
||||
RuleBasedBreakIterator sentIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getSentenceInstance(Locale.getDefault());
|
||||
sentIter1.setText(testString);
|
||||
@ -357,7 +357,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
if (p != testString.length())
|
||||
errln("ERROR: last() returned" + p + "instead of " + testString.length());
|
||||
q = sentIter1.previous();
|
||||
doTest(testString, p, q, 60, "This\n costs $20,00,000.");
|
||||
doTest(testString, p, q, 60, "This costs $20,00,000.");
|
||||
p = q;
|
||||
q = sentIter1.previous();
|
||||
doTest(testString, p, q, 41, "How are you doing? ");
|
||||
@ -399,7 +399,7 @@ public class RBBIAPITest extends com.ibm.icu.dev.test.TestFmwk {
|
||||
* Tests the method IsBoundary() of RuleBasedBreakIterator
|
||||
**/
|
||||
public void TestIsBoundary() {
|
||||
String testString1 = "Write here. \u092d\u093e\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 \u0939\u094c\u0964";
|
||||
String testString1 = "Write here. \u092d\u0301\u0930\u0924 \u0938\u0941\u0902\u0926\u0930 a\u0301u";
|
||||
RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) RuleBasedBreakIterator.getCharacterInstance(Locale.getDefault());
|
||||
charIter1.setText(testString1);
|
||||
int bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26};
|
||||
|
@ -9,6 +9,7 @@ package com.ibm.icu.dev.test.rbbi;
|
||||
//Regression testing of RuleBasedBreakIterator
|
||||
import com.ibm.icu.dev.test.*;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator_Old;
|
||||
import java.util.Vector;
|
||||
|
||||
public class RBBITest extends TestFmwk
|
||||
@ -43,6 +44,15 @@ public class RBBITest extends TestFmwk
|
||||
public void TestDefaultRuleBasedCharacterIteration(){
|
||||
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getCharacterInstance();
|
||||
logln("Testing the RBBI for character iteration by using default rules");
|
||||
try {
|
||||
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
|
||||
}
|
||||
catch (ClassCastException e) {
|
||||
// Bail out if using new RBBI implementation
|
||||
logln("Test Skipped.");
|
||||
return;
|
||||
}
|
||||
|
||||
//fetch the rules used to create the above RuleBasedBreakIterator
|
||||
String defaultRules=rbbi.toString();
|
||||
|
||||
@ -172,6 +182,14 @@ public class RBBITest extends TestFmwk
|
||||
public void TestDefaultRuleBasedWordIteration(){
|
||||
logln("Testing the RBBI for word iteration using default rules");
|
||||
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance();
|
||||
try {
|
||||
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
|
||||
}
|
||||
catch (ClassCastException e) {
|
||||
// Bail out if using new RBBI implementation
|
||||
logln("Test Skipped.");
|
||||
return;
|
||||
}
|
||||
//fetch the rules used to create the above RuleBasedBreakIterator
|
||||
String defaultRules=rbbi.toString();
|
||||
|
||||
@ -325,6 +343,14 @@ public class RBBITest extends TestFmwk
|
||||
logln("Testing the RBBI for sentence iteration using default rules");
|
||||
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getSentenceInstance();
|
||||
//fetch the rules used to create the above RuleBasedBreakIterator
|
||||
try {
|
||||
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
|
||||
}
|
||||
catch (ClassCastException e) {
|
||||
// Bail out if using new RBBI implementation
|
||||
logln("Test Skipped.");
|
||||
return;
|
||||
}
|
||||
String defaultRules=rbbi.toString();
|
||||
RuleBasedBreakIterator sentIterDefault=null;
|
||||
try{
|
||||
@ -418,16 +444,24 @@ public class RBBITest extends TestFmwk
|
||||
}
|
||||
|
||||
public void TestDefaultRuleBasedLineIteration(){
|
||||
logln("Testing the RBBI for line iteration using default rules");
|
||||
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance();
|
||||
//fetch the rules used to create the above RuleBasedBreakIterator
|
||||
String defaultRules=rbbi.toString();
|
||||
RuleBasedBreakIterator lineIterDefault=null;
|
||||
try{
|
||||
lineIterDefault = new RuleBasedBreakIterator(defaultRules);
|
||||
}catch(IllegalArgumentException iae){
|
||||
errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());
|
||||
}
|
||||
logln("Testing the RBBI for line iteration using default rules");
|
||||
RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance();
|
||||
//fetch the rules used to create the above RuleBasedBreakIterator
|
||||
try {
|
||||
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
|
||||
}
|
||||
catch (ClassCastException e) {
|
||||
// Bail out if using new RBBI implementation
|
||||
logln("Test Skipped.");
|
||||
return;
|
||||
}
|
||||
String defaultRules=rbbi.toString();
|
||||
RuleBasedBreakIterator lineIterDefault=null;
|
||||
try{
|
||||
lineIterDefault = new RuleBasedBreakIterator(defaultRules);
|
||||
}catch(IllegalArgumentException iae){
|
||||
errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());
|
||||
}
|
||||
|
||||
Vector linedata = new Vector();
|
||||
linedata.addElement("Multi-");
|
||||
@ -524,6 +558,15 @@ public class RBBITest extends TestFmwk
|
||||
// get overridden.
|
||||
rbbi.toString();
|
||||
RuleBasedBreakIterator lineIter=null;
|
||||
try {
|
||||
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old)rbbi;
|
||||
}
|
||||
catch (ClassCastException e) {
|
||||
// Bail out if using new RBBI implementation
|
||||
logln("Test Skipped.");
|
||||
return;
|
||||
}
|
||||
|
||||
try{
|
||||
lineIter = new RuleBasedBreakIterator(rules);
|
||||
}catch(IllegalArgumentException iae){
|
||||
@ -651,7 +694,15 @@ public class RBBITest extends TestFmwk
|
||||
public void TestAbbrRuleBasedWordIteration(){
|
||||
logln("Testing the RBBI for word iteration by adding rules to support abbreviation");
|
||||
RuleBasedBreakIterator rb =(RuleBasedBreakIterator)RuleBasedBreakIterator.getWordInstance();
|
||||
|
||||
try {
|
||||
// This test won't work with the new break iterators. Cast will fail in this case.
|
||||
RuleBasedBreakIterator_Old obi = (RuleBasedBreakIterator_Old) rb;
|
||||
}
|
||||
catch (ClassCastException e) {
|
||||
logln("Test skipped.");
|
||||
return;
|
||||
}
|
||||
|
||||
String wrules2="$abbr=((Mr.)|(Mrs.)|(Ms.)|(Dr.)|(U.S.));" + // abbreviations.
|
||||
rb.toString() +
|
||||
"($abbr$ws)*$word;";
|
||||
@ -701,6 +752,10 @@ public class RBBITest extends TestFmwk
|
||||
buffer.append(text);
|
||||
}
|
||||
text = buffer.toString();
|
||||
if (rbbi == null) {
|
||||
errln("null iterator, test skipped.");
|
||||
return;
|
||||
}
|
||||
|
||||
rbbi.setText(text);
|
||||
|
||||
|
@ -29,11 +29,11 @@ public class BreakIteratorRules extends ListResourceBundle {
|
||||
// BreakIteratorClasses lists the class names to instantiate for each
|
||||
// built-in type of BreakIterator
|
||||
{ "BreakIteratorClasses",
|
||||
new String[] { "RuleBasedBreakIterator", // character-break iterator class
|
||||
"RuleBasedBreakIterator", // word-break iterator class
|
||||
"RuleBasedBreakIterator", // line-break iterator class
|
||||
"RuleBasedBreakIterator", // sentence-break iterator class
|
||||
"RuleBasedBreakIterator"} // Title-Case break iterator class
|
||||
new String[] { "RuleBasedBreakIterator_New", // character-break iterator class
|
||||
"RuleBasedBreakIterator_New", // word-break iterator class
|
||||
"RuleBasedBreakIterator_New", // line-break iterator class
|
||||
"RuleBasedBreakIterator_New", // sentence-break iterator class
|
||||
"RuleBasedBreakIterator_New"} // Title-Case break iterator class
|
||||
},
|
||||
|
||||
// rules describing how to break between logical characters
|
||||
|
@ -27,10 +27,10 @@ public class BreakIteratorRules_th extends ListResourceBundle {
|
||||
// iterator. Notice we're now using DictionaryBasedBreakIterator
|
||||
// for word and line breaking.
|
||||
{ "BreakIteratorClasses",
|
||||
new String[] { "RuleBasedBreakIterator", // character-break iterator class
|
||||
new String[] { "RuleBasedBreakIterator_New", // character-break iterator class
|
||||
"DictionaryBasedBreakIterator", // word-break iterator class
|
||||
"DictionaryBasedBreakIterator", // line-break iterator class
|
||||
"RuleBasedBreakIterator" } // sentence-break iterator class
|
||||
"RuleBasedBreakIterator_New" } // sentence-break iterator class
|
||||
},
|
||||
|
||||
{ "WordBreakRules",
|
||||
|
@ -18,6 +18,7 @@ import com.ibm.icu.impl.ICULocaleService;
|
||||
import com.ibm.icu.impl.ICUService;
|
||||
import com.ibm.icu.impl.ICUService.Factory;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
/**
|
||||
* @author Ram
|
||||
@ -76,10 +77,26 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
||||
}
|
||||
static final ICULocaleService service = new BFService();
|
||||
|
||||
// KIND_NAMES are used in synthesizing the resource name that holds the source
|
||||
// break rules. For old-style (ICU 2.8 and previous) break iterators.
|
||||
// The resources are com.ibm.icu.impl.data.BreakIteratorRules, and have
|
||||
// names like "CharacterBreakRules", where the "Character" part of the
|
||||
// name comes from here (this array).
|
||||
private static final String[] KIND_NAMES = {
|
||||
"Character", "Word", "Line", "Sentence", "Title"
|
||||
};
|
||||
|
||||
/** KIND_NAMES_2 are used in synthesizing the names for
|
||||
* the precompiled break rules used with the new (ICU 3.0) RBBI.
|
||||
* The fully assembled names look like icudt30b_char.brk, which is the
|
||||
* file name of the brk file as produced by the ICU4C build.
|
||||
* @internal
|
||||
*/
|
||||
private static final String[] KIND_NAMES_2 = {
|
||||
"char", "word", "line", "sent", "title"
|
||||
};
|
||||
|
||||
|
||||
private static BreakIterator createBreakInstance(Locale locale, int kind) {
|
||||
String prefix = KIND_NAMES[kind];
|
||||
return createBreakInstance(locale, kind,
|
||||
@ -97,8 +114,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
||||
String[] classNames = bundle.getStringArray("BreakIteratorClasses");
|
||||
String rules = bundle.getString(rulesName);
|
||||
if (classNames[kind].equals("RuleBasedBreakIterator")) {
|
||||
// Old style (2.8 and previous) Break Iterator.
|
||||
// Not used by default, but if someone wants to specify the old class
|
||||
// in some locale's resources, it should still work.
|
||||
iter = new RuleBasedBreakIterator_Old(rules);
|
||||
}
|
||||
else if (classNames[kind].equals("RuleBasedBreakIterator_New")) {
|
||||
try {
|
||||
// Class for new RBBI engine.
|
||||
// Set up path to precompiled rule data.
|
||||
String rulesFileName =
|
||||
"data/icudt" + VersionInfo.ICU_VERSION.getMajor() +
|
||||
VersionInfo.ICU_VERSION.getMinor() + "b_" + KIND_NAMES_2[kind] + ".brk";
|
||||
InputStream is = ICUData.getRequiredStream(rulesFileName);
|
||||
iter = RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new IllegalArgumentException(e.toString());
|
||||
}
|
||||
}
|
||||
else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
|
||||
try {
|
||||
InputStream dictionary = ICUData.getStream(bundle.getString(dictionaryName));
|
||||
|
@ -8,16 +8,10 @@
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.util.RangeValueIterator;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.impl.ICUData;
|
||||
import com.ibm.icu.impl.Trie;
|
||||
import com.ibm.icu.impl.CharTrie;
|
||||
@ -69,9 +63,13 @@ public class RBBIDataWrapper {
|
||||
|
||||
// Getters for fields from the state table header
|
||||
//
|
||||
final static int getNumStates(int table[]) {
|
||||
return table[NUMSTATES]<<16 + (table[NUMSTATES+1]&0xffff);
|
||||
}
|
||||
final static int getNumStates(short table[]) {
|
||||
int hi = table[NUMSTATES];
|
||||
int lo = table[NUMSTATES+1];
|
||||
int val = (hi<<16) + (lo&0x0000ffff);
|
||||
return val;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Data Header. A struct-like class with the fields from the RBBI data file header.
|
||||
@ -119,14 +117,14 @@ public class RBBIDataWrapper {
|
||||
|
||||
static class TrieFoldingFunc implements Trie.DataManipulate {
|
||||
public int getFoldingOffset(int data) {
|
||||
if ((data & 0x8000) == 0) {
|
||||
if ((data & 0x8000) != 0) {
|
||||
return data & 0x7fff;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
static TrieFoldingFunc fTrieFoldingFunc;
|
||||
static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
|
||||
|
||||
|
||||
RBBIDataWrapper() {
|
||||
@ -299,19 +297,148 @@ public class RBBIDataWrapper {
|
||||
/** Debug function to display the break iterator data. */
|
||||
void dump() {
|
||||
System.out.println("RBBI Data Wrapper dump ...");
|
||||
System.out.println();
|
||||
System.out.println("Forward State Table");
|
||||
dumpTable(fFTable);
|
||||
System.out.println("Reverse State Table");
|
||||
dumpTable(fRTable);
|
||||
System.out.println("Forward Safe Points Table");
|
||||
dumpTable(fSFTable);
|
||||
System.out.println("Reverse Safe Points Table");
|
||||
dumpTable(fSRTable);
|
||||
|
||||
dumpCharCategories();
|
||||
System.out.println("Source Rules: " + fRuleSource);
|
||||
|
||||
}
|
||||
|
||||
/** Fixed width int-to-string conversion.
|
||||
* TODO: there must be easy built-in way to do this */
|
||||
private static String intToString(int n, int width) {
|
||||
StringBuffer dest = new StringBuffer(width);
|
||||
dest.append(n);
|
||||
while (dest.length() < width) {
|
||||
dest.insert(0, ' ');
|
||||
}
|
||||
return dest.toString();
|
||||
}
|
||||
|
||||
/** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
|
||||
private void dumpTable(short table[]) {
|
||||
int n;
|
||||
int state;
|
||||
String header = " Row Acc Look Tag";
|
||||
for (n=0; n<fHeader.fCatCount; n++) {
|
||||
header += intToString(n, 5);
|
||||
}
|
||||
System.out.println(header);
|
||||
for (n=0; n<header.length(); n++) {
|
||||
System.out.print("-");
|
||||
}
|
||||
System.out.println();
|
||||
for (state=0; state< getNumStates(table); state++) {
|
||||
dumpRow(table, state);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
/**
|
||||
* Dump (for debug) a single row of an RBBI state table
|
||||
* @param table
|
||||
* @param state
|
||||
* @internal
|
||||
*/
|
||||
private void dumpRow(short table[], int state) {
|
||||
StringBuffer dest = new StringBuffer(fHeader.fCatCount*5 + 20);
|
||||
dest.append(intToString(state, 4));
|
||||
int row = getRowIndex(state);
|
||||
if (table[row+ACCEPTING] != 0) {
|
||||
dest.append(intToString(table[row+ACCEPTING], 5));
|
||||
}else {
|
||||
dest.append(" ");
|
||||
}
|
||||
if (table[row+LOOKAHEAD] != 0) {
|
||||
System.out.println(dest);
|
||||
dest.append(intToString(table[row+LOOKAHEAD], 5));
|
||||
}else {
|
||||
dest.append(" ");
|
||||
}
|
||||
dest.append(intToString(table[row+TAGIDX], 5));
|
||||
|
||||
for (int col=0; col<fHeader.fCatCount; col++) {
|
||||
dest.append(intToString(table[row+NEXTSTATES+col], 5));
|
||||
}
|
||||
|
||||
System.out.println(dest);
|
||||
}
|
||||
|
||||
private void dumpCharCategories() {
|
||||
int n = fHeader.fCatCount;
|
||||
String catStrings[] = new String[n+1];
|
||||
int rangeStart = 0;
|
||||
int rangeEnd = 0;
|
||||
int lastCat = -1;
|
||||
int char32;
|
||||
int category;
|
||||
int lastNewline[] = new int[n+1];
|
||||
|
||||
for (category = 0; category <= fHeader.fCatCount; category ++) {
|
||||
catStrings[category] = "";
|
||||
}
|
||||
System.out.println("\nCharacter Categories");
|
||||
System.out.println("--------------------");
|
||||
for (char32 = 0; char32<=0x10ffff; char32++) {
|
||||
category = fTrie.getCodePointValue(char32);
|
||||
category &= ~0x4000; // Mask off dictionary bit.
|
||||
if (category < 0 || category > fHeader.fCatCount) {
|
||||
System.out.println("Error, bad category " + Integer.toHexString(category) +
|
||||
" for char " + Integer.toHexString(char32));
|
||||
break;
|
||||
}
|
||||
if (category == lastCat ) {
|
||||
rangeEnd = char32;
|
||||
} else {
|
||||
if (lastCat >= 0) {
|
||||
if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
|
||||
lastNewline[lastCat] = catStrings[lastCat].length() + 10;
|
||||
catStrings[lastCat] += "\n ";
|
||||
}
|
||||
|
||||
catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
|
||||
if (rangeEnd != rangeStart) {
|
||||
catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
|
||||
}
|
||||
}
|
||||
lastCat = category;
|
||||
rangeStart = rangeEnd = char32;
|
||||
}
|
||||
}
|
||||
catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
|
||||
if (rangeEnd != rangeStart) {
|
||||
catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
|
||||
}
|
||||
|
||||
for (category = 0; category <= fHeader.fCatCount; category ++) {
|
||||
System.out.println (intToString(category, 5) + " " + catStrings[category]);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
String s;
|
||||
if (args.length == 0) {
|
||||
s = "icudt28b_char.brk";
|
||||
s = "char";
|
||||
} else {
|
||||
s = args[0];
|
||||
}
|
||||
System.out.println("RBBIDataWrapper.main(" + s + ") ");
|
||||
|
||||
String versionedName =
|
||||
"icudt" + VersionInfo.ICU_VERSION.getMajor() +
|
||||
VersionInfo.ICU_VERSION.getMinor() + "b_" + s + ".brk";
|
||||
|
||||
try {
|
||||
RBBIDataWrapper This = RBBIDataWrapper.get(s);
|
||||
RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
|
||||
This.dump();
|
||||
}
|
||||
catch (Exception e) {
|
||||
|
@ -7,7 +7,9 @@
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.StringCharacterIterator;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
|
||||
/**
|
||||
* Rule Based Break Iterator implementation.
|
||||
@ -27,7 +29,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
* The rule data for this BreakIterator instance
|
||||
* @internal
|
||||
*/
|
||||
private RBBIDataWrapper fData;
|
||||
private RBBIDataWrapper fRData;
|
||||
|
||||
/** Index of the Rule {tag} values for the most recent match.
|
||||
* @internal
|
||||
@ -61,7 +63,9 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
public Object clone()
|
||||
{
|
||||
RuleBasedBreakIterator_New result = (RuleBasedBreakIterator_New) super.clone();
|
||||
// TODO: real clone code
|
||||
if (fText != null) {
|
||||
fText = (CharacterIterator)fText.clone();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -71,8 +75,27 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public boolean equals(Object that) {
|
||||
return false; // TODO:
|
||||
}
|
||||
try {
|
||||
RuleBasedBreakIterator_New other = (RuleBasedBreakIterator_New) that;
|
||||
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
|
||||
return false;
|
||||
}
|
||||
if (fRData != null && other.fRData != null &&
|
||||
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
|
||||
return false;
|
||||
}
|
||||
if (fText == null && other.fText == null) {
|
||||
return true;
|
||||
}
|
||||
if (fText == null || other.fText == null) {
|
||||
return false;
|
||||
}
|
||||
return fText.equals(other.fText);
|
||||
}
|
||||
catch(ClassCastException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the description (rules) used to create this iterator.
|
||||
@ -81,8 +104,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
*/
|
||||
public String toString() {
|
||||
String retStr = null;
|
||||
if (fData != null) {
|
||||
retStr = fData.fRuleSource;
|
||||
if (fRData != null) {
|
||||
retStr = fRData.fRuleSource;
|
||||
}
|
||||
return retStr;
|
||||
}
|
||||
@ -94,9 +117,23 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
*/
|
||||
public int hashCode()
|
||||
{
|
||||
return 0; // TODO
|
||||
return fRData.fRuleSource.hashCode();
|
||||
}
|
||||
|
||||
|
||||
//=======================================================================
|
||||
// Constructors & Factories
|
||||
//=======================================================================
|
||||
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
|
||||
RuleBasedBreakIterator_New This = new RuleBasedBreakIterator_New();
|
||||
This.fRData = RBBIDataWrapper.get(is);
|
||||
This.fText = new java.text.StringCharacterIterator(""); // Note: some old tests fail if fText is null
|
||||
// on a newly created instance.
|
||||
return This;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
@ -192,8 +229,8 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
return BreakIterator.DONE;
|
||||
}
|
||||
|
||||
if (fData.fSRTable != null || fData.fSFTable != null) {
|
||||
return handlePrevious(fData.fRTable);
|
||||
if (fRData.fSRTable != null || fRData.fSFTable != null) {
|
||||
return handlePrevious(fRData.fRTable);
|
||||
}
|
||||
|
||||
// old rule syntax
|
||||
@ -266,7 +303,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
|
||||
int result = 0;
|
||||
|
||||
if (fData.fSRTable != null) {
|
||||
if (fRData.fSRTable != null) {
|
||||
// Safe Point Reverse rules exist.
|
||||
// This allows us to use the optimum algorithm.
|
||||
fText.setIndex(offset);
|
||||
@ -275,20 +312,20 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
// this handles offset being between a supplementary character
|
||||
CINext32(fText);
|
||||
// handlePrevious will move most of the time to < 1 boundary away
|
||||
handlePrevious(fData.fSRTable);
|
||||
handlePrevious(fRData.fSRTable);
|
||||
result = next();
|
||||
while (result <= offset) {
|
||||
result = next();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
if (fData.fSFTable != null) {
|
||||
if (fRData.fSFTable != null) {
|
||||
// No Safe point reverse table, but there is a safe pt forward table.
|
||||
//
|
||||
fText.setIndex(offset);
|
||||
CIPrevious32(fText);
|
||||
// handle next will give result >= offset
|
||||
handleNext(fData.fSFTable);
|
||||
handleNext(fRData.fSFTable);
|
||||
// previous will give result 0 or 1 boundary away from offset,
|
||||
// most of the time
|
||||
// we have to
|
||||
@ -352,7 +389,7 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
// to carry out this operation
|
||||
|
||||
int result;
|
||||
if (fData.fSFTable != null) {
|
||||
if (fRData.fSFTable != null) {
|
||||
/// todo synwee
|
||||
// new rule syntax
|
||||
fText.setIndex(offset);
|
||||
@ -360,19 +397,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
// safe point.
|
||||
// this handles offset being between a supplementary character
|
||||
CIPrevious32(fText);
|
||||
handleNext(fData.fSFTable);
|
||||
handleNext(fRData.fSFTable);
|
||||
result = previous();
|
||||
while (result >= offset) {
|
||||
result = previous();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
if (fData.fSRTable != null) {
|
||||
if (fRData.fSRTable != null) {
|
||||
// backup plan if forward safe table is not available
|
||||
fText.setIndex(offset);
|
||||
CINext32(fText);
|
||||
// handle previous will give result <= offset
|
||||
handlePrevious(fData.fSRTable);
|
||||
handlePrevious(fRData.fSRTable);
|
||||
|
||||
// next will give result 0 or 1 boundary away from offset,
|
||||
// most of the time
|
||||
@ -397,6 +434,19 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
return previous();
|
||||
}
|
||||
|
||||
/**
|
||||
* Throw IllegalArgumentException unless begin <= offset < end.
|
||||
* TODO: subclassing interface from old RBBI is not really usable.
|
||||
* What to do with old protected functions tagged as stable?
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
protected static final void checkOffset(int offset, CharacterIterator text) {
|
||||
if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
|
||||
throw new IllegalArgumentException("offset out of bounds");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns true if the specfied position is a boundary position. As a side
|
||||
* effect, leaves the iterator pointing to the first boundary position at
|
||||
@ -406,8 +456,10 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public boolean isBoundary(int offset) {
|
||||
checkOffset(offset, fText);
|
||||
|
||||
// the beginning index of the iterator is always a boundary position by definition
|
||||
if (fText == null || offset == fText.getBeginIndex()) {
|
||||
if (offset == fText.getBeginIndex()) {
|
||||
first(); // For side effects on current position, tag values.
|
||||
return true;
|
||||
}
|
||||
@ -502,8 +554,8 @@ public int getRuleStatus() {
|
||||
// Status val N-1 <-- the value we need to return
|
||||
// The status values are sorted in ascending order.
|
||||
// This function returns the last (largest) of the array of status values.
|
||||
int idx = fLastRuleStatusIndex + fData.fStatusTable[fLastRuleStatusIndex];
|
||||
int tagVal = fData.fStatusTable[idx];
|
||||
int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
|
||||
int tagVal = fRData.fStatusTable[idx];
|
||||
|
||||
return tagVal;
|
||||
}
|
||||
@ -532,11 +584,11 @@ public int getRuleStatus() {
|
||||
*/
|
||||
public int getRuleStatusVec(int[] fillInArray) {
|
||||
makeRuleStatusValid();
|
||||
int numStatusVals = fData.fStatusTable[fLastRuleStatusIndex];
|
||||
int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
|
||||
if (fillInArray != null) {
|
||||
int numToCopy = Math.min(numStatusVals, fillInArray.length);
|
||||
for (int i=0; i<numToCopy; i++) {
|
||||
fillInArray[i] = fData.fStatusTable[fLastRuleStatusIndex + i + 1];
|
||||
fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
|
||||
}
|
||||
}
|
||||
return numStatusVals;
|
||||
@ -618,8 +670,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
if (ci == null) {
|
||||
return false;
|
||||
}
|
||||
int end = ci.getEndIndex();
|
||||
if (end == 0 || ci.getIndex() < end) {
|
||||
if (ci.getIndex() >= ci.getEndIndex()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -637,7 +688,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
* @internal
|
||||
*/
|
||||
private int handleNext() {
|
||||
return handleNext(fData.fFTable);
|
||||
return handleNext(fRData.fFTable);
|
||||
}
|
||||
|
||||
|
||||
@ -663,7 +714,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
int state = START_STATE;
|
||||
short category;
|
||||
int c = CICurrent32(fText);
|
||||
int row = fData.getRowIndex(state);
|
||||
int row = fRData.getRowIndex(state);
|
||||
int lookaheadStatus = 0;
|
||||
int lookaheadTagIdx = 0;
|
||||
|
||||
@ -671,7 +722,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
|
||||
// Character Category fetch for starting character.
|
||||
// See comments on character category code within loop, below.
|
||||
category = (short)fData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
if ((category & 0x4000) != 0) {
|
||||
// fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
@ -704,7 +755,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
//
|
||||
category = (short)fData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
|
||||
// Clear the dictionary flag bit in the character's category.
|
||||
// Note: not using the old style dictionary stuff in this Java engine.
|
||||
@ -725,7 +776,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
// look up a state transition in the state table
|
||||
// state = row->fNextState[category];
|
||||
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||||
row = fData.getRowIndex(state);
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
// Get the next character. Doing it here positions the iterator
|
||||
// to the correct position for recording matches in the code that
|
||||
@ -793,15 +844,15 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
* handlePrevious
|
||||
*/
|
||||
private int handlePrevious() {
|
||||
if (fText == null || fData == null) {
|
||||
if (fText == null || fRData == null) {
|
||||
return 0;
|
||||
}
|
||||
if (fData.fRTable == null) {
|
||||
if (fRData.fRTable == null) {
|
||||
fText.first();
|
||||
return fText.getIndex();
|
||||
}
|
||||
|
||||
short stateTable[] = fData.fRTable;
|
||||
short stateTable[] = fRData.fRTable;
|
||||
int state = START_STATE;
|
||||
int category;
|
||||
int lastCategory = 0;
|
||||
@ -812,8 +863,8 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
int c = CICurrent32(fText);
|
||||
int row;
|
||||
|
||||
row = fData.getRowIndex(state);
|
||||
category = (short)fData.fTrie.getCodePointValue(c);
|
||||
row = fRData.getRowIndex(state);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category &= ~0x4000; // Clear the dictionary bit, just in case.
|
||||
|
||||
if (fTrace) {
|
||||
@ -829,7 +880,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
// save the last character's category and look up the current
|
||||
// character's category
|
||||
lastCategory = category;
|
||||
category = (short)fData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Don't exist in this Java engine implementation. Clear the bit.
|
||||
@ -848,7 +899,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
|
||||
// look up a state transition in the backwards state table
|
||||
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||||
row = fData.getRowIndex(state);
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
continueOn: {
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0 &&
|
||||
@ -942,9 +993,9 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
boolean lookAheadHardBreak =
|
||||
(stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
|
||||
int row = fData.getRowIndex(state);
|
||||
int row = fRData.getRowIndex(state);
|
||||
|
||||
category = (short)fData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category &= ~0x4000; // Mask off dictionary bit.
|
||||
|
||||
if (fTrace) {
|
||||
@ -965,7 +1016,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
// save the last character's category and look up the current
|
||||
// character's category
|
||||
lastCategory = category;
|
||||
category = (short)fData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
|
||||
category &= ~0x4000; // Clear the dictionary bit flag
|
||||
// (Should be unused; holdover from old RBBI)
|
||||
@ -982,7 +1033,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
||||
|
||||
// look up a state transition in the backwards state table
|
||||
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||||
row = fData.getRowIndex(state);
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
|
||||
// Match found, common case, could have lookahead so we move on to check it
|
||||
|
Loading…
Reference in New Issue
Block a user