ICU-3295 rbbi rt port to Java, word break monkey test

X-SVN-Rev: 15150
This commit is contained in:
Andy Heninger 2004-05-05 01:17:24 +00:00
parent b57d64d5c6
commit 43ee8a9c4d
7 changed files with 256 additions and 41 deletions

View File

@ -7,13 +7,9 @@ package com.ibm.icu.dev.test.rbbi;
// Monkey testing of RuleBasedBreakIterator
import com.ibm.icu.dev.test.*;
import com.ibm.icu.text.RuleBasedBreakIterator_New;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.impl.StringUCharacterIterator;
import com.ibm.icu.text.UnicodeSet;
import java.text.CharacterIterator;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
@ -49,7 +45,7 @@ public class RBBITestMonkey extends TestFmwk {
abstract List charClasses();
// Set the test text on which subsequent calls to next() will operate
abstract void setText(String text);
abstract void setText(StringBuffer text);
// Find the next break postion, starting from the specified position.
// Return -1 after reaching end of string.
@ -69,7 +65,7 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fHangulSet;
UnicodeSet fAnySet;
String fText;
StringBuffer fText;
RBBICharMonkey() {
@ -91,7 +87,7 @@ public class RBBITestMonkey extends TestFmwk {
};
void setText(String s) {
void setText(StringBuffer s) {
fText = s;
}
@ -105,18 +101,206 @@ public class RBBITestMonkey extends TestFmwk {
}
/**
*
* Word Monkey Test Class
*
*
*
*/
static class RBBIWordMonkey extends RBBIMonkeyKind {
List fSets;
StringBuffer fText;
UnicodeSet fKatakanaSet;
UnicodeSet fALetterSet;
UnicodeSet fMidLetterSet;
UnicodeSet fMidNumLetSet;
UnicodeSet fMidNumSet;
UnicodeSet fNumericSet;
UnicodeSet fFormatSet;
UnicodeSet fExtendSet;
UnicodeSet fOtherSet;
RBBIWordMonkey() {
fSets = new ArrayList();
fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}\\u30fc\\uff70\\uff9e\\uff9f]");
String ALetterStr = "[[\\p{Alphabetic}\\u05f3]-[\\p{Ideographic}]-[\\p{Script=Thai}]" +
"-[\\p{Script=Lao}]-[\\p{Script=Hiragana}]-" +
"[\\p{script=KATAKANA}\\u30fc\\uff70\\uff9e\\uff9f]]";
fALetterSet = new UnicodeSet(ALetterStr);
fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027]");
fMidNumLetSet = new UnicodeSet("[\\u002e\\u003a]");
fMidNumSet = new UnicodeSet("[\\p{Line_Break=Infix_Numeric}]");
fNumericSet = new UnicodeSet("[\\p{Line_Break=Numeric}]");
fFormatSet = new UnicodeSet("[\\p{Format}-\\p{Grapheme_Extend}]");
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]");
fOtherSet = new UnicodeSet();
fOtherSet.complement();
fOtherSet.removeAll(fKatakanaSet);
fOtherSet.removeAll(fALetterSet);
fOtherSet.removeAll(fMidLetterSet);
fOtherSet.removeAll(fMidNumLetSet);
fOtherSet.removeAll(fMidNumSet);
fOtherSet.removeAll(fNumericSet);
fSets.add(fALetterSet);
fSets.add(fMidLetterSet);
fSets.add(fMidNumLetSet);
fSets.add(fMidNumSet);
fSets.add(fNumericSet);
fSets.add(fFormatSet);
fSets.add(fOtherSet);
}
List charClasses() {
return null; // TODO:
return fSets;
}
void setText(String text) { // TODO:
void setText(StringBuffer s) {
fText = s;
}
int next(int i) { // TODO:
return 0;
int next(int prevPos) {
int p0, p1, p2, p3; // Indices of the significant code points around the
// break position being tested. The candidate break
// location is before p2.
int breakPos = -1;
int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
// Prev break at end of string. return DONE.
if (prevPos >= fText.length()) {
return -1;
}
p0 = p1 = p2 = p3 = prevPos;
c3 = UTF16.charAt(fText, prevPos);
c0 = c1 = c2 = 0;
// Format char after prev break? Special case, see last Note for Word Boundaries TR.
// break immdiately after the format char.
if (breakPos >= 0 && fFormatSet.contains(c3) && breakPos < (fText.length() -1)) {
breakPos = UTF16.moveCodePointOffset(fText, breakPos, 1);
return breakPos;
}
// Loop runs once per "significant" character position in the input text.
for (;;) {
// Move all of the positions forward in the input string.
p0 = p1; c0 = c1;
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
// Advancd p3 by (GC Format*) Rules 3, 4
p3 = nextGC(fText, p3);
if (p3 == -1 || p3 >= fText.length()) {
p3 = fText.length();
c3 = 0;
} else {
c3 = UTF16.charAt(fText, p3);
while (fFormatSet.contains(c3)) {
p3 = moveIndex32(fText, p3, 1);
c3 = 0;
if (p3 < fText.length()) {
c3 = UTF16.charAt(fText, p3);
}
}
}
if (p1 == p2) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
}
if (p2 == fText.length()) {
// Reached end of string. Always a break position.
break;
}
// Rule (5). ALetter x ALetter
if (fALetterSet.contains(c1) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
//
// Also incorporates rule 7 by skipping pos ahead to position of the
// terminating ALetter.
if ( fALetterSet.contains(c1) &&
(fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
fALetterSet.contains(c3)) {
continue;
}
// Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
if (fALetterSet.contains(c0) &&
(fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) ) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (8) Numeric x Numeric
if (fNumericSet.contains(c1) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (9) ALetter x Numeric
if (fALetterSet.contains(c1) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (10) Numeric x ALetter
if (fNumericSet.contains(c1) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (11) Numeric (MidNum | MidNumLet) x Numeric
if ( fNumericSet.contains(c0) &&
(fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (12) Numeric x (MidNum | MidNumLet) Numeric
if (fNumericSet.contains(c1) &&
(fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
fNumericSet.contains(c3)) {
continue;
}
// Rule (13) Katakana x Katakana
if (fKatakanaSet.contains(c1) &&
fKatakanaSet.contains(c2)) {
continue;
}
// Rule 14. Break found here.
break;
}
// Rule 4 fixup, back up before any trailing
// format characters at the end of the word.
breakPos = p2;
int t = nextGC(fText, p1);
if (t > p1) {
breakPos = t;
}
return breakPos;
}
}
@ -125,7 +309,7 @@ public class RBBITestMonkey extends TestFmwk {
return null; // TODO:
}
void setText(String text) { // TODO:
void setText(StringBuffer text) { // TODO:
}
int next(int i) { // TODO:
@ -134,13 +318,33 @@ public class RBBITestMonkey extends TestFmwk {
}
/**
* Move an index into a string by n code points.
* Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
* complicating usage.
* @param s a Text string
* @param i The starting code unit index into the text string
* @param amt The amount to adjust the string by.
* @return The adjusted code unit index, pinned to the string's length, or
* unchanged if input index was outside of the string.
*/
static int moveIndex32(StringBuffer s, int i, int amt) {
if (i < 0 || i >= s.length()) {
return i;
}
int retVal = UTF16.moveCodePointOffset(s, i, amt);
return retVal;
}
/**
* return the index of the next code point in the input text.
* @param i the preceding index
* @return
* @internal
*/
static int nextCP(String s, int i) {
static int nextCP(StringBuffer s, int i) {
if (i == -1) {
// End of Input indication. Continue to return end value.
return -1;
@ -188,7 +392,7 @@ public class RBBITestMonkey extends TestFmwk {
* @return The index of the first code point following the grapheme cluster
* @internal
*/
private static int nextGC(String s, int i) {
private static int nextGC(StringBuffer s, int i) {
if (i >= s.length() || i == -1 ) {
return -1;
}
@ -368,7 +572,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
// Debugging settings. Comment out everything in the following block for normal operation
//
//--------------------------------------------------------------------------------------------
// numIterations = -1;
// numIterations = 20;
//RuleBasedBreakIterator_New.fTrace = true;
//m_seed = -1324359431;
// TESTSTRINGLEN = 50;
@ -426,7 +630,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
Arrays.fill(isBoundaryBreaks, false);
// Calculate the expected results for this test string.
mk.setText(testText.toString());
mk.setText(testText);
expectedCount = 0;
expectedBreaks[0] = true;
expected[expectedCount ++] = 0;
@ -527,8 +731,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
String hexChars = "0123456789abcdef";
int c; // Char from test data
int bn;
String testData = testText.toString();
for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testData, ci)) {
for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) {
if (ci == i) {
// This is the location of the error.
errorText.append("<?>");
@ -536,8 +739,8 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
// This a non-error expected break position.
errorText.append("<>");
}
if (ci < testData.length()) {
c = UTF16.charAt(testData, ci);
if (ci < testText.length()) {
c = UTF16.charAt(testText, ci);
if (c < 0x10000) {
errorText.append("\\u");
for (bn=12; bn>=0; bn-=4) {
@ -551,7 +754,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
}
}
}
if (ci == testData.length() && ci != -1) {
if (ci == testText.length() && ci != -1) {
errorText.append("<>");
}
errorText.append("</data>\n");
@ -573,7 +776,6 @@ public void TestCharMonkey() {
int loopCount = 500;
int seed = 1;
String breakType = "all";
if (params.inclusion >= 9) {
loopCount = 10000;
@ -588,7 +790,6 @@ public void TestWordMonkey() {
int loopCount = 500;
int seed = 1;
String breakType = "all";
if (params.inclusion >= 9) {
loopCount = 10000;
@ -597,7 +798,7 @@ public void TestWordMonkey() {
logln("Word Break Monkey Test");
RBBIWordMonkey m = new RBBIWordMonkey();
BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
//RunMonkey(bi, m, "word", seed, loopCount);
RunMonkey(bi, m, "word", seed, loopCount);
}
public void TestLineMonkey() {

View File

@ -120,18 +120,32 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
iter = new RuleBasedBreakIterator_Old(rules);
}
else if (classNames[kind].equals("RuleBasedBreakIterator_New")) {
try {
// Class for new RBBI engine.
// Set up path to precompiled rule data.
String rulesFileName =
"data/icudt" + VersionInfo.ICU_VERSION.getMajor() +
VersionInfo.ICU_VERSION.getMinor() + "b_" + KIND_NAMES_2[kind] + ".brk";
InputStream is = ICUData.getRequiredStream(rulesFileName);
iter = RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is);
}
catch (IOException e) {
throw new IllegalArgumentException(e.toString());
}
try {
// Class for new RBBI engine.
// Open a stream to the .brk file. Path to the brk files has this form:
// data/icudt30b/line.brk (30 is version number)
String rulesFileName =
"data/icudt" + VersionInfo.ICU_VERSION.getMajor() +
VersionInfo.ICU_VERSION.getMinor() + "b/" +
KIND_NAMES_2[kind] + ".brk";
InputStream is = ICUData.getStream(rulesFileName);
if (is == null) {
// Temporary!!! Try again with break files named data/icudt28b_char.brk
// (or word, line, etc.) This was a temporary location
// used during development, this code can be removed once
// the data is in the data directory, above. TODO: remove
// the following code, make this catch turn around and throw.
rulesFileName =
"data/icudt" + VersionInfo.ICU_VERSION.getMajor() +
VersionInfo.ICU_VERSION.getMinor() + "b_" +
KIND_NAMES_2[kind] + ".brk";
is = ICUData.getRequiredStream(rulesFileName);
}
iter = RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is);
}
catch (IOException e) {
throw new IllegalArgumentException(e.toString());
}
}
else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
try {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/RuleBasedBreakIterator_Old.java,v $
* $Date: 2004/04/12 22:08:32 $
* $Revision: 1.1 $
* $Date: 2004/05/05 01:17:23 $
* $Revision: 1.2 $
*
*****************************************************************************************
*/
@ -229,8 +229,8 @@ import java.io.*;
* &nbsp; For examples, see the resource data (which is annotated).</p>
*
* @author Richard Gillam
* @stable ICU 2.0
* $RCSfile: RuleBasedBreakIterator_Old.java,v $ $Revision: 1.1 $ $Date: 2004/04/12 22:08:32 $
* @internal ICU 2.0
* $RCSfile: RuleBasedBreakIterator_Old.java,v $ $Revision: 1.2 $ $Date: 2004/05/05 01:17:23 $
*/
public class RuleBasedBreakIterator_Old extends RuleBasedBreakIterator {