ICU-9715 Improve RBBI next performance by about 5% (35% over ICU4J 50.1 for this ticket).

X-SVN-Rev: 32797
This commit is contained in:
George Rhoten 2012-11-12 00:49:43 +00:00
parent bc31ae8173
commit e076e3a5de
4 changed files with 40 additions and 46 deletions
icu4j/main
classes/core/src/com/ibm/icu
tests/core/src/com/ibm/icu/dev/test/rbbi

View File

@ -17,7 +17,7 @@ public final class CharacterIteration {
// 32 bit Char value returned from when an iterator has run out of range.
// Positive value so fast case (not end, not surrogate) can be checked
// with a single test.
public static int DONE32 = 0x7fffffff;
public static final int DONE32 = 0x7fffffff;
/**
* Move the iterator forward to the next code point, and return that code point,

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and *
* Copyright (C) 1996-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -335,6 +335,10 @@ final class RBBIDataWrapper {
///CLOVER:OFF
/* Debug function to display the break iterator data. */
void dump() {
if (fFTable.length == 0) {
// There is no table. Fail early for testing purposes.
throw new NullPointerException();
}
System.out.println("RBBI Data Wrapper dump ...");
System.out.println();
System.out.println("Forward State Table");

View File

@ -24,6 +24,7 @@ import java.util.Set;
import java.util.Stack;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.CharTrie;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
@ -44,7 +45,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
* @internal
* @deprecated This API is ICU internal only.
*/
public RuleBasedBreakIterator() {
private RuleBasedBreakIterator() {
fLastStatusIndexValid = true;
fDictionaryCharCount = 0;
fBreakEngines.add(fUnhandledBreakEngine);
}
/**
@ -74,7 +78,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
* @stable ICU 2.2
*/
public RuleBasedBreakIterator(String rules) {
init();
this();
try {
ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
compileRules(rules, ruleOS);
@ -248,7 +252,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// are. The state machine only fetches user text input while in RUN mode.
private static final int RBBI_START = 0;
private static final int RBBI_RUN = 1;
private static final int RBBI_END = 2;
private static final int RBBI_END = 2;
/*
* The character iterator through which this BreakIterator accesses the text.
@ -260,7 +264,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
* @internal
* @deprecated This API is ICU internal only.
*/
RBBIDataWrapper fRData;
RBBIDataWrapper fRData;
/*
* Index of the Rule {tag} values for the most recent match.
@ -344,12 +348,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
this.fRData.dump();
}
private void init() {
fLastStatusIndexValid = true;
fDictionaryCharCount = 0;
fBreakEngines.add(fUnhandledBreakEngine);
}
/**
* Compile a set of source break rules into the binary state tables used
* by the break iterator engine. Creating a break iterator from precompiled
@ -1005,7 +1003,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
* @internal
* @deprecated This API is ICU internal only.
*/
protected LanguageBreakEngine getEngineFor(int c) {
private LanguageBreakEngine getEngineFor(int c) {
if (c == DONE32 || !fUseDictionary) {
return null;
}
@ -1063,7 +1061,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// handleNext(void) All forward iteration vectors through this function.
//
//-----------------------------------------------------------------------------------
int handleNext() {
private int handleNext() {
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
// and possibly regenerate the cache
@ -1087,12 +1085,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
Stack<Integer> breaks = new Stack<Integer>();
e.findBreaks(fText, startPos, result, false, getBreakType(), breaks);
fCachedBreakPositions = new int[breaks.size() + 2];
int breaksSize = breaks.size();
fCachedBreakPositions = new int[breaksSize + 2];
fCachedBreakPositions[0] = startPos;
for (int i = 0; i < breaks.size(); i++) {
for (int i = 0; i < breaksSize; i++) {
fCachedBreakPositions[i + 1] = breaks.elementAt(i).intValue();
}
fCachedBreakPositions[breaks.size() + 1] = result;
fCachedBreakPositions[breaksSize + 1] = result;
fPositionInCache = 0;
} else {
@ -1148,18 +1147,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
fLastStatusIndexValid = true;
fLastRuleStatusIndex = 0;
// if we're already at the end of the text, return DONE.
if (fText == null) {
return BreakIterator.DONE;
}
// caches for quicker access
CharacterIterator text = fText;
short flagsState = stateTable[RBBIDataWrapper.FLAGS+1];
CharTrie trie = fRData.fTrie;
// Set up the starting char
int initialPosition = text.getIndex();
int result = initialPosition;
int c = text.current();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(text, c);
@ -1167,11 +1159,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
return BreakIterator.DONE;
}
}
int initialPosition = text.getIndex();
int result = initialPosition;
// Set the initial state for the state machine
int state = START_STATE;
int row = fRData.getRowIndex(state);
short category = 3;
short flagsState = stateTable[RBBIDataWrapper.FLAGS+1];
int mode = RBBI_RUN;
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
@ -1185,8 +1180,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
int lookaheadStatus = 0;
int lookaheadTagIdx = 0;
int lookaheadResult = 0;
boolean lookAheadHardBreak =
(flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
// loop until we reach the end of the text or transition to state 0
while (state != STOP_STATE) {
@ -1205,11 +1198,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
} else if (result == initialPosition) {
// Ran off end, no match found.
// move forward one
text.setIndex(initialPosition);
next32(text);
}
break;
}
@ -1226,7 +1214,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
category = (short) fRData.fTrie.getCodePointValue(c);
category = (short) trie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
@ -1283,7 +1271,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
// TODO: make a standalone hard break in a rule work.
if (lookAheadHardBreak) {
if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) {
text.setIndex(result);
return result;
}
@ -1312,22 +1300,24 @@ public class RuleBasedBreakIterator extends BreakIterator {
// The state machine is done. Check whether it found a match...
// If the iterator failed to advance in the match engine, force it ahead by one.
// (This really indicates a defect in the break rules. They should always match
// at least one character.)
// If c == DONE32 we ran off the end as normal, no match found. Move forward one.
// If the iterator failed to advance in the match engine when c != DONE32,
// force it ahead by one. (This second condition really indicates a defect
// in the break rules. They should always match at least one character.)
if (result == initialPosition) {
if (TRACE) {
if (TRACE && c != DONE32) {
System.out.println("Iterator did not move. Advancing by 1.");
}
result = text.setIndex(initialPosition);
text.setIndex(initialPosition);
next32(text);
result = text.getIndex();
}
// Leave the iterator at our result position.
// (we may have advanced beyond the last accepting position chasing after
// longer matches that never completed.)
text.setIndex(result);
else {
// Leave the iterator at our result position.
// (we may have advanced beyond the last accepting position chasing after
// longer matches that never completed.)
text.setIndex(result);
}
if (TRACE) {
System.out.println("result = " + result);
}

View File

@ -694,7 +694,7 @@ public class RBBITest extends TestFmwk {
/* Tests the method public Object clone() */
public void TestClone() {
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator();
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator("");
try {
rbbi.setText((CharacterIterator) null);
if (((RuleBasedBreakIterator) rbbi.clone()).getText() != null)
@ -745,7 +745,7 @@ public class RBBITest extends TestFmwk {
* Tests the method public void dump()
*/
public void TestDump() {
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator();
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator("");
try {
rbbi.dump();
errln("RuleBasedBreakIterator.dump() was suppose to return "