ICU-1560 implement smart rollback

X-SVN-Rev: 7155
This commit is contained in:
Alan Liu 2001-11-29 01:01:31 +00:00
parent 8eff706d84
commit adeb7ac274
7 changed files with 700 additions and 418 deletions

View File

@ -178,7 +178,7 @@ int32_t Transliterator::transliterate(Replaceable& text,
offsets.contextLimit = limit;
offsets.start = start;
offsets.limit = limit;
filteredTransliterate(text, offsets, FALSE);
filteredTransliterate(text, offsets, FALSE, TRUE);
return offsets.limit;
@ -317,7 +317,7 @@ void Transliterator::finishTransliteration(Replaceable& text,
filteredTransliterate(text, index, FALSE);
filteredTransliterate(text, index, FALSE, TRUE);
@ -356,7 +356,7 @@ void Transliterator::_transliterate(Replaceable& text,
filteredTransliterate(text, index, TRUE);
filteredTransliterate(text, index, TRUE, TRUE);
#if 0
@ -393,18 +393,6 @@ void Transliterator::_transliterate(Replaceable& text,
* Rollback makes global filters and compound transliterators very
* bulletproof, but it also makes some transliterators completely
* non-incremental -- that is, for some transliterators, rollback
* is always triggered, until finishTransliteration() is called.
* Since this eliminates most of the usefulness of incremental
* mode, rollback should usually be disabled.
* This is used by Transliterator and CompoundTransliterator.
* This method breaks up the input text into runs of unfiltered
* characters. It passes each such run to
@ -415,13 +403,33 @@ void Transliterator::_transliterate(Replaceable& text,
void Transliterator::filteredTransliterate(Replaceable& text,
UTransPosition& index,
UBool incremental) const {
if (filter == 0) {
// Short circuit path for transliterators with no filter
UBool incremental,
UBool rollback) const {
// Short circuit path for transliterators with no filter in
// non-incremental mode.
if (filter == 0 && !rollback) {
handleTransliterate(text, index, incremental);
// This method processes text in two groupings:
// RUNS -- A run is a contiguous group of characters which are contained
// in the filter for this transliterator (filter.contains(ch) == true).
// Text outside of runs may appear as context but it is not modified.
// The start and limit Position values are narrowed to each run.
// PASSES (incremental only) -- To make incremental mode work correctly,
// each run is broken up into n passes, where n is the length (in code
// points) of the run. Each pass contains the first n characters. If a
// pass is completely transliterated, it is committed, and further passes
// include characters after the committed text. If a pass is blocked,
// and does not transliterate completely, then this method rolls back
// the changes made during the pass, extends the pass by one code point,
// and tries again.
// globalLimit is the limit value for the entire operation. We
// set index.limit to the end of each unfiltered run before
// calling handleTransliterate(), so we need to maintain the real
@ -429,33 +437,36 @@ void Transliterator::filteredTransliterate(Replaceable& text,
// update globalLimit for insertions or deletions that have
// happened.
int32_t globalLimit = index.limit;
// Break the input text up. Say the input text has the form:
// If there is a non-null filter, then break the input text up. Say the
// input text has the form:
// xxxabcxxdefxx
// where 'x' represents a filtered character. Then we break this
// up into:
// where 'x' represents a filtered character (filter.contains('x') ==
// false). Then we break this up into:
// xxxabc xxdef xx
// Each pass through the loop consumes a run of filtered
// characters (which are ignored) and a subsequent run of
// unfiltered characters (which are transliterated). If, at any
// point, we fail to consume our entire segment, we stop.
// unfiltered characters (which are transliterated).
for (;;) {
// Narrow the range to be transliterated to the first segment
// of unfiltered characters at or after index.start.
UChar32 c;
if (filter != NULL) {
// Narrow the range to be transliterated to the first segment
// of unfiltered characters at or after index.start.
// Advance compoundStart past filtered chars
while (index.start < globalLimit &&
!filter->contains(c=text.char32At(index.start))) {
index.start += UTF_CHAR_LENGTH(c);
// Advance past filtered chars
UChar32 c;
while (index.start < globalLimit &&
!filter->contains(c=text.char32At(index.start))) {
index.start += UTF_CHAR_LENGTH(c);
// Find the end of this run of unfiltered chars
index.limit = index.start;
while (index.limit < globalLimit &&
filter->contains(c=text.char32At(index.limit))) {
index.limit += UTF_CHAR_LENGTH(c);
// Find the end of this run of unfiltered chars
index.limit = index.start;
while (index.limit < globalLimit &&
filter->contains(c=text.char32At(index.limit))) {
index.limit += UTF_CHAR_LENGTH(c);
// Check to see if the unfiltered run is empty. This only
@ -466,15 +477,15 @@ void Transliterator::filteredTransliterate(Replaceable& text,
int32_t limit = index.limit;
// Is this segment incremental? If there is additional
// Is this run incremental? If there is additional
// filtered text (if limit < globalLimit) then we pass in
// an incremental value of FALSE to force the subclass to
// complete the transliteration for this segment.
UBool isIncrementalSegment =
(limit < globalLimit ? FALSE : incremental);
// complete the transliteration for this run.
UBool isIncrementalRun =
(index.limit < globalLimit ? FALSE : incremental);
int32_t delta;
// Implement rollback. To understand the need for rollback,
// consider the following transliterator:
@ -495,88 +506,147 @@ void Transliterator::filteredTransliterate(Replaceable& text,
// transformation in incremental mode into characters outside its
// filter.
// There are two solutions. The first is to add two new index
// values to the position structure, a filteredStart and a
// filteredLimit. Then filteredTransliterate() can set and read
// these, and avoid filtering partially transliterated results. A
// variant of this solution is to retain an internal state object
// with the filtered range that is indexed by the text pointer and
// the position object pointer, in analogy to strtok(). The third
// solution involves no change to the API and no internal state
// cache. It is to roll back any partially transliterated results
// if (a) there is a filter, and (b) the transliteration is
// incremental. This is the solution implemented here.
int32_t rollbackStart = 0;
int32_t rollbackCopy = 0;
if (isIncrementalSegment) {
// To handle this, when in incremental mode we supply characters to
// handleTransliterate() in several passes. Each pass adds one more
// input character to the input text. That is, for input "ABCD", we
// first try "A", then "AB", then "ABC", and finally "ABCD". If at
// any point we block (upon return, start < limit) then we roll
// back. If at any point we complete the run (upon return start ==
// limit) then we commit that run.
if (rollback && isIncrementalRun) {
int32_t runStart = index.start;
int32_t runLimit = index.limit;
int32_t runLength = runLimit - runStart;
// Make a rollback copy at the end of the string
rollbackStart = index.start;
rollbackCopy = text.length();
text.copy(rollbackStart, limit, rollbackCopy);
// Delegate to subclass for actual transliteration.
handleTransliterate(text, index, isIncrementalSegment);
int32_t delta = index.limit - limit; // change in length
// Adjust overall limit for insertions/deletions. Don't need
// to worry about contextLimit because handleTransliterate()
// maintains that.
globalLimit += delta;
int32_t rollbackOrigin = text.length();
text.copy(runStart, runLimit, rollbackOrigin);
// If we failed to complete transliterate this segment,
// then we are done. If rollback is required, then do so.
if (index.start != index.limit) {
if (isIncrementalSegment) {
// Replace [rollbackStart, limit) -- this is the
// original filtered segment -- with
// [rollbackCopy, text.length()), the rollback
// copy, then delete the rollback copy.
rollbackCopy += delta;
int32_t rollbackLen = text.length() - rollbackCopy;
// Variables reflecting the commitment of completely
// transliterated text. passStart is the runStart, advanced
// past committed text. rollbackStart is the rollbackOrigin,
// advanced past rollback text that corresponds to committed
// text.
int32_t passStart = runStart;
int32_t rollbackStart = rollbackOrigin;
// Delete the partially transliterated segment
rollbackCopy -= index.limit - rollbackStart;
text.handleReplaceBetween(rollbackStart, index.limit, EMPTY);
// The limit for each pass; we advance by one code point with
// each iteration.
int32_t passLimit = index.start;
// Copy the rollback copy back
text.copy(rollbackCopy, text.length(), rollbackStart);
// Delete the rollback copy
rollbackCopy += rollbackLen;
text.handleReplaceBetween(rollbackCopy, text.length(), EMPTY);
// Restore indices
index.start = rollbackStart;
index.limit = limit;
index.contextLimit -= delta;
globalLimit -= delta;
// Total length, in 16-bit code units, of uncommitted text.
// This is the length to be rolled back.
int32_t uncommittedLength = 0;
// Total delta (change in length) for all passes
int32_t totalDelta = 0;
// PASS MAIN LOOP -- Start with a single character, and extend
// the text by one character at a time. Roll back partial
// transliterations and commit complete transliterations.
for (;;) {
// Length of additional code point, either one or two
int32_t charLength =
passLimit += charLength;
if (passLimit > runLimit) {
uncommittedLength += charLength;
index.limit = passLimit;
// Delegate to subclass for actual transliteration. Upon
// return, start will be updated to point after the
// transliterated text, and limit and contextLimit will be
// adjusted for length changes.
handleTransliterate(text, index, true);
delta = index.limit - passLimit; // change in length
// We failed to completely transliterate this pass.
// Roll back the text. Indices remain unchanged; reset
// them where necessary.
if (index.start != index.limit) {
// Find the rollbackStart, adjusted for length changes
// and the deletion of partially transliterated text.
int32_t rs = rollbackStart + delta - (index.limit - passStart);
// Delete the partially transliterated text
text.handleReplaceBetween(passStart, index.limit, EMPTY);
// Copy the rollback text back
text.copy(rs, rs + uncommittedLength, passStart);
// Restore indices to their original values
index.start = passStart;
index.limit = passLimit;
index.contextLimit -= delta;
// We did completely transliterate this pass. Update the
// commit indices to record how far we got. Adjust indices
// for length change.
else {
// Move the pass indices past the committed text.
passStart = passLimit = index.start;
// Adjust the rollbackStart for length changes and move
// it past the committed text. All characters we've
// processed to this point are committed now, so zero
// out the uncommittedLength.
rollbackStart += delta + uncommittedLength;
uncommittedLength = 0;
// Adjust indices for length changes.
runLimit += delta;
totalDelta += delta;
} else if (isIncrementalSegment) {
// We finished this segment; delete the rollback copy
rollbackCopy += delta;
text.handleReplaceBetween(rollbackCopy, text.length(), EMPTY);
// Adjust overall limit and rollbackOrigin for insertions and
// deletions. Don't need to worry about contextLimit because
// handleTransliterate() maintains that.
rollbackOrigin += totalDelta;
globalLimit += totalDelta;
// Delete the rollback copy
text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, EMPTY);
// If we failed to complete transliterate this segment,
else {
// Delegate to subclass for actual transliteration.
int32_t limit = index.limit;
handleTransliterate(text, index, isIncrementalRun);
delta = index.limit - limit; // change in length
// Adjust overall limit for insertions/deletions. Don't need
// to worry about contextLimit because handleTransliterate()
// maintains that.
globalLimit += delta;
// If we failed to complete transliterate this run,
// then we are done.
if (index.start != index.limit) {
// If we did completely transliterate this
// segment, then repeat with the next unfiltered segment.
// run, then repeat with the next unfiltered run.
// Start is valid where it is. Limit needs to be put back where
// it was, modulo adjustments for deletions/insertions.
index.limit = globalLimit;
index.limit = globalLimit;
void Transliterator::filteredTransliterate(Replaceable& text,
UTransPosition& index,
UBool incremental) const {
filteredTransliterate(text, index, incremental, FALSE);

View File

@ -573,22 +573,54 @@ protected:
UBool incremental) const = 0;
* Core transliteration method called by all other methods in
* Tranliterator. This method splits up the input text into
* segments of unfiltered text and passes those to
* handleTransliterate(). For most subclasses this is convenient
* and efficient. Subclasses that can more efficiently handle the
* filter logic on their own (rare) can override
* filteredTransliterate(). Such subclasses must still implement
* handleTransliterate() but they can do so with an empty body,
* since filteredTransliterate() is the only method that calls
* handleTransliterate().
* Transliterate a substring of text, as specified by index, taking filters
* into account. This method is for subclasses that need to delegate to
* another transliterator, such as CompoundTransliterator.
* @param text the text to be transliterated
* @param index the position indices
* @param incremental if TRUE, then assume more characters may be inserted
* at index.limit, and postpone processing to accomodate future incoming
* characters
virtual void filteredTransliterate(Replaceable& text,
UTransPosition& index,
UBool incremental) const;
friend class CompoundTransliterator; // for filteredTransliterate
friend class CompoundTransliterator; // for filteredTransliterate()
* Top-level transliteration method, handling filtering, incremental and
* non-incremental transliteration, and rollback. All transliteration
* public API methods eventually call this method with a rollback argument
* of TRUE. Other entities may call this method but rollback should be
* <p>If this transliterator has a filter, break up the input text into runs
* of unfiltered characters. Pass each run to
* <subclass>.handleTransliterate().
* <p>In incremental mode, if rollback is TRUE, perform a special
* incremental procedure in which several passes are made over the input
* text, adding one character at a time, and committing successful
* transliterations as they occur. Unsuccessful transliterations are rolled
* back and retried with additional characters to give correct results.
* @param text the text to be transliterated
* @param index the position indices
* @param incremental if TRUE, then assume more characters may be inserted
* at index.limit, and postpone processing to accomodate future incoming
* characters
* @param rollback if TRUE and if incremental is TRUE, then perform special
* incremental processing, as described above, and undo partial
* transliterations where necessary. If incremental is FALSE then this
* parameter is ignored.
virtual void filteredTransliterate(Replaceable& text,
UTransPosition& index,
UBool incremental,
UBool rollback) const;

View File

@ -394,12 +394,12 @@ void TransliteratorTest::TestKeyboard2(void) {
// insertion, buffer
"a", "A",
"p", "Ap",
"s", "Ay",
"c", "Ayc",
"s", "Aps", // modified for rollback - "Ay",
"c", "Apsc", // modified for rollback - "Ayc",
"a", "AycA",
"p", "AycAp",
"s", "AycAy",
"c", "AycAyc",
"s", "AycAps", // modified for rollback - "AycAy",
"c", "AycApsc", // modified for rollback - "AycAyc",
"h", "AycAY",
0, "AycAY", // null means finishKeyboardTransliteration
@ -423,9 +423,9 @@ void TransliteratorTest::TestKeyboard3(void) {
// keyboard xliteration.
"a", "a",
"b", "ab",
"t", "aby",
"t", "abt", // modified for rollback - "aby",
"c", "abyc",
"t", "abycy",
"t", "abyct", // modified for rollback - "abycy",
"h", "abycz",
0, "abycz", // null means finishKeyboardTransliteration
@ -2110,7 +2110,7 @@ void TransliteratorTest::TestNewEngine() {
delete t;
#if 0
#if 1
// This test will only work if Transliterator.ROLLBACK is
// true. Otherwise, this test will fail, revealing a
// limitation of global filters in incremental mode.

View File

@ -5,8 +5,8 @@
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/,v $
* $Date: 2001/11/28 17:40:40 $
* $Revision: 1.82 $
* $Date: 2001/11/29 01:00:29 $
* $Revision: 1.83 $
@ -267,12 +267,12 @@ public class TransliteratorTest extends TestFmwk {
// insertion, buffer
"a", "A",
"p", "Ap",
"s", "Ay",
"c", "Ayc",
"s", "Aps", // modified for rollback - "Ay",
"c", "Apsc", // modified for rollback - "Ayc",
"a", "AycA",
"p", "AycAp",
"s", "AycAy",
"c", "AycAyc",
"s", "AycAps", // modified for rollback - "AycAy",
"c", "AycApsc", // modified for rollback - "AycAyc",
"h", "AycAY",
null, "AycAY", // null means finishKeyboardTransliteration
@ -298,9 +298,9 @@ public class TransliteratorTest extends TestFmwk {
// keyboard xliteration.
"a", "a",
"b", "ab",
"t", "aby",
"t", "abt", // modified for rollback - "aby",
"c", "abyc",
"t", "abycy",
"t", "abyct", // modified for rollback - "abycy",
"h", "abycz",
null, "abycz", // null means finishKeyboardTransliteration
@ -323,15 +323,8 @@ public class TransliteratorTest extends TestFmwk {
log = new StringBuffer(s.toString() + " => ");
t.finishTransliteration(s, index);
String str = s.toString();
// Show the start index '{' and the cursor '|'
log.append(str.substring(0, index.contextStart)).
if (str.equals(DATA[i+1])) {
formatInput(log, s, index);
if (s.toString().equals(DATA[i+1])) {
} else {
errln("FAIL: " + log.toString() + ", expected " + DATA[i+1]);
@ -1547,7 +1540,7 @@ public class TransliteratorTest extends TestFmwk {
// Katakana should be untouched
expect(t, "a\u3042\u30A2", "\u3042\u3042\u30A2");
if (false) {
if (true) {
// This test will only work if Transliterator.ROLLBACK is
// true. Otherwise, this test will fail, revealing a
// limitation of global filters in incremental mode.

View File

@ -5,8 +5,8 @@
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/,v $
* $Date: 2001/11/21 22:21:45 $
* $Revision: 1.65 $
* $Date: 2001/11/29 01:00:10 $
* $Revision: 1.66 $
@ -242,7 +242,7 @@ import;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
* @author Alan Liu
* @version $RCSfile:,v $ $Revision: 1.65 $ $Date: 2001/11/21 22:21:45 $
* @version $RCSfile:,v $ $Revision: 1.66 $ $Date: 2001/11/29 01:00:10 $
public abstract class Transliterator {
@ -464,7 +464,7 @@ public abstract class Transliterator {
Position pos = new Position(start, limit, start);
filteredTransliterate(text, pos, false);
filteredTransliterate(text, pos, false, true);
return pos.limit;
@ -555,7 +555,7 @@ public abstract class Transliterator {
filteredTransliterate(text, index, true);
filteredTransliterate(text, index, true, true);
// This doesn't work once we add quantifier support. Need to rewrite
@ -613,7 +613,7 @@ public abstract class Transliterator {
public final void finishTransliteration(Replaceable text,
Position index) {
filteredTransliterate(text, index, false);
filteredTransliterate(text, index, false, true);
@ -657,34 +657,61 @@ public abstract class Transliterator {
Position pos, boolean incremental);
* Rollback makes global filters and compound transliterators very
* bulletproof, but it also makes some transliterators completely
* non-incremental -- that is, for some transliterators, rollback
* is always triggered, until finishTransliteration() is called.
* Since this eliminates most of the usefulness of incremental
* mode, rollback should usually be disabled.
* Top-level transliteration method, handling filtering, incremental and
* non-incremental transliteration, and rollback. All transliteration
* public API methods eventually call this method with a rollback argument
* of TRUE. Other entities may call this method but rollback should be
* <p>If this transliterator has a filter, break up the input text into runs
* of unfiltered characters. Pass each run to
* <subclass>.handleTransliterate().
* This is used by Transliterator and CompoundTransliterator.
static final boolean ROLLBACK = false;
* This method breaks up the input text into runs of unfiltered
* characters. It passes each such run to
* <subclass>.handleTransliterate(). Subclasses that can handle the
* filter logic more efficiently themselves may override this method.
* <p>In incremental mode, if rollback is TRUE, perform a special
* incremental procedure in which several passes are made over the input
* text, adding one character at a time, and committing successful
* transliterations as they occur. Unsuccessful transliterations are rolled
* back and retried with additional characters to give correct results.
* All transliteration calls in this class go through this method.
* @param text the text to be transliterated
* @param index the position indices
* @param incremental if TRUE, then assume more characters may be inserted
* at index.limit, and postpone processing to accomodate future incoming
* characters
* @param rollback if TRUE and if incremental is TRUE, then perform special
* incremental processing, as described above, and undo partial
* transliterations where necessary. If incremental is FALSE then this
* parameter is ignored.
protected void filteredTransliterate(Replaceable text,
Position index,
boolean incremental) {
if (filter == null) {
// Short circuit path for transliterators with no filter
private void filteredTransliterate(Replaceable text,
Position index,
boolean incremental,
boolean rollback) {
// Short circuit path for transliterators with no filter in
// non-incremental mode.
if (filter == null && !rollback) {
handleTransliterate(text, index, incremental);
// This method processes text in two groupings:
// RUNS -- A run is a contiguous group of characters which are contained
// in the filter for this transliterator (filter.contains(ch) == true).
// Text outside of runs may appear as context but it is not modified.
// The start and limit Position values are narrowed to each run.
// PASSES (incremental only) -- To make incremental mode work correctly,
// each run is broken up into n passes, where n is the length (in code
// points) of the run. Each pass contains the first n characters. If a
// pass is completely transliterated, it is committed, and further passes
// include characters after the committed text. If a pass is blocked,
// and does not transliterate completely, then this method rolls back
// the changes made during the pass, extends the pass by one code point,
// and tries again.
// globalLimit is the limit value for the entire operation. We
// set index.limit to the end of each unfiltered run before
// calling handleTransliterate(), so we need to maintain the real
@ -693,32 +720,35 @@ public abstract class Transliterator {
// happened.
int globalLimit = index.limit;
// Break the input text up. Say the input text has the form:
// If there is a non-null filter, then break the input text up. Say the
// input text has the form:
// xxxabcxxdefxx
// where 'x' represents a filtered character. Then we break this
// up into:
// where 'x' represents a filtered character (filter.contains('x') ==
// false). Then we break this up into:
// xxxabc xxdef xx
// Each pass through the loop consumes a run of filtered
// characters (which are ignored) and a subsequent run of
// unfiltered characters (which are transliterated). If, at any
// point, we fail to consume our entire segment, we stop.
// unfiltered characters (which are transliterated).
for (;;) {
// Narrow the range to be transliterated to the first segment
// of unfiltered characters at or after index.start.
int c;
// Advance compoundStart past filtered chars
while (index.start < globalLimit &&
!filter.contains(c=UTF16.charAt(text, index.start))) {
index.start += UTF16.getCharCount(c);
// Find the end of this run of unfiltered chars
index.limit = index.start;
while (index.limit < globalLimit &&
filter.contains(c=UTF16.charAt(text, index.limit))) {
index.limit += UTF16.getCharCount(c);
if (filter != null) {
// Narrow the range to be transliterated to the first run
// of unfiltered characters at or after index.start.
// Advance past filtered chars
int c;
while (index.start < globalLimit &&
!filter.contains(c=UTF16.charAt(text, index.start))) {
index.start += UTF16.getCharCount(c);
// Find the end of this run of unfiltered chars
index.limit = index.start;
while (index.limit < globalLimit &&
filter.contains(c=UTF16.charAt(text, index.limit))) {
index.limit += UTF16.getCharCount(c);
// Check to see if the unfiltered run is empty. This only
@ -729,14 +759,14 @@ public abstract class Transliterator {
int limit = index.limit;
// Is this segment incremental? If there is additional
// Is this run incremental? If there is additional
// filtered text (if limit < globalLimit) then we pass in
// an incremental value of FALSE to force the subclass to
// complete the transliteration for this segment.
boolean isIncrementalSegment =
(limit < globalLimit ? false : incremental);
// complete the transliteration for this run.
boolean isIncrementalRun =
(index.limit < globalLimit ? false : incremental);
int delta;
// Implement rollback. To understand the need for rollback,
// consider the following transliterator:
@ -745,96 +775,149 @@ public abstract class Transliterator {
// "u" is "A > b;"
// "v" is a compound of "t; NFD; u" with a filter [:Ll:]
// Now apply "c" to the input text "a". The result is "b". But if
// Now apply "v" to the input text "a". The result is "b". But if
// the transliteration is done incrementally, then the NFD holds
// things up after "t" has already transformed "a" to "A". When
// finishTransliterate() is called, "A" is _not_ processed because
// it gets excluded by the [:Ll:] filter, and the end result is "A"
// -- incorrect. The problem is that the filter is applied to a
// partially-transliterated result, when we only want it to apply to
// input text. Although this example hinges on a compound
// input text. Although this example describes a compound
// transliterator containing NFD and a specific filter, it can
// actually happen with any transliterator which may do a partial
// happen with any transliterator which does a partial
// transformation in incremental mode into characters outside its
// filter.
// There are two solutions. The first is to add two new index
// values to the position structure, a filteredStart and a
// filteredLimit. Then filteredTransliterate() can set and read
// these, and avoid filtering partially transliterated results. A
// variant of this solution is to retain an internal state object
// with the filtered range that is indexed by the text pointer and
// the position object pointer, in analogy to strtok(). The third
// solution involves no change to the API and no internal state
// cache. It is to roll back any partially transliterated results
// if (a) there is a filter, and (b) the transliteration is
// incremental. This is the solution implemented here.
int rollbackStart = 0;
int rollbackCopy = 0;
if (isIncrementalSegment) {
// Make a rollback copy at the end of the string
rollbackStart = index.start;
rollbackCopy = text.length();
text.copy(rollbackStart, limit, rollbackCopy);
// To handle this, when in incremental mode we supply characters to
// handleTransliterate() in several passes. Each pass adds one more
// input character to the input text. That is, for input "ABCD", we
// first try "A", then "AB", then "ABC", and finally "ABCD". If at
// any point we block (upon return, start < limit) then we roll
// back. If at any point we complete the run (upon return start ==
// limit) then we commit that run.
// Delegate to subclass for actual transliteration.
handleTransliterate(text, index, isIncrementalSegment);
if (rollback && isIncrementalRun) {
int delta = index.limit - limit; // change in length
int runStart = index.start;
int runLimit = index.limit;
int runLength = runLimit - runStart;
// Adjust overall limit for insertions/deletions. Don't need
// to worry about contextLimit because handleTransliterate()
// maintains that.
globalLimit += delta;
// Make a rollback copy at the end of the string
int rollbackOrigin = text.length();
text.copy(runStart, runLimit, rollbackOrigin);
// If we failed to complete transliterate this segment,
// then we are done. If rollback is required, then do so.
if (index.start != index.limit) {
if (isIncrementalSegment) {
// Replace [rollbackStart, limit) -- this is the
// original filtered segment -- with
// [rollbackCopy, text.length()), the rollback
// copy, then delete the rollback copy.
rollbackCopy += delta;
int rollbackLen = text.length() - rollbackCopy;
// Delete the partially transliterated segment
rollbackCopy -= index.limit - rollbackStart;
text.replace(rollbackStart, index.limit, "");
// Copy the rollback copy back
text.copy(rollbackCopy, text.length(), rollbackStart);
// Delete the rollback copy
rollbackCopy += rollbackLen;
text.replace(rollbackCopy, text.length(), "");
// Restore indices
index.start = rollbackStart;
index.limit = limit;
index.contextLimit -= delta;
globalLimit -= delta;
// Variables reflecting the commitment of completely
// transliterated text. passStart is the runStart, advanced
// past committed text. rollbackStart is the rollbackOrigin,
// advanced past rollback text that corresponds to committed
// text.
int passStart = runStart;
int rollbackStart = rollbackOrigin;
// The limit for each pass; we advance by one code point with
// each iteration.
int passLimit = index.start;
// Total length, in 16-bit code units, of uncommitted text.
// This is the length to be rolled back.
int uncommittedLength = 0;
// Total delta (change in length) for all passes
int totalDelta = 0;
// PASS MAIN LOOP -- Start with a single character, and extend
// the text by one character at a time. Roll back partial
// transliterations and commit complete transliterations.
for (;;) {
// Length of additional code point, either one or two
int charLength =
UTF16.getCharCount(UTF16.charAt(text, passLimit));
passLimit += charLength;
if (passLimit > runLimit) {
uncommittedLength += charLength;
index.limit = passLimit;
// Delegate to subclass for actual transliteration. Upon
// return, start will be updated to point after the
// transliterated text, and limit and contextLimit will be
// adjusted for length changes.
handleTransliterate(text, index, true);
delta = index.limit - passLimit; // change in length
// We failed to completely transliterate this pass.
// Roll back the text. Indices remain unchanged; reset
// them where necessary.
if (index.start != index.limit) {
// Find the rollbackStart, adjusted for length changes
// and the deletion of partially transliterated text.
int rs = rollbackStart + delta - (index.limit - passStart);
// Delete the partially transliterated text
text.replace(passStart, index.limit, "");
// Copy the rollback text back
text.copy(rs, rs + uncommittedLength, passStart);
// Restore indices to their original values
index.start = passStart;
index.limit = passLimit;
index.contextLimit -= delta;
// We did completely transliterate this pass. Update the
// commit indices to record how far we got. Adjust indices
// for length change.
else {
// Move the pass indices past the committed text.
passStart = passLimit = index.start;
// Adjust the rollbackStart for length changes and move
// it past the committed text. All characters we've
// processed to this point are committed now, so zero
// out the uncommittedLength.
rollbackStart += delta + uncommittedLength;
uncommittedLength = 0;
// Adjust indices for length changes.
runLimit += delta;
totalDelta += delta;
} else if (isIncrementalSegment) {
// We finished this segment; delete the rollback copy
rollbackCopy += delta;
text.replace(rollbackCopy, text.length(), "");
} else {
// If we failed to complete transliterate this segment,
// then we are done.
if (index.start != index.limit) {
// Adjust overall limit and rollbackOrigin for insertions and
// deletions. Don't need to worry about contextLimit because
// handleTransliterate() maintains that.
rollbackOrigin += totalDelta;
globalLimit += totalDelta;
// Delete the rollback copy
text.replace(rollbackOrigin, rollbackOrigin + runLength, "");
else {
// Delegate to subclass for actual transliteration.
int limit = index.limit;
handleTransliterate(text, index, isIncrementalRun);
delta = index.limit - limit; // change in length
// Adjust overall limit for insertions/deletions. Don't need
// to worry about contextLimit because handleTransliterate()
// maintains that.
globalLimit += delta;
// If we failed to complete transliterate this run,
// then we are done.
if (index.start != index.limit) {
// If we did completely transliterate this
// segment, then repeat with the next unfiltered segment.
// run, then repeat with the next unfiltered run.
// Start is valid where it is. Limit needs to be put back where
@ -842,6 +925,22 @@ public abstract class Transliterator {
index.limit = globalLimit;
* Transliterate a substring of text, as specified by index, taking filters
* into account. This method is for subclasses that need to delegate to
* another transliterator, such as CompoundTransliterator.
* @param text the text to be transliterated
* @param index the position indices
* @param incremental if TRUE, then assume more characters may be inserted
* at index.limit, and postpone processing to accomodate future incoming
* characters
protected void filteredTransliterate(Replaceable text,
Position index,
boolean incremental) {
filteredTransliterate(text, index, incremental, false);
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context. The default value is zero, but
@ -1726,9 +1825,7 @@ public abstract class Transliterator {
* <p><b>Note:</b> Most subclasses that implement
* handleTransliterator() will <em>not</em> want to use this
* method, since characters they see are already filtered. Only
* subclasses with special requirements, such as those overriding
* filteredTransliterate(), should need this method.
* method, since characters they see are already filtered.
* @deprecated the new architecture provides filtering at the top
* level. This method will be removed Dec 31 2001.

View File

@ -5,8 +5,8 @@
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/,v $
* $Date: 2001/11/28 17:40:40 $
* $Revision: 1.82 $
* $Date: 2001/11/29 01:00:29 $
* $Revision: 1.83 $
@ -267,12 +267,12 @@ public class TransliteratorTest extends TestFmwk {
// insertion, buffer
"a", "A",
"p", "Ap",
"s", "Ay",
"c", "Ayc",
"s", "Aps", // modified for rollback - "Ay",
"c", "Apsc", // modified for rollback - "Ayc",
"a", "AycA",
"p", "AycAp",
"s", "AycAy",
"c", "AycAyc",
"s", "AycAps", // modified for rollback - "AycAy",
"c", "AycApsc", // modified for rollback - "AycAyc",
"h", "AycAY",
null, "AycAY", // null means finishKeyboardTransliteration
@ -298,9 +298,9 @@ public class TransliteratorTest extends TestFmwk {
// keyboard xliteration.
"a", "a",
"b", "ab",
"t", "aby",
"t", "abt", // modified for rollback - "aby",
"c", "abyc",
"t", "abycy",
"t", "abyct", // modified for rollback - "abycy",
"h", "abycz",
null, "abycz", // null means finishKeyboardTransliteration
@ -323,15 +323,8 @@ public class TransliteratorTest extends TestFmwk {
log = new StringBuffer(s.toString() + " => ");
t.finishTransliteration(s, index);
String str = s.toString();
// Show the start index '{' and the cursor '|'
log.append(str.substring(0, index.contextStart)).
if (str.equals(DATA[i+1])) {
formatInput(log, s, index);
if (s.toString().equals(DATA[i+1])) {
} else {
errln("FAIL: " + log.toString() + ", expected " + DATA[i+1]);
@ -1547,7 +1540,7 @@ public class TransliteratorTest extends TestFmwk {
// Katakana should be untouched
expect(t, "a\u3042\u30A2", "\u3042\u3042\u30A2");
if (false) {
if (true) {
// This test will only work if Transliterator.ROLLBACK is
// true. Otherwise, this test will fail, revealing a
// limitation of global filters in incremental mode.

View File

@ -5,8 +5,8 @@
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/,v $
* $Date: 2001/11/21 22:21:45 $
* $Revision: 1.65 $
* $Date: 2001/11/29 01:00:10 $
* $Revision: 1.66 $
@ -242,7 +242,7 @@ import;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
* @author Alan Liu
* @version $RCSfile:,v $ $Revision: 1.65 $ $Date: 2001/11/21 22:21:45 $
* @version $RCSfile:,v $ $Revision: 1.66 $ $Date: 2001/11/29 01:00:10 $
public abstract class Transliterator {
@ -464,7 +464,7 @@ public abstract class Transliterator {
Position pos = new Position(start, limit, start);
filteredTransliterate(text, pos, false);
filteredTransliterate(text, pos, false, true);
return pos.limit;
@ -555,7 +555,7 @@ public abstract class Transliterator {
filteredTransliterate(text, index, true);
filteredTransliterate(text, index, true, true);
// This doesn't work once we add quantifier support. Need to rewrite
@ -613,7 +613,7 @@ public abstract class Transliterator {
public final void finishTransliteration(Replaceable text,
Position index) {
filteredTransliterate(text, index, false);
filteredTransliterate(text, index, false, true);
@ -657,34 +657,61 @@ public abstract class Transliterator {
Position pos, boolean incremental);
* Rollback makes global filters and compound transliterators very
* bulletproof, but it also makes some transliterators completely
* non-incremental -- that is, for some transliterators, rollback
* is always triggered, until finishTransliteration() is called.
* Since this eliminates most of the usefulness of incremental
* mode, rollback should usually be disabled.
* Top-level transliteration method, handling filtering, incremental and
* non-incremental transliteration, and rollback. All transliteration
* public API methods eventually call this method with a rollback argument
* of TRUE. Other entities may call this method but rollback should be
* <p>If this transliterator has a filter, break up the input text into runs
* of unfiltered characters. Pass each run to
* <subclass>.handleTransliterate().
* This is used by Transliterator and CompoundTransliterator.
static final boolean ROLLBACK = false;
* This method breaks up the input text into runs of unfiltered
* characters. It passes each such run to
* <subclass>.handleTransliterate(). Subclasses that can handle the
* filter logic more efficiently themselves may override this method.
* <p>In incremental mode, if rollback is TRUE, perform a special
* incremental procedure in which several passes are made over the input
* text, adding one character at a time, and committing successful
* transliterations as they occur. Unsuccessful transliterations are rolled
* back and retried with additional characters to give correct results.
* All transliteration calls in this class go through this method.
* @param text the text to be transliterated
* @param index the position indices
* @param incremental if TRUE, then assume more characters may be inserted
* at index.limit, and postpone processing to accomodate future incoming
* characters
* @param rollback if TRUE and if incremental is TRUE, then perform special
* incremental processing, as described above, and undo partial
* transliterations where necessary. If incremental is FALSE then this
* parameter is ignored.
protected void filteredTransliterate(Replaceable text,
Position index,
boolean incremental) {
if (filter == null) {
// Short circuit path for transliterators with no filter
private void filteredTransliterate(Replaceable text,
Position index,
boolean incremental,
boolean rollback) {
// Short circuit path for transliterators with no filter in
// non-incremental mode.
if (filter == null && !rollback) {
handleTransliterate(text, index, incremental);
// This method processes text in two groupings:
// RUNS -- A run is a contiguous group of characters which are contained
// in the filter for this transliterator (filter.contains(ch) == true).
// Text outside of runs may appear as context but it is not modified.
// The start and limit Position values are narrowed to each run.
// PASSES (incremental only) -- To make incremental mode work correctly,
// each run is broken up into n passes, where n is the length (in code
// points) of the run. Each pass contains the first n characters. If a
// pass is completely transliterated, it is committed, and further passes
// include characters after the committed text. If a pass is blocked,
// and does not transliterate completely, then this method rolls back
// the changes made during the pass, extends the pass by one code point,
// and tries again.
// globalLimit is the limit value for the entire operation. We
// set index.limit to the end of each unfiltered run before
// calling handleTransliterate(), so we need to maintain the real
@ -693,32 +720,35 @@ public abstract class Transliterator {
// happened.
int globalLimit = index.limit;
// Break the input text up. Say the input text has the form:
// If there is a non-null filter, then break the input text up. Say the
// input text has the form:
// xxxabcxxdefxx
// where 'x' represents a filtered character. Then we break this
// up into:
// where 'x' represents a filtered character (filter.contains('x') ==
// false). Then we break this up into:
// xxxabc xxdef xx
// Each pass through the loop consumes a run of filtered
// characters (which are ignored) and a subsequent run of
// unfiltered characters (which are transliterated). If, at any
// point, we fail to consume our entire segment, we stop.
// unfiltered characters (which are transliterated).
for (;;) {
// Narrow the range to be transliterated to the first segment
// of unfiltered characters at or after index.start.
int c;
// Advance compoundStart past filtered chars
while (index.start < globalLimit &&
!filter.contains(c=UTF16.charAt(text, index.start))) {
index.start += UTF16.getCharCount(c);
// Find the end of this run of unfiltered chars
index.limit = index.start;
while (index.limit < globalLimit &&
filter.contains(c=UTF16.charAt(text, index.limit))) {
index.limit += UTF16.getCharCount(c);
if (filter != null) {
// Narrow the range to be transliterated to the first run
// of unfiltered characters at or after index.start.
// Advance past filtered chars
int c;
while (index.start < globalLimit &&
!filter.contains(c=UTF16.charAt(text, index.start))) {
index.start += UTF16.getCharCount(c);
// Find the end of this run of unfiltered chars
index.limit = index.start;
while (index.limit < globalLimit &&
filter.contains(c=UTF16.charAt(text, index.limit))) {
index.limit += UTF16.getCharCount(c);
// Check to see if the unfiltered run is empty. This only
@ -729,14 +759,14 @@ public abstract class Transliterator {
int limit = index.limit;
// Is this segment incremental? If there is additional
// Is this run incremental? If there is additional
// filtered text (if limit < globalLimit) then we pass in
// an incremental value of FALSE to force the subclass to
// complete the transliteration for this segment.
boolean isIncrementalSegment =
(limit < globalLimit ? false : incremental);
// complete the transliteration for this run.
boolean isIncrementalRun =
(index.limit < globalLimit ? false : incremental);
int delta;
// Implement rollback. To understand the need for rollback,
// consider the following transliterator:
@ -745,96 +775,149 @@ public abstract class Transliterator {
// "u" is "A > b;"
// "v" is a compound of "t; NFD; u" with a filter [:Ll:]
// Now apply "c" to the input text "a". The result is "b". But if
// Now apply "v" to the input text "a". The result is "b". But if
// the transliteration is done incrementally, then the NFD holds
// things up after "t" has already transformed "a" to "A". When
// finishTransliterate() is called, "A" is _not_ processed because
// it gets excluded by the [:Ll:] filter, and the end result is "A"
// -- incorrect. The problem is that the filter is applied to a
// partially-transliterated result, when we only want it to apply to
// input text. Although this example hinges on a compound
// input text. Although this example describes a compound
// transliterator containing NFD and a specific filter, it can
// actually happen with any transliterator which may do a partial
// happen with any transliterator which does a partial
// transformation in incremental mode into characters outside its
// filter.
// There are two solutions. The first is to add two new index
// values to the position structure, a filteredStart and a
// filteredLimit. Then filteredTransliterate() can set and read
// these, and avoid filtering partially transliterated results. A
// variant of this solution is to retain an internal state object
// with the filtered range that is indexed by the text pointer and
// the position object pointer, in analogy to strtok(). The third
// solution involves no change to the API and no internal state
// cache. It is to roll back any partially transliterated results
// if (a) there is a filter, and (b) the transliteration is
// incremental. This is the solution implemented here.
int rollbackStart = 0;
int rollbackCopy = 0;
if (isIncrementalSegment) {
// Make a rollback copy at the end of the string
rollbackStart = index.start;
rollbackCopy = text.length();
text.copy(rollbackStart, limit, rollbackCopy);
// To handle this, when in incremental mode we supply characters to
// handleTransliterate() in several passes. Each pass adds one more
// input character to the input text. That is, for input "ABCD", we
// first try "A", then "AB", then "ABC", and finally "ABCD". If at
// any point we block (upon return, start < limit) then we roll
// back. If at any point we complete the run (upon return start ==
// limit) then we commit that run.
// Delegate to subclass for actual transliteration.
handleTransliterate(text, index, isIncrementalSegment);
if (rollback && isIncrementalRun) {
int delta = index.limit - limit; // change in length
int runStart = index.start;
int runLimit = index.limit;
int runLength = runLimit - runStart;
// Adjust overall limit for insertions/deletions. Don't need
// to worry about contextLimit because handleTransliterate()
// maintains that.
globalLimit += delta;
// Make a rollback copy at the end of the string
int rollbackOrigin = text.length();
text.copy(runStart, runLimit, rollbackOrigin);
// If we failed to complete transliterate this segment,
// then we are done. If rollback is required, then do so.
if (index.start != index.limit) {
if (isIncrementalSegment) {
// Replace [rollbackStart, limit) -- this is the
// original filtered segment -- with
// [rollbackCopy, text.length()), the rollback
// copy, then delete the rollback copy.
rollbackCopy += delta;
int rollbackLen = text.length() - rollbackCopy;
// Delete the partially transliterated segment
rollbackCopy -= index.limit - rollbackStart;
text.replace(rollbackStart, index.limit, "");
// Copy the rollback copy back
text.copy(rollbackCopy, text.length(), rollbackStart);
// Delete the rollback copy
rollbackCopy += rollbackLen;
text.replace(rollbackCopy, text.length(), "");
// Restore indices
index.start = rollbackStart;
index.limit = limit;
index.contextLimit -= delta;
globalLimit -= delta;
// Variables reflecting the commitment of completely
// transliterated text. passStart is the runStart, advanced
// past committed text. rollbackStart is the rollbackOrigin,
// advanced past rollback text that corresponds to committed
// text.
int passStart = runStart;
int rollbackStart = rollbackOrigin;
// The limit for each pass; we advance by one code point with
// each iteration.
int passLimit = index.start;
// Total length, in 16-bit code units, of uncommitted text.
// This is the length to be rolled back.
int uncommittedLength = 0;
// Total delta (change in length) for all passes
int totalDelta = 0;
// PASS MAIN LOOP -- Start with a single character, and extend
// the text by one character at a time. Roll back partial
// transliterations and commit complete transliterations.
for (;;) {
// Length of additional code point, either one or two
int charLength =
UTF16.getCharCount(UTF16.charAt(text, passLimit));
passLimit += charLength;
if (passLimit > runLimit) {
uncommittedLength += charLength;
index.limit = passLimit;
// Delegate to subclass for actual transliteration. Upon
// return, start will be updated to point after the
// transliterated text, and limit and contextLimit will be
// adjusted for length changes.
handleTransliterate(text, index, true);
delta = index.limit - passLimit; // change in length
// We failed to completely transliterate this pass.
// Roll back the text. Indices remain unchanged; reset
// them where necessary.
if (index.start != index.limit) {
// Find the rollbackStart, adjusted for length changes
// and the deletion of partially transliterated text.
int rs = rollbackStart + delta - (index.limit - passStart);
// Delete the partially transliterated text
text.replace(passStart, index.limit, "");
// Copy the rollback text back
text.copy(rs, rs + uncommittedLength, passStart);
// Restore indices to their original values
index.start = passStart;
index.limit = passLimit;
index.contextLimit -= delta;
// We did completely transliterate this pass. Update the
// commit indices to record how far we got. Adjust indices
// for length change.
else {
// Move the pass indices past the committed text.
passStart = passLimit = index.start;
// Adjust the rollbackStart for length changes and move
// it past the committed text. All characters we've
// processed to this point are committed now, so zero
// out the uncommittedLength.
rollbackStart += delta + uncommittedLength;
uncommittedLength = 0;
// Adjust indices for length changes.
runLimit += delta;
totalDelta += delta;
} else if (isIncrementalSegment) {
// We finished this segment; delete the rollback copy
rollbackCopy += delta;
text.replace(rollbackCopy, text.length(), "");
} else {
// If we failed to complete transliterate this segment,
// then we are done.
if (index.start != index.limit) {
// Adjust overall limit and rollbackOrigin for insertions and
// deletions. Don't need to worry about contextLimit because
// handleTransliterate() maintains that.
rollbackOrigin += totalDelta;
globalLimit += totalDelta;
// Delete the rollback copy
text.replace(rollbackOrigin, rollbackOrigin + runLength, "");
else {
// Delegate to subclass for actual transliteration.
int limit = index.limit;
handleTransliterate(text, index, isIncrementalRun);
delta = index.limit - limit; // change in length
// Adjust overall limit for insertions/deletions. Don't need
// to worry about contextLimit because handleTransliterate()
// maintains that.
globalLimit += delta;
// If we failed to complete transliterate this run,
// then we are done.
if (index.start != index.limit) {
// If we did completely transliterate this
// segment, then repeat with the next unfiltered segment.
// run, then repeat with the next unfiltered run.
// Start is valid where it is. Limit needs to be put back where
@ -842,6 +925,22 @@ public abstract class Transliterator {
index.limit = globalLimit;
* Transliterate a substring of text, as specified by index, taking filters
* into account. This method is for subclasses that need to delegate to
* another transliterator, such as CompoundTransliterator.
* @param text the text to be transliterated
* @param index the position indices
* @param incremental if TRUE, then assume more characters may be inserted
* at index.limit, and postpone processing to accomodate future incoming
* characters
protected void filteredTransliterate(Replaceable text,
Position index,
boolean incremental) {
filteredTransliterate(text, index, incremental, false);
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context. The default value is zero, but
@ -1726,9 +1825,7 @@ public abstract class Transliterator {
* <p><b>Note:</b> Most subclasses that implement
* handleTransliterator() will <em>not</em> want to use this
* method, since characters they see are already filtered. Only
* subclasses with special requirements, such as those overriding
* filteredTransliterate(), should need this method.
* method, since characters they see are already filtered.
* @deprecated the new architecture provides filtering at the top
* level. This method will be removed Dec 31 2001.