ICU-13346 add Edits.Iterator.previous() for mapping near-earlier indexes; compress some repeated m:n replacements even when m!=n

X-SVN-Rev: 40426
This commit is contained in:
Markus Scherer 2017-09-18 22:59:48 +00:00
parent 68ef77118b
commit b966a7e403
2 changed files with 286 additions and 69 deletions

View File

@ -18,10 +18,10 @@ public final class Edits {
private static final int MAX_UNCHANGED_LENGTH = 0x1000;
private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
// No length change.
private static final int MAX_SHORT_WIDTH = 6;
private static final int MAX_SHORT_CHANGE_LENGTH = 0xfff;
// 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.
private static final int MAX_SHORT_CHANGE_OLD_LENGTH = 6;
private static final int MAX_SHORT_CHANGE_NEW_LENGTH = 7;
private static final int SHORT_CHANGE_NUM_MASK = 0x1ff;
private static final int MAX_SHORT_CHANGE = 0x6fff;
// 0111mmmmmmnnnnnn records a replacement of m text units with n.
@ -103,20 +103,6 @@ public final class Edits {
* @provisional This API might change or be removed in a future release.
*/
public void addReplace(int oldLength, int newLength) {
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
// Replacement of short oldLength text units by same-length new text.
// Merge into previous short-replacement record, if any.
++numChanges;
int last = lastUnit();
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
(last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
setLastUnit(last + 1);
return;
}
append(oldLength << 12);
return;
}
if(oldLength < 0 || newLength < 0) {
throw new IllegalArgumentException(
"addReplace(" + oldLength + ", " + newLength +
@ -136,6 +122,21 @@ public final class Edits {
delta += newDelta;
}
if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&
newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) {
// Merge into previous same-lengths short-replacement record, if any.
int u = (oldLength << 12) | (newLength << 9);
int last = lastUnit();
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
(last & ~SHORT_CHANGE_NUM_MASK) == u &&
(last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) {
setLastUnit(last + 1);
return;
}
append(u);
return;
}
int head = 0x7000;
if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
head |= oldLength << 6;
@ -225,9 +226,14 @@ public final class Edits {
private final char[] array;
private int index;
private final int length;
/**
* 0 if we are not within compressed equal-length changes.
* Otherwise the number of remaining changes, including the current one.
*/
private int remaining;
private final boolean onlyChanges_, coarse;
private int dir; // iteration direction: back(<0), initial(0), forward(>0)
private boolean changed;
private int oldLength_, newLength_;
private int srcIndex, replIndex, destIndex;
@ -258,7 +264,7 @@ public final class Edits {
}
}
private void updateIndexes() {
private void updateNextIndexes() {
srcIndex += oldLength_;
if (changed) {
replIndex += newLength_;
@ -266,8 +272,17 @@ public final class Edits {
destIndex += newLength_;
}
private void updatePreviousIndexes() {
srcIndex -= oldLength_;
if (changed) {
replIndex -= newLength_;
}
destIndex -= newLength_;
}
private boolean noNext() {
// No change beyond the string.
// No change before or beyond the string.
dir = 0;
changed = false;
oldLength_ = newLength_ = 0;
return false;
@ -284,13 +299,32 @@ public final class Edits {
}
private boolean next(boolean onlyChanges) {
// We have an errorCode in case we need to start guarding against integer overflows.
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
updateIndexes();
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of equal-length changes.
--remaining;
return true;
// Forward iteration: Update the string indexes to the limit of the current span,
// and post-increment-read array units to assemble a new span.
// Leaves the array index one after the last unit of that span.
if (dir > 0) {
updateNextIndexes();
} else {
if (dir < 0) {
// Turn around from previous() to next().
// Post-increment-read the same span again.
if (remaining > 0) {
// Fine-grained iterator:
// Stay on the current one of a sequence of compressed changes.
++index; // next() rests on the index after the sequence unit.
dir = 1;
return true;
}
}
dir = 1;
}
if (remaining >= 1) {
// Fine-grained iterator: Continue a sequence of compressed changes.
if (remaining > 1) {
--remaining;
return true;
}
remaining = 0;
}
if (index >= length) {
return noNext();
@ -306,7 +340,7 @@ public final class Edits {
}
newLength_ = oldLength_;
if (onlyChanges) {
updateIndexes();
updateNextIndexes();
if (index >= length) {
return noNext();
}
@ -318,14 +352,19 @@ public final class Edits {
}
changed = true;
if (u <= MAX_SHORT_CHANGE) {
int oldLen = u >> 12;
int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
int num = (u & SHORT_CHANGE_NUM_MASK) + 1;
if (coarse) {
int w = u >> 12;
int len = (u & 0xfff) + 1;
oldLength_ = newLength_ = len * w;
oldLength_ = num * oldLen;
newLength_ = num * newLen;
} else {
// Split a sequence of equal-length changes that was compressed into one unit.
oldLength_ = newLength_ = u >> 12;
remaining = u & 0xfff;
// Split a sequence of changes that was compressed into one unit.
oldLength_ = oldLen;
newLength_ = newLen;
if (num > 1) {
remaining = num; // This is the first of two or more changes.
}
return true;
}
} else {
@ -340,22 +379,124 @@ public final class Edits {
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
++index;
if (u <= MAX_SHORT_CHANGE) {
int w = u >> 12;
int len = (u & 0xfff) + 1;
len = len * w;
oldLength_ += len;
newLength_ += len;
int num = (u & SHORT_CHANGE_NUM_MASK) + 1;
oldLength_ += (u >> 12) * num;
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
} else {
assert(u <= 0x7fff);
int oldLen = readLength((u >> 6) & 0x3f);
int newLen = readLength(u & 0x3f);
oldLength_ += oldLen;
newLength_ += newLen;
oldLength_ += readLength((u >> 6) & 0x3f);
newLength_ += readLength(u & 0x3f);
}
}
return true;
}
private boolean previous() {
// Backward iteration: Pre-decrement-read array units to assemble a new span,
// then update the string indexes to the start of that span.
// Leaves the array index on the head unit of that span.
if (dir >= 0) {
if (dir > 0) {
// Turn around from next() to previous().
// Set the string indexes to the span limit and
// pre-decrement-read the same span again.
if (remaining > 0) {
// Fine-grained iterator:
// Stay on the current one of a sequence of compressed changes.
--index; // previous() rests on the sequence unit.
dir = -1;
return true;
}
updateNextIndexes();
}
dir = -1;
}
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of compressed changes.
int u = array[index];
assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
++remaining;
updatePreviousIndexes();
return true;
}
remaining = 0;
}
if (index <= 0) {
return noNext();
}
int u = array[--index];
if (u <= MAX_UNCHANGED) {
// Combine adjacent unchanged ranges.
changed = false;
oldLength_ = u + 1;
while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
--index;
oldLength_ += u + 1;
}
newLength_ = oldLength_;
// No need to handle onlyChanges as long as previous() is called only from findIndex().
updatePreviousIndexes();
return true;
}
changed = true;
if (u <= MAX_SHORT_CHANGE) {
int oldLen = u >> 12;
int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
int num = (u & SHORT_CHANGE_NUM_MASK) + 1;
if (coarse) {
oldLength_ = num * oldLen;
newLength_ = num * newLen;
} else {
// Split a sequence of changes that was compressed into one unit.
oldLength_ = oldLen;
newLength_ = newLen;
if (num > 1) {
remaining = 1; // This is the last of two or more changes.
}
updatePreviousIndexes();
return true;
}
} else {
if (u <= 0x7fff) {
// The change is encoded in u alone.
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
} else {
// Back up to the head of the change, read the lengths,
// and reset the index to the head again.
assert(index > 0);
while ((u = array[--index]) > 0x7fff) {}
assert(u > MAX_SHORT_CHANGE);
int headIndex = index++;
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
index = headIndex;
}
if (!coarse) {
updatePreviousIndexes();
return true;
}
}
// Combine adjacent changes.
while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
--index;
if (u <= MAX_SHORT_CHANGE) {
int num = (u & SHORT_CHANGE_NUM_MASK) + 1;
oldLength_ += (u >> 12) * num;
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
} else if (u <= 0x7fff) {
// Read the lengths, and reset the index to the head again.
int headIndex = index++;
oldLength_ += readLength((u >> 6) & 0x3f);
newLength_ += readLength(u & 0x3f);
index = headIndex;
}
}
updatePreviousIndexes();
return true;
}
/**
* Finds the edit that contains the source index.
* The source index may be found in a non-change
@ -410,7 +551,43 @@ public final class Edits {
spanLength = newLength_;
}
if (i < spanStart) {
if (i >= (spanStart / 2)) {
// Search backwards.
for (;;) {
boolean hasPrevious = previous();
assert(hasPrevious); // because i>=0 and the first span starts at 0
spanStart = findSource ? srcIndex : destIndex;
if (i >= spanStart) {
// The index is in the current span.
return 0;
}
if (remaining > 0) {
// Is the index in one of the remaining compressed edits?
// spanStart is the start of the current span, first of the remaining ones.
spanLength = findSource ? oldLength_ : newLength_;
int u = array[index];
assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
int num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
int len = num * spanLength;
if (i >= (spanStart - len)) {
int n = ((spanStart - i - 1) / spanLength) + 1;
// 1 <= n <= num
srcIndex -= n * oldLength_;
replIndex -= n * newLength_;
destIndex -= n * newLength_;
remaining += n;
return 0;
}
// Skip all of these edits at once.
srcIndex -= num * oldLength_;
replIndex -= num * newLength_;
destIndex -= num * newLength_;
remaining = 0;
}
}
}
// Reset the iterator to the start.
dir = 0;
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
} else if (i < (spanStart + spanLength)) {
// The index is in the current span.
@ -428,21 +605,21 @@ public final class Edits {
// The index is in the current span.
return 0;
}
if (remaining > 0) {
if (remaining > 1) {
// Is the index in one of the remaining compressed edits?
// spanStart is the start of the current span, before the remaining ones.
int len = (remaining + 1) * spanLength;
// spanStart is the start of the current span, first of the remaining ones.
int len = remaining * spanLength;
if (i < (spanStart + len)) {
int n = (i - spanStart) / spanLength; // 1 <= n <= remaining
len = n * spanLength;
srcIndex += len;
replIndex += len;
destIndex += len;
int n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1
srcIndex += n * oldLength_;
replIndex += n * newLength_;
destIndex += n * newLength_;
remaining -= n;
return 0;
}
// Make next() skip all of these edits at once.
oldLength_ = newLength_ = len;
oldLength_ *= remaining;
newLength_ *= remaining;
remaining = 0;
}
}

View File

@ -951,12 +951,18 @@ public final class UCharacterCaseTest extends TestFmwk
srcIndexes.add(srcIndex);
if (expected[i].oldLength > 1) {
srcIndexes.add(srcIndex + 1);
if (expected[i].oldLength > 2) {
srcIndexes.add(srcIndex + expected[i].oldLength - 1);
}
}
}
if (expected[i].newLength > 0) {
destIndexes.add(destIndex);
if (expected[i].newLength > 0) {
if (expected[i].newLength > 1) {
destIndexes.add(destIndex + 1);
if (expected[i].newLength > 2) {
destIndexes.add(destIndex + expected[i].newLength - 1);
}
}
}
srcIndex += expected[i].oldLength;
@ -967,18 +973,31 @@ public final class UCharacterCaseTest extends TestFmwk
srcIndexes.add(srcLength + 1);
destIndexes.add(destLength + 1);
Collections.reverse(destIndexes);
for (int i : srcIndexes) {
assertEquals(name + " destIndexFromSrc(" + i + "):",
destIndexFromSrc(expected, srcLength, destLength, i),
ei2.destinationIndexFromSourceIndex(i));
// Zig-zag across the indexes to stress next() <-> previous().
for (int i = 0; i < srcIndexes.size(); ++i) {
for (int j : ZIG_ZAG) {
if ((i + j) < srcIndexes.size()) {
int si = srcIndexes.get(i + j);
assertEquals(name + " destIndexFromSrc(" + si + "):",
destIndexFromSrc(expected, srcLength, destLength, si),
ei2.destinationIndexFromSourceIndex(si));
}
}
}
for (int i : destIndexes) {
assertEquals(name + " srcIndexFromDest(" + i + "):",
srcIndexFromDest(expected, srcLength, destLength, i),
ei2.sourceIndexFromDestinationIndex(i));
for (int i = 0; i < destIndexes.size(); ++i) {
for (int j : ZIG_ZAG) {
if ((i + j) < destIndexes.size()) {
int di = destIndexes.get(i + j);
assertEquals(name + " srcIndexFromDest(" + di + "):",
srcIndexFromDest(expected, srcLength, destLength, di),
ei2.sourceIndexFromDestinationIndex(di));
}
}
}
}
private static final int[] ZIG_ZAG = { 0, 1, 2, 3, 2, 1 };
@Test
public void TestEdits() {
Edits edits = new Edits();
@ -992,21 +1011,21 @@ public final class UCharacterCaseTest extends TestFmwk
assertFalse("unchanged 10003 hasChanges", edits.hasChanges());
assertEquals("unchanged 10003 numberOfChanges", 0, edits.numberOfChanges());
assertEquals("unchanged 10003", 0, edits.lengthDelta());
edits.addReplace(1, 1); // multiple short equal-length edits are compressed
edits.addReplace(2, 1); // multiple short equal-lengths edits are compressed
edits.addUnchanged(0);
edits.addReplace(1, 1);
edits.addReplace(1, 1);
edits.addReplace(2, 1);
edits.addReplace(2, 1);
edits.addReplace(0, 10);
edits.addReplace(100, 0);
edits.addReplace(3000, 4000); // variable-length encoding
edits.addReplace(100000, 100000);
assertTrue("some edits hasChanges", edits.hasChanges());
assertEquals("some edits numberOfChanges", 7, edits.numberOfChanges());
assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta());
assertEquals("some edits", -3 + 10 - 100 + 1000, edits.lengthDelta());
EditChange[] coarseExpectedChanges = new EditChange[] {
new EditChange(false, 10003, 10003),
new EditChange(true, 103103, 104013)
new EditChange(true, 103106, 104013)
};
checkEditsIter("coarse",
edits.getCoarseIterator(), edits.getCoarseIterator(),
@ -1017,9 +1036,9 @@ public final class UCharacterCaseTest extends TestFmwk
EditChange[] fineExpectedChanges = new EditChange[] {
new EditChange(false, 10003, 10003),
new EditChange(true, 1, 1),
new EditChange(true, 1, 1),
new EditChange(true, 1, 1),
new EditChange(true, 2, 1),
new EditChange(true, 2, 1),
new EditChange(true, 2, 1),
new EditChange(true, 0, 10),
new EditChange(true, 100, 0),
new EditChange(true, 3000, 4000),
@ -1040,6 +1059,27 @@ public final class UCharacterCaseTest extends TestFmwk
assertFalse("reset then iterator", ei.next());
}
@Test
public void TestEditsFindFwdBwd() {
// Some users need index mappings to be efficient when they are out of order.
// The most interesting failure case for this test is it taking a very long time.
Edits e = new Edits();
int N = 200000;
for (int i = 0; i < N; ++i) {
e.addUnchanged(1);
e.addReplace(3, 1);
}
Edits.Iterator iter = e.getFineIterator();
for (int i = 0; i <= N; i += 2) {
assertEquals("ascending", i * 2, iter.sourceIndexFromDestinationIndex(i));
assertEquals("ascending", i * 2 + 1, iter.sourceIndexFromDestinationIndex(i + 1));
}
for (int i = N; i >= 0; i -= 2) {
assertEquals("descending", i * 2 + 1, iter.sourceIndexFromDestinationIndex(i + 1));
assertEquals("descending", i * 2, iter.sourceIndexFromDestinationIndex(i));
}
}
@Test
public void TestMergeEdits() {
Edits ab = new Edits(), bc = new Edits(), ac = new Edits(), expected_ac = new Edits();