ICU-1046 reimplement filter on a CompoundTransliterator to work correctly

X-SVN-Rev: 5247
This commit is contained in:
Alan Liu 2001-07-17 00:16:02 +00:00
parent 5ed909951b
commit 589dbddcbe
4 changed files with 138 additions and 165 deletions

View File

@ -31,7 +31,7 @@ CompoundTransliterator::CompoundTransliterator(
int32_t transliteratorCount,
UnicodeFilter* adoptedFilter) :
Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter),
trans(0), filters(0), count(0), compoundRBTIndex(-1) {
trans(0), count(0), compoundRBTIndex(-1) {
setTransliterators(transliterators, transliteratorCount);
}
@ -46,16 +46,16 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
UTransDirection direction,
UnicodeFilter* adoptedFilter,
UErrorCode& status) :
Transliterator(id, 0), // set filter to 0 here!
trans(0), filters(0), compoundRBTIndex(-1) {
init(id, direction, adoptedFilter, -1, 0, TRUE, status);
Transliterator(id, adoptedFilter),
trans(0), compoundRBTIndex(-1) {
init(id, direction, -1, 0, TRUE, status);
}
CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
UErrorCode& status) :
Transliterator(id, 0), // set filter to 0 here!
trans(0), filters(0), compoundRBTIndex(-1) {
init(id, UTRANS_FORWARD, 0, -1, 0, TRUE, status);
trans(0), compoundRBTIndex(-1) {
init(id, UTRANS_FORWARD, -1, 0, TRUE, status);
}
/**
@ -69,8 +69,8 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
Transliterator *adoptedTrans,
UErrorCode& status) :
Transliterator(ID, 0),
trans(0), filters(0), compoundRBTIndex(-1) {
init(idBlock, UTRANS_FORWARD, 0, idSplitPoint, adoptedTrans, FALSE, status);
trans(0), compoundRBTIndex(-1) {
init(idBlock, UTRANS_FORWARD, idSplitPoint, adoptedTrans, FALSE, status);
}
/**
@ -83,8 +83,8 @@ CompoundTransliterator::CompoundTransliterator(UTransDirection dir,
UVector& list,
UErrorCode& status) :
Transliterator(UnicodeString("", ""), 0),
trans(0), filters(0), compoundRBTIndex(-1) {
init(list, dir, 0, FALSE, status);
trans(0), compoundRBTIndex(-1) {
init(list, dir, FALSE, status);
// assume caller will fixup ID
}
@ -93,8 +93,6 @@ CompoundTransliterator::CompoundTransliterator(UTransDirection dir,
* constructors. Before calling init(), set trans and filter to NULL.
* @param id the id containing ';'-separated entries
* @param direction either FORWARD or REVERSE
* @param adoptedFilter a filter object to be owned by this transliterator.
* May be NULL.
* @param idSplitPoint the index into id at which the
* adoptedSplitTransliterator should be inserted, if there is one, or
* -1 if there is none.
@ -108,16 +106,13 @@ CompoundTransliterator::CompoundTransliterator(UTransDirection dir,
*/
void CompoundTransliterator::init(const UnicodeString& id,
UTransDirection direction,
UnicodeFilter* adoptedFilter,
int32_t idSplitPoint,
Transliterator *adoptedSplitTrans,
UBool fixReverseID,
UErrorCode& status) {
// assert(trans == 0);
// assert(filters == 0);
if (U_FAILURE(status)) {
delete adoptedFilter;
delete adoptedSplitTrans;
return;
}
@ -129,7 +124,7 @@ void CompoundTransliterator::init(const UnicodeString& id,
list, compoundRBTIndex,
NULL, status);
init(list, direction, adoptedFilter, fixReverseID, status);
init(list, direction, fixReverseID, status);
}
/**
@ -140,8 +135,6 @@ void CompoundTransliterator::init(const UnicodeString& id,
* is, it should be in the FORWARD order; if direction is REVERSE then
* the list order will be reversed.
* @param direction either FORWARD or REVERSE
* @param adoptedFilter a filter object to be owned by this transliterator.
* May be NULL.
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
* entries by calling getID() of component entries. Some constructors
* do not require this because they apply a facade ID anyway.
@ -149,11 +142,9 @@ void CompoundTransliterator::init(const UnicodeString& id,
*/
void CompoundTransliterator::init(UVector& list,
UTransDirection direction,
UnicodeFilter* adoptedFilter,
UBool fixReverseID,
UErrorCode& status) {
// assert(trans == 0);
// assert(filters == 0);
// Allocate array
if (U_SUCCESS(status)) {
@ -162,8 +153,7 @@ void CompoundTransliterator::init(UVector& list,
}
if (U_FAILURE(status) || trans == 0) {
delete adoptedFilter;
// assert(trans == 0);
// assert(trans == 0);
return;
}
@ -194,7 +184,6 @@ void CompoundTransliterator::init(UVector& list,
}
computeMaximumContextLength();
adoptFilter(adoptedFilter);
}
/**
@ -214,41 +203,11 @@ UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterat
return id; // Return temporary
}
///**
// * Splits a string, as in JavaScript
// */
//UnicodeString* CompoundTransliterator::split(const UnicodeString& s,
// UChar divider,
// int32_t* countPtr) {
// // changed MED
// // see how many there are
// *countPtr = 1;
// int32_t i;
// for (i = 0; i < s.length(); ++i) {
// if (s.charAt(i) == divider)
// ++(*countPtr);
// }
//
// // make an array with them
// UnicodeString* result = new UnicodeString[*countPtr];
// int32_t last = 0;
// int32_t current = 0;
//
// for (i = 0; i < s.length(); ++i) {
// if (s.charAt(i) == divider) {
// s.extractBetween(last, i, result[current++]);
// last = i+1;
// }
// }
// s.extractBetween(last, i, result[current]);
// return result;
//}
/**
* Copy constructor.
*/
CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :
Transliterator(t), trans(0), filters(0), count(0), compoundRBTIndex(-1) {
Transliterator(t), trans(0), count(0), compoundRBTIndex(-1) {
*this = t;
}
@ -264,14 +223,9 @@ void CompoundTransliterator::freeTransliterators(void) {
if (trans != 0) {
delete trans[i];
}
if (filters != 0) {
delete filters[i];
}
}
delete[] trans;
delete[] filters;
trans = 0;
filters = 0;
count = 0;
}
@ -285,23 +239,14 @@ CompoundTransliterator& CompoundTransliterator::operator=(
for (i=0; i<count; ++i) {
delete trans[i];
trans[i] = 0;
if (filters != 0) {
delete filters[i];
filters[i] = 0;
}
}
if (t.count > count) {
delete[] trans;
trans = new Transliterator*[t.count];
delete[] filters;
filters = (t.filter == 0) ? 0 : new UnicodeFilter*[t.count];
}
count = t.count;
for (i=0; i<count; ++i) {
trans[i] = t.trans[i]->clone();
if (t.filters != 0) {
filters[i] = t.filters[i]->clone();
}
}
compoundRBTIndex = t.compoundRBTIndex;
return *this;
@ -343,8 +288,7 @@ void CompoundTransliterator::setTransliterators(Transliterator* const transliter
void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[],
int32_t transCount) {
// First free trans[] and set count to zero. Once this is done,
// orphan the filter. Set up the new trans[], and call
// adoptFilter() to fix up the filters in trans[].
// orphan the filter. Set up the new trans[].
freeTransliterators();
UnicodeFilter *f = orphanFilter();
trans = adoptedTransliterators;
@ -354,55 +298,6 @@ void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransli
setID(joinIDs(trans, count));
}
/**
* Override Transliterator. Modify the transliterators that make up
* this compound transliterator so their filters are the logical AND
* of this transliterator's filter and their own. Original filters
* are kept in the filters array.
*/
void CompoundTransliterator::adoptFilter(UnicodeFilter* f) {
/**
* If there is a filter F for the compound transliterator as a
* whole, then we need to modify every non-null filter f in
* the chain to be f' = F & f.
*
* There are two possible states:
* 1. getFilter() != 0
* original filters in filters[]
* createAnd() filters in trans[]
* 2. getFilter() == 0
* filters[] either unallocated or empty
* original filters in trans[]
* This method must insure that we stay in one of these states.
*/
if (count > 0) {
if (f == 0) {
// Restore original filters
if (getFilter() != 0 && filters != 0) {
for (int32_t i=0; i<count; ++i) {
trans[i]->adoptFilter(filters[i]);
filters[i] = 0;
}
}
} else {
// If the previous filter is 0, then the component filters
// are in trans[i], and need to be pulled out into filters[].
if (getFilter() == 0) {
if (filters == 0) {
filters = new UnicodeFilter*[count];
}
for (int32_t i=0; i<count; ++i) {
filters[i] = trans[i]->orphanFilter();
}
}
for (int32_t i=0; i<count; ++i) {
trans[i]->adoptFilter(UnicodeFilterLogic::createAnd(f, filters[i]));
}
}
}
Transliterator::adoptFilter(f);
}
UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
UBool escapeUnprintable) const {
// We do NOT call toRules() on our component transliterators, in
@ -496,29 +391,88 @@ void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPositi
return; // Short circuit for empty compound transliterators
}
int32_t i;
int32_t start = index.start;
int32_t globalLimit = index.limit;
/* globalLimit is the overall limit. We keep track of this
* since we overwrite index.limit with the previous
* index.start. After each transliteration, we update
* globalLimit for insertions or deletions that have happened.
*/
for (i=0; i<count; ++i) {
index.start = start; // Reset start
int32_t limit = index.limit;
trans[i]->handleTransliterate(text, index, incremental);
// Adjust overall limit for insertions/deletions
globalLimit += index.limit - limit;
index.limit = index.start; // Move limit to end of committed text
}
// Start is good where it is -- where the last
// transliterator left it. Limit needs to be put back
// where it was, modulo adjustments for deletions/insertions.
index.limit = globalLimit;
const UnicodeFilter *filter = getFilter();
// compoundLimit is the limit value for the entire compound
// operation. We overwrite index.limit with the previous
// index.start. After each transliteration, we update
// compoundLimit for insertions or deletions that have happened.
int32_t compoundLimit = index.limit;
// For compounds with filters, the limit of each unfiltered
// segment. If filter == 0 then this is not used.
int32_t filteredLimit = 0;
// If we have a compound filter (a filter on this object, as
// oppposed to filtered on trans[i]), then we break the input text
// up. Say the input text has the form:
// xxxabcxxdefxx
// where 'x' represents a filtered character. Then we break this
// up into:
// xxxabc xxdef xx
// Each pass through the loop consumes a run of filtered
// characters (which are ignored) and a subsequent run of
// unfiltered characters. If, at any point, we fail to consume
// our entire segment, we stop.
do {
// compoundStart is the start for the entire compound
// operation.
int32_t compoundStart = index.start;
// If there is a compound filter, then narrow the range to be
// transliterated to the first segment of unfiltered
// characters at or after index.start.
if (filter != 0) {
int32_t l;
// Advance compoundStart past filtered chars
while (compoundStart < compoundLimit &&
!filter->contains(text.charAt(compoundStart))) {
++compoundStart;
}
l = compoundStart;
// Find the end of this run of unfiltered chars
while (l < compoundLimit &&
filter->contains(text.charAt(l))) {
++l;
}
// Check to see if the unfiltered run is empty. This only
// happens at the end of the string when all the remaining
// characters are filtered.
if (l == compoundStart) {
// assert(compoundStart == compoundLimit);
index.start = compoundStart;
break;
}
// Keep track of the end of the unfiltered run in
// filteredLimit to determine if we processed the run
// completely.
index.limit = filteredLimit = l;
}
// Give each transliterator a crack at the run of characters.
// See comments at the top of the method for more detail.
for (int32_t i=0; i<count; ++i) {
index.start = compoundStart; // Reset start
int32_t limit = index.limit;
trans[i]->handleTransliterate(text, index, incremental);
// Adjust overall limit for insertions/deletions
compoundLimit += index.limit - limit;
index.limit = index.start; // Move limit to end of committed text
}
// If there is no filter then we are done. If there is a
// filter and we failed to complete transliterate this
// segment, then we are done. If we did completely
// transliterate this segment, then look for another
// unfiltered segment by looping back up to the top.
} while (filter != 0 && index.start == filteredLimit);
// Start is good where it is -- where the last transliterator left
// it. Limit needs to be put back where it was, modulo
// adjustments for deletions/insertions.
index.limit = compoundLimit;
}
/**

View File

@ -34,18 +34,13 @@ class U_I18N_API UVector;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: cpdtrans.h,v $ $Revision: 1.14 $ $Date: 2001/07/16 20:48:26 $
* @version $RCSfile: cpdtrans.h,v $ $Revision: 1.15 $ $Date: 2001/07/17 00:15:49 $
* @draft
*/
class U_I18N_API CompoundTransliterator : public Transliterator {
Transliterator** trans;
/**
* Array of original filters associated with transliterators.
*/
UnicodeFilter** filters;
int32_t count;
/**
@ -149,14 +144,6 @@ public:
void adoptTransliterators(Transliterator* adoptedTransliterators[],
int32_t count);
/**
* Override Transliterator. Modify the transliterators that make up
* this compound transliterator so their filters are the logical AND
* of this transliterator's filter and their own. Original filters
* are kept in the filters array.
*/
virtual void adoptFilter(UnicodeFilter* f);
/**
* Override Transliterator:
* Create a rule string that can be passed to createFromRules()
@ -202,7 +189,6 @@ private:
void init(const UnicodeString& id,
UTransDirection direction,
UnicodeFilter* adoptedFilter,
int32_t idSplitPoint,
Transliterator *adoptedRbt,
UBool fixReverseID,
@ -210,7 +196,6 @@ private:
void init(UVector& list,
UTransDirection direction,
UnicodeFilter* adoptedFilter,
UBool fixReverseID,
UErrorCode& status);
@ -222,13 +207,6 @@ private:
UnicodeString joinIDs(Transliterator* const transliterators[],
int32_t transCount);
/**
* Splits a string, as in JavaScript
*/
//UnicodeString* split(const UnicodeString& s,
// UChar divider,
// int32_t* countPtr);
void freeTransliterators(void);
void computeMaximumContextLength(void);

View File

@ -63,6 +63,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE(27,TestCreateInstance);
TESTCASE(28,TestNormalizationTransliterator);
TESTCASE(29,TestCompoundRBT);
TESTCASE(30,TestCompoundFilter);
default: name = ""; break;
}
}
@ -1266,6 +1267,44 @@ void TransliteratorTest::TestCompoundRBT(void) {
delete u;
}
/**
* Compound filter semantics were orginially not implemented
* correctly. Originally, each component filter f(i) is replaced by
* f'(i) = f(i) && g, where g is the filter for the compound
* transliterator.
*
* From Mark:
*
* Suppose and I have a transliterator X. Internally X is
* "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
*
* The compound should convert all greek characters (through latin) to
* cyrillic, then lowercase the result. The filter should say "don't
* touch 'A' in the original". But because an intermediate result
* happens to go through "A", the Greek Alpha gets hung up.
*/
void TransliteratorTest::TestCompoundFilter(void) {
Transliterator *t = Transliterator::createInstance
("Greek-Latin; Latin-Cyrillic; Lower");
if (t == 0) {
errln("FAIL: createInstance failed");
return;
}
UErrorCode status = U_ZERO_ERROR;
t->adoptFilter(new UnicodeSet("[^A]", status));
if (U_FAILURE(status)) {
errln("FAIL: UnicodeSet ct failed");
delete t;
return;
}
// Only the 'A' at index 1 should remain unchanged
expect(*t,
CharsToUnicodeString("CA\\u039A\\u0391"),
CharsToUnicodeString("\\u043AA\\u043A\\u0430"));
delete t;
}
//======================================================================
// Support methods
//======================================================================

View File

@ -161,6 +161,8 @@ class TransliteratorTest : public IntlTest {
void TestCompoundRBT(void);
void TestCompoundFilter(void);
//======================================================================
// Support methods
//======================================================================