ICU-1434 initial implementation of source/target set API

X-SVN-Rev: 8971
This commit is contained in:
Alan Liu 2002-06-28 21:13:54 +00:00
parent 59164f02ca
commit d1773b2571
27 changed files with 416 additions and 3 deletions

View File

@ -95,6 +95,13 @@ public:
*/
virtual void setData(const TransliterationRuleData*) {}
/**
* Stubbed out implementation of UnicodeMatcher API.
* @param toUnionTo the set into which to union the source characters
* @return a reference to toUnionTo
*/
virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
protected:
UnicodeFilter();

View File

@ -14,6 +14,7 @@ U_NAMESPACE_BEGIN
class Replaceable;
class UnicodeString;
class UnicodeSet;
/**
* Constants returned by <code>UnicodeMatcher::matches()</code>
@ -128,6 +129,13 @@ public:
* indexing.
*/
virtual UBool matchesIndexValue(uint8_t v) const = 0;
/**
* Union the set of all characters that may be matched by this object
* into the given set.
* @param toUnionTo the set into which to union the source characters
*/
virtual void addMatchSetTo(UnicodeSet& toUnionTo) const = 0;
};
U_NAMESPACE_END

View File

@ -536,6 +536,15 @@ public:
const UnicodeString& s);
public:
/**
* Implementation of UnicodeMatcher API. Union the set of all
* characters that may be matched by this object into the given
* set.
* @param toUnionTo the set into which to union the source characters
*/
void addMatchSetTo(UnicodeSet& toUnionTo) const;
/**
* Returns the index of the given character within this set, where
* the set is ordered by ascending code point. If the character

View File

@ -62,6 +62,10 @@ UBool UnicodeFilter::matchesIndexValue(uint8_t v) const {
return FALSE;
}
// Stub this out for filters that do not implement this
void UnicodeFilter::addMatchSetTo(UnicodeSet& toUnionTo) const {
}
U_NAMESPACE_END
//eof

View File

@ -900,6 +900,13 @@ int32_t UnicodeSet::matchRest(const Replaceable& text,
return maxLen;
}
/**
* Implement of UnicodeMatcher
*/
void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
toUnionTo.addAll(*this);
}
/**
* Returns the index of the given character within this set, where
* the set is ordered by ascending code point. If the character

View File

@ -366,6 +366,41 @@ UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
return rulesSource;
}
/**
* Implement Transliterator framework
*/
void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const {
UnicodeSet set;
result.clear();
for (int32_t i=0; i<count; ++i) {
result.addAll(trans[i]->getSourceSet(set));
// Take the example of Hiragana-Latin. This is really
// Hiragana-Katakana; Katakana-Latin. The source set of
// these two is roughly [:Hiragana:] and [:Katakana:].
// But the source set for the entire transliterator is
// actually [:Hiragana:] ONLY -- that is, the first
// non-empty source set.
// This is a heuristic, and not 100% reliable.
if (!result.isEmpty()) {
break;
}
}
}
/**
* Override Transliterator framework
*/
UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const {
UnicodeSet set;
result.clear();
for (int32_t i=0; i<count; ++i) {
// This is a heuristic, and not 100% reliable.
result.addAll(trans[i]->getTargetSet(set));
}
return result;
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/

View File

@ -9,6 +9,7 @@
*/
#include "funcrepl.h"
#include "unicode/translit.h"
#include "unicode/uniset.h"
static const UChar AMPERSAND = 38; // '&'
static const UChar OPEN[] = {40,32,0}; // "( "
@ -91,6 +92,14 @@ UnicodeString& FunctionReplacer::toReplacerPattern(UnicodeString& rule,
return rule;
}
/**
* Implement UnicodeReplacer
*/
void FunctionReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
UnicodeSet set;
toUnionTo.addAll(translit->getTargetSet(set));
}
/**
* UnicodeFunctor API
*/

View File

@ -81,6 +81,11 @@ class FunctionReplacer : public UnicodeFunctor, public UnicodeReplacer {
virtual UnicodeString& toReplacerPattern(UnicodeString& rule,
UBool escapeUnprintable) const;
/**
* Implement UnicodeReplacer
*/
virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
/**
* UnicodeFunctor API
*/

View File

@ -114,6 +114,15 @@ UBool Quantifier::matchesIndexValue(uint8_t v) const {
return (minCount == 0) || matcher->toMatcher()->matchesIndexValue(v);
}
/**
* Implement UnicodeMatcher
*/
void Quantifier::addMatchSetTo(UnicodeSet& toUnionTo) const {
if (maxCount > 0) {
matcher->toMatcher()->addMatchSetTo(toUnionTo);
}
}
/**
* Implement UnicodeFunctor
*/

View File

@ -56,6 +56,11 @@ class Quantifier : public UnicodeFunctor, public UnicodeMatcher {
*/
virtual UBool matchesIndexValue(uint8_t v) const;
/**
* Implement UnicodeMatcher
*/
virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
/**
* UnicodeFunctor API
*/

View File

@ -151,5 +151,19 @@ UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
return data->ruleSet.toRules(rulesSource, escapeUnprintable);
}
/**
* Implement Transliterator framework
*/
void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
data->ruleSet.getSourceTargetSet(result, FALSE);
}
/**
* Override Transliterator framework
*/
UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
return data->ruleSet.getSourceTargetSet(result, TRUE);
}
U_NAMESPACE_END

View File

@ -494,6 +494,32 @@ void TransliterationRule::setData(const TransliterationRuleData* d) {
// Don't have to do segments since they are in the context or key
}
/**
* Union the set of all characters that may be modified by this rule
* into the given set.
*/
void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {
int32_t limit = anteContextLength + keyLength;
for (int32_t i=anteContextLength; i<limit; ) {
UChar32 ch = pattern.char32At(i);
i += UTF_CHAR_LENGTH(ch);
const UnicodeMatcher* matcher = data->lookupMatcher(ch);
if (matcher == NULL) {
toUnionTo.add(ch);
} else {
matcher->addMatchSetTo(toUnionTo);
}
}
}
/**
* Union the set of all characters that may be emitted by this rule
* into the given set.
*/
void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {
output->toReplacer()->addReplacementSetTo(toUnionTo);
}
U_NAMESPACE_END
//eof

View File

@ -268,6 +268,19 @@ public:
*/
virtual UnicodeString& toRule(UnicodeString& pat,
UBool escapeUnprintable) const;
/**
* Union the set of all characters that may be modified by this rule
* into the given set.
*/
void addSourceSetTo(UnicodeSet& toUnionTo) const;
/**
* Union the set of all characters that may be emitted by this rule
* into the given set.
*/
void addTargetSetTo(UnicodeSet& toUnionTo) const;
private:
friend class StringMatcher;

View File

@ -10,6 +10,7 @@
#include "rbt_set.h"
#include "rbt_rule.h"
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "cmemory.h"
U_CDECL_BEGIN
@ -404,4 +405,24 @@ UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
return ruleSource;
}
/**
* Return the set of all characters that may be modified
* (getTarget=false) or emitted (getTarget=true) by this set.
*/
UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,
UBool getTarget) const {
result.clear();
int32_t count = ruleVector->size();
for (int32_t i=0; i<count; ++i) {
TransliterationRule* r =
(TransliterationRule*) ruleVector->elementAt(i);
if (getTarget) {
r->addTargetSetTo(result);
} else {
r->addSourceSetTo(result);
}
}
return result;
}
U_NAMESPACE_END

View File

@ -20,6 +20,7 @@ class TransliterationRule;
class TransliterationRuleData;
class UnicodeFilter;
class UnicodeString;
class UnicodeSet;
/**
* A set of rules for a <code>RuleBasedTransliterator</code>.
@ -132,6 +133,13 @@ public:
*/
virtual UnicodeString& toRules(UnicodeString& result,
UBool escapeUnprintable) const;
/**
* Return the set of all characters that may be modified
* (getTarget=false) or emitted (getTarget=true) by this set.
*/
UnicodeSet& getSourceTargetSet(UnicodeSet& result,
UBool getTarget) const;
};
U_NAMESPACE_END

View File

@ -9,6 +9,7 @@
#include "strmatch.h"
#include "rbt_data.h"
#include "util.h"
#include "unicode/uniset.h"
U_NAMESPACE_BEGIN
@ -180,6 +181,22 @@ UBool StringMatcher::matchesIndexValue(uint8_t v) const {
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
}
/**
* Implement UnicodeMatcher
*/
void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
UChar32 ch;
for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
ch = pattern.char32At(i);
const UnicodeMatcher* matcher = data->lookupMatcher(ch);
if (matcher == NULL) {
toUnionTo.add(ch);
} else {
matcher->addMatchSetTo(toUnionTo);
}
}
}
/**
* UnicodeReplacer API
*/
@ -226,6 +243,19 @@ UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
matchStart = matchLimit = -1;
}
/**
* Union the set of all characters that may output by this object
* into the given set.
* @param toUnionTo the set into which to union the output characters
*/
void StringMatcher::addReplacementSetTo(UnicodeSet& toUnionTo) const {
// The output of this replacer varies; it is the source text between
// matchStart and matchLimit. Since this varies depending on the
// input text, we can't compute it here. We can either do nothing
// or we can add ALL characters to the set. It's probably more useful
// to do nothing.
}
/**
* Implement UnicodeFunctor
*/

View File

@ -96,6 +96,11 @@ class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public Unico
*/
virtual UBool matchesIndexValue(uint8_t v) const;
/**
* Implement UnicodeMatcher
*/
virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
/**
* Implement UnicodeFunctor
*/
@ -145,6 +150,13 @@ class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public Unico
*/
void resetMatch();
/**
* Union the set of all characters that may output by this object
* into the given set.
* @param toUnionTo the set into which to union the output characters
*/
virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
private:
/**

View File

@ -11,6 +11,7 @@
#include "strrepl.h"
#include "rbt_data.h"
#include "util.h"
#include "unicode/uniset.h"
U_NAMESPACE_BEGIN
@ -257,6 +258,22 @@ UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
return rule;
}
/**
* Implement UnicodeReplacer
*/
void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
UChar32 ch;
for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
ch = output.char32At(i);
UnicodeReplacer* r = data->lookupReplacer(ch);
if (r == NULL) {
toUnionTo.add(ch);
} else {
r->addReplacementSetTo(toUnionTo);
}
}
}
/**
* UnicodeFunctor API
*/

View File

@ -127,6 +127,11 @@ class StringReplacer : public UnicodeFunctor, public UnicodeReplacer {
virtual UnicodeString& toReplacerPattern(UnicodeString& result,
UBool escapeUnprintable) const;
/**
* Implement UnicodeReplacer
*/
virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
/**
* UnicodeFunctor API
*/

View File

@ -1057,6 +1057,36 @@ UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
return rulesSource;
}
UnicodeSet& Transliterator::getSourceSet(UnicodeSet& result) const {
handleGetSourceSet(result);
if (filter != NULL) {
UnicodeSet* filterSet;
UBool deleteFilterSet = FALSE;
// Most, but not all filters will be UnicodeSets. Optimize for
// the high-runner case.
if (filter->getDynamicClassID() == UnicodeSet::getStaticClassID()) {
filterSet = (UnicodeSet*) filter;
} else {
filterSet = new UnicodeSet();
deleteFilterSet = TRUE;
filter->addMatchSetTo(*filterSet);
}
result.retainAll(*filterSet);
if (deleteFilterSet) {
delete filterSet;
}
}
return result;
}
void Transliterator::handleGetSourceSet(UnicodeSet& result) const {
result.clear();
}
UnicodeSet& Transliterator::getTargetSet(UnicodeSet& result) const {
return result.clear();
}
// For public consumption
void Transliterator::registerFactory(const UnicodeString& id,
Transliterator::Factory factory,

View File

@ -159,6 +159,24 @@ public:
virtual UnicodeString& toRules(UnicodeString& result,
UBool escapeUnprintable) const;
protected:
/**
* Implement Transliterator framework
*/
virtual void handleGetSourceSet(UnicodeSet& result) const;
public:
/**
* Override Transliterator framework
*/
virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
// handleTransliterate should be protected, but was declared public before ICU 2.2.
// We do not have a separate deprecation date for this method since the entire class
// will become internal after 2002-sep-30.
#ifndef U_USE_DEPRECATED_TRANSLITERATOR_API
protected:
#endif
/**
* Implements {@link Transliterator#handleTransliterate}.
* @deprecated To be removed after 2002-sep-30.

View File

@ -382,6 +382,7 @@ public:
*/
Transliterator* clone(void) const;
protected:
/**
* Implements {@link Transliterator#handleTransliterate}.
* @deprecated To be removed after 2002-sep-30.
@ -389,6 +390,7 @@ public:
virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets,
UBool isIncremental) const;
public:
/**
* Return a representation of this transliterator as source rules.
* These rules will produce an equivalent transliterator if used
@ -404,6 +406,18 @@ public:
virtual UnicodeString& toRules(UnicodeString& result,
UBool escapeUnprintable) const;
protected:
/**
* Implement Transliterator framework
*/
virtual void handleGetSourceSet(UnicodeSet& result) const;
public:
/**
* Override Transliterator framework
*/
virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
/**
* Return the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:

View File

@ -780,6 +780,51 @@ public:
virtual UnicodeString& toRules(UnicodeString& result,
UBool escapeUnprintable) const;
/**
* Returns the set of all characters that may be modified in the
* input text by this Transliterator. This incorporates this
* object's current filter; if the filter is changed, the return
* value of this function will change. The default implementation
* returns an empty set. Some subclasses may override {@link
* #handleGetSourceSet} to return a more precise result. The
* return result is approximate in any case and is intended for
* use by tests, tools, or utilities.
* @param result receives result set; previous contents lost
* @return a reference to result
* @see #getTargetSet
* @see #handleGetSourceSet
*/
UnicodeSet& getSourceSet(UnicodeSet& result) const;
/**
* Framework method that returns the set of all characters that
* may be modified in the input text by this Transliterator,
* ignoring the effect of this object's filter. The base class
* implementation returns the empty set. Subclasses that wish to
* implement this should override this method.
* @return the set of characters that this transliterator may
* modify. The set may be modified, so subclasses should return a
* newly-created object.
* @param result receives result set; previous contents lost
* @see #getSourceSet
* @see #getTargetSet
*/
virtual void handleGetSourceSet(UnicodeSet& result) const;
/**
* Returns the set of all characters that may be generated as
* replacement text by this transliterator. The default
* implementation returns the empty set. Some subclasses may
* override this method to return a more precise result. The
* return result is approximate in any case and is intended for
* use by tests, tools, or utilities requiring such
* meta-information.
* @param result receives result set; previous contents lost
* @return a reference to result
* @see #getTargetSet
*/
virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
public:
/**

View File

@ -16,6 +16,7 @@ U_NAMESPACE_BEGIN
class Replaceable;
class UnicodeString;
class UnicodeSet;
/**
* <code>UnicodeReplacer</code> defines a protocol for objects that
@ -67,6 +68,13 @@ class U_I18N_API UnicodeReplacer /* not : public UObject because this is an inte
*/
virtual UnicodeString& toReplacerPattern(UnicodeString& result,
UBool escapeUnprintable) const = 0;
/**
* Union the set of all characters that may output by this object
* into the given set.
* @param toUnionTo the set into which to union the output characters
*/
virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const = 0;
};
U_NAMESPACE_END

View File

@ -331,6 +331,9 @@ void CompoundTransliteratorTest::TestTransliterate(){
if(U_FAILURE(status)){
errln("CompoundTransliterator construction failed");
}else {
#if 0
// handleTransliterate is a protected method that was erroneously made
// public. It is not public API that needs to be tested.
UnicodeString s("abcabc");
expect(*ct1, s, s);
UTransPosition index = { 0, 0, 0, 0 };
@ -343,7 +346,7 @@ void CompoundTransliteratorTest::TestTransliterate(){
UnicodeString rsource3(s);
ct1->handleTransliterate(rsource3, index, TRUE);
expectAux(ct1->getID() + ":String, index(1,2,3), incremental=TRUE", rsource3 + "->" + rsource3, rsource3==expectedResult, expectedResult);
#endif
}
delete ct1;
UnicodeString Data[]={
@ -391,7 +394,7 @@ void CompoundTransliteratorTest::expect(const CompoundTransliterator& t,
t.transliterate(rsource);
expectAux(t.getID() + ":Replaceable", source + "->" + rsource, rsource==expectedResult, expectedResult);
// Test handleTransliterate (incremental) transliteration --
// Test transliterate (incremental) transliteration --
rsource.remove();
rsource.append(source);
UTransPosition index;
@ -399,7 +402,8 @@ void CompoundTransliteratorTest::expect(const CompoundTransliterator& t,
index.contextLimit = source.length();
index.start = 0;
index.limit = source.length();
t.handleTransliterate(rsource, index, TRUE);
UErrorCode ec = U_ZERO_ERROR;
t.transliterate(rsource, index, ec);
t.finishTransliteration(rsource,index);
expectAux(t.getID() + ":handleTransliterate ", source + "->" + rsource, rsource==expectedResult, expectedResult);

View File

@ -159,6 +159,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE(69,TestMulticharStringSet);
TESTCASE(70,TestUserFunction);
TESTCASE(71,TestAnyX);
TESTCASE(72,TestSourceTargetSet);
default: name = ""; break;
}
@ -3570,6 +3571,53 @@ void TransliteratorTest::TestAnyX(void) {
delete anyLatin;
}
/**
* Test the source and target set API. These are only implemented
* for RBT and CompoundTransliterator at this time.
*/
void TransliteratorTest::TestSourceTargetSet() {
UErrorCode ec = U_ZERO_ERROR;
// Rules
const char* r =
"a > b; "
"r [x{lu}] > q;";
// Expected source
UnicodeSet expSrc("[arx{lu}]", ec);
// Expected target
UnicodeSet expTrg("[bq]", ec);
UParseError pe;
Transliterator* t = Transliterator::createFromRules("test", r, UTRANS_FORWARD, pe, ec);
if (U_FAILURE(ec)) {
delete t;
errln("FAIL: Couldn't set up test");
return;
}
UnicodeSet src; t->getSourceSet(src);
UnicodeSet trg; t->getTargetSet(trg);
if (src == expSrc && trg == expTrg) {
UnicodeString a, b;
logln((UnicodeString)"Ok: " +
r + " => source = " + src.toPattern(a, TRUE) +
", target = " + trg.toPattern(b, TRUE));
} else {
UnicodeString a, b, c, d;
errln((UnicodeString)"FAIL: " +
r + " => source = " + src.toPattern(a, TRUE) +
", expected " + expSrc.toPattern(b, TRUE) +
"; target = " + trg.toPattern(c, TRUE) +
", expected " + expTrg.toPattern(d, TRUE));
}
delete t;
}
//======================================================================
// Support methods
//======================================================================

View File

@ -328,6 +328,8 @@ private:
void TestAnyX(void);
void TestSourceTargetSet(void);
//======================================================================
// Support methods
//======================================================================