ICU-1048 allow ::ID blocks in rules

X-SVN-Rev: 5233
This commit is contained in:
Alan Liu 2001-07-13 21:09:41 +00:00
parent f35b88f4eb
commit 9805ae16d9
9 changed files with 946 additions and 348 deletions

View File

@ -10,6 +10,7 @@
#include "unicode/cpdtrans.h"
#include "unicode/unifilt.h"
#include "unicode/unifltlg.h"
#include "uvector.h"
/**
* Constructs a new compound transliterator given an array of
@ -30,7 +31,7 @@ CompoundTransliterator::CompoundTransliterator(
int32_t transliteratorCount,
UnicodeFilter* adoptedFilter) :
Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter),
trans(0), filters(0), count(0) {
trans(0), filters(0), count(0), compoundRBTIndex(-1) {
setTransliterators(transliterators, transliteratorCount);
}
@ -46,44 +47,142 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
UnicodeFilter* adoptedFilter,
UErrorCode& status) :
Transliterator(id, 0), // set filter to 0 here!
trans(0), filters(0) {
init(id, direction, adoptedFilter, status);
trans(0), filters(0), compoundRBTIndex(-1) {
init(id, direction, adoptedFilter, -1, 0, TRUE, status);
}
CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
UErrorCode& status) :
Transliterator(id, 0), // set filter to 0 here!
trans(0), filters(0) {
init(id, UTRANS_FORWARD, 0, status);
trans(0), filters(0), compoundRBTIndex(-1) {
init(id, UTRANS_FORWARD, 0, -1, 0, TRUE, status);
}
/**
* Private constructor for compound RBTs. Construct a compound
* transliterator using the given idBlock, with the adoptedTrans
* inserted at the idSplitPoint.
*/
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
const UnicodeString& idBlock,
int32_t idSplitPoint,
Transliterator *adoptedTrans,
UErrorCode& status) :
Transliterator(ID, 0),
trans(0), filters(0), compoundRBTIndex(-1) {
init(idBlock, UTRANS_FORWARD, 0, idSplitPoint, adoptedTrans, FALSE, status);
}
/**
* Private constructor for Transliterator from a vector of
* transliterators. The vector order is FORWARD, so if dir is REVERSE
* then the vector order will be reversed.
*/
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
UTransDirection dir,
UVector& list,
UErrorCode& status) :
Transliterator(ID, 0),
trans(0), filters(0), compoundRBTIndex(-1) {
init(list, dir, 0, TRUE, status);
}
/**
* Finish constructing a transliterator: only to be called by
* constructors. Before calling init(), set trans and filter to NULL.
* @param id the id containing ';'-separated entries
* @param direction either FORWARD or REVERSE
* @param adoptedFilter a filter object to be owned by this transliterator.
* May be NULL.
* @param idSplitPoint the index into id at which the
* adoptedSplitTransliterator should be inserted, if there is one, or
* -1 if there is none.
* @param adoptedSplitTransliterator a transliterator to be inserted
* before the entry at offset idSplitPoint in the id string. May be
* NULL to insert no entry.
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
* entries by calling getID() of component entries. Some constructors
* do not require this because they apply a facade ID anyway.
* @param status the error code indicating success or failure
*/
void CompoundTransliterator::init(const UnicodeString& id,
UTransDirection direction,
UnicodeFilter* adoptedFilter,
int32_t idSplitPoint,
Transliterator *adoptedSplitTrans,
UBool fixReverseID,
UErrorCode& status) {
if (U_FAILURE(status))
return;
UnicodeString* list = split(id, ID_DELIM, &count);
trans = new Transliterator*[count];
for (int32_t i = 0; i < count; ++i) {
trans[i] = createInstance(list[direction==UTRANS_FORWARD ? i : (count-1-i)],
direction);
if (trans[i] == NULL) {
while (++i < count)
trans[i] = 0;
status = U_ILLEGAL_ARGUMENT_ERROR;
delete[] list;
delete adoptedFilter;
return;
}
}
delete[] list;
// assert(trans == 0);
// assert(filters == 0);
// If the direction is UTRANS_REVERSE then we need to fix
// the ID.
if (direction == UTRANS_REVERSE) {
if (U_FAILURE(status)) {
delete adoptedFilter;
delete adoptedSplitTrans;
return;
}
UVector list;
Transliterator::parseCompoundID(id, direction,
idSplitPoint, adoptedSplitTrans,
list, compoundRBTIndex,
NULL, status);
init(list, direction, adoptedFilter, fixReverseID, status);
}
/**
* Finish constructing a transliterator: only to be called by
* constructors. Before calling init(), set trans and filter to NULL.
* @param list a vector of transliterator objects to be adopted. It
* should NOT be empty. The list should be in declared order. That
* is, it should be in the FORWARD order; if direction is REVERSE then
* the list order will be reversed.
* @param direction either FORWARD or REVERSE
* @param adoptedFilter a filter object to be owned by this transliterator.
* May be NULL.
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
* entries by calling getID() of component entries. Some constructors
* do not require this because they apply a facade ID anyway.
* @param status the error code indicating success or failure
*/
void CompoundTransliterator::init(UVector& list,
UTransDirection direction,
UnicodeFilter* adoptedFilter,
UBool fixReverseID,
UErrorCode& status) {
// assert(trans == 0);
// assert(filters == 0);
// Allocate array
if (U_SUCCESS(status)) {
count = list.size();
trans = new Transliterator*[count];
}
if (U_FAILURE(status) || trans == 0) {
delete adoptedFilter;
// assert(trans == 0);
return;
}
// Move the transliterators from the vector into an array.
// Reverse the order if necessary.
int32_t i;
for (i=0; i<count; ++i) {
int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i;
trans[i] = (Transliterator*) list.elementAt(j);
}
// Fix compoundRBTIndex for REVERSE transliterators
if (compoundRBTIndex >= 0 && direction == UTRANS_REVERSE) {
compoundRBTIndex = count - 1 - compoundRBTIndex;
}
// If the direction is UTRANS_REVERSE then we may need to fix the
// ID.
if (direction == UTRANS_REVERSE && fixReverseID) {
UnicodeString newID;
for (int32_t i=0; i<count; ++i) {
for (i=0; i<count; ++i) {
if (i > 0) {
newID.append(ID_DELIM);
}
@ -113,35 +212,35 @@ UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterat
return id; // Return temporary
}
/**
* Splits a string, as in JavaScript
*/
UnicodeString* CompoundTransliterator::split(const UnicodeString& s,
UChar divider,
int32_t* countPtr) {
// changed MED
// see how many there are
*countPtr = 1;
int32_t i;
for (i = 0; i < s.length(); ++i) {
if (s.charAt(i) == divider)
++(*countPtr);
}
// make an array with them
UnicodeString* result = new UnicodeString[*countPtr];
int32_t last = 0;
int32_t current = 0;
for (i = 0; i < s.length(); ++i) {
if (s.charAt(i) == divider) {
s.extractBetween(last, i, result[current++]);
last = i+1;
}
}
s.extractBetween(last, i, result[current]);
return result;
}
///**
// * Splits a string, as in JavaScript
// */
//UnicodeString* CompoundTransliterator::split(const UnicodeString& s,
// UChar divider,
// int32_t* countPtr) {
// // changed MED
// // see how many there are
// *countPtr = 1;
// int32_t i;
// for (i = 0; i < s.length(); ++i) {
// if (s.charAt(i) == divider)
// ++(*countPtr);
// }
//
// // make an array with them
// UnicodeString* result = new UnicodeString[*countPtr];
// int32_t last = 0;
// int32_t current = 0;
//
// for (i = 0; i < s.length(); ++i) {
// if (s.charAt(i) == divider) {
// s.extractBetween(last, i, result[current++]);
// last = i+1;
// }
// }
// s.extractBetween(last, i, result[current]);
// return result;
//}
/**
* Copy constructor.
@ -301,73 +400,102 @@ void CompoundTransliterator::adoptFilter(UnicodeFilter* f) {
Transliterator::adoptFilter(f);
}
UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
UBool escapeUnprintable) const {
// We do NOT call toRules() on our component transliterators, in
// general. If we have several rule-based transliterators, this
// yields a concatenation of the rules -- not what we want. We do
// handle compound RBT transliterators specially -- those for which
// compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
// we do call toRules() recursively.
rulesSource.truncate(0);
for (int32_t i=0; i<count; ++i) {
UnicodeString rule;
if (i == compoundRBTIndex) {
trans[i]->toRules(rule, escapeUnprintable);
} else {
trans[i]->Transliterator::toRules(rule, escapeUnprintable);
}
if (rulesSource.length() &&
rulesSource.charAt(rulesSource.length() - 1) != 10) {
rulesSource.append((UChar)10);
}
rulesSource.append(rule);
if (rulesSource.length() &&
rulesSource.charAt(rulesSource.length() - 1) != ID_DELIM) {
rulesSource.append(ID_DELIM);
}
}
return rulesSource;
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
UBool incremental) const {
/* Call each transliterator with the same start value and
* initial cursor index, but with the limit index as modified
* by preceding transliterators. The cursor index must be
/* Call each transliterator with the same contextStart and
* start, but with the limit as modified
* by preceding transliterators. The start index must be
* reset for each transliterator to give each a chance to
* transliterate the text. The initial cursor index is known
* transliterate the text. The initial contextStart index is known
* to still point to the same place after each transliterator
* is called because each transliterator will not change the
* text between start and the initial value of cursor.
* text between contextStart and the initial start index.
*
* IMPORTANT: After the first transliterator, each subsequent
* transliterator only gets to transliterate text committed by
* preceding transliterators; that is, the cursor (output
* preceding transliterators; that is, the start (output
* value) of transliterator i becomes the limit (input value)
* of transliterator i+1. Finally, the overall limit is fixed
* up before we return.
*
* Assumptions we make here:
* (1) start <= cursor <= limit ;cursor valid on entry
* (2) cursor <= cursor' <= limit' ;cursor doesn't move back
* (3) cursor <= limit' ;text before cursor unchanged
* - cursor' is the value of cursor after calling handleKT
* (1) contextStart <= start <= limit ;cursor valid on entry
* (2) start <= start' <= limit' ;cursor doesn't move back
* (3) start <= limit' ;text before start unchanged
* - start' is the value of start after calling handleKT
* - limit' is the value of limit after calling handleKT
*/
/**
* Example: 3 transliterators. This example illustrates the
* mechanics we need to implement. S, C, and L are the start,
* cursor, and limit. gl is the globalLimit.
* mechanics we need to implement. C, S, and L are the contextStart,
* start, and limit. gl is the globalLimit.
*
* 1. h-u, changes hex to Unicode
*
* 4 7 a d 0 4 7 a
* abc/u0061/u => abca/u
* S C L S C L gl=f->a
* C S L C S L gl=f->a
*
* 2. upup, changes "x" to "XX"
*
* 4 7 a 4 7 a
* abca/u => abcAA/u
* S CL S C
* C SL C S
* L gl=a->b
* 3. u-h, changes Unicode to hex
*
* 4 7 a 4 7 a d 0 3
* abcAA/u => abc/u0041/u0041/u
* S C L S C
* C S L C S
* L gl=b->15
* 4. return
*
* 4 7 a d 0 3
* abc/u0041/u0041/u
* S C L
* C S L
*/
if (count < 1) {
index.start = index.limit;
return; // Short circuit for empty compound transliterators
}
int32_t i;
int32_t cursor = index.start;
int32_t limit = index.limit;
int32_t globalLimit = limit;
int32_t start = index.start;
int32_t globalLimit = index.limit;
/* globalLimit is the overall limit. We keep track of this
* since we overwrite index.limit with the previous
* index.start. After each transliteration, we update
@ -375,16 +503,16 @@ void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPositi
*/
for (i=0; i<count; ++i) {
index.start = cursor; // Reset cursor
index.limit = limit;
index.start = start; // Reset start
int32_t limit = index.limit;
trans[i]->handleTransliterate(text, index, incremental);
// Adjust overall limit for insertions/deletions
globalLimit += index.limit - limit;
limit = index.start; // Move limit to end of committed text
index.limit = index.start; // Move limit to end of committed text
}
// Cursor is good where it is -- where the last
// Start is good where it is -- where the last
// transliterator left it. Limit needs to be put back
// where it was, modulo adjustments for deletions/insertions.
index.limit = globalLimit;

View File

@ -15,10 +15,10 @@
*/
void NormalizationTransliterator::registerIDs() {
UErrorCode status = U_ZERO_ERROR;
Transliterator::_registerFactory(UnicodeString("NFC", ""), _createNFC, status);
Transliterator::_registerFactory(UnicodeString("NFKC", ""), _createNFKC, status);
Transliterator::_registerFactory(UnicodeString("NFD", ""), _createNFD, status);
Transliterator::_registerFactory(UnicodeString("NFKD", ""), _createNFKD, status);
Transliterator::_registerFactory(UnicodeString("Any-NFC", ""), _createNFC, status);
Transliterator::_registerFactory(UnicodeString("Any-NFKC", ""), _createNFKC, status);
Transliterator::_registerFactory(UnicodeString("Any-NFD", ""), _createNFD, status);
Transliterator::_registerFactory(UnicodeString("Any-NFKD", ""), _createNFKD, status);
}
/**

View File

@ -22,7 +22,7 @@ void RuleBasedTransliterator::_construct(const UnicodeString& rules,
data = 0;
isDataOwned = TRUE;
if (U_SUCCESS(status)) {
data = TransliterationRuleParser::parse(rules, direction, parseError);
data = TransliteratorParser::parse(rules, direction, parseError);
if (data == 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
} else {
@ -40,6 +40,18 @@ RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
}
/**
* Internal constructor.
*/
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
TransliterationRuleData* theData,
UBool isDataAdopted) :
Transliterator(id, 0),
data(theData),
isDataOwned(isDataAdopted) {
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
}
/**
* Copy constructor. Since the data object is immutable, we can share
* it with other objects -- no need to clone it.

View File

@ -48,7 +48,11 @@
// trailing SymbolTable.SYMBOL_REF character.
// private static final char ANCHOR_END = '$';
const UnicodeString TransliterationRuleParser::gOPERATORS = OPERATORS;
const UnicodeString TransliteratorParser::gOPERATORS = OPERATORS;
// These are also used in Transliterator::toRules()
static const int32_t ID_TOKEN_LEN = 2;
static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
//----------------------------------------------------------------------
// BEGIN ParseData
@ -167,14 +171,14 @@ public:
UBool anchorStart;
UBool anchorEnd;
TransliterationRuleParser& parser;
TransliteratorParser& parser;
static const UnicodeString gOperators;
//--------------------------------------------------
// Methods
RuleHalf(TransliterationRuleParser& parser);
RuleHalf(TransliteratorParser& parser);
~RuleHalf();
/**
@ -220,7 +224,7 @@ inline int32_t _voidPtr_to_int32(void* x) {
const UnicodeString RuleHalf::gOperators = OPERATORS;
RuleHalf::RuleHalf(TransliterationRuleParser& p) : parser(p) {
RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) {
cursor = -1;
ante = -1;
post = -1;
@ -487,24 +491,62 @@ int32_t* RuleHalf::createSegments() const {
//----------------------------------------------------------------------
TransliterationRuleData*
TransliterationRuleParser::parse(const UnicodeString& rules,
UTransDirection direction,
UParseError* parseError) {
TransliterationRuleParser parser(rules, direction, parseError);
parser.parseRules();
if (U_FAILURE(parser.status)) {
TransliteratorParser::parse(const UnicodeString& rules,
UTransDirection direction,
UParseError* parseError) {
TransliteratorParser parser(rules, direction, parseError);
UnicodeString idBlock;
int32_t idSplitPoint, count;
parser.parseRules(idBlock, idSplitPoint, count);
if (U_FAILURE(parser.status) || idBlock.length() != 0) {
delete parser.data;
parser.data = 0;
}
return parser.data;
}
/**
* Parse a given set of rules. Return up to three pieces of
* parsed data. These are the header ::id block, the rule block,
* and the footer ::id block. Any or all of these may be empty.
* If the ::id blocks are empty, their corresponding parameters
* are returned as the empty string. If there are no rules, the
* TransliterationRuleData result is 0.
* @param ruleDataResult caller owns the pointer stored here.
* May be NULL.
* @param headerRule string including semicolons for the header
* ::id block. May be empty.
* @param footerRule string including semicolons for the footer
* ::id block. May be empty.
*/
void TransliteratorParser::parse(const UnicodeString& rules,
UTransDirection direction,
TransliterationRuleData*& ruleDataResult,
UnicodeString& idBlockResult,
int32_t& idSplitPointResult,
UParseError* parseError,
UErrorCode& ec) {
if (U_FAILURE(ec)) {
ruleDataResult = 0;
return;
}
TransliteratorParser parser(rules, direction, parseError);
int32_t count;
parser.parseRules(idBlockResult, idSplitPointResult, count);
if (U_FAILURE(parser.status) || count == 0) {
delete parser.data;
parser.data = 0;
}
ruleDataResult = parser.data;
ec = parser.status;
}
/**
* @param rules list of rules, separated by newline characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
TransliterationRuleParser::TransliterationRuleParser(
TransliteratorParser::TransliteratorParser(
const UnicodeString& theRules,
UTransDirection theDirection,
UParseError* theParseError) :
@ -515,7 +557,7 @@ TransliterationRuleParser::TransliterationRuleParser(
/**
* Destructor.
*/
TransliterationRuleParser::~TransliterationRuleParser() {
TransliteratorParser::~TransliteratorParser() {
delete parseData;
}
@ -527,8 +569,11 @@ TransliterationRuleParser::~TransliterationRuleParser() {
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void TransliterationRuleParser::parseRules(void) {
void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
int32_t& idSplitPointResult,
int32_t& ruleCount) {
status = U_ZERO_ERROR;
ruleCount = 0;
delete data;
data = new TransliterationRuleData(status);
@ -543,14 +588,21 @@ void TransliterationRuleParser::parseRules(void) {
}
determineVariableRange();
UnicodeString str; // scratch
idBlockResult.truncate(0);
idSplitPointResult = -1;
int32_t pos = 0;
int32_t limit = rules.length();
// The mode marks whether we are in the header ::id block, the
// rule block, or the footer ::id block.
// mode == 0: start: rule->1, ::id->0
// mode == 1: in rules: rule->1, ::id->2
// mode == 2: in footer rule block: rule->ERROR, ::id->2
int32_t mode = 0;
while (pos < limit && U_SUCCESS(status)) {
UChar c = rules.charAt(pos++);
if (Unicode::isWhitespace(c)) {
// Ignore leading whitespace. Note that this is not
// Unicode spaces, but Java spaces -- a subset,
// representing whitespace likely to be seen in code.
// Ignore leading whitespace.
continue;
}
// Skip lines starting with the comment character
@ -561,10 +613,50 @@ void TransliterationRuleParser::parseRules(void) {
}
continue; // Either fall out or restart with next line
}
// We've found the start of a rule. c is its first
// character, and pos points past c. Lexically parse the
// rule into component pieces.
pos = parseRule(--pos, limit);
// We've found the start of a rule or ID. c is its first
// character, and pos points past c.
--pos;
// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
// chars left.
if ((pos + ID_TOKEN_LEN + 1) <= limit &&
rules.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
pos += ID_TOKEN_LEN;
c = rules.charAt(pos);
while (Unicode::isWhitespace(c) && pos < limit) {
++pos;
c = rules.charAt(pos);
}
int32_t p = pos;
UBool sawDelim;
Transliterator::parseID(rules, p, sawDelim, direction, NULL, FALSE);
if (p == pos) {
// Invalid ::id
status = U_ILLEGAL_ARGUMENT_ERROR;
} else {
if (mode == 1) {
mode = 2;
idSplitPointResult = idBlockResult.length();
}
rules.extractBetween(pos, p, str);
idBlockResult.append(str);
if (!sawDelim) {
idBlockResult.append((UChar)0x003B /*;*/);
}
pos = p;
}
} else {
// Parse a rule
pos = parseRule(pos, limit);
if (U_SUCCESS(status)) {
++ruleCount;
if (mode == 2) {
// ::id in illegal position (because a rule
// occurred after the ::id footer block)
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
mode = 1;
}
}
// Convert the set vector to an array
@ -573,7 +665,8 @@ void TransliterationRuleParser::parseRules(void) {
// orphanElement removes the given element and shifts all other
// elements down. For performance (and code clarity) we work from
// the end back to index 0.
for (int32_t i=data->setVariablesLength; i>0; ) {
int32_t i;
for (i=data->setVariablesLength; i>0; ) {
--i;
data->setVariables[i] =
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
@ -582,6 +675,9 @@ void TransliterationRuleParser::parseRules(void) {
// Index the rules
if (U_SUCCESS(status)) {
data->ruleSet.freeze(*data, status);
if (idSplitPointResult < 0) {
idSplitPointResult = idBlockResult.length();
}
}
}
@ -598,7 +694,7 @@ void TransliterationRuleParser::parseRules(void) {
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
*/
int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
// Locate the left side, operator, and right side
int32_t start = pos;
UChar op = 0;
@ -759,7 +855,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
* @param rule pattern string
* @param start position of first character of current rule
*/
int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
int32_t TransliteratorParser::syntaxError(int32_t parseErrorCode,
const UnicodeString& rule,
int32_t start) {
if (parseError != 0) {
@ -786,7 +882,7 @@ int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
* Parse a UnicodeSet out, store it, and return the stand-in character
* used to represent it.
*/
UChar TransliterationRuleParser::parseSet(const UnicodeString& rule,
UChar TransliteratorParser::parseSet(const UnicodeString& rule,
ParsePosition& pos) {
UnicodeSet* set = new UnicodeSet(rule, pos, *parseData, status);
if (variableNext >= variableLimit) {
@ -804,7 +900,7 @@ UChar TransliterationRuleParser::parseSet(const UnicodeString& rule,
* Append the value of the given variable name to the given
* UnicodeString.
*/
void TransliterationRuleParser::appendVariableDef(const UnicodeString& name,
void TransliteratorParser::appendVariableDef(const UnicodeString& name,
UnicodeString& buf) {
const UnicodeString* s = (const UnicodeString*) data->variableNames->get(name);
if (s == NULL) {
@ -839,7 +935,7 @@ void TransliterationRuleParser::appendVariableDef(const UnicodeString& name,
* When done, everything not in the hash is available for use. In practice,
* this method may employ some other algorithm for improved speed.
*/
void TransliterationRuleParser::determineVariableRange(void) {
void TransliteratorParser::determineVariableRange(void) {
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
@ -864,7 +960,7 @@ void TransliterationRuleParser::determineVariableRange(void) {
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for 'h'.
*/
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
int32_t TransliteratorParser::quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
UChar charToFind) {
for (int32_t i=start; i<limit; ++i) {

View File

@ -18,7 +18,7 @@ class ParseData;
class RuleHalf;
class ParsePosition;
class TransliterationRuleParser {
class TransliteratorParser {
/**
* This is a reference to external data we don't own. This works because
@ -87,6 +87,28 @@ public:
UTransDirection direction,
UParseError* parseError = 0);
/**
* Parse a given set of rules. Return up to three pieces of
* parsed data. These are the header ::id block, the rule block,
* and the footer ::id block. Any or all of these may be empty.
* If the ::id blocks are empty, their corresponding parameters
* are returned as the empty string. If there are no rules, the
* TransliterationRuleData result is 0.
* @param ruleDataResult caller owns the pointer stored here.
* May be NULL.
* @param headerRule string including semicolons for the header
* ::id block. May be empty.
* @param footerRule string including semicolons for the footer
* ::id block. May be empty.
*/
static void parse(const UnicodeString& rules,
UTransDirection direction,
TransliterationRuleData*& ruleDataResult,
UnicodeString& idBlockResult,
int32_t& idSplitPointResult,
UParseError* parseError,
UErrorCode& ec);
private:
/**
@ -94,14 +116,14 @@ private:
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
TransliterationRuleParser(const UnicodeString& rules,
TransliteratorParser(const UnicodeString& rules,
UTransDirection direction,
UParseError* parseError = 0);
/**
* Destructor.
*/
~TransliterationRuleParser();
~TransliteratorParser();
/**
* Parse the given string as a sequence of rules, separated by newline
@ -111,7 +133,8 @@ private:
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void parseRules(void);
void parseRules(UnicodeString& idBlockResult, int32_t& idSplitPointResult,
int32_t& ruleCount);
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
@ -139,13 +162,6 @@ private:
*/
int32_t syntaxError(int32_t parseErrorCode, const UnicodeString&, int32_t start);
/**
* Allocate a private-use substitution character for the given set,
* register it in the setVariables hash, and return the substitution
* character.
*/
//UChar registerSet(UnicodeSet* adoptedSet);
/**
* Parse a UnicodeSet out, store it, and return the stand-in character
* used to represent it.
@ -189,8 +205,8 @@ private:
friend class RuleHalf;
// Disallowed methods; no impl.
TransliterationRuleParser(const TransliterationRuleParser&);
TransliterationRuleParser& operator=(const TransliterationRuleParser&);
TransliteratorParser(const TransliteratorParser&);
TransliteratorParser& operator=(const TransliteratorParser&);
};
#endif

View File

@ -9,8 +9,7 @@
*/
#include "unicode/remtrans.h"
//const UnicodeString RemoveTransliterator::ID = UnicodeString("Remove", "");
const UChar RemoveTransliterator::ID[] = {0x52, 0x65, 0x6D, 0x6F, 0x76, 0x65, 0x00}; /* "Remove" */
const UChar RemoveTransliterator::ID[] = {65, 110, 121, 45, 0x52, 0x65, 0x6D, 0x6F, 0x76, 0x65, 0x00}; /* "Any-Remove" */
Transliterator* RemoveTransliterator::clone(void) const {
return new RemoveTransliterator();

View File

@ -565,91 +565,347 @@ Transliterator* Transliterator::createInverse(void) const {
Transliterator* Transliterator::createInstance(const UnicodeString& ID,
UTransDirection dir,
UParseError* parseError) {
Transliterator* t = 0;
if (ID.indexOf(ID_DELIM) >= 0) {
UErrorCode status = U_ZERO_ERROR;
t = new CompoundTransliterator(ID, dir, 0, status);
if (U_FAILURE(status)) {
delete t;
t = 0;
UErrorCode status = U_ZERO_ERROR;
return createInstance(ID, dir, -1, NULL, parseError, status);
}
/**
* Create a transliterator given a compound ID (possibly degenerate,
* with no ID_DELIM). If idSplitPoint >= 0 and adoptedSplitTrans !=
* 0, then insert adoptedSplitTrans in the compound ID at offset
* idSplitPoint. Otherwise idSplitPoint should be -1 and
* adoptedSplitTrans should be 0. The resultant transliterator will
* be an atomic (non-compound) transliterator if this is indicated by
* ID. Otherwise it will be a compound translitertor.
*/
Transliterator* Transliterator::createInstance(const UnicodeString& ID,
UTransDirection dir,
int32_t idSplitPoint,
Transliterator *adoptedSplitTrans,
UParseError* parseError,
UErrorCode& status) {
if (U_FAILURE(status)) {
return 0;
}
UVector list;
int32_t ignored;
parseCompoundID(ID, dir, idSplitPoint, adoptedSplitTrans,
list, ignored, parseError, status);
if (U_FAILURE(status)) {
return 0;
}
switch (list.size()) {
case 0:
return new NullTransliterator();
case 1:
return (Transliterator*) list.elementAt(0);
default:
return new CompoundTransliterator(ID, dir, list, status);
}
}
/**
* Returns a <code>Transliterator</code> object constructed from
* the given rule string. This will be a RuleBasedTransliterator,
* if the rule string contains only rules, or a
* CompoundTransliterator, if it contains ID blocks, or a
* NullTransliterator, if it contains ID blocks which parse as
* empty for the given direction.
*/
Transliterator* Transliterator::createFromRules(const UnicodeString& ID,
const UnicodeString& rules,
UTransDirection dir,
UParseError* parseError) {
UnicodeString idBlock;
int32_t idSplitPoint = -1;
TransliterationRuleData *data = 0;
UErrorCode status = U_ZERO_ERROR;
TransliteratorParser::parse(rules, dir, data,
idBlock, idSplitPoint,
parseError, status);
if (U_FAILURE(status)) {
delete data;
return 0;
}
// NOTE: The logic here matches that in _createInstance().
if (idBlock.length() == 0) {
if (data == 0) {
// No idBlock, no data -- this is just an
// alias for Null
return new NullTransliterator();
} else {
// No idBlock, data != 0 -- this is an
// ordinary RBT_DATA.
return new RuleBasedTransliterator(ID, data, TRUE); // TRUE == adopt data object
}
} else {
// 'id' is the ID with the filter pattern removed and with
// whitespace deleted.
UnicodeString id(ID);
// Look for embedded filter pattern
UnicodeSet *filter = 0;
int32_t setStart = id.indexOf((UChar)0x005B /*[*/);
int32_t setLimit;
if (setStart >= 0) {
UErrorCode status = U_ZERO_ERROR;
ParsePosition pos(setStart);
filter = new UnicodeSet();
filter->applyPattern(id, pos, 0, status);
if (data == 0) {
// idBlock, no data -- this is an alias
Transliterator *t = createInstance(idBlock, dir, parseError);
if (t != 0) {
t->setID(ID);
}
return t;
} else {
// idBlock and data -- this is a compound
// RBT
UnicodeString id("_", "");
Transliterator *t = new RuleBasedTransliterator(id, data, TRUE); // TRUE == adopt data object
t = new CompoundTransliterator(ID, idBlock, idSplitPoint,
t, status);
if (U_FAILURE(status)) {
// There was a parse failure in the filter pattern
delete filter;
return 0;
delete t;
t = 0;
}
setLimit = pos.getIndex();
id.removeBetween(setStart, setLimit);
}
// Delete whitespace
int32_t i;
for (i=0; i<id.length(); ++i) {
if (Unicode::isWhitespace(id.charAt(i))) {
id.remove(i, 1);
--i;
}
}
// Fix the id, if necessary, by reversing it (A-B => B-A).
// Record the position of the separator. Detect the special
// case of Null, whose inverse is itself. Given an ID with no
// separator "Foo", an abbreviation for "Any-Foo", consider
// the inverse to be "Foo-Any".
int32_t sep = id.indexOf(ID_SEP);
if (id.caseCompare(NullTransliterator::ID,
U_FOLD_CASE_DEFAULT) == 0) {
sep = id.length();
} else if (dir == UTRANS_REVERSE) {
UnicodeString left;
if (sep >= 0) {
id.extractBetween(0, sep, left);
id.removeBetween(0, sep+1);
} else {
left = UnicodeString("Any", "");
}
sep = id.length();
id.append(ID_SEP).append(left);
} else if (sep < 0) {
sep = id.length();
}
// The 'alias' parameter is non-empty if _createInstance()
// finds that the given ID refers to an alias. The reason
// _createInstance() doesn't call createInstance() (this
// method) directly is to avoid deadlock. There are other
// ways to do this but this is one of the more efficient ways.
UnicodeString alias;
t = _createInstance(id, alias, parseError);
if (alias.length() > 0) { // assert(t==0)
t = createInstance(alias);
}
if (t != 0) {
if (filter != 0) {
t->adoptFilter(filter);
id.insert(sep, ID, setStart, setLimit-setStart);
}
t->setID(id);
return t;
}
}
}
UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
UBool escapeUnprintable) const {
// The base class implementation of toRules munges the ID into
// the correct format. That is: foo => ::foo
rulesSource = getID();
// KEEP in sync with rbt_pars
rulesSource.insert(0, UnicodeString("::", ""));
return rulesSource;
}
/**
* Parse a compound ID (possibly a degenerate one, containing no
* ID_DELIM). If idSplitPoint >= 0 and adoptedSplitTrans != 0, then
* insert adoptedSplitTrans in the compound ID at offset idSplitPoint.
* Otherwise idSplitPoint should be -1 and adoptedSplitTrans should be
* 0. Return in the result vector the instantiated transliterator
* objects (one of these will be adoptedSplitTrans, if the latter was
* specified). These will be in order of id, so if dir is REVERSE,
* then the caller will have to reverse the order.
*
* @param splitTransIndex output parameter to receive the index in
* 'result' at which the adoptedSplitTrans is stored, or -1 if
* adoptedSplitTrans == 0
*/
void Transliterator::parseCompoundID(const UnicodeString& id,
UTransDirection dir,
int32_t idSplitPoint,
Transliterator *adoptedSplitTrans,
UVector& result,
int32_t& splitTransIndex,
UParseError* parseError,
UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
splitTransIndex = -1;
int32_t pos = 0;
int32_t i;
while (pos < id.length()) {
// We compare (pos >= split), not (pos == split), so we can
// skip over whitespace (see below).
if (pos >= idSplitPoint && adoptedSplitTrans != 0) {
splitTransIndex = result.size();
result.addElement(adoptedSplitTrans);
adoptedSplitTrans = 0;
}
int32_t p = pos;
UBool sawDelimiter; // We ignore this
Transliterator *t =
parseID(id, p, sawDelimiter, dir, parseError, TRUE);
if (p == pos) {
delete t;
status = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
pos = p;
// The return value may be NULL when, for instance, creating a
// REVERSE transliterator of ID "Latin-Greek()".
if (t != 0) {
result.addElement(t);
}
}
// Handle case of idSplitPoint == id.length()
if (pos >= idSplitPoint && adoptedSplitTrans != 0) {
splitTransIndex = result.size();
result.addElement(adoptedSplitTrans);
adoptedSplitTrans = 0;
}
if (U_FAILURE(status)) {
for (i=0; i<result.size(); ++i) {
delete (Transliterator*)result.elementAt(i);
}
result.removeAllElements();
delete adoptedSplitTrans;
}
}
/**
* Parse a single ID, possibly including an inline filter, and return
* the resultant transliterator object. NOTE: If 'create' is FALSE,
* then the amount of syntax checking is limited. However, the 'pos'
* parameter will be updated correctly, assuming the input string is
* valid.
*
* A trailing /;? \s* / is skipped. The parameter sawDelimiter
* indicates whether the ';' was seen or not. Upon return, if pos is
* advanced, it will either point to a non-whitespace character past
* the trailing ';', if any, or be equal to length().
*
* On return one of the following will be true:
* pos unchanged: sawDelimiter meaningless
* pos == ID.length(): sawDelimiter TRUE or FALSE
* pos < ID.length(): sawDelimiter always TRUE
*
* @param ID the ID string
* @param pos INPUT-OUTPUT parameter. On input, the position of the
* first character to parse. On output, the position after the last
* character parsed. This will be a semicolon or ID.length(). In the
* case of an error this value will be unchanged.
* @param create if TRUE, create and return the result. If FALSE,
* only scan the ID, and return NULL.
* @return a newly created transliterator, or NULL. NULL is returned
* in all cases if create is FALSE. If create is TRUE, then NULL is
* returned on error, or if the ID is effectively empty.
* E.g. "Latin-Greek()" with dir == REVERSE. Do NOT check for NULL to
* determine if there was an error. Instead, check to see if pos
* moved.
*/
Transliterator* Transliterator::parseID(const UnicodeString& ID,
int32_t& pos,
UBool& sawDelimiter,
UTransDirection dir,
UParseError* parseError,
UBool create) {
Transliterator* t = 0;
UnicodeString str; // scratch
// Look for embedded filter pattern by looking for ';' and
// '[' and seeing which comes first.
UnicodeSet *filter = 0;
int32_t limit = ID.indexOf(ID_DELIM, pos);
sawDelimiter = limit >= 0;
if (!sawDelimiter) {
limit = ID.length();
}
int32_t setStart = ID.indexOf((UChar)0x005B /*[*/, pos);
int32_t setLimit;
if (setStart >= 0 && setStart < limit) {
UErrorCode status = U_ZERO_ERROR;
ParsePosition ppos(setStart);
filter = new UnicodeSet();
filter->applyPattern(ID, ppos, 0, status);
if (U_FAILURE(status)) {
// There was a parse failure in the filter pattern
delete filter;
return 0;
}
setLimit = ppos.getIndex();
if (limit < setLimit) {
limit = ID.indexOf(ID_DELIM, setLimit);
sawDelimiter = limit >= 0;
if (!sawDelimiter) {
limit = ID.length();
}
}
} else {
setStart = setLimit = pos;
}
// Advance limit past /;?\s*/
int32_t idLimit = limit; // limit before separator
if (sawDelimiter) {
// assert(limit < ID.length() && ID.charAt(limit) == ID_DELIM);
++limit;
}
while (limit < ID.length() && u_isspace(ID.charAt(limit))) {
++limit;
}
if (!create) {
// TODO Improve performance by scanning the UnicodeSet pattern
// without actually constructing it, if create is FALSE. That
// is, create a method like this one for UnicodeSet.
delete filter;
pos = limit;
return 0;
}
// 'id' is the ID with the filter pattern removed and with
// whitespace deleted.
UnicodeString id;
ID.extractBetween(pos, setStart, id);
ID.extractBetween(setLimit, idLimit, str);
id.append(str);
// Delete whitespace
int32_t i;
for (i=0; i<id.length(); ++i) {
if (Unicode::isWhitespace(id.charAt(i))) {
id.remove(i, 1);
--i;
}
}
// Fix the id, if necessary, by reversing it (A-B => B-A).
// Record the position of the separator. Detect the special
// case of Null, whose inverse is itself. Given an ID with no
// separator "Foo", an abbreviation for "Any-Foo", consider
// the inverse to be "Foo-Any".
int32_t sep = id.indexOf(ID_SEP);
if (sep < 0 && id.caseCompare(NullTransliterator::ID,
U_FOLD_CASE_DEFAULT) == 0) {
sep = id.length();
} else if (dir == UTRANS_REVERSE) {
if (sep >= 0) {
id.extractBetween(0, sep, str);
id.removeBetween(0, sep+1);
} else {
str = UnicodeString("Any", "");
}
sep = id.length();
id.append(ID_SEP).append(str);
} else if (sep < 0) {
str = UnicodeString("Any-", "");
sep = str.length();
id.insert(0, str);
}
// The 'alias' parameter is non-empty if _createInstance()
// finds that the given ID refers to an alias. The reason
// _createInstance() doesn't call createInstance() (this
// method) directly is to avoid deadlock. There are other
// ways to do this but this is one of the more efficient ways.
str.truncate(0);
t = _createInstance(id, str /*alias*/, parseError);
if (str.length() > 0) {
// assert(t==0);
t = createInstance(str, UTRANS_FORWARD, parseError);
}
if (t != 0) {
if (filter != 0) {
t->adoptFilter(filter);
id.insert(sep, ID, setStart, setLimit-setStart);
}
t->setID(id);
pos = limit;
}
return t;
}
/**
* Returns a transliterator object given its ID. Unlike getInstance(),
@ -661,8 +917,6 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID,
Transliterator* Transliterator::_createInstance(const UnicodeString& ID,
UnicodeString& aliasReturn,
UParseError* parseError) {
UErrorCode status = U_ZERO_ERROR;
if (!cacheInitialized) {
initializeCache();
}
@ -672,46 +926,55 @@ Transliterator* Transliterator::_createInstance(const UnicodeString& ID,
CacheEntry* entry = (CacheEntry*) cache->get(ID);
if (entry == 0) {
entry = (CacheEntry*) internalCache->get(ID);
if (entry == 0) {
return 0; // out of memory
}
}
TransliterationRuleData* data = 0;
UErrorCode status = U_ZERO_ERROR;
if (entry == 0) {
return 0;
}
for (;;) {
if (entry->entryType == CacheEntry::RBT_DATA) {
return new RuleBasedTransliterator(ID, entry->u.data);
} else if (entry->entryType == CacheEntry::PROTOTYPE) {
return entry->u.prototype->clone();
} else if (entry->entryType == CacheEntry::ALIAS) {
// We can't call createInstance() here because of deadlock.
aliasReturn = entry->stringArg;
return 0;
} else if (entry->entryType == CacheEntry::FACTORY) {
return entry->u.factory();
} else if (entry->entryType == CacheEntry::COMPOUND_RBT) {
UnicodeString id("_", "");
Transliterator *t = new RuleBasedTransliterator(id, entry->u.data);
t = new CompoundTransliterator(ID, entry->stringArg,
entry->intArg, t, status);
if (U_FAILURE(status)) {
delete t;
t = 0;
_unregister(ID);
}
return t;
}
if (entry->entryType == CacheEntry::RBT_DATA) {
data = entry->u.data;
// Fall through to construct transliterator from cached Data object.
} else if (entry->entryType == CacheEntry::PROTOTYPE) {
return entry->u.prototype->clone();
} else if (entry->entryType == CacheEntry::ALIAS) {
// We can't call createInstance() here because of deadlock.
aliasReturn = entry->stringArg;
return 0;
} else if (entry->entryType == CacheEntry::FACTORY) {
return entry->u.factory();
} else {
// At this point entry type must be either RULES_FORWARD
// or RULES_REVERSE
// At this point entry type must be either RULES_FORWARD or
// RULES_REVERSE. We process the rule data into a
// TransliteratorRuleData object, and possibly also into an
// ::id header and/or footer. Then we modify the cache with
// the parsed data and retry.
UBool isReverse = (entry->entryType == CacheEntry::RULES_REVERSE);
// We use the file name, taken from another resource bundle
// 2-d array at static init time, as a locale language. We're
// just using the locale mechanism to map through to a file
// name; this in no way represents an actual locale.
char *ch;
ch = new char[entry->stringArg.length() + 1];
char *ch = new char[entry->stringArg.length() + 1];
ch[entry->stringArg.extract(0, 0x7fffffff, ch, "")] = 0;
Locale fakeLocale(ch);
delete [] ch;
ResourceBundle bundle((char *)0,
fakeLocale, status);
// Call RBT to parse the rules from the resource bundle
ResourceBundle bundle((char *)0, fakeLocale, status);
UnicodeString rules = bundle.getStringEx(RB_RULE, status);
// If the status indicates a failure, then we don't have any
@ -719,42 +982,54 @@ Transliterator* Transliterator::_createInstance(const UnicodeString& ID,
// in the root locale should correspond to all the installed
// transliterators; if it lists something that's not
// installed, we'll get an error from ResourceBundle.
if (U_SUCCESS(status)) {
data = TransliterationRuleParser::parse(rules, isReverse
? UTRANS_REVERSE
: UTRANS_FORWARD,
parseError);
TransliteratorParser::parse(rules, isReverse ?
UTRANS_REVERSE : UTRANS_FORWARD,
entry->u.data,
entry->stringArg,
entry->intArg,
parseError,
status);
// Double check to see if someone has modified the entry
// since we last looked at it.
if (entry->entryType != CacheEntry::RBT_DATA) {
entry->entryType = CacheEntry::RBT_DATA;
entry->u.data = data;
if (U_FAILURE(status)) {
// We have a failure of some kind. Remove the ID from the
// cache so we don't keep trying. NOTE: This will throw off
// anyone who is, at the moment, trying to iterate over the
// available IDs. That's acceptable since we should never
// really get here except under installation, configuration,
// or unrecoverable run time memory failures.
_unregister(ID);
break;
}
// Reset entry->entryType to something that we process at the
// top of the loop, then loop back to the top. As long as we
// do this, we only loop through twice at most.
// NOTE: The logic here matches that in createFromRules().
if (entry->stringArg.length() == 0) {
if (entry->u.data == 0) {
// No idBlock, no data -- this is just an
// alias for Null
entry->entryType = CacheEntry::ALIAS;
entry->stringArg = NullTransliterator::ID;
} else {
// Oops! Another thread has updated this cache entry
// already to point to a data object. Discard the
// one we just created and use the one in the cache
// instead.
delete data;
data = entry->u.data;
// No idBlock, data != 0 -- this is an
// ordinary RBT_DATA
entry->entryType = CacheEntry::RBT_DATA;
}
} else {
if (entry->u.data == 0) {
// idBlock, no data -- this is an alias
entry->entryType = CacheEntry::ALIAS;
} else {
// idBlock and data -- this is a compound
// RBT
entry->entryType = CacheEntry::COMPOUND_RBT;
}
}
}
if (data != 0) {
return new RuleBasedTransliterator(ID, data);
} else {
// We have a failure of some kind. Remove the ID from the
// cache so we don't keep trying. NOTE: This will throw off
// anyone who is, at the moment, trying to iterate over the
// available IDs. That's acceptable since we should never
// really get here except under installation, configuration,
// or unrecoverable run time memory failures.
_unregister(ID);
}
return 0;
return 0; // failed
}
// For public consumption
@ -907,10 +1182,11 @@ UChar Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const {
(localFilter->contains(c = text.charAt(i)) ? c : (UChar)0xFFFE);
}
// TODO Move this into the class
// NO This should remain a C function for os/390 and Solaris Workshop [grhoten]
/**
* Comparison function for UVector.
*
* Do not make this a class static: This should remain a C function
* for os/390 and Solaris Workshop [grhoten]
*/
U_CDECL_BEGIN
static UBool U_CALLCONV

View File

@ -17,6 +17,7 @@
#include "unicode/hextouni.h"
#include "unicode/unitohex.h"
#include "unicode/unicode.h"
#include "unicode/uniset.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
@ -61,6 +62,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE(26,TestLiberalizedID);
TESTCASE(27,TestCreateInstance);
TESTCASE(28,TestNormalizationTransliterator);
TESTCASE(29,TestCompoundRBT);
default: name = ""; break;
}
}
@ -1053,93 +1055,6 @@ void TransliteratorTest::TestLiberalizedID(void) {
}
}
//======================================================================
// Support methods
//======================================================================
void TransliteratorTest::expect(const UnicodeString& rules,
const UnicodeString& source,
const UnicodeString& expectedResult) {
UErrorCode status = U_ZERO_ERROR;
Transliterator *t = new RuleBasedTransliterator("<ID>", rules, status);
if (U_FAILURE(status)) {
errln("FAIL: Transliterator constructor failed");
} else {
expect(*t, source, expectedResult);
}
delete t;
}
void TransliteratorTest::expect(const Transliterator& t,
const UnicodeString& source,
const UnicodeString& expectedResult,
const Transliterator& reverseTransliterator) {
expect(t, source, expectedResult);
expect(reverseTransliterator, expectedResult, source);
}
void TransliteratorTest::expect(const Transliterator& t,
const UnicodeString& source,
const UnicodeString& expectedResult) {
UnicodeString result(source);
t.transliterate(result);
expectAux(t.getID() + ":String", source, result, expectedResult);
UnicodeString rsource(source);
t.transliterate(rsource);
expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
// Test keyboard (incremental) transliteration -- this result
// must be the same after we finalize (see below).
rsource.remove();
UTransPosition index={0, 0, 0, 0};
UnicodeString log;
for (int32_t i=0; i<source.length(); ++i) {
if (i != 0) {
log.append(" + ");
}
log.append(source.charAt(i)).append(" -> ");
UErrorCode status = U_ZERO_ERROR;
t.transliterate(rsource, index, source.charAt(i), status);
// Append the string buffer with a vertical bar '|' where
// the committed index is.
UnicodeString left, right;
rsource.extractBetween(0, index.start, left);
rsource.extractBetween(index.start, rsource.length(), right);
log.append(left).append((UChar)PIPE).append(right);
}
// As a final step in keyboard transliteration, we must call
// transliterate to finish off any pending partial matches that
// were waiting for more input.
t.finishTransliteration(rsource, index);
log.append(" => ").append(rsource);
expectAux(t.getID() + ":Keyboard", log,
rsource == expectedResult,
expectedResult);
}
void TransliteratorTest::expectAux(const UnicodeString& tag,
const UnicodeString& source,
const UnicodeString& result,
const UnicodeString& expectedResult) {
expectAux(tag, source + " -> " + result,
result == expectedResult,
expectedResult);
}
void TransliteratorTest::expectAux(const UnicodeString& tag,
const UnicodeString& summary, UBool pass,
const UnicodeString& expectedResult) {
if (pass) {
logln(UnicodeString("(")+tag+") " + prettify(summary));
} else {
errln(UnicodeString("FAIL: (")+tag+") "
+ prettify(summary)
+ ", expected " + prettify(expectedResult));
}
}
/* test for Jitterbug 912 */
void TransliteratorTest::TestCreateInstance(){
UParseError *err = 0;
@ -1248,3 +1163,157 @@ void TransliteratorTest::TestNormalizationTransliterator() {
delete NFKD;
delete NFKC;
}
/**
* Test compound RBT rules.
*/
void TransliteratorTest::TestCompoundRBT(void) {
// Careful with spacing and ';' here: Phrase this exactly
// as toRules() is going to return it. If toRules() changes
// with regard to spacing or ';', then adjust this string.
UnicodeString rule("::Hex-Unicode;\n"
"::Any-Lower;\n"
"a > '.A.';\n"
"b > '.B.';\n"
"::Any[^t]-Upper;", "");
Transliterator *t = Transliterator::createFromRules("Test", rule);
if (t == 0) {
errln("FAIL: createFromRules failed");
return;
}
expect(*t, "\\u0043at in the hat, bat on the mat",
"C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
UnicodeString r;
t->toRules(r, TRUE);
if (r == rule) {
logln((UnicodeString)"OK: toRules() => " + r);
} else {
errln((UnicodeString)"FAIL: toRules() => " + r +
", expected " + rule);
}
delete t;
// Now test toRules
t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic");
if (t == 0) {
errln("FAIL: createInstance failed");
return;
}
UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;");
t->toRules(r, TRUE);
if (r != exp) {
errln((UnicodeString)"FAIL: toRules() => " + r +
", expected " + exp);
} else {
logln((UnicodeString)"OK: toRules() => " + r);
}
delete t;
// Round trip the result of toRules
t = Transliterator::createFromRules("Test", r);
if (t == 0) {
errln("FAIL: createFromRules #2 failed");
return;
} else {
logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded");
}
// Test toRules again
t->toRules(r, TRUE);
if (r != exp) {
errln((UnicodeString)"FAIL: toRules() => " + r +
", expected " + exp);
} else {
logln((UnicodeString)"OK: toRules() => " + r);
}
delete t;
}
//======================================================================
// Support methods
//======================================================================
void TransliteratorTest::expect(const UnicodeString& rules,
const UnicodeString& source,
const UnicodeString& expectedResult) {
UErrorCode status = U_ZERO_ERROR;
Transliterator *t = new RuleBasedTransliterator("<ID>", rules, status);
if (U_FAILURE(status)) {
errln("FAIL: Transliterator constructor failed");
} else {
expect(*t, source, expectedResult);
}
delete t;
}
void TransliteratorTest::expect(const Transliterator& t,
const UnicodeString& source,
const UnicodeString& expectedResult,
const Transliterator& reverseTransliterator) {
expect(t, source, expectedResult);
expect(reverseTransliterator, expectedResult, source);
}
void TransliteratorTest::expect(const Transliterator& t,
const UnicodeString& source,
const UnicodeString& expectedResult) {
UnicodeString result(source);
t.transliterate(result);
expectAux(t.getID() + ":String", source, result, expectedResult);
UnicodeString rsource(source);
t.transliterate(rsource);
expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
// Test keyboard (incremental) transliteration -- this result
// must be the same after we finalize (see below).
rsource.remove();
UTransPosition index={0, 0, 0, 0};
UnicodeString log;
for (int32_t i=0; i<source.length(); ++i) {
if (i != 0) {
log.append(" + ");
}
log.append(source.charAt(i)).append(" -> ");
UErrorCode status = U_ZERO_ERROR;
t.transliterate(rsource, index, source.charAt(i), status);
// Append the string buffer with a vertical bar '|' where
// the committed index is.
UnicodeString left, right;
rsource.extractBetween(0, index.start, left);
rsource.extractBetween(index.start, rsource.length(), right);
log.append(left).append((UChar)PIPE).append(right);
}
// As a final step in keyboard transliteration, we must call
// transliterate to finish off any pending partial matches that
// were waiting for more input.
t.finishTransliteration(rsource, index);
log.append(" => ").append(rsource);
expectAux(t.getID() + ":Keyboard", log,
rsource == expectedResult,
expectedResult);
}
void TransliteratorTest::expectAux(const UnicodeString& tag,
const UnicodeString& source,
const UnicodeString& result,
const UnicodeString& expectedResult) {
expectAux(tag, source + " -> " + result,
result == expectedResult,
expectedResult);
}
void TransliteratorTest::expectAux(const UnicodeString& tag,
const UnicodeString& summary, UBool pass,
const UnicodeString& expectedResult) {
if (pass) {
logln(UnicodeString("(")+tag+") " + prettify(summary));
} else {
errln(UnicodeString("FAIL: (")+tag+") "
+ prettify(summary)
+ ", expected " + prettify(expectedResult));
}
}

View File

@ -159,6 +159,8 @@ class TransliteratorTest : public IntlTest {
void TestNormalizationTransliterator(void);
void TestCompoundRBT(void);
//======================================================================
// Support methods
//======================================================================