ICU-1048 allow ::ID blocks in rules
X-SVN-Rev: 5233
This commit is contained in:
parent
f35b88f4eb
commit
9805ae16d9
@ -10,6 +10,7 @@
|
||||
#include "unicode/cpdtrans.h"
|
||||
#include "unicode/unifilt.h"
|
||||
#include "unicode/unifltlg.h"
|
||||
#include "uvector.h"
|
||||
|
||||
/**
|
||||
* Constructs a new compound transliterator given an array of
|
||||
@ -30,7 +31,7 @@ CompoundTransliterator::CompoundTransliterator(
|
||||
int32_t transliteratorCount,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter),
|
||||
trans(0), filters(0), count(0) {
|
||||
trans(0), filters(0), count(0), compoundRBTIndex(-1) {
|
||||
setTransliterators(transliterators, transliteratorCount);
|
||||
}
|
||||
|
||||
@ -46,44 +47,142 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
UErrorCode& status) :
|
||||
Transliterator(id, 0), // set filter to 0 here!
|
||||
trans(0), filters(0) {
|
||||
init(id, direction, adoptedFilter, status);
|
||||
trans(0), filters(0), compoundRBTIndex(-1) {
|
||||
init(id, direction, adoptedFilter, -1, 0, TRUE, status);
|
||||
}
|
||||
|
||||
CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
|
||||
UErrorCode& status) :
|
||||
Transliterator(id, 0), // set filter to 0 here!
|
||||
trans(0), filters(0) {
|
||||
init(id, UTRANS_FORWARD, 0, status);
|
||||
trans(0), filters(0), compoundRBTIndex(-1) {
|
||||
init(id, UTRANS_FORWARD, 0, -1, 0, TRUE, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Private constructor for compound RBTs. Construct a compound
|
||||
* transliterator using the given idBlock, with the adoptedTrans
|
||||
* inserted at the idSplitPoint.
|
||||
*/
|
||||
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
|
||||
const UnicodeString& idBlock,
|
||||
int32_t idSplitPoint,
|
||||
Transliterator *adoptedTrans,
|
||||
UErrorCode& status) :
|
||||
Transliterator(ID, 0),
|
||||
trans(0), filters(0), compoundRBTIndex(-1) {
|
||||
init(idBlock, UTRANS_FORWARD, 0, idSplitPoint, adoptedTrans, FALSE, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Private constructor for Transliterator from a vector of
|
||||
* transliterators. The vector order is FORWARD, so if dir is REVERSE
|
||||
* then the vector order will be reversed.
|
||||
*/
|
||||
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
|
||||
UTransDirection dir,
|
||||
UVector& list,
|
||||
UErrorCode& status) :
|
||||
Transliterator(ID, 0),
|
||||
trans(0), filters(0), compoundRBTIndex(-1) {
|
||||
init(list, dir, 0, TRUE, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finish constructing a transliterator: only to be called by
|
||||
* constructors. Before calling init(), set trans and filter to NULL.
|
||||
* @param id the id containing ';'-separated entries
|
||||
* @param direction either FORWARD or REVERSE
|
||||
* @param adoptedFilter a filter object to be owned by this transliterator.
|
||||
* May be NULL.
|
||||
* @param idSplitPoint the index into id at which the
|
||||
* adoptedSplitTransliterator should be inserted, if there is one, or
|
||||
* -1 if there is none.
|
||||
* @param adoptedSplitTransliterator a transliterator to be inserted
|
||||
* before the entry at offset idSplitPoint in the id string. May be
|
||||
* NULL to insert no entry.
|
||||
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
|
||||
* entries by calling getID() of component entries. Some constructors
|
||||
* do not require this because they apply a facade ID anyway.
|
||||
* @param status the error code indicating success or failure
|
||||
*/
|
||||
void CompoundTransliterator::init(const UnicodeString& id,
|
||||
UTransDirection direction,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
int32_t idSplitPoint,
|
||||
Transliterator *adoptedSplitTrans,
|
||||
UBool fixReverseID,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status))
|
||||
return;
|
||||
UnicodeString* list = split(id, ID_DELIM, &count);
|
||||
trans = new Transliterator*[count];
|
||||
for (int32_t i = 0; i < count; ++i) {
|
||||
trans[i] = createInstance(list[direction==UTRANS_FORWARD ? i : (count-1-i)],
|
||||
direction);
|
||||
if (trans[i] == NULL) {
|
||||
while (++i < count)
|
||||
trans[i] = 0;
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
delete[] list;
|
||||
delete adoptedFilter;
|
||||
return;
|
||||
}
|
||||
}
|
||||
delete[] list;
|
||||
// assert(trans == 0);
|
||||
// assert(filters == 0);
|
||||
|
||||
// If the direction is UTRANS_REVERSE then we need to fix
|
||||
// the ID.
|
||||
if (direction == UTRANS_REVERSE) {
|
||||
if (U_FAILURE(status)) {
|
||||
delete adoptedFilter;
|
||||
delete adoptedSplitTrans;
|
||||
return;
|
||||
}
|
||||
|
||||
UVector list;
|
||||
Transliterator::parseCompoundID(id, direction,
|
||||
idSplitPoint, adoptedSplitTrans,
|
||||
list, compoundRBTIndex,
|
||||
NULL, status);
|
||||
|
||||
init(list, direction, adoptedFilter, fixReverseID, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finish constructing a transliterator: only to be called by
|
||||
* constructors. Before calling init(), set trans and filter to NULL.
|
||||
* @param list a vector of transliterator objects to be adopted. It
|
||||
* should NOT be empty. The list should be in declared order. That
|
||||
* is, it should be in the FORWARD order; if direction is REVERSE then
|
||||
* the list order will be reversed.
|
||||
* @param direction either FORWARD or REVERSE
|
||||
* @param adoptedFilter a filter object to be owned by this transliterator.
|
||||
* May be NULL.
|
||||
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
|
||||
* entries by calling getID() of component entries. Some constructors
|
||||
* do not require this because they apply a facade ID anyway.
|
||||
* @param status the error code indicating success or failure
|
||||
*/
|
||||
void CompoundTransliterator::init(UVector& list,
|
||||
UTransDirection direction,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
UBool fixReverseID,
|
||||
UErrorCode& status) {
|
||||
// assert(trans == 0);
|
||||
// assert(filters == 0);
|
||||
|
||||
// Allocate array
|
||||
if (U_SUCCESS(status)) {
|
||||
count = list.size();
|
||||
trans = new Transliterator*[count];
|
||||
}
|
||||
|
||||
if (U_FAILURE(status) || trans == 0) {
|
||||
delete adoptedFilter;
|
||||
// assert(trans == 0);
|
||||
return;
|
||||
}
|
||||
|
||||
// Move the transliterators from the vector into an array.
|
||||
// Reverse the order if necessary.
|
||||
int32_t i;
|
||||
for (i=0; i<count; ++i) {
|
||||
int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i;
|
||||
trans[i] = (Transliterator*) list.elementAt(j);
|
||||
}
|
||||
|
||||
// Fix compoundRBTIndex for REVERSE transliterators
|
||||
if (compoundRBTIndex >= 0 && direction == UTRANS_REVERSE) {
|
||||
compoundRBTIndex = count - 1 - compoundRBTIndex;
|
||||
}
|
||||
|
||||
// If the direction is UTRANS_REVERSE then we may need to fix the
|
||||
// ID.
|
||||
if (direction == UTRANS_REVERSE && fixReverseID) {
|
||||
UnicodeString newID;
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
for (i=0; i<count; ++i) {
|
||||
if (i > 0) {
|
||||
newID.append(ID_DELIM);
|
||||
}
|
||||
@ -113,35 +212,35 @@ UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterat
|
||||
return id; // Return temporary
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits a string, as in JavaScript
|
||||
*/
|
||||
UnicodeString* CompoundTransliterator::split(const UnicodeString& s,
|
||||
UChar divider,
|
||||
int32_t* countPtr) {
|
||||
// changed MED
|
||||
// see how many there are
|
||||
*countPtr = 1;
|
||||
int32_t i;
|
||||
for (i = 0; i < s.length(); ++i) {
|
||||
if (s.charAt(i) == divider)
|
||||
++(*countPtr);
|
||||
}
|
||||
|
||||
// make an array with them
|
||||
UnicodeString* result = new UnicodeString[*countPtr];
|
||||
int32_t last = 0;
|
||||
int32_t current = 0;
|
||||
|
||||
for (i = 0; i < s.length(); ++i) {
|
||||
if (s.charAt(i) == divider) {
|
||||
s.extractBetween(last, i, result[current++]);
|
||||
last = i+1;
|
||||
}
|
||||
}
|
||||
s.extractBetween(last, i, result[current]);
|
||||
return result;
|
||||
}
|
||||
///**
|
||||
// * Splits a string, as in JavaScript
|
||||
// */
|
||||
//UnicodeString* CompoundTransliterator::split(const UnicodeString& s,
|
||||
// UChar divider,
|
||||
// int32_t* countPtr) {
|
||||
// // changed MED
|
||||
// // see how many there are
|
||||
// *countPtr = 1;
|
||||
// int32_t i;
|
||||
// for (i = 0; i < s.length(); ++i) {
|
||||
// if (s.charAt(i) == divider)
|
||||
// ++(*countPtr);
|
||||
// }
|
||||
//
|
||||
// // make an array with them
|
||||
// UnicodeString* result = new UnicodeString[*countPtr];
|
||||
// int32_t last = 0;
|
||||
// int32_t current = 0;
|
||||
//
|
||||
// for (i = 0; i < s.length(); ++i) {
|
||||
// if (s.charAt(i) == divider) {
|
||||
// s.extractBetween(last, i, result[current++]);
|
||||
// last = i+1;
|
||||
// }
|
||||
// }
|
||||
// s.extractBetween(last, i, result[current]);
|
||||
// return result;
|
||||
//}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
@ -301,73 +400,102 @@ void CompoundTransliterator::adoptFilter(UnicodeFilter* f) {
|
||||
Transliterator::adoptFilter(f);
|
||||
}
|
||||
|
||||
UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
|
||||
UBool escapeUnprintable) const {
|
||||
// We do NOT call toRules() on our component transliterators, in
|
||||
// general. If we have several rule-based transliterators, this
|
||||
// yields a concatenation of the rules -- not what we want. We do
|
||||
// handle compound RBT transliterators specially -- those for which
|
||||
// compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
|
||||
// we do call toRules() recursively.
|
||||
rulesSource.truncate(0);
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
UnicodeString rule;
|
||||
if (i == compoundRBTIndex) {
|
||||
trans[i]->toRules(rule, escapeUnprintable);
|
||||
} else {
|
||||
trans[i]->Transliterator::toRules(rule, escapeUnprintable);
|
||||
}
|
||||
if (rulesSource.length() &&
|
||||
rulesSource.charAt(rulesSource.length() - 1) != 10) {
|
||||
rulesSource.append((UChar)10);
|
||||
}
|
||||
rulesSource.append(rule);
|
||||
if (rulesSource.length() &&
|
||||
rulesSource.charAt(rulesSource.length() - 1) != ID_DELIM) {
|
||||
rulesSource.append(ID_DELIM);
|
||||
}
|
||||
}
|
||||
return rulesSource;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
|
||||
UBool incremental) const {
|
||||
/* Call each transliterator with the same start value and
|
||||
* initial cursor index, but with the limit index as modified
|
||||
* by preceding transliterators. The cursor index must be
|
||||
/* Call each transliterator with the same contextStart and
|
||||
* start, but with the limit as modified
|
||||
* by preceding transliterators. The start index must be
|
||||
* reset for each transliterator to give each a chance to
|
||||
* transliterate the text. The initial cursor index is known
|
||||
* transliterate the text. The initial contextStart index is known
|
||||
* to still point to the same place after each transliterator
|
||||
* is called because each transliterator will not change the
|
||||
* text between start and the initial value of cursor.
|
||||
* text between contextStart and the initial start index.
|
||||
*
|
||||
* IMPORTANT: After the first transliterator, each subsequent
|
||||
* transliterator only gets to transliterate text committed by
|
||||
* preceding transliterators; that is, the cursor (output
|
||||
* preceding transliterators; that is, the start (output
|
||||
* value) of transliterator i becomes the limit (input value)
|
||||
* of transliterator i+1. Finally, the overall limit is fixed
|
||||
* up before we return.
|
||||
*
|
||||
* Assumptions we make here:
|
||||
* (1) start <= cursor <= limit ;cursor valid on entry
|
||||
* (2) cursor <= cursor' <= limit' ;cursor doesn't move back
|
||||
* (3) cursor <= limit' ;text before cursor unchanged
|
||||
* - cursor' is the value of cursor after calling handleKT
|
||||
* (1) contextStart <= start <= limit ;cursor valid on entry
|
||||
* (2) start <= start' <= limit' ;cursor doesn't move back
|
||||
* (3) start <= limit' ;text before start unchanged
|
||||
* - start' is the value of start after calling handleKT
|
||||
* - limit' is the value of limit after calling handleKT
|
||||
*/
|
||||
|
||||
/**
|
||||
* Example: 3 transliterators. This example illustrates the
|
||||
* mechanics we need to implement. S, C, and L are the start,
|
||||
* cursor, and limit. gl is the globalLimit.
|
||||
* mechanics we need to implement. C, S, and L are the contextStart,
|
||||
* start, and limit. gl is the globalLimit.
|
||||
*
|
||||
* 1. h-u, changes hex to Unicode
|
||||
*
|
||||
* 4 7 a d 0 4 7 a
|
||||
* abc/u0061/u => abca/u
|
||||
* S C L S C L gl=f->a
|
||||
* C S L C S L gl=f->a
|
||||
*
|
||||
* 2. upup, changes "x" to "XX"
|
||||
*
|
||||
* 4 7 a 4 7 a
|
||||
* abca/u => abcAA/u
|
||||
* S CL S C
|
||||
* C SL C S
|
||||
* L gl=a->b
|
||||
* 3. u-h, changes Unicode to hex
|
||||
*
|
||||
* 4 7 a 4 7 a d 0 3
|
||||
* abcAA/u => abc/u0041/u0041/u
|
||||
* S C L S C
|
||||
* C S L C S
|
||||
* L gl=b->15
|
||||
* 4. return
|
||||
*
|
||||
* 4 7 a d 0 3
|
||||
* abc/u0041/u0041/u
|
||||
* S C L
|
||||
* C S L
|
||||
*/
|
||||
|
||||
if (count < 1) {
|
||||
index.start = index.limit;
|
||||
return; // Short circuit for empty compound transliterators
|
||||
}
|
||||
|
||||
int32_t i;
|
||||
int32_t cursor = index.start;
|
||||
int32_t limit = index.limit;
|
||||
int32_t globalLimit = limit;
|
||||
int32_t start = index.start;
|
||||
int32_t globalLimit = index.limit;
|
||||
/* globalLimit is the overall limit. We keep track of this
|
||||
* since we overwrite index.limit with the previous
|
||||
* index.start. After each transliteration, we update
|
||||
@ -375,16 +503,16 @@ void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPositi
|
||||
*/
|
||||
|
||||
for (i=0; i<count; ++i) {
|
||||
index.start = cursor; // Reset cursor
|
||||
index.limit = limit;
|
||||
index.start = start; // Reset start
|
||||
int32_t limit = index.limit;
|
||||
|
||||
trans[i]->handleTransliterate(text, index, incremental);
|
||||
|
||||
// Adjust overall limit for insertions/deletions
|
||||
globalLimit += index.limit - limit;
|
||||
limit = index.start; // Move limit to end of committed text
|
||||
index.limit = index.start; // Move limit to end of committed text
|
||||
}
|
||||
// Cursor is good where it is -- where the last
|
||||
// Start is good where it is -- where the last
|
||||
// transliterator left it. Limit needs to be put back
|
||||
// where it was, modulo adjustments for deletions/insertions.
|
||||
index.limit = globalLimit;
|
||||
|
@ -15,10 +15,10 @@
|
||||
*/
|
||||
void NormalizationTransliterator::registerIDs() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator::_registerFactory(UnicodeString("NFC", ""), _createNFC, status);
|
||||
Transliterator::_registerFactory(UnicodeString("NFKC", ""), _createNFKC, status);
|
||||
Transliterator::_registerFactory(UnicodeString("NFD", ""), _createNFD, status);
|
||||
Transliterator::_registerFactory(UnicodeString("NFKD", ""), _createNFKD, status);
|
||||
Transliterator::_registerFactory(UnicodeString("Any-NFC", ""), _createNFC, status);
|
||||
Transliterator::_registerFactory(UnicodeString("Any-NFKC", ""), _createNFKC, status);
|
||||
Transliterator::_registerFactory(UnicodeString("Any-NFD", ""), _createNFD, status);
|
||||
Transliterator::_registerFactory(UnicodeString("Any-NFKD", ""), _createNFKD, status);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -22,7 +22,7 @@ void RuleBasedTransliterator::_construct(const UnicodeString& rules,
|
||||
data = 0;
|
||||
isDataOwned = TRUE;
|
||||
if (U_SUCCESS(status)) {
|
||||
data = TransliterationRuleParser::parse(rules, direction, parseError);
|
||||
data = TransliteratorParser::parse(rules, direction, parseError);
|
||||
if (data == 0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
} else {
|
||||
@ -40,6 +40,18 @@ RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
|
||||
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal constructor.
|
||||
*/
|
||||
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
|
||||
TransliterationRuleData* theData,
|
||||
UBool isDataAdopted) :
|
||||
Transliterator(id, 0),
|
||||
data(theData),
|
||||
isDataOwned(isDataAdopted) {
|
||||
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor. Since the data object is immutable, we can share
|
||||
* it with other objects -- no need to clone it.
|
||||
|
@ -48,7 +48,11 @@
|
||||
// trailing SymbolTable.SYMBOL_REF character.
|
||||
// private static final char ANCHOR_END = '$';
|
||||
|
||||
const UnicodeString TransliterationRuleParser::gOPERATORS = OPERATORS;
|
||||
const UnicodeString TransliteratorParser::gOPERATORS = OPERATORS;
|
||||
|
||||
// These are also used in Transliterator::toRules()
|
||||
static const int32_t ID_TOKEN_LEN = 2;
|
||||
static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// BEGIN ParseData
|
||||
@ -167,14 +171,14 @@ public:
|
||||
UBool anchorStart;
|
||||
UBool anchorEnd;
|
||||
|
||||
TransliterationRuleParser& parser;
|
||||
TransliteratorParser& parser;
|
||||
|
||||
static const UnicodeString gOperators;
|
||||
|
||||
//--------------------------------------------------
|
||||
// Methods
|
||||
|
||||
RuleHalf(TransliterationRuleParser& parser);
|
||||
RuleHalf(TransliteratorParser& parser);
|
||||
~RuleHalf();
|
||||
|
||||
/**
|
||||
@ -220,7 +224,7 @@ inline int32_t _voidPtr_to_int32(void* x) {
|
||||
|
||||
const UnicodeString RuleHalf::gOperators = OPERATORS;
|
||||
|
||||
RuleHalf::RuleHalf(TransliterationRuleParser& p) : parser(p) {
|
||||
RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) {
|
||||
cursor = -1;
|
||||
ante = -1;
|
||||
post = -1;
|
||||
@ -487,24 +491,62 @@ int32_t* RuleHalf::createSegments() const {
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
TransliterationRuleData*
|
||||
TransliterationRuleParser::parse(const UnicodeString& rules,
|
||||
UTransDirection direction,
|
||||
UParseError* parseError) {
|
||||
TransliterationRuleParser parser(rules, direction, parseError);
|
||||
parser.parseRules();
|
||||
if (U_FAILURE(parser.status)) {
|
||||
TransliteratorParser::parse(const UnicodeString& rules,
|
||||
UTransDirection direction,
|
||||
UParseError* parseError) {
|
||||
TransliteratorParser parser(rules, direction, parseError);
|
||||
UnicodeString idBlock;
|
||||
int32_t idSplitPoint, count;
|
||||
parser.parseRules(idBlock, idSplitPoint, count);
|
||||
if (U_FAILURE(parser.status) || idBlock.length() != 0) {
|
||||
delete parser.data;
|
||||
parser.data = 0;
|
||||
}
|
||||
return parser.data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a given set of rules. Return up to three pieces of
|
||||
* parsed data. These are the header ::id block, the rule block,
|
||||
* and the footer ::id block. Any or all of these may be empty.
|
||||
* If the ::id blocks are empty, their corresponding parameters
|
||||
* are returned as the empty string. If there are no rules, the
|
||||
* TransliterationRuleData result is 0.
|
||||
* @param ruleDataResult caller owns the pointer stored here.
|
||||
* May be NULL.
|
||||
* @param headerRule string including semicolons for the header
|
||||
* ::id block. May be empty.
|
||||
* @param footerRule string including semicolons for the footer
|
||||
* ::id block. May be empty.
|
||||
*/
|
||||
void TransliteratorParser::parse(const UnicodeString& rules,
|
||||
UTransDirection direction,
|
||||
TransliterationRuleData*& ruleDataResult,
|
||||
UnicodeString& idBlockResult,
|
||||
int32_t& idSplitPointResult,
|
||||
UParseError* parseError,
|
||||
UErrorCode& ec) {
|
||||
if (U_FAILURE(ec)) {
|
||||
ruleDataResult = 0;
|
||||
return;
|
||||
}
|
||||
TransliteratorParser parser(rules, direction, parseError);
|
||||
int32_t count;
|
||||
parser.parseRules(idBlockResult, idSplitPointResult, count);
|
||||
if (U_FAILURE(parser.status) || count == 0) {
|
||||
delete parser.data;
|
||||
parser.data = 0;
|
||||
}
|
||||
ruleDataResult = parser.data;
|
||||
ec = parser.status;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param rules list of rules, separated by newline characters
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
TransliterationRuleParser::TransliterationRuleParser(
|
||||
TransliteratorParser::TransliteratorParser(
|
||||
const UnicodeString& theRules,
|
||||
UTransDirection theDirection,
|
||||
UParseError* theParseError) :
|
||||
@ -515,7 +557,7 @@ TransliterationRuleParser::TransliterationRuleParser(
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
TransliterationRuleParser::~TransliterationRuleParser() {
|
||||
TransliteratorParser::~TransliteratorParser() {
|
||||
delete parseData;
|
||||
}
|
||||
|
||||
@ -527,8 +569,11 @@ TransliterationRuleParser::~TransliterationRuleParser() {
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
void TransliterationRuleParser::parseRules(void) {
|
||||
void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
|
||||
int32_t& idSplitPointResult,
|
||||
int32_t& ruleCount) {
|
||||
status = U_ZERO_ERROR;
|
||||
ruleCount = 0;
|
||||
|
||||
delete data;
|
||||
data = new TransliterationRuleData(status);
|
||||
@ -543,14 +588,21 @@ void TransliterationRuleParser::parseRules(void) {
|
||||
}
|
||||
determineVariableRange();
|
||||
|
||||
UnicodeString str; // scratch
|
||||
idBlockResult.truncate(0);
|
||||
idSplitPointResult = -1;
|
||||
int32_t pos = 0;
|
||||
int32_t limit = rules.length();
|
||||
// The mode marks whether we are in the header ::id block, the
|
||||
// rule block, or the footer ::id block.
|
||||
// mode == 0: start: rule->1, ::id->0
|
||||
// mode == 1: in rules: rule->1, ::id->2
|
||||
// mode == 2: in footer rule block: rule->ERROR, ::id->2
|
||||
int32_t mode = 0;
|
||||
while (pos < limit && U_SUCCESS(status)) {
|
||||
UChar c = rules.charAt(pos++);
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
// Ignore leading whitespace. Note that this is not
|
||||
// Unicode spaces, but Java spaces -- a subset,
|
||||
// representing whitespace likely to be seen in code.
|
||||
// Ignore leading whitespace.
|
||||
continue;
|
||||
}
|
||||
// Skip lines starting with the comment character
|
||||
@ -561,10 +613,50 @@ void TransliterationRuleParser::parseRules(void) {
|
||||
}
|
||||
continue; // Either fall out or restart with next line
|
||||
}
|
||||
// We've found the start of a rule. c is its first
|
||||
// character, and pos points past c. Lexically parse the
|
||||
// rule into component pieces.
|
||||
pos = parseRule(--pos, limit);
|
||||
// We've found the start of a rule or ID. c is its first
|
||||
// character, and pos points past c.
|
||||
--pos;
|
||||
// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
|
||||
// chars left.
|
||||
if ((pos + ID_TOKEN_LEN + 1) <= limit &&
|
||||
rules.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
|
||||
pos += ID_TOKEN_LEN;
|
||||
c = rules.charAt(pos);
|
||||
while (Unicode::isWhitespace(c) && pos < limit) {
|
||||
++pos;
|
||||
c = rules.charAt(pos);
|
||||
}
|
||||
int32_t p = pos;
|
||||
UBool sawDelim;
|
||||
Transliterator::parseID(rules, p, sawDelim, direction, NULL, FALSE);
|
||||
if (p == pos) {
|
||||
// Invalid ::id
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
} else {
|
||||
if (mode == 1) {
|
||||
mode = 2;
|
||||
idSplitPointResult = idBlockResult.length();
|
||||
}
|
||||
rules.extractBetween(pos, p, str);
|
||||
idBlockResult.append(str);
|
||||
if (!sawDelim) {
|
||||
idBlockResult.append((UChar)0x003B /*;*/);
|
||||
}
|
||||
pos = p;
|
||||
}
|
||||
} else {
|
||||
// Parse a rule
|
||||
pos = parseRule(pos, limit);
|
||||
if (U_SUCCESS(status)) {
|
||||
++ruleCount;
|
||||
if (mode == 2) {
|
||||
// ::id in illegal position (because a rule
|
||||
// occurred after the ::id footer block)
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
mode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the set vector to an array
|
||||
@ -573,7 +665,8 @@ void TransliterationRuleParser::parseRules(void) {
|
||||
// orphanElement removes the given element and shifts all other
|
||||
// elements down. For performance (and code clarity) we work from
|
||||
// the end back to index 0.
|
||||
for (int32_t i=data->setVariablesLength; i>0; ) {
|
||||
int32_t i;
|
||||
for (i=data->setVariablesLength; i>0; ) {
|
||||
--i;
|
||||
data->setVariables[i] =
|
||||
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
|
||||
@ -582,6 +675,9 @@ void TransliterationRuleParser::parseRules(void) {
|
||||
// Index the rules
|
||||
if (U_SUCCESS(status)) {
|
||||
data->ruleSet.freeze(*data, status);
|
||||
if (idSplitPointResult < 0) {
|
||||
idSplitPointResult = idBlockResult.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -598,7 +694,7 @@ void TransliterationRuleParser::parseRules(void) {
|
||||
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
||||
* creates a rule object and adds it to our rule list.
|
||||
*/
|
||||
int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
|
||||
int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
|
||||
// Locate the left side, operator, and right side
|
||||
int32_t start = pos;
|
||||
UChar op = 0;
|
||||
@ -759,7 +855,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
|
||||
* @param rule pattern string
|
||||
* @param start position of first character of current rule
|
||||
*/
|
||||
int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
|
||||
int32_t TransliteratorParser::syntaxError(int32_t parseErrorCode,
|
||||
const UnicodeString& rule,
|
||||
int32_t start) {
|
||||
if (parseError != 0) {
|
||||
@ -786,7 +882,7 @@ int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
|
||||
* Parse a UnicodeSet out, store it, and return the stand-in character
|
||||
* used to represent it.
|
||||
*/
|
||||
UChar TransliterationRuleParser::parseSet(const UnicodeString& rule,
|
||||
UChar TransliteratorParser::parseSet(const UnicodeString& rule,
|
||||
ParsePosition& pos) {
|
||||
UnicodeSet* set = new UnicodeSet(rule, pos, *parseData, status);
|
||||
if (variableNext >= variableLimit) {
|
||||
@ -804,7 +900,7 @@ UChar TransliterationRuleParser::parseSet(const UnicodeString& rule,
|
||||
* Append the value of the given variable name to the given
|
||||
* UnicodeString.
|
||||
*/
|
||||
void TransliterationRuleParser::appendVariableDef(const UnicodeString& name,
|
||||
void TransliteratorParser::appendVariableDef(const UnicodeString& name,
|
||||
UnicodeString& buf) {
|
||||
const UnicodeString* s = (const UnicodeString*) data->variableNames->get(name);
|
||||
if (s == NULL) {
|
||||
@ -839,7 +935,7 @@ void TransliterationRuleParser::appendVariableDef(const UnicodeString& name,
|
||||
* When done, everything not in the hash is available for use. In practice,
|
||||
* this method may employ some other algorithm for improved speed.
|
||||
*/
|
||||
void TransliterationRuleParser::determineVariableRange(void) {
|
||||
void TransliteratorParser::determineVariableRange(void) {
|
||||
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
|
||||
|
||||
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
|
||||
@ -864,7 +960,7 @@ void TransliterationRuleParser::determineVariableRange(void) {
|
||||
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
||||
* found by a search for 'h'.
|
||||
*/
|
||||
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
|
||||
int32_t TransliteratorParser::quotedIndexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
UChar charToFind) {
|
||||
for (int32_t i=start; i<limit; ++i) {
|
||||
|
@ -18,7 +18,7 @@ class ParseData;
|
||||
class RuleHalf;
|
||||
class ParsePosition;
|
||||
|
||||
class TransliterationRuleParser {
|
||||
class TransliteratorParser {
|
||||
|
||||
/**
|
||||
* This is a reference to external data we don't own. This works because
|
||||
@ -87,6 +87,28 @@ public:
|
||||
UTransDirection direction,
|
||||
UParseError* parseError = 0);
|
||||
|
||||
/**
|
||||
* Parse a given set of rules. Return up to three pieces of
|
||||
* parsed data. These are the header ::id block, the rule block,
|
||||
* and the footer ::id block. Any or all of these may be empty.
|
||||
* If the ::id blocks are empty, their corresponding parameters
|
||||
* are returned as the empty string. If there are no rules, the
|
||||
* TransliterationRuleData result is 0.
|
||||
* @param ruleDataResult caller owns the pointer stored here.
|
||||
* May be NULL.
|
||||
* @param headerRule string including semicolons for the header
|
||||
* ::id block. May be empty.
|
||||
* @param footerRule string including semicolons for the footer
|
||||
* ::id block. May be empty.
|
||||
*/
|
||||
static void parse(const UnicodeString& rules,
|
||||
UTransDirection direction,
|
||||
TransliterationRuleData*& ruleDataResult,
|
||||
UnicodeString& idBlockResult,
|
||||
int32_t& idSplitPointResult,
|
||||
UParseError* parseError,
|
||||
UErrorCode& ec);
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
@ -94,14 +116,14 @@ private:
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
TransliterationRuleParser(const UnicodeString& rules,
|
||||
TransliteratorParser(const UnicodeString& rules,
|
||||
UTransDirection direction,
|
||||
UParseError* parseError = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~TransliterationRuleParser();
|
||||
~TransliteratorParser();
|
||||
|
||||
/**
|
||||
* Parse the given string as a sequence of rules, separated by newline
|
||||
@ -111,7 +133,8 @@ private:
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
void parseRules(void);
|
||||
void parseRules(UnicodeString& idBlockResult, int32_t& idSplitPointResult,
|
||||
int32_t& ruleCount);
|
||||
|
||||
/**
|
||||
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
||||
@ -139,13 +162,6 @@ private:
|
||||
*/
|
||||
int32_t syntaxError(int32_t parseErrorCode, const UnicodeString&, int32_t start);
|
||||
|
||||
/**
|
||||
* Allocate a private-use substitution character for the given set,
|
||||
* register it in the setVariables hash, and return the substitution
|
||||
* character.
|
||||
*/
|
||||
//UChar registerSet(UnicodeSet* adoptedSet);
|
||||
|
||||
/**
|
||||
* Parse a UnicodeSet out, store it, and return the stand-in character
|
||||
* used to represent it.
|
||||
@ -189,8 +205,8 @@ private:
|
||||
friend class RuleHalf;
|
||||
|
||||
// Disallowed methods; no impl.
|
||||
TransliterationRuleParser(const TransliterationRuleParser&);
|
||||
TransliterationRuleParser& operator=(const TransliterationRuleParser&);
|
||||
TransliteratorParser(const TransliteratorParser&);
|
||||
TransliteratorParser& operator=(const TransliteratorParser&);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -9,8 +9,7 @@
|
||||
*/
|
||||
#include "unicode/remtrans.h"
|
||||
|
||||
//const UnicodeString RemoveTransliterator::ID = UnicodeString("Remove", "");
|
||||
const UChar RemoveTransliterator::ID[] = {0x52, 0x65, 0x6D, 0x6F, 0x76, 0x65, 0x00}; /* "Remove" */
|
||||
const UChar RemoveTransliterator::ID[] = {65, 110, 121, 45, 0x52, 0x65, 0x6D, 0x6F, 0x76, 0x65, 0x00}; /* "Any-Remove" */
|
||||
|
||||
Transliterator* RemoveTransliterator::clone(void) const {
|
||||
return new RemoveTransliterator();
|
||||
|
@ -565,91 +565,347 @@ Transliterator* Transliterator::createInverse(void) const {
|
||||
Transliterator* Transliterator::createInstance(const UnicodeString& ID,
|
||||
UTransDirection dir,
|
||||
UParseError* parseError) {
|
||||
Transliterator* t = 0;
|
||||
if (ID.indexOf(ID_DELIM) >= 0) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
t = new CompoundTransliterator(ID, dir, 0, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete t;
|
||||
t = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
return createInstance(ID, dir, -1, NULL, parseError, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a transliterator given a compound ID (possibly degenerate,
|
||||
* with no ID_DELIM). If idSplitPoint >= 0 and adoptedSplitTrans !=
|
||||
* 0, then insert adoptedSplitTrans in the compound ID at offset
|
||||
* idSplitPoint. Otherwise idSplitPoint should be -1 and
|
||||
* adoptedSplitTrans should be 0. The resultant transliterator will
|
||||
* be an atomic (non-compound) transliterator if this is indicated by
|
||||
* ID. Otherwise it will be a compound translitertor.
|
||||
*/
|
||||
Transliterator* Transliterator::createInstance(const UnicodeString& ID,
|
||||
UTransDirection dir,
|
||||
int32_t idSplitPoint,
|
||||
Transliterator *adoptedSplitTrans,
|
||||
UParseError* parseError,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UVector list;
|
||||
int32_t ignored;
|
||||
parseCompoundID(ID, dir, idSplitPoint, adoptedSplitTrans,
|
||||
list, ignored, parseError, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
switch (list.size()) {
|
||||
case 0:
|
||||
return new NullTransliterator();
|
||||
|
||||
case 1:
|
||||
return (Transliterator*) list.elementAt(0);
|
||||
|
||||
default:
|
||||
return new CompoundTransliterator(ID, dir, list, status);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>Transliterator</code> object constructed from
|
||||
* the given rule string. This will be a RuleBasedTransliterator,
|
||||
* if the rule string contains only rules, or a
|
||||
* CompoundTransliterator, if it contains ID blocks, or a
|
||||
* NullTransliterator, if it contains ID blocks which parse as
|
||||
* empty for the given direction.
|
||||
*/
|
||||
Transliterator* Transliterator::createFromRules(const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
UTransDirection dir,
|
||||
UParseError* parseError) {
|
||||
UnicodeString idBlock;
|
||||
int32_t idSplitPoint = -1;
|
||||
TransliterationRuleData *data = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
TransliteratorParser::parse(rules, dir, data,
|
||||
idBlock, idSplitPoint,
|
||||
parseError, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// NOTE: The logic here matches that in _createInstance().
|
||||
if (idBlock.length() == 0) {
|
||||
if (data == 0) {
|
||||
// No idBlock, no data -- this is just an
|
||||
// alias for Null
|
||||
return new NullTransliterator();
|
||||
} else {
|
||||
// No idBlock, data != 0 -- this is an
|
||||
// ordinary RBT_DATA.
|
||||
return new RuleBasedTransliterator(ID, data, TRUE); // TRUE == adopt data object
|
||||
}
|
||||
} else {
|
||||
// 'id' is the ID with the filter pattern removed and with
|
||||
// whitespace deleted.
|
||||
UnicodeString id(ID);
|
||||
|
||||
// Look for embedded filter pattern
|
||||
UnicodeSet *filter = 0;
|
||||
int32_t setStart = id.indexOf((UChar)0x005B /*[*/);
|
||||
int32_t setLimit;
|
||||
if (setStart >= 0) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ParsePosition pos(setStart);
|
||||
filter = new UnicodeSet();
|
||||
filter->applyPattern(id, pos, 0, status);
|
||||
if (data == 0) {
|
||||
// idBlock, no data -- this is an alias
|
||||
Transliterator *t = createInstance(idBlock, dir, parseError);
|
||||
if (t != 0) {
|
||||
t->setID(ID);
|
||||
}
|
||||
return t;
|
||||
} else {
|
||||
// idBlock and data -- this is a compound
|
||||
// RBT
|
||||
UnicodeString id("_", "");
|
||||
Transliterator *t = new RuleBasedTransliterator(id, data, TRUE); // TRUE == adopt data object
|
||||
t = new CompoundTransliterator(ID, idBlock, idSplitPoint,
|
||||
t, status);
|
||||
if (U_FAILURE(status)) {
|
||||
// There was a parse failure in the filter pattern
|
||||
delete filter;
|
||||
return 0;
|
||||
delete t;
|
||||
t = 0;
|
||||
}
|
||||
setLimit = pos.getIndex();
|
||||
id.removeBetween(setStart, setLimit);
|
||||
}
|
||||
|
||||
// Delete whitespace
|
||||
int32_t i;
|
||||
for (i=0; i<id.length(); ++i) {
|
||||
if (Unicode::isWhitespace(id.charAt(i))) {
|
||||
id.remove(i, 1);
|
||||
--i;
|
||||
}
|
||||
}
|
||||
|
||||
// Fix the id, if necessary, by reversing it (A-B => B-A).
|
||||
// Record the position of the separator. Detect the special
|
||||
// case of Null, whose inverse is itself. Given an ID with no
|
||||
// separator "Foo", an abbreviation for "Any-Foo", consider
|
||||
// the inverse to be "Foo-Any".
|
||||
int32_t sep = id.indexOf(ID_SEP);
|
||||
if (id.caseCompare(NullTransliterator::ID,
|
||||
U_FOLD_CASE_DEFAULT) == 0) {
|
||||
sep = id.length();
|
||||
} else if (dir == UTRANS_REVERSE) {
|
||||
UnicodeString left;
|
||||
if (sep >= 0) {
|
||||
id.extractBetween(0, sep, left);
|
||||
id.removeBetween(0, sep+1);
|
||||
} else {
|
||||
left = UnicodeString("Any", "");
|
||||
}
|
||||
sep = id.length();
|
||||
id.append(ID_SEP).append(left);
|
||||
} else if (sep < 0) {
|
||||
sep = id.length();
|
||||
}
|
||||
|
||||
// The 'alias' parameter is non-empty if _createInstance()
|
||||
// finds that the given ID refers to an alias. The reason
|
||||
// _createInstance() doesn't call createInstance() (this
|
||||
// method) directly is to avoid deadlock. There are other
|
||||
// ways to do this but this is one of the more efficient ways.
|
||||
UnicodeString alias;
|
||||
t = _createInstance(id, alias, parseError);
|
||||
|
||||
if (alias.length() > 0) { // assert(t==0)
|
||||
t = createInstance(alias);
|
||||
}
|
||||
|
||||
if (t != 0) {
|
||||
if (filter != 0) {
|
||||
t->adoptFilter(filter);
|
||||
id.insert(sep, ID, setStart, setLimit-setStart);
|
||||
}
|
||||
t->setID(id);
|
||||
return t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
|
||||
UBool escapeUnprintable) const {
|
||||
// The base class implementation of toRules munges the ID into
|
||||
// the correct format. That is: foo => ::foo
|
||||
rulesSource = getID();
|
||||
// KEEP in sync with rbt_pars
|
||||
rulesSource.insert(0, UnicodeString("::", ""));
|
||||
return rulesSource;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a compound ID (possibly a degenerate one, containing no
|
||||
* ID_DELIM). If idSplitPoint >= 0 and adoptedSplitTrans != 0, then
|
||||
* insert adoptedSplitTrans in the compound ID at offset idSplitPoint.
|
||||
* Otherwise idSplitPoint should be -1 and adoptedSplitTrans should be
|
||||
* 0. Return in the result vector the instantiated transliterator
|
||||
* objects (one of these will be adoptedSplitTrans, if the latter was
|
||||
* specified). These will be in order of id, so if dir is REVERSE,
|
||||
* then the caller will have to reverse the order.
|
||||
*
|
||||
* @param splitTransIndex output parameter to receive the index in
|
||||
* 'result' at which the adoptedSplitTrans is stored, or -1 if
|
||||
* adoptedSplitTrans == 0
|
||||
*/
|
||||
void Transliterator::parseCompoundID(const UnicodeString& id,
|
||||
UTransDirection dir,
|
||||
int32_t idSplitPoint,
|
||||
Transliterator *adoptedSplitTrans,
|
||||
UVector& result,
|
||||
int32_t& splitTransIndex,
|
||||
UParseError* parseError,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
splitTransIndex = -1;
|
||||
int32_t pos = 0;
|
||||
int32_t i;
|
||||
while (pos < id.length()) {
|
||||
// We compare (pos >= split), not (pos == split), so we can
|
||||
// skip over whitespace (see below).
|
||||
if (pos >= idSplitPoint && adoptedSplitTrans != 0) {
|
||||
splitTransIndex = result.size();
|
||||
result.addElement(adoptedSplitTrans);
|
||||
adoptedSplitTrans = 0;
|
||||
}
|
||||
int32_t p = pos;
|
||||
UBool sawDelimiter; // We ignore this
|
||||
Transliterator *t =
|
||||
parseID(id, p, sawDelimiter, dir, parseError, TRUE);
|
||||
if (p == pos) {
|
||||
delete t;
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
break;
|
||||
}
|
||||
pos = p;
|
||||
// The return value may be NULL when, for instance, creating a
|
||||
// REVERSE transliterator of ID "Latin-Greek()".
|
||||
if (t != 0) {
|
||||
result.addElement(t);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle case of idSplitPoint == id.length()
|
||||
if (pos >= idSplitPoint && adoptedSplitTrans != 0) {
|
||||
splitTransIndex = result.size();
|
||||
result.addElement(adoptedSplitTrans);
|
||||
adoptedSplitTrans = 0;
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
for (i=0; i<result.size(); ++i) {
|
||||
delete (Transliterator*)result.elementAt(i);
|
||||
}
|
||||
result.removeAllElements();
|
||||
delete adoptedSplitTrans;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a single ID, possibly including an inline filter, and return
|
||||
* the resultant transliterator object. NOTE: If 'create' is FALSE,
|
||||
* then the amount of syntax checking is limited. However, the 'pos'
|
||||
* parameter will be updated correctly, assuming the input string is
|
||||
* valid.
|
||||
*
|
||||
* A trailing /;? \s* / is skipped. The parameter sawDelimiter
|
||||
* indicates whether the ';' was seen or not. Upon return, if pos is
|
||||
* advanced, it will either point to a non-whitespace character past
|
||||
* the trailing ';', if any, or be equal to length().
|
||||
*
|
||||
* On return one of the following will be true:
|
||||
* pos unchanged: sawDelimiter meaningless
|
||||
* pos == ID.length(): sawDelimiter TRUE or FALSE
|
||||
* pos < ID.length(): sawDelimiter always TRUE
|
||||
*
|
||||
* @param ID the ID string
|
||||
* @param pos INPUT-OUTPUT parameter. On input, the position of the
|
||||
* first character to parse. On output, the position after the last
|
||||
* character parsed. This will be a semicolon or ID.length(). In the
|
||||
* case of an error this value will be unchanged.
|
||||
* @param create if TRUE, create and return the result. If FALSE,
|
||||
* only scan the ID, and return NULL.
|
||||
* @return a newly created transliterator, or NULL. NULL is returned
|
||||
* in all cases if create is FALSE. If create is TRUE, then NULL is
|
||||
* returned on error, or if the ID is effectively empty.
|
||||
* E.g. "Latin-Greek()" with dir == REVERSE. Do NOT check for NULL to
|
||||
* determine if there was an error. Instead, check to see if pos
|
||||
* moved.
|
||||
*/
|
||||
Transliterator* Transliterator::parseID(const UnicodeString& ID,
|
||||
int32_t& pos,
|
||||
UBool& sawDelimiter,
|
||||
UTransDirection dir,
|
||||
UParseError* parseError,
|
||||
UBool create) {
|
||||
Transliterator* t = 0;
|
||||
UnicodeString str; // scratch
|
||||
|
||||
// Look for embedded filter pattern by looking for ';' and
|
||||
// '[' and seeing which comes first.
|
||||
UnicodeSet *filter = 0;
|
||||
int32_t limit = ID.indexOf(ID_DELIM, pos);
|
||||
sawDelimiter = limit >= 0;
|
||||
if (!sawDelimiter) {
|
||||
limit = ID.length();
|
||||
}
|
||||
int32_t setStart = ID.indexOf((UChar)0x005B /*[*/, pos);
|
||||
int32_t setLimit;
|
||||
if (setStart >= 0 && setStart < limit) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ParsePosition ppos(setStart);
|
||||
filter = new UnicodeSet();
|
||||
filter->applyPattern(ID, ppos, 0, status);
|
||||
if (U_FAILURE(status)) {
|
||||
// There was a parse failure in the filter pattern
|
||||
delete filter;
|
||||
return 0;
|
||||
}
|
||||
setLimit = ppos.getIndex();
|
||||
if (limit < setLimit) {
|
||||
limit = ID.indexOf(ID_DELIM, setLimit);
|
||||
sawDelimiter = limit >= 0;
|
||||
if (!sawDelimiter) {
|
||||
limit = ID.length();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
setStart = setLimit = pos;
|
||||
}
|
||||
|
||||
// Advance limit past /;?\s*/
|
||||
int32_t idLimit = limit; // limit before separator
|
||||
if (sawDelimiter) {
|
||||
// assert(limit < ID.length() && ID.charAt(limit) == ID_DELIM);
|
||||
++limit;
|
||||
}
|
||||
while (limit < ID.length() && u_isspace(ID.charAt(limit))) {
|
||||
++limit;
|
||||
}
|
||||
|
||||
if (!create) {
|
||||
// TODO Improve performance by scanning the UnicodeSet pattern
|
||||
// without actually constructing it, if create is FALSE. That
|
||||
// is, create a method like this one for UnicodeSet.
|
||||
delete filter;
|
||||
pos = limit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 'id' is the ID with the filter pattern removed and with
|
||||
// whitespace deleted.
|
||||
UnicodeString id;
|
||||
ID.extractBetween(pos, setStart, id);
|
||||
ID.extractBetween(setLimit, idLimit, str);
|
||||
id.append(str);
|
||||
|
||||
// Delete whitespace
|
||||
int32_t i;
|
||||
for (i=0; i<id.length(); ++i) {
|
||||
if (Unicode::isWhitespace(id.charAt(i))) {
|
||||
id.remove(i, 1);
|
||||
--i;
|
||||
}
|
||||
}
|
||||
|
||||
// Fix the id, if necessary, by reversing it (A-B => B-A).
|
||||
// Record the position of the separator. Detect the special
|
||||
// case of Null, whose inverse is itself. Given an ID with no
|
||||
// separator "Foo", an abbreviation for "Any-Foo", consider
|
||||
// the inverse to be "Foo-Any".
|
||||
int32_t sep = id.indexOf(ID_SEP);
|
||||
if (sep < 0 && id.caseCompare(NullTransliterator::ID,
|
||||
U_FOLD_CASE_DEFAULT) == 0) {
|
||||
sep = id.length();
|
||||
} else if (dir == UTRANS_REVERSE) {
|
||||
if (sep >= 0) {
|
||||
id.extractBetween(0, sep, str);
|
||||
id.removeBetween(0, sep+1);
|
||||
} else {
|
||||
str = UnicodeString("Any", "");
|
||||
}
|
||||
sep = id.length();
|
||||
id.append(ID_SEP).append(str);
|
||||
} else if (sep < 0) {
|
||||
str = UnicodeString("Any-", "");
|
||||
sep = str.length();
|
||||
id.insert(0, str);
|
||||
}
|
||||
|
||||
// The 'alias' parameter is non-empty if _createInstance()
|
||||
// finds that the given ID refers to an alias. The reason
|
||||
// _createInstance() doesn't call createInstance() (this
|
||||
// method) directly is to avoid deadlock. There are other
|
||||
// ways to do this but this is one of the more efficient ways.
|
||||
str.truncate(0);
|
||||
t = _createInstance(id, str /*alias*/, parseError);
|
||||
|
||||
if (str.length() > 0) {
|
||||
// assert(t==0);
|
||||
t = createInstance(str, UTRANS_FORWARD, parseError);
|
||||
}
|
||||
|
||||
if (t != 0) {
|
||||
if (filter != 0) {
|
||||
t->adoptFilter(filter);
|
||||
id.insert(sep, ID, setStart, setLimit-setStart);
|
||||
}
|
||||
t->setID(id);
|
||||
pos = limit;
|
||||
}
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a transliterator object given its ID. Unlike getInstance(),
|
||||
@ -661,8 +917,6 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID,
|
||||
Transliterator* Transliterator::_createInstance(const UnicodeString& ID,
|
||||
UnicodeString& aliasReturn,
|
||||
UParseError* parseError) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
if (!cacheInitialized) {
|
||||
initializeCache();
|
||||
}
|
||||
@ -672,46 +926,55 @@ Transliterator* Transliterator::_createInstance(const UnicodeString& ID,
|
||||
CacheEntry* entry = (CacheEntry*) cache->get(ID);
|
||||
if (entry == 0) {
|
||||
entry = (CacheEntry*) internalCache->get(ID);
|
||||
if (entry == 0) {
|
||||
return 0; // out of memory
|
||||
}
|
||||
}
|
||||
|
||||
TransliterationRuleData* data = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
if (entry == 0) {
|
||||
return 0;
|
||||
}
|
||||
for (;;) {
|
||||
if (entry->entryType == CacheEntry::RBT_DATA) {
|
||||
return new RuleBasedTransliterator(ID, entry->u.data);
|
||||
} else if (entry->entryType == CacheEntry::PROTOTYPE) {
|
||||
return entry->u.prototype->clone();
|
||||
} else if (entry->entryType == CacheEntry::ALIAS) {
|
||||
// We can't call createInstance() here because of deadlock.
|
||||
aliasReturn = entry->stringArg;
|
||||
return 0;
|
||||
} else if (entry->entryType == CacheEntry::FACTORY) {
|
||||
return entry->u.factory();
|
||||
} else if (entry->entryType == CacheEntry::COMPOUND_RBT) {
|
||||
UnicodeString id("_", "");
|
||||
Transliterator *t = new RuleBasedTransliterator(id, entry->u.data);
|
||||
t = new CompoundTransliterator(ID, entry->stringArg,
|
||||
entry->intArg, t, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete t;
|
||||
t = 0;
|
||||
_unregister(ID);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
if (entry->entryType == CacheEntry::RBT_DATA) {
|
||||
data = entry->u.data;
|
||||
// Fall through to construct transliterator from cached Data object.
|
||||
} else if (entry->entryType == CacheEntry::PROTOTYPE) {
|
||||
return entry->u.prototype->clone();
|
||||
} else if (entry->entryType == CacheEntry::ALIAS) {
|
||||
// We can't call createInstance() here because of deadlock.
|
||||
aliasReturn = entry->stringArg;
|
||||
return 0;
|
||||
} else if (entry->entryType == CacheEntry::FACTORY) {
|
||||
return entry->u.factory();
|
||||
} else {
|
||||
// At this point entry type must be either RULES_FORWARD
|
||||
// or RULES_REVERSE
|
||||
// At this point entry type must be either RULES_FORWARD or
|
||||
// RULES_REVERSE. We process the rule data into a
|
||||
// TransliteratorRuleData object, and possibly also into an
|
||||
// ::id header and/or footer. Then we modify the cache with
|
||||
// the parsed data and retry.
|
||||
UBool isReverse = (entry->entryType == CacheEntry::RULES_REVERSE);
|
||||
|
||||
|
||||
// We use the file name, taken from another resource bundle
|
||||
// 2-d array at static init time, as a locale language. We're
|
||||
// just using the locale mechanism to map through to a file
|
||||
// name; this in no way represents an actual locale.
|
||||
|
||||
char *ch;
|
||||
ch = new char[entry->stringArg.length() + 1];
|
||||
char *ch = new char[entry->stringArg.length() + 1];
|
||||
ch[entry->stringArg.extract(0, 0x7fffffff, ch, "")] = 0;
|
||||
Locale fakeLocale(ch);
|
||||
delete [] ch;
|
||||
|
||||
ResourceBundle bundle((char *)0,
|
||||
fakeLocale, status);
|
||||
|
||||
// Call RBT to parse the rules from the resource bundle
|
||||
|
||||
ResourceBundle bundle((char *)0, fakeLocale, status);
|
||||
UnicodeString rules = bundle.getStringEx(RB_RULE, status);
|
||||
|
||||
// If the status indicates a failure, then we don't have any
|
||||
@ -719,42 +982,54 @@ Transliterator* Transliterator::_createInstance(const UnicodeString& ID,
|
||||
// in the root locale should correspond to all the installed
|
||||
// transliterators; if it lists something that's not
|
||||
// installed, we'll get an error from ResourceBundle.
|
||||
if (U_SUCCESS(status)) {
|
||||
|
||||
data = TransliterationRuleParser::parse(rules, isReverse
|
||||
? UTRANS_REVERSE
|
||||
: UTRANS_FORWARD,
|
||||
parseError);
|
||||
TransliteratorParser::parse(rules, isReverse ?
|
||||
UTRANS_REVERSE : UTRANS_FORWARD,
|
||||
entry->u.data,
|
||||
entry->stringArg,
|
||||
entry->intArg,
|
||||
parseError,
|
||||
status);
|
||||
|
||||
// Double check to see if someone has modified the entry
|
||||
// since we last looked at it.
|
||||
if (entry->entryType != CacheEntry::RBT_DATA) {
|
||||
entry->entryType = CacheEntry::RBT_DATA;
|
||||
entry->u.data = data;
|
||||
if (U_FAILURE(status)) {
|
||||
// We have a failure of some kind. Remove the ID from the
|
||||
// cache so we don't keep trying. NOTE: This will throw off
|
||||
// anyone who is, at the moment, trying to iterate over the
|
||||
// available IDs. That's acceptable since we should never
|
||||
// really get here except under installation, configuration,
|
||||
// or unrecoverable run time memory failures.
|
||||
_unregister(ID);
|
||||
break;
|
||||
}
|
||||
|
||||
// Reset entry->entryType to something that we process at the
|
||||
// top of the loop, then loop back to the top. As long as we
|
||||
// do this, we only loop through twice at most.
|
||||
// NOTE: The logic here matches that in createFromRules().
|
||||
if (entry->stringArg.length() == 0) {
|
||||
if (entry->u.data == 0) {
|
||||
// No idBlock, no data -- this is just an
|
||||
// alias for Null
|
||||
entry->entryType = CacheEntry::ALIAS;
|
||||
entry->stringArg = NullTransliterator::ID;
|
||||
} else {
|
||||
// Oops! Another thread has updated this cache entry
|
||||
// already to point to a data object. Discard the
|
||||
// one we just created and use the one in the cache
|
||||
// instead.
|
||||
delete data;
|
||||
data = entry->u.data;
|
||||
// No idBlock, data != 0 -- this is an
|
||||
// ordinary RBT_DATA
|
||||
entry->entryType = CacheEntry::RBT_DATA;
|
||||
}
|
||||
} else {
|
||||
if (entry->u.data == 0) {
|
||||
// idBlock, no data -- this is an alias
|
||||
entry->entryType = CacheEntry::ALIAS;
|
||||
} else {
|
||||
// idBlock and data -- this is a compound
|
||||
// RBT
|
||||
entry->entryType = CacheEntry::COMPOUND_RBT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (data != 0) {
|
||||
return new RuleBasedTransliterator(ID, data);
|
||||
} else {
|
||||
// We have a failure of some kind. Remove the ID from the
|
||||
// cache so we don't keep trying. NOTE: This will throw off
|
||||
// anyone who is, at the moment, trying to iterate over the
|
||||
// available IDs. That's acceptable since we should never
|
||||
// really get here except under installation, configuration,
|
||||
// or unrecoverable run time memory failures.
|
||||
_unregister(ID);
|
||||
}
|
||||
|
||||
return 0;
|
||||
return 0; // failed
|
||||
}
|
||||
|
||||
// For public consumption
|
||||
@ -907,10 +1182,11 @@ UChar Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const {
|
||||
(localFilter->contains(c = text.charAt(i)) ? c : (UChar)0xFFFE);
|
||||
}
|
||||
|
||||
// TODO Move this into the class
|
||||
// NO This should remain a C function for os/390 and Solaris Workshop [grhoten]
|
||||
/**
|
||||
* Comparison function for UVector.
|
||||
*
|
||||
* Do not make this a class static: This should remain a C function
|
||||
* for os/390 and Solaris Workshop [grhoten]
|
||||
*/
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include "unicode/hextouni.h"
|
||||
#include "unicode/unitohex.h"
|
||||
#include "unicode/unicode.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/ucnv_err.h"
|
||||
|
||||
@ -61,6 +62,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
||||
TESTCASE(26,TestLiberalizedID);
|
||||
TESTCASE(27,TestCreateInstance);
|
||||
TESTCASE(28,TestNormalizationTransliterator);
|
||||
TESTCASE(29,TestCompoundRBT);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
@ -1053,93 +1055,6 @@ void TransliteratorTest::TestLiberalizedID(void) {
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
void TransliteratorTest::expect(const UnicodeString& rules,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator *t = new RuleBasedTransliterator("<ID>", rules, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: Transliterator constructor failed");
|
||||
} else {
|
||||
expect(*t, source, expectedResult);
|
||||
}
|
||||
delete t;
|
||||
}
|
||||
|
||||
void TransliteratorTest::expect(const Transliterator& t,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult,
|
||||
const Transliterator& reverseTransliterator) {
|
||||
expect(t, source, expectedResult);
|
||||
expect(reverseTransliterator, expectedResult, source);
|
||||
}
|
||||
|
||||
void TransliteratorTest::expect(const Transliterator& t,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult) {
|
||||
UnicodeString result(source);
|
||||
t.transliterate(result);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
|
||||
UnicodeString rsource(source);
|
||||
t.transliterate(rsource);
|
||||
expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
|
||||
|
||||
// Test keyboard (incremental) transliteration -- this result
|
||||
// must be the same after we finalize (see below).
|
||||
rsource.remove();
|
||||
UTransPosition index={0, 0, 0, 0};
|
||||
UnicodeString log;
|
||||
|
||||
for (int32_t i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
t.transliterate(rsource, index, source.charAt(i), status);
|
||||
// Append the string buffer with a vertical bar '|' where
|
||||
// the committed index is.
|
||||
UnicodeString left, right;
|
||||
rsource.extractBetween(0, index.start, left);
|
||||
rsource.extractBetween(index.start, rsource.length(), right);
|
||||
log.append(left).append((UChar)PIPE).append(right);
|
||||
}
|
||||
|
||||
// As a final step in keyboard transliteration, we must call
|
||||
// transliterate to finish off any pending partial matches that
|
||||
// were waiting for more input.
|
||||
t.finishTransliteration(rsource, index);
|
||||
log.append(" => ").append(rsource);
|
||||
|
||||
expectAux(t.getID() + ":Keyboard", log,
|
||||
rsource == expectedResult,
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
void TransliteratorTest::expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& result,
|
||||
const UnicodeString& expectedResult) {
|
||||
expectAux(tag, source + " -> " + result,
|
||||
result == expectedResult,
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
void TransliteratorTest::expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& summary, UBool pass,
|
||||
const UnicodeString& expectedResult) {
|
||||
if (pass) {
|
||||
logln(UnicodeString("(")+tag+") " + prettify(summary));
|
||||
} else {
|
||||
errln(UnicodeString("FAIL: (")+tag+") "
|
||||
+ prettify(summary)
|
||||
+ ", expected " + prettify(expectedResult));
|
||||
}
|
||||
}
|
||||
/* test for Jitterbug 912 */
|
||||
void TransliteratorTest::TestCreateInstance(){
|
||||
UParseError *err = 0;
|
||||
@ -1248,3 +1163,157 @@ void TransliteratorTest::TestNormalizationTransliterator() {
|
||||
delete NFKD;
|
||||
delete NFKC;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test compound RBT rules.
|
||||
*/
|
||||
void TransliteratorTest::TestCompoundRBT(void) {
|
||||
// Careful with spacing and ';' here: Phrase this exactly
|
||||
// as toRules() is going to return it. If toRules() changes
|
||||
// with regard to spacing or ';', then adjust this string.
|
||||
UnicodeString rule("::Hex-Unicode;\n"
|
||||
"::Any-Lower;\n"
|
||||
"a > '.A.';\n"
|
||||
"b > '.B.';\n"
|
||||
"::Any[^t]-Upper;", "");
|
||||
Transliterator *t = Transliterator::createFromRules("Test", rule);
|
||||
if (t == 0) {
|
||||
errln("FAIL: createFromRules failed");
|
||||
return;
|
||||
}
|
||||
expect(*t, "\\u0043at in the hat, bat on the mat",
|
||||
"C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
|
||||
UnicodeString r;
|
||||
t->toRules(r, TRUE);
|
||||
if (r == rule) {
|
||||
logln((UnicodeString)"OK: toRules() => " + r);
|
||||
} else {
|
||||
errln((UnicodeString)"FAIL: toRules() => " + r +
|
||||
", expected " + rule);
|
||||
}
|
||||
delete t;
|
||||
|
||||
// Now test toRules
|
||||
t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic");
|
||||
if (t == 0) {
|
||||
errln("FAIL: createInstance failed");
|
||||
return;
|
||||
}
|
||||
UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;");
|
||||
t->toRules(r, TRUE);
|
||||
if (r != exp) {
|
||||
errln((UnicodeString)"FAIL: toRules() => " + r +
|
||||
", expected " + exp);
|
||||
} else {
|
||||
logln((UnicodeString)"OK: toRules() => " + r);
|
||||
}
|
||||
delete t;
|
||||
|
||||
// Round trip the result of toRules
|
||||
t = Transliterator::createFromRules("Test", r);
|
||||
if (t == 0) {
|
||||
errln("FAIL: createFromRules #2 failed");
|
||||
return;
|
||||
} else {
|
||||
logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded");
|
||||
}
|
||||
|
||||
// Test toRules again
|
||||
t->toRules(r, TRUE);
|
||||
if (r != exp) {
|
||||
errln((UnicodeString)"FAIL: toRules() => " + r +
|
||||
", expected " + exp);
|
||||
} else {
|
||||
logln((UnicodeString)"OK: toRules() => " + r);
|
||||
}
|
||||
|
||||
delete t;
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
void TransliteratorTest::expect(const UnicodeString& rules,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator *t = new RuleBasedTransliterator("<ID>", rules, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: Transliterator constructor failed");
|
||||
} else {
|
||||
expect(*t, source, expectedResult);
|
||||
}
|
||||
delete t;
|
||||
}
|
||||
|
||||
void TransliteratorTest::expect(const Transliterator& t,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult,
|
||||
const Transliterator& reverseTransliterator) {
|
||||
expect(t, source, expectedResult);
|
||||
expect(reverseTransliterator, expectedResult, source);
|
||||
}
|
||||
|
||||
void TransliteratorTest::expect(const Transliterator& t,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult) {
|
||||
UnicodeString result(source);
|
||||
t.transliterate(result);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
|
||||
UnicodeString rsource(source);
|
||||
t.transliterate(rsource);
|
||||
expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
|
||||
|
||||
// Test keyboard (incremental) transliteration -- this result
|
||||
// must be the same after we finalize (see below).
|
||||
rsource.remove();
|
||||
UTransPosition index={0, 0, 0, 0};
|
||||
UnicodeString log;
|
||||
|
||||
for (int32_t i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
t.transliterate(rsource, index, source.charAt(i), status);
|
||||
// Append the string buffer with a vertical bar '|' where
|
||||
// the committed index is.
|
||||
UnicodeString left, right;
|
||||
rsource.extractBetween(0, index.start, left);
|
||||
rsource.extractBetween(index.start, rsource.length(), right);
|
||||
log.append(left).append((UChar)PIPE).append(right);
|
||||
}
|
||||
|
||||
// As a final step in keyboard transliteration, we must call
|
||||
// transliterate to finish off any pending partial matches that
|
||||
// were waiting for more input.
|
||||
t.finishTransliteration(rsource, index);
|
||||
log.append(" => ").append(rsource);
|
||||
|
||||
expectAux(t.getID() + ":Keyboard", log,
|
||||
rsource == expectedResult,
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
void TransliteratorTest::expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& result,
|
||||
const UnicodeString& expectedResult) {
|
||||
expectAux(tag, source + " -> " + result,
|
||||
result == expectedResult,
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
void TransliteratorTest::expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& summary, UBool pass,
|
||||
const UnicodeString& expectedResult) {
|
||||
if (pass) {
|
||||
logln(UnicodeString("(")+tag+") " + prettify(summary));
|
||||
} else {
|
||||
errln(UnicodeString("FAIL: (")+tag+") "
|
||||
+ prettify(summary)
|
||||
+ ", expected " + prettify(expectedResult));
|
||||
}
|
||||
}
|
||||
|
@ -159,6 +159,8 @@ class TransliteratorTest : public IntlTest {
|
||||
|
||||
void TestNormalizationTransliterator(void);
|
||||
|
||||
void TestCompoundRBT(void);
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
Loading…
Reference in New Issue
Block a user