ICU-4539 Added ability to put ::ID rules into the middle of Transliterator rule sets.

X-SVN-Rev: 17844
This commit is contained in:
Richard Gillam 2005-06-09 17:30:48 +00:00
parent 950b3b0bc3
commit fb164eba5a
18 changed files with 852 additions and 410 deletions

View File

@ -1,5 +1,5 @@
#--------------------------------------------------------------------
# Copyright (c) 1999-2004, International Business Machines
# Copyright (c) 1999-2005, International Business Machines
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
@ -27,7 +27,7 @@ e {($tone) r} > r &tone-digit($1);
# The following backs up until it finds the right vowel, then deposits the tone
$vowel = [aAeEiIoOuUüÜ];
$vowel = [aAeEiIoOuUüÜ {u\u0308} {U\u0308} ];
$consonant = [[a-z A-Z] - [$vowel]];
$digit = [1-5];
$1 &digit-tone($3) $2 < ([aAeE]) ($vowel* $consonant*) ($digit);

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -51,7 +51,7 @@ CompoundTransliterator::CompoundTransliterator(
int32_t transliteratorCount,
UnicodeFilter* adoptedFilter) :
Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter),
trans(0), count(0), compoundRBTIndex(-1) {
trans(0), count(0), numAnonymousRBTs(0) {
setTransliterators(transliterators, transliteratorCount);
}
@ -68,20 +68,36 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
UParseError& /*parseError*/,
UErrorCode& status) :
Transliterator(id, adoptedFilter),
trans(0), compoundRBTIndex(-1) {
trans(0), numAnonymousRBTs(0) {
// TODO add code for parseError...currently unused, but
// later may be used by parsing code...
init(id, direction, -1, 0, TRUE, status);
init(id, direction, TRUE, status);
}
CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
UParseError& /*parseError*/,
UErrorCode& status) :
Transliterator(id, 0), // set filter to 0 here!
trans(0), compoundRBTIndex(-1) {
trans(0), numAnonymousRBTs(0) {
// TODO add code for parseError...currently unused, but
// later may be used by parsing code...
init(id, UTRANS_FORWARD, -1, 0, TRUE, status);
init(id, UTRANS_FORWARD, TRUE, status);
}
/**
* Private constructor for use of TransliteratorAlias
*/
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
UVector& list,
UnicodeFilter* adoptedFilter,
int32_t anonymousRBTs,
UParseError& /*parseError*/,
UErrorCode& status) :
Transliterator(ID, adoptedFilter),
trans(0), numAnonymousRBTs(anonymousRBTs)
{
init(list, UTRANS_FORWARD, FALSE, status);
}
/**
@ -93,7 +109,7 @@ CompoundTransliterator::CompoundTransliterator(UVector& list,
UParseError& /*parseError*/,
UErrorCode& status) :
Transliterator(EMPTY, NULL),
trans(0), compoundRBTIndex(-1)
trans(0), numAnonymousRBTs(0)
{
// TODO add code for parseError...currently unused, but
// later may be used by parsing code...
@ -101,20 +117,14 @@ CompoundTransliterator::CompoundTransliterator(UVector& list,
// assume caller will fixup ID
}
/**
* Private constructor for compound RBTs. Construct a compound
* transliterator using the given idBlock, with the adoptedTrans
* inserted at the idSplitPoint.
*/
CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID,
const UnicodeString& idBlock,
int32_t idSplitPoint,
Transliterator *adoptedTrans,
CompoundTransliterator::CompoundTransliterator(UVector& list,
int32_t anonymousRBTs,
UParseError& /*parseError*/,
UErrorCode& status) :
Transliterator(newID, 0),
trans(0), compoundRBTIndex(-1)
Transliterator(EMPTY, NULL),
trans(0), numAnonymousRBTs(anonymousRBTs)
{
init(idBlock, UTRANS_FORWARD, idSplitPoint, adoptedTrans, FALSE, status);
init(list, UTRANS_FORWARD, FALSE, status);
}
/**
@ -135,14 +145,11 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID,
*/
void CompoundTransliterator::init(const UnicodeString& id,
UTransDirection direction,
int32_t idSplitPoint,
Transliterator *adoptedSplitTrans,
UBool fixReverseID,
UErrorCode& status) {
// assert(trans == 0);
if (U_FAILURE(status)) {
delete adoptedSplitTrans;
return;
}
@ -152,12 +159,11 @@ void CompoundTransliterator::init(const UnicodeString& id,
if (!TransliteratorIDParser::parseCompoundID(id, direction,
regenID, list, compoundFilter)) {
status = U_INVALID_ID;
delete adoptedSplitTrans;
delete compoundFilter;
return;
}
compoundRBTIndex = TransliteratorIDParser::instantiateList(list, adoptedSplitTrans, idSplitPoint, status);
TransliteratorIDParser::instantiateList(list, status);
init(list, direction, fixReverseID, status);
@ -209,11 +215,6 @@ void CompoundTransliterator::init(UVector& list,
trans[i] = (Transliterator*) list.elementAt(j);
}
// Fix compoundRBTIndex for REVERSE transliterators
if (compoundRBTIndex >= 0 && direction == UTRANS_REVERSE) {
compoundRBTIndex = count - 1 - compoundRBTIndex;
}
// If the direction is UTRANS_REVERSE then we may need to fix the
// ID.
if (direction == UTRANS_REVERSE && fixReverseID) {
@ -251,7 +252,7 @@ UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterat
* Copy constructor.
*/
CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :
Transliterator(t), trans(0), count(0), compoundRBTIndex(-1) {
Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) {
*this = t;
}
@ -292,7 +293,7 @@ CompoundTransliterator& CompoundTransliterator::operator=(
for (i=0; i<count; ++i) {
trans[i] = t.trans[i]->clone();
}
compoundRBTIndex = t.compoundRBTIndex;
numAnonymousRBTs = t.numAnonymousRBTs;
return *this;
}
@ -359,7 +360,7 @@ UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
// compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
// we do call toRules() recursively.
rulesSource.truncate(0);
if (compoundRBTIndex >= 0 && getFilter() != NULL) {
if (numAnonymousRBTs >= 1 && getFilter() != NULL) {
// If we are a compound RBT and if we have a global
// filter, then emit it at the top.
UnicodeString pat;
@ -367,8 +368,24 @@ UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
}
for (int32_t i=0; i<count; ++i) {
UnicodeString rule;
if (i == compoundRBTIndex) {
// Anonymous RuleBasedTransliterators (inline rules and
// ::BEGIN/::END blocks) are given IDs that begin with
// "%Pass": use toRules() to write all the rules to the output
// (and insert "::Null;" if we have two in a row)
if (trans[i]->getID().startsWith("%Pass")) {
trans[i]->toRules(rule, escapeUnprintable);
if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith("%Pass"))
rule = "::Null;" + rule;
// we also use toRules() on CompoundTransliterators (which we
// check for by looking for a semicolon in the ID)-- this gets
// the list of their child transliterators output in the right
// format
} else if (trans[i]->getID().indexOf(';') >= 0) {
trans[i]->toRules(rule, escapeUnprintable);
// for everything else, use Transliterator::toRules()
} else {
trans[i]->Transliterator::toRules(rule, escapeUnprintable);
}

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -42,13 +42,7 @@ class U_I18N_API CompoundTransliterator : public Transliterator {
int32_t count;
/**
* For compound RBTs (those with an ::id block before and/or after
* the main rule block) we record the index of the RBT here.
* Otherwise, this should have a value of -1. We need this
* information to implement toRules().
*/
int32_t compoundRBTIndex;
int32_t numAnonymousRBTs;
public:
@ -202,28 +196,27 @@ private:
friend class Transliterator;
friend class TransliteratorAlias; // to access private ct
/**
* Private constructor for compound RBTs. Construct a compound
* transliterator using the given idBlock, with the adoptedTrans
* inserted at the idSplitPoint.
*/
CompoundTransliterator(const UnicodeString& ID,
const UnicodeString& idBlock,
int32_t idSplitPoint,
Transliterator *adoptedTrans,
UErrorCode& status);
/**
* Private constructor for Transliterator.
*/
CompoundTransliterator(const UnicodeString& ID,
UVector& list,
UnicodeFilter* adoptedFilter,
int32_t numAnonymousRBTs,
UParseError& parseError,
UErrorCode& status);
CompoundTransliterator(UVector& list,
UParseError& parseError,
UErrorCode& status);
CompoundTransliterator(UVector& list,
int32_t anonymousRBTs,
UParseError& parseError,
UErrorCode& status);
void init(const UnicodeString& id,
UTransDirection direction,
int32_t idSplitPoint,
Transliterator *adoptedRbt,
UBool fixReverseID,
UErrorCode& status);

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -43,13 +43,13 @@ void RuleBasedTransliterator::_construct(const UnicodeString& rules,
return;
}
if (parser.idBlock.length() != 0 ||
if (parser.idBlockVector->size() != 0 ||
parser.compoundFilter != NULL) {
status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
return;
}
fData = parser.orphanData();
fData = (TransliterationRuleData*)parser.dataVector->orphanElementAt(0);
setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
}

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -23,7 +23,7 @@ U_NAMESPACE_BEGIN
TransliterationRuleData::TransliterationRuleData(UErrorCode& status)
: UMemory(), ruleSet(status),
variableNames(0), variables(0)
variableNames(0), variables(0), variablesAreOwned(TRUE)
{
if (U_FAILURE(status)) {
return;
@ -44,7 +44,8 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status)
TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData& other) :
UMemory(other), ruleSet(other.ruleSet),
variablesBase(other.variablesBase),
variablesLength(other.variablesLength)
variablesLength(other.variablesLength),
variablesAreOwned(TRUE)
{
UErrorCode status = U_ZERO_ERROR;
variableNames = new Hashtable(status);
@ -78,12 +79,12 @@ TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData&
TransliterationRuleData::~TransliterationRuleData() {
delete variableNames;
if (variables != 0) {
if (variablesAreOwned && variables != 0) {
for (int32_t i=0; i<variablesLength; ++i) {
delete variables[i];
}
uprv_free(variables);
}
uprv_free(variables);
}
UnicodeFunctor*

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 1999-2004, International Business Machines Corporation and others. All Rights Reserved.
* Copyright (C) 1999-2005, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
@ -74,6 +74,15 @@ public:
*/
UnicodeFunctor** variables;
/**
* Flag that indicates whether the variables are owned (if a single
* call to Transliterator::createFromRules() produces a CompoundTransliterator
* with more than one RuleBasedTransliterator as children, they all share
* the same variables list, so only the first one is considered to own
* the variables)
*/
bool variablesAreOwned;
/**
* The character that represents variables[0]. Characters
* variablesBase through variablesBase +

View File

@ -32,6 +32,7 @@
#include "unicode/symtable.h"
#include "tridpars.h"
#include "uvector.h"
#include "hash.h"
#include "util.h"
#include "cmemory.h"
#include "uprops.h"
@ -108,6 +109,15 @@ static const UChar HALF_ENDERS[] = { // "=><;"
static const int32_t ID_TOKEN_LEN = 2;
static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
/*
commented out until we do real ::BEGIN/::END functionality
static const int32_t BEGIN_TOKEN_LEN = 5;
static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'
static const int32_t END_TOKEN_LEN = 3;
static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
*/
U_NAMESPACE_BEGIN
//----------------------------------------------------------------------
@ -126,8 +136,11 @@ public:
const UVector* variablesVector; // alias
const Hashtable* variableNames; // alias
ParseData(const TransliterationRuleData* data = 0,
const UVector* variablesVector = 0);
const UVector* variablesVector = 0,
const Hashtable* variableNames = 0);
virtual const UnicodeString* lookup(const UnicodeString& s) const;
@ -153,14 +166,15 @@ private:
};
ParseData::ParseData(const TransliterationRuleData* d,
const UVector* sets) :
data(d), variablesVector(sets) {}
const UVector* sets,
const Hashtable* vNames) :
data(d), variablesVector(sets), variableNames(vNames) {}
/**
* Implement SymbolTable API.
*/
const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
return (const UnicodeString*) data->variableNames->get(name);
return (const UnicodeString*) variableNames->get(name);
}
/**
@ -516,7 +530,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
// standin for that matcher.
StringMatcher* m =
new StringMatcher(buf, bufSegStart, buf.length(),
segmentNumber, *parser.data);
segmentNumber, *parser.curData);
// Record and associate object and segment number
parser.setSegmentObject(segmentNumber, m);
@ -554,7 +568,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
UnicodeString output;
buf.extractBetween(bufSegStart, buf.length(), output);
FunctionReplacer *r =
new FunctionReplacer(t, new StringReplacer(output, parser.data));
new FunctionReplacer(t, new StringReplacer(output, parser.curData));
// Replace the buffer contents with a stand-in
buf.truncate(bufSegStart);
@ -645,7 +659,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
}
UnicodeFunctor *m =
new StringMatcher(buf, qstart, qlimit, 0, *parser.data);
new StringMatcher(buf, qstart, qlimit, 0, *parser.curData);
int32_t min = 0;
int32_t max = Quantifier::MAX;
switch (c) {
@ -794,10 +808,13 @@ UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {
* Constructor.
*/
TransliteratorParser::TransliteratorParser() {
data = NULL;
dataVector = NULL;
idBlockVector = NULL;
curData = NULL;
compoundFilter = NULL;
parseData = NULL;
variablesVector = NULL;
variableNames = NULL;
segmentObjects = NULL;
}
@ -805,10 +822,16 @@ TransliteratorParser::TransliteratorParser() {
* Destructor.
*/
TransliteratorParser::~TransliteratorParser() {
delete data;
while (dataVector != NULL && !dataVector->isEmpty())
delete (TransliterationRuleData*)(dataVector->orphanElementAt(0));
delete dataVector;
delete idBlockVector;
delete compoundFilter;
delete parseData;
while (variablesVector != NULL && !variablesVector->isEmpty())
delete (UnicodeFunctor*)variablesVector->orphanElementAt(0);
delete variablesVector;
delete variableNames;
delete segmentObjects;
}
@ -833,15 +856,6 @@ UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
return f;
}
/**
* Return the data object parsed by parse(). Caller owns result.
*/
TransliterationRuleData* TransliteratorParser::orphanData() {
TransliterationRuleData* d = data;
data = NULL;
return d;
}
//----------------------------------------------------------------------
// Private implementation
//----------------------------------------------------------------------
@ -861,12 +875,31 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
parseError.preContext[0] = parseError.postContext[0] = (UChar)0;
status = U_ZERO_ERROR;
delete data;
data = new TransliterationRuleData(status);
UBool parsingIDs = TRUE;
UBool inBeginEndBlock = FALSE;
int32_t ruleCount = 0;
if (dataVector == NULL)
dataVector = new UVector(status);
else {
while (!dataVector->isEmpty())
delete (TransliterationRuleData*)(dataVector->orphanElementAt(0));
}
if (U_FAILURE(status)) {
return;
}
if (idBlockVector == NULL) {
idBlockVector = new UVector(status);
idBlockVector->setDeleter(uhash_deleteUnicodeString);
}
else
idBlockVector->removeAllElements();
if (U_FAILURE(status)) {
return;
}
curData = NULL;
direction = theDirection;
ruleCount = 0;
@ -876,34 +909,27 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
if (variablesVector == NULL) {
variablesVector = new UVector(status);
} else {
variablesVector->removeAllElements();
while (!variablesVector->isEmpty())
delete (UnicodeFunctor*)variablesVector->orphanElementAt(0);
}
parseData = new ParseData(0, variablesVector);
if (variableNames == NULL) {
variableNames = new Hashtable(status);
variableNames->setValueDeleter(uhash_deleteUnicodeString);
} else {
variableNames->removeAll();
}
parseData = new ParseData(0, variablesVector, variableNames);
if (parseData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
parseData->data = data;
// By default, rules use part of the private use area
// E000..F8FF for variables and other stand-ins. Currently
// the range F000..F8FF is typically sufficient. The 'use
// variable range' pragma allows rule sets to modify this.
setVariableRange(0xF000, 0xF8FF);
dotStandIn = (UChar) -1;
UnicodeString str; // scratch
idBlock.truncate(0);
idSplitPoint = -1;
UnicodeString idBlockResult;
int32_t pos = 0;
int32_t limit = rule.length();
// The mode marks whether we are in the header ::id block, the
// rule block, or the footer ::id block.
// mode == 0: start: rule->1, ::id->0
// mode == 1: in rules: rule->1, ::id->2
// mode == 2: in footer rule block: rule->ERROR, ::id->2
int32_t mode = 0;
// The compound filter offset is an index into idBlockResult.
// If it is 0, then the compound filter occurred at the start,
@ -913,9 +939,6 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
compoundFilter = NULL;
int32_t compoundFilterOffset = -1;
// The number of ::ID block entries we have parsed
int32_t idBlockCount = 0;
while (pos < limit && U_SUCCESS(status)) {
UChar c = rule.charAt(pos++);
if (uprv_isRuleWhiteSpace(c)) {
@ -930,13 +953,21 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
}
continue; // Either fall out or restart with next line
}
// skip empty rules
if (c == END_OF_RULE)
continue;
// keep track of how many rules we've seen
++ruleCount;
// We've found the start of a rule or ID. c is its first
// character, and pos points past c.
--pos;
// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
// chars left.
if ((pos + ID_TOKEN_LEN + 1) <= limit &&
rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
pos += ID_TOKEN_LEN;
c = rule.charAt(pos);
while (uprv_isRuleWhiteSpace(c) && pos < limit) {
@ -944,33 +975,35 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
c = rule.charAt(pos);
}
if (mode == 1) {
// We have just entered the footer ::ID block
mode = 2;
// In the forward direction add elements at the end.
// In the reverse direction add elements at the start.
idSplitPoint = idBlockCount;
}
int32_t p = pos;
if (!parsingIDs) {
if (curData != NULL) {
if (direction == UTRANS_FORWARD)
dataVector->addElement(curData, status);
else
dataVector->insertElementAt(curData, 0, status);
curData = NULL;
}
parsingIDs = TRUE;
}
TransliteratorIDParser::SingleID* id =
TransliteratorIDParser::parseSingleID(rule, p, direction, status);
if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) {
// Successful ::ID parse.
if (direction == UTRANS_FORWARD) {
idBlock.append(id->canonID).append(END_OF_RULE);
idBlockResult.append(id->canonID).append(END_OF_RULE);
} else {
idBlock.insert(0, END_OF_RULE);
idBlock.insert(0, id->canonID);
idBlockResult.insert(0, END_OF_RULE);
idBlockResult.insert(0, id->canonID);
}
++idBlockCount;
} else {
// Couldn't parse an ID. Try to parse a global filter
int32_t withParens = -1;
UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, &idBlock);
UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL);
if (f != NULL) {
if (ICU_Utility::parseChar(rule, p, END_OF_RULE)
&& (direction == UTRANS_FORWARD) == (withParens == 0))
@ -981,7 +1014,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
delete f;
} else {
compoundFilter = f;
compoundFilterOffset = idBlockCount;
compoundFilterOffset = ruleCount;
}
} else {
delete f;
@ -993,78 +1026,93 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
}
}
delete id;
pos = p;
} else if (resemblesPragma(rule, pos, limit)) {
int32_t ppp = parsePragma(rule, pos, limit);
if (ppp < 0) {
syntaxError(U_MALFORMED_PRAGMA, rule, pos);
}
pos = ppp;
} else {
// Parse a rule
pos = parseRule(rule, pos, limit);
if (U_SUCCESS(status)) {
++ruleCount;
if (mode == 2) {
// ::id in illegal position (because a rule
// occurred after the ::id footer block)
syntaxError(U_ILLEGAL_ARGUMENT_ERROR,rule,pos);
}
}else{
syntaxError(status,rule,pos);
if (parsingIDs) {
if (direction == UTRANS_FORWARD)
idBlockVector->addElement(new UnicodeString(idBlockResult), status);
else
idBlockVector->insertElementAt(new UnicodeString(idBlockResult), 0, status);
idBlockResult.remove();
parsingIDs = FALSE;
curData = new TransliterationRuleData(status);
parseData->data = curData;
// By default, rules use part of the private use area
// E000..F8FF for variables and other stand-ins. Currently
// the range F000..F8FF is typically sufficient. The 'use
// variable range' pragma allows rule sets to modify this.
setVariableRange(0xF000, 0xF8FF);
}
if (resemblesPragma(rule, pos, limit)) {
int32_t ppp = parsePragma(rule, pos, limit);
if (ppp < 0) {
syntaxError(U_MALFORMED_PRAGMA, rule, pos);
}
pos = ppp;
// Parse a rule
} else {
pos = parseRule(rule, pos, limit);
}
mode = 1;
}
}
if (idSplitPoint < 0) {
idSplitPoint = idBlockCount;
if (parsingIDs && idBlockResult.length() > 0) {
if (direction == UTRANS_FORWARD)
idBlockVector->addElement(new UnicodeString(idBlockResult), status);
else
idBlockVector->insertElementAt(new UnicodeString(idBlockResult), 0, status);
}
else if (!parsingIDs && curData != NULL) {
if (direction == UTRANS_FORWARD)
dataVector->addElement(curData, status);
else
dataVector->insertElementAt(curData, 0, status);
}
if (direction == UTRANS_REVERSE) {
idSplitPoint = idBlockCount - idSplitPoint;
}
// Convert the set vector to an array
data->variablesLength = variablesVector->size();
if(data->variablesLength == 0) {
data->variables = 0;
} else {
data->variables = (UnicodeFunctor **)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor *));
}
// orphanElement removes the given element and shifts all other
// elements down. For performance (and code clarity) we work from
// the end back to index 0.
int32_t i;
for (i=data->variablesLength; i>0; ) {
--i;
data->variables[i] =
(UnicodeSet*) variablesVector->orphanElementAt(i);
}
// Index the rules
if (U_SUCCESS(status)) {
// Convert the set vector to an array
for (int32_t i = 0; i < dataVector->size(); i++) {
TransliterationRuleData* data = (TransliterationRuleData*)dataVector->elementAt(i);
data->variablesLength = variablesVector->size();
if (data->variablesLength == 0) {
data->variables = 0;
} else {
data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*));
data->variablesAreOwned = (i == 0);
}
for (int32_t j = 0; j < data->variablesLength; j++) {
data->variables[j] =
((UnicodeSet*)variablesVector->elementAt(j));
}
data->variableNames->removeAll();
int32_t pos = -1;
const UHashElement* he = variableNames->nextElement(pos);
while (he != NULL) {
data->variableNames->put(*((UnicodeString*)(he->key.pointer)),
((UnicodeString*)(he->value.pointer))->clone(), status);
he = variableNames->nextElement(pos);
}
}
variablesVector->removeAllElements(); // keeps them from getting deleted when we succeed
// Index the rules
if (compoundFilter != NULL) {
if ((direction == UTRANS_FORWARD &&
compoundFilterOffset != 0) ||
(direction == UTRANS_REVERSE &&
compoundFilterOffset != idBlockCount)) {
if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) ||
(direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) {
status = U_MISPLACED_COMPOUND_FILTER;
}
}
data->ruleSet.freeze(parseError,status);
if (idSplitPoint < 0) {
idSplitPoint = idBlock.length();
}
if (ruleCount == 0) {
delete data;
data = NULL;
for (int32_t i = 0; i < dataVector->size(); i++) {
TransliterationRuleData* data = (TransliterationRuleData*)dataVector->elementAt(i);
data->ruleSet.freeze(parseError, status);
}
if (idBlockVector->size() == 1 && ((UnicodeString*)idBlockVector->elementAt(0))->isEmpty())
idBlockVector->removeElementAt(0);
}
}
@ -1077,8 +1125,11 @@ void TransliteratorParser::setVariableRange(int32_t start, int32_t end) {
return;
}
data->variablesBase = variableNext = (UChar) start; // first private use
variableLimit = (UChar) (end + 1);
curData->variablesBase = (UChar) start;
if (dataVector->size() == 0) {
variableNext = (UChar) start;
variableLimit = (UChar) (end + 1);
}
}
/**
@ -1087,7 +1138,7 @@ void TransliteratorParser::setVariableRange(int32_t start, int32_t end) {
* variable range does not overlap characters used in a rule.
*/
UBool TransliteratorParser::checkVariableRange(UChar32 ch) const {
return !(ch >= data->variablesBase && ch < variableLimit);
return !(ch >= curData->variablesBase && ch < variableLimit);
}
/**
@ -1276,7 +1327,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
}
// We allow anything on the right, including an empty string.
UnicodeString* value = new UnicodeString(right->text);
data->variableNames->put(undefinedVariableName, value, status);
variableNames->put(undefinedVariableName, value, status);
++variableLimit;
return pos;
}
@ -1363,13 +1414,13 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
segmentObjects->toArray((void**) segmentsArray);
}
data->ruleSet.addRule(new TransliterationRule(
curData->ruleSet.addRule(new TransliterationRule(
left->text, left->ante, left->post,
right->text, right->cursor, right->cursorOffset,
segmentsArray,
segmentObjects->size(),
left->anchorStart, left->anchorEnd,
data,
curData,
status), status);
return pos;
@ -1434,7 +1485,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted) {
// (typical n is 0, 1, or 2); linear search is optimal.
for (int32_t i=0; i<variablesVector->size(); ++i) {
if (variablesVector->elementAt(i) == adopted) { // [sic] pointer comparison
return (UChar) (data->variablesBase + i);
return (UChar) (curData->variablesBase + i);
}
}
@ -1452,7 +1503,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted) {
*/
UChar TransliteratorParser::getSegmentStandin(int32_t seg) {
// Special character used to indicate an empty spot
UChar empty = data->variablesBase - 1;
UChar empty = curData->variablesBase - 1;
while (segmentStandins.length() < seg) {
segmentStandins.append(empty);
}
@ -1483,7 +1534,7 @@ void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted)
if (segmentObjects->size() < seg) {
segmentObjects->setSize(seg);
}
int32_t index = getSegmentStandin(seg) - data->variablesBase;
int32_t index = getSegmentStandin(seg) - curData->variablesBase;
if (segmentObjects->elementAt(seg-1) != NULL ||
variablesVector->elementAt(index) != NULL) {
// should never happen
@ -1511,7 +1562,7 @@ UChar TransliteratorParser::getDotStandIn() {
*/
void TransliteratorParser::appendVariableDef(const UnicodeString& name,
UnicodeString& buf) {
const UnicodeString* s = (const UnicodeString*) data->variableNames->get(name);
const UnicodeString* s = (const UnicodeString*) variableNames->get(name);
if (s == NULL) {
// We allow one undefined variable so that variable definition
// statements work. For the first undefined variable we return

View File

@ -26,6 +26,7 @@ class ParseData;
class RuleHalf;
class ParsePosition;
class UVector;
class Hashtable;
class StringMatcher;
class TransliteratorParser : public UMemory {
@ -33,27 +34,16 @@ class TransliteratorParser : public UMemory {
public:
/**
* PUBLIC data member containing the parsed data object, or null if
* there were no rules.
* A Vector of TransliterationRuleData objects, one for each discrete group
* of rules in the rule set
*/
TransliterationRuleData* data;
UVector* dataVector;
/**
* PUBLIC data member.
* The block of ::IDs, both at the top and at the bottom.
* Inserted into these may be additional rules at the
* idSplitPoint.
* A Vector of UnicodeStrings containing all of the ID blocks in the rule set
*/
UnicodeString idBlock;
/**
* PUBLIC data member.
* In a compound RBT, the index at which the RBT rules are
* inserted into the ID block. Index 0 means before any IDs
* in the block. Index idBlock.length() means after all IDs
* in the block. Index is a string index.
*/
int32_t idSplitPoint;
UVector* idBlockVector;
/**
* PUBLIC data member containing the parsed compound filter, if any.
@ -62,10 +52,10 @@ class TransliteratorParser : public UMemory {
private:
// The number of rules parsed. This tells us if there were
// any actual transliterator rules, or if there were just ::ID
// block IDs.
int32_t ruleCount;
/**
* The current data object for which we are parsing rules
*/
TransliterationRuleData* curData;
UTransDirection direction;
@ -92,6 +82,12 @@ class TransliteratorParser : public UMemory {
*/
UVector* variablesVector;
/**
* Temporary table of variable names. When parsing is complete, this is
* copied into data.variableNames.
*/
Hashtable* variableNames;
/**
* String of standins for segments. Used during the parsing of a single
* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
@ -177,12 +173,6 @@ public:
*/
UnicodeSet* orphanCompoundFilter();
/**
* Return the data object parsed by parse(). Caller owns result.
* @return the data object parsed by parse().
*/
TransliterationRuleData* orphanData();
private:
/**

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -120,7 +120,7 @@ inline void _debugOut(const char* msg, TransliterationRule* rule,
UnicodeString esc;
_escape(buf, esc);
CharString cbuf(esc);
printf("%s\n", (char*) cbuf);
printf("%s\n", (const char*) cbuf);
}
#else

View File

@ -921,30 +921,26 @@ Transliterator::createInstance(const UnicodeString& ID,
return NULL;
}
TransliteratorIDParser::instantiateList(list, NULL, -1, status);
TransliteratorIDParser::instantiateList(list, status);
if (U_FAILURE(status)) {
return NULL;
}
U_ASSERT(list.size() > 0);
Transliterator* t = NULL;
switch (list.size()) {
case 1:
t = (Transliterator*) list.elementAt(0);
break;
default:
if (list.size() > 1 || canonID.indexOf(";") >= 0) {
// [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
// has one child transliterator. This is so that toRules() will return the right thing
// (without any inactive ID), but our main ID still comes out correct. That is, if we
// instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
// even though the ID is "(Lower);Latin-Greek;".
t = new CompoundTransliterator(list, parseError, status);
/* test for NULL */
if (t == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_FAILURE(status)) {
delete t;
return NULL;
}
break;
}
else {
t = (Transliterator*)list.elementAt(0);
}
t->setID(canonID);
if (globalFilter != NULL) {
t->adoptFilter(globalFilter);
@ -1053,59 +1049,61 @@ Transliterator::createFromRules(const UnicodeString& ID,
}
// NOTE: The logic here matches that in TransliteratorRegistry.
if (parser.idBlock.length() == 0) {
if (parser.data == NULL) {
// No idBlock, no data -- this is just an
// alias for Null
t = new NullTransliterator();
} else {
// No idBlock, data != 0 -- this is an
// ordinary RBT_DATA.
t = new RuleBasedTransliterator(ID, parser.orphanData(), TRUE); // TRUE == adopt data object
if (parser.idBlockVector->size() == 0 && parser.dataVector->size() == 0) {
t = new NullTransliterator();
}
else if (parser.idBlockVector->size() == 0 && parser.dataVector->size() == 1) {
t = new RuleBasedTransliterator(ID, (TransliterationRuleData*)parser.dataVector->orphanElementAt(0), TRUE);
}
else if (parser.idBlockVector->size() == 1 && parser.dataVector->size() == 0) {
// idBlock, no data -- this is an alias. The ID has
// been munged from reverse into forward mode, if
// necessary, so instantiate the ID in the forward
// direction.
if (parser.compoundFilter != NULL) {
UnicodeString filterPattern;
parser.compoundFilter->toPattern(filterPattern, FALSE);
t = createInstance(filterPattern + ";"
+ *((UnicodeString*)parser.idBlockVector->elementAt(0)), UTRANS_FORWARD, parseError, status);
}
/* test for NULL */
if (t == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
} else {
if (parser.data == NULL) {
// idBlock, no data -- this is an alias. The ID has
// been munged from reverse into forward mode, if
// necessary, so instantiate the ID in the forward
// direction.
t = createInstance(parser.idBlock, UTRANS_FORWARD, parseError, status);
if (t != NULL) {
t->setID(ID);
}
} else {
// idBlock and data -- this is a compound
// RBT
UnicodeString id((UChar)0x005F); // '_'
t = new RuleBasedTransliterator(id, parser.orphanData(), TRUE); // TRUE == adopt data object
/* test for NULL */
if (t == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
t = new CompoundTransliterator(ID, parser.idBlock, parser.idSplitPoint,
t, status);
/* test for NULL */
if (t == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_FAILURE(status)) {
delete t;
t = 0;
}
if (parser.compoundFilter != NULL) {
t->adoptFilter(parser.orphanCompoundFilter());
}
return t;
else
t = createInstance(*((UnicodeString*)parser.idBlockVector->elementAt(0)), UTRANS_FORWARD, parseError, status);
if (t != NULL) {
t->setID(ID);
}
}
else {
UVector transliterators(status);
int32_t passNumber = 1;
int32_t limit = parser.idBlockVector->size();
if (parser.dataVector->size() > limit)
limit = parser.dataVector->size();
for (int32_t i = 0; i < limit; i++) {
if (i < parser.idBlockVector->size()) {
UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector->elementAt(i);
if (!idBlock->isEmpty()) {
Transliterator* temp = createInstance(*idBlock, UTRANS_FORWARD, parseError, status);
if (temp != NULL && temp->getDynamicClassID() != NullTransliterator::getStaticClassID())
transliterators.addElement(temp, status);
else
delete temp;
}
}
if (!parser.dataVector->isEmpty()) {
TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector->orphanElementAt(0);
transliterators.addElement(new RuleBasedTransliterator((UnicodeString)"%Pass" + (passNumber++),
data, TRUE), status);
}
}
t = new CompoundTransliterator(transliterators, passNumber - 1, parseError, status);
t->setID(ID);
t->adoptFilter(parser.orphanCompoundFilter());
}
return t;
}

View File

@ -58,25 +58,25 @@ U_NAMESPACE_BEGIN
// Alias
//------------------------------------------------------------------
TransliteratorAlias::TransliteratorAlias(const UnicodeString& theAliasID) :
TransliteratorAlias::TransliteratorAlias(const UnicodeString& theAliasID,
const UnicodeSet* cpdFilter) :
ID(),
aliasID(theAliasID),
trans(0),
compoundFilter(0),
idSplitPoint(-1),
aliasesOrRules(theAliasID),
transes(0),
compoundFilter(cpdFilter),
direction(UTRANS_FORWARD),
type(TransliteratorAlias::SIMPLE) {
}
TransliteratorAlias::TransliteratorAlias(const UnicodeString& theID,
const UnicodeString& idBlock,
Transliterator* adopted,
int32_t theIDSplitPoint,
const UnicodeString& idBlocks,
UVector* adoptedTransliterators,
const UnicodeSet* cpdFilter) :
ID(theID),
aliasID(idBlock),
trans(adopted),
aliasesOrRules(idBlocks),
transes(adoptedTransliterators),
compoundFilter(cpdFilter),
idSplitPoint(theIDSplitPoint),
direction(UTRANS_FORWARD),
type(TransliteratorAlias::COMPOUND) {
}
@ -84,15 +84,15 @@ TransliteratorAlias::TransliteratorAlias(const UnicodeString& theID,
const UnicodeString& rules,
UTransDirection dir) :
ID(theID),
aliasID(rules), // bad name -- rename aliasID!
trans(0),
aliasesOrRules(rules),
transes(0),
compoundFilter(0),
idSplitPoint((int32_t) dir), // bad name -- rename idSplitPoint!
direction(dir),
type(TransliteratorAlias::RULES) {
}
TransliteratorAlias::~TransliteratorAlias() {
delete trans;
delete transes;
}
@ -104,23 +104,60 @@ Transliterator* TransliteratorAlias::create(UParseError& pe,
Transliterator *t = NULL;
switch (type) {
case SIMPLE:
t = Transliterator::createInstance(aliasID, UTRANS_FORWARD, pe, ec);
t = Transliterator::createInstance(aliasesOrRules, UTRANS_FORWARD, pe, ec);
if (compoundFilter != 0)
t->adoptFilter((UnicodeSet*)compoundFilter->clone());
break;
case COMPOUND:
t = new CompoundTransliterator(ID, aliasID, idSplitPoint,
trans, ec);
/* test for NULL */
if (t == 0) {
ec = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
trans = 0; // so we don't delete it later
if (compoundFilter) {
// TODO: Is this right? Are we leaking memory here?
// I'm suspicious because of the "trans = 0" line above;
// doesn't seem to fit the cloning here. Don't have time
// to track this down right now. [alan 3.0]
t->adoptFilter((UnicodeSet*) compoundFilter->clone());
{
// the total number of transliterators in the compound is the total number of anonymous transliterators
// plus the total number of ID blocks-- we start by assuming the list begins and ends with an ID
// block and that each pair anonymous transliterators has an ID block between them. Then we go back
// to see whether there really are ID blocks at the beginning and end (by looking for U+FFFF, which
// marks the position where an anonymous transliterator goes) and adjust accordingly
int32_t anonymousRBTs = transes->size();
int32_t transCount = anonymousRBTs * 2 + 1;
if (!aliasesOrRules.isEmpty() && aliasesOrRules[0] == (UChar)(0xffff))
--transCount;
if (aliasesOrRules.length() >= 2 && aliasesOrRules[aliasesOrRules.length() - 1] == (UChar)(0xffff))
--transCount;
UnicodeString noIDBlock((UChar)(0xffff));
noIDBlock += ((UChar)(0xffff));
int32_t pos = aliasesOrRules.indexOf(noIDBlock);
while (pos >= 0) {
--transCount;
pos = aliasesOrRules.indexOf(noIDBlock, pos + 1);
}
UVector transliterators(ec);
UnicodeString idBlock;
int32_t blockSeparatorPos = aliasesOrRules.indexOf((UChar)(0xffff));
while (blockSeparatorPos >= 0) {
aliasesOrRules.extract(0, blockSeparatorPos, idBlock);
aliasesOrRules.remove(0, blockSeparatorPos + 1);
if (!idBlock.isEmpty())
transliterators.addElement(Transliterator::createInstance(idBlock, UTRANS_FORWARD, pe, ec), ec);
if (!transes->isEmpty())
transliterators.addElement(transes->orphanElementAt(0), ec);
blockSeparatorPos = aliasesOrRules.indexOf((UChar)(0xffff));
}
if (!aliasesOrRules.isEmpty())
transliterators.addElement(Transliterator::createInstance(aliasesOrRules, UTRANS_FORWARD, pe, ec), ec);
while (!transes->isEmpty())
transliterators.addElement(transes->orphanElementAt(0), ec);
if (U_SUCCESS(ec)) {
t = new CompoundTransliterator(ID, transliterators,
(compoundFilter ? (UnicodeSet*)(compoundFilter->clone()) : 0),
anonymousRBTs, pe, ec);
if (t == 0) {
ec = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
} else {
for (int32_t i = 0; i < transliterators.size(); i++)
delete (Transliterator*)(transliterators.elementAt(i));
}
}
break;
case RULES:
@ -141,9 +178,7 @@ void TransliteratorAlias::parse(TransliteratorParser& parser,
return;
}
// aliasID is really rules -- rename it!
// idSplitPoint is really UTransDirection -- rename it!
parser.parse(aliasID, (UTransDirection) idSplitPoint, pe, ec);
parser.parse(aliasesOrRules, direction, pe, ec);
}
//----------------------------------------------------------------------
@ -399,7 +434,8 @@ public:
UnicodeSet* compoundFilter; // For COMPOUND_RBT
union {
Transliterator* prototype; // For PROTOTYPE
TransliterationRuleData* data; // For RBT_DATA, COMPOUND_RBT
TransliterationRuleData* data; // For RBT_DATA
UVector* dataVector; // For COMPOUND_RBT
struct {
Transliterator::Factory function;
Transliterator::Token context;
@ -428,12 +464,16 @@ Entry::~Entry() {
DEBUG_delEntry(this);
if (entryType == PROTOTYPE) {
delete u.prototype;
} else if (entryType == RBT_DATA || entryType == COMPOUND_RBT) {
} else if (entryType == RBT_DATA) {
// The data object is shared between instances of RBT. The
// entry object owns it. It should only be deleted when the
// transliterator component is being cleaned up. Doing so
// invalidates any RBTs that the user has instantiated.
delete u.data;
} else if (entryType == COMPOUND_RBT) {
while (u.dataVector != NULL && !u.dataVector->isEmpty())
delete (TransliterationRuleData*)u.dataVector->orphanElementAt(0);
delete u.dataVector;
}
delete compoundFilter;
}
@ -522,39 +562,41 @@ Transliterator* TransliteratorRegistry::reget(const UnicodeString& ID,
entry->entryType == Entry::RULES_REVERSE ||
entry->entryType == Entry::LOCALE_RULES) {
entry->u.data = parser.orphanData();
entry->stringArg = parser.idBlock;
entry->intArg = parser.idSplitPoint;
entry->compoundFilter = parser.orphanCompoundFilter();
if (parser.idBlockVector->isEmpty() && parser.dataVector->isEmpty()) {
entry->u.data = 0;
entry->entryType = Entry::ALIAS;
entry->stringArg = UNICODE_STRING_SIMPLE("Any-NULL");
}
else if (parser.idBlockVector->isEmpty() && parser.dataVector->size() == 1) {
entry->u.data = (TransliterationRuleData*)parser.dataVector->orphanElementAt(0);
entry->entryType = Entry::RBT_DATA;
}
else if (parser.idBlockVector->size() == 1 && parser.dataVector->isEmpty()) {
entry->stringArg = *(UnicodeString*)(parser.idBlockVector->elementAt(0));
entry->compoundFilter = parser.orphanCompoundFilter();
entry->entryType = Entry::ALIAS;
}
else {
entry->entryType = Entry::COMPOUND_RBT;
entry->compoundFilter = parser.orphanCompoundFilter();
entry->u.dataVector = new UVector(status);
entry->stringArg.remove();
// Reset entry->entryType to encapsulate the parsed data. The
// next time we instantiate this ID (including this very next
// time, at the end of this function) we won't have to parse
// again.
// NOTE: The logic here matches that in
// Transliterator::createFromRules().
if (entry->stringArg.length() == 0) {
if (entry->u.data == 0) {
// No idBlock, no data -- this is just an
// alias for Null
entry->entryType = Entry::ALIAS;
entry->stringArg = UNICODE_STRING_SIMPLE("Any-Null");
} else {
// No idBlock, data != 0 -- this is an
// ordinary RBT_DATA
entry->entryType = Entry::RBT_DATA;
}
} else {
if (entry->u.data == 0) {
// idBlock, no data -- this is an alias. The ID has
// been munged from reverse into forward mode, if
// necessary, so instantiate the ID in the forward
// direction.
entry->entryType = Entry::ALIAS;
} else {
// idBlock and data -- this is a compound
// RBT
entry->entryType = Entry::COMPOUND_RBT;
int32_t limit = parser.idBlockVector->size();
if (parser.dataVector->size() > limit)
limit = parser.dataVector->size();
for (int32_t i = 0; i < limit; i++) {
if (i < parser.idBlockVector->size()) {
UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector->elementAt(i);
if (!idBlock->isEmpty())
entry->stringArg += *idBlock;
}
if (!parser.dataVector->isEmpty()) {
TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector->orphanElementAt(0);
entry->u.dataVector->addElement(data, status);
entry->stringArg += (UChar)0xffff; // use U+FFFF to mark position of RBTs in ID block
}
}
}
}
@ -1165,7 +1207,7 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
}
return t;
case Entry::ALIAS:
aliasReturn = new TransliteratorAlias(entry->stringArg);
aliasReturn = new TransliteratorAlias(entry->stringArg, entry->compoundFilter);
if (aliasReturn == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
}
@ -1178,13 +1220,19 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
return t;
case Entry::COMPOUND_RBT:
{
UnicodeString id((UChar)0x005F); /* "_" */
Transliterator *t = new RuleBasedTransliterator(id, entry->u.data);
if (t == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
UVector* rbts = new UVector(status);
int32_t passNumber = 1;
for (int32_t i = 0; U_SUCCESS(status) && i < entry->u.dataVector->size(); i++) {
Transliterator* t = new RuleBasedTransliterator((UnicodeString)"%Pass" + (passNumber++),
(TransliterationRuleData*)(entry->u.dataVector->elementAt(i)), FALSE);
if (t == 0)
status = U_MEMORY_ALLOCATION_ERROR;
else
rbts->addElement(t, status);
}
aliasReturn = new TransliteratorAlias(ID, entry->stringArg, t, entry->intArg, entry->compoundFilter);
if (U_FAILURE(status))
return 0;
aliasReturn = new TransliteratorAlias(ID, entry->stringArg, rbts, entry->compoundFilter);
}
if (aliasReturn == 0) {
status = U_MEMORY_ALLOCATION_ERROR;

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (c) 2001-2004, International Business Machines
* Copyright (c) 2001-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -44,13 +44,13 @@ class TransliteratorAlias : public UMemory {
* Construct a simple alias (type == SIMPLE)
* @param aliasID the given id.
*/
TransliteratorAlias(const UnicodeString& aliasID);
TransliteratorAlias(const UnicodeString& aliasID, const UnicodeSet* compoundFilter);
/**
* Construct a compound RBT alias (type == COMPOUND)
*/
TransliteratorAlias(const UnicodeString& ID, const UnicodeString& idBlock,
Transliterator* adopted, int32_t idSplitPoint,
TransliteratorAlias(const UnicodeString& ID, const UnicodeString& idBlocks,
UVector* adoptedTransliterators,
const UnicodeSet* compoundFilter);
/**
@ -108,10 +108,10 @@ class TransliteratorAlias : public UMemory {
// Here ID is the ID, aliasID is the rules string.
// idSplitPoint is the UTransDirection.
UnicodeString ID;
UnicodeString aliasID; // rename! holds rules for RULES type
Transliterator* trans; // owned
UnicodeString aliasesOrRules;
UVector* transes; // owned
const UnicodeSet* compoundFilter; // alias
int32_t idSplitPoint; // rename! holds UTransDirection for RULES type
UTransDirection direction;
enum { SIMPLE, COMPOUND, RULES } type;
TransliteratorAlias(const TransliteratorAlias &other); // forbid copying of this class

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (c) 2002-2004, International Business Machines Corporation
* Copyright (c) 2002-2005, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -437,22 +437,13 @@ UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t d
* the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
* SingleID entries to actual transliterators.
*
* Also, optionally, insert the given transliterator at the given
* position. This effectively happens before anything else.
*
* @param list vector of SingleID objects. On exit, vector
* of one or more Transliterators.
* @param insert Transliterator to insert, or NULL if none.
* Adopted.
* @param insertIndex index from 0..list.size()-1, at which
* to place 'insert', or -1 if none.
* @return new value of insertIndex. The index will shift if
* there are empty items, like "(Lower)", with indices less than
* insertIndex.
*/
int32_t TransliteratorIDParser::instantiateList(UVector& list,
Transliterator* insert,
int32_t insertIndex,
void TransliteratorIDParser::instantiateList(UVector& list,
UErrorCode& ec) {
UVector tlist(ec);
if (U_FAILURE(ec)) {
@ -463,15 +454,6 @@ int32_t TransliteratorIDParser::instantiateList(UVector& list,
Transliterator* t;
int32_t i;
for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size()
if (insertIndex == i) {
insertIndex = tlist.size();
tlist.addElement(insert, ec);
if (U_FAILURE(ec)) {
goto RETURN;
}
insert = NULL;
}
// We run the loop too long by one, so we can
// do an insert after the last element
if (i==list.size()) {
@ -525,9 +507,7 @@ int32_t TransliteratorIDParser::instantiateList(UVector& list,
}
}
delete insert; // Clean up in case of failure
list.setDeleter(save);
return insertIndex;
}
/**

View File

@ -1,6 +1,6 @@
/*
**************************************************************************
* Copyright (c) 2002-2004, International Business Machines Corporation *
* Copyright (c) 2002-2005, International Business Machines Corporation *
* and others. All Rights Reserved. *
**************************************************************************
* Date Name Description *
@ -202,23 +202,15 @@ class TransliteratorIDParser /* not : public UObject because all methods are sta
* the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
* SingleID entries to actual transliterators.
*
* Also, optionally, insert the given transliterator at the given
* position. This effectively happens before anything else.
*
* @param list vector of SingleID objects. On exit, vector
* of one or more Transliterators.
* @param insert Transliterator to insert, or null if none.
* @param insertIndex index from 0..list.size()-1, at which
* to place 'insert', or -1 if none.
* @param ec Output param to receive a success or an error code.
* @return new value of insertIndex. The index will shift if
* there are empty items, like "(Lower)", with indices less than
* insertIndex.
*/
static int32_t instantiateList(UVector& list,
Transliterator* insert,
int32_t insertIndex,
UErrorCode& ec);
static void instantiateList(UVector& list,
UErrorCode& ec);
/**
* Parse an ID into pieces. Take IDs of the form T, T/V, S-T,

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -343,6 +343,7 @@ protected:
friend class TransliteratorParser; // for parseID()
friend class TransliteratorIDParser; // for createBasicInstance()
friend class TransliteratorAlias; // for setID()
public:

View File

@ -1101,19 +1101,19 @@ void TransliteratorRoundTripTest::TestHan() {
pn->transliterate(target2);
// verify that there are no marks
Transliterator *nfc = Transliterator::createInstance("nfc", UTRANS_FORWARD, status);
Transliterator *nfd = Transliterator::createInstance("nfd", UTRANS_FORWARD, status);
ASSERT_SUCCESS(status);
UnicodeString nfced = target2;
nfc->transliterate(nfced);
UnicodeSet allMarks("[:mark:]", status);
UnicodeString nfded = target2;
nfd->transliterate(nfded);
UnicodeSet allMarks("[\\u0304\\u0301\\u030C\\u0300\\u0306]", status); // look only for Pinyin tone marks, not all marks (there are some others in there)
ASSERT_SUCCESS(status);
assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfced));
assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfded));
// verify roundtrip
Transliterator *np = pn->createInverse(status);
ASSERT_SUCCESS(status);
UnicodeString target3 = target;
UnicodeString target3 = target2;
np->transliterate(target3);
UBool roundtripOK = (target3.compare(target) == 0);
assertTrue("NumericPinyin must roundtrip", roundtripOK);
@ -1125,13 +1125,15 @@ void TransliteratorRoundTripTest::TestHan() {
writeStringInU8(out, target);
fprintf(out, "\nPinyin-Numeric-Pinyin: ");
writeStringInU8(out, target2);
fprintf(out, "\nNumeric-Pinyin-Pinyin: ");
writeStringInU8(out, target3);
fprintf(out, "\n");
fclose(out);
}
delete hanTL;
delete pn;
delete nfc;
delete nfd;
delete np;
uset_close(USetExemplars);
}

View File

@ -183,6 +183,8 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE(75,TestAllCodepoints);
TESTCASE(76,TestBoilerplate);
TESTCASE(77,TestAlternateSyntax);
TESTCASE(78,TestBeginEnd);
TESTCASE(79,TestBeginEndToRules);
default: name = ""; break;
}
}
@ -776,7 +778,7 @@ void TransliteratorTest::TestJ277(void) {
// Transliterate the Greek locale data
Locale el("el");
DateFormatSymbols syms(el, status);
if (U_FAILURE(status)) { errln("FAIL: DateFormatSymbols constructor failed. Error: " + UnicodeString(u_errorName(status))); return; }
if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
int32_t i, count;
const UnicodeString* data = syms.getMonths(count);
for (i=0; i<count; ++i) {
@ -3972,6 +3974,332 @@ void TransliteratorTest::TestAlternateSyntax() {
"<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}");
}
static const char* BEGIN_END_RULES[] = {
// [0]
"abc > xy;"
"aba > z;",
// [1]
/*
"::BEGIN;"
"abc > xy;"
"::END;"
"::BEGIN;"
"aba > z;"
"::END;",
*/
"", // test case commented out below, this is here to keep from messing up the indexes
// [2]
/*
"abc > xy;"
"::BEGIN;"
"aba > z;"
"::END;",
*/
"", // test case commented out below, this is here to keep from messing up the indexes
// [3]
/*
"::BEGIN;"
"abc > xy;"
"::END;"
"aba > z;",
*/
"", // test case commented out below, this is here to keep from messing up the indexes
// [4]
"abc > xy;"
"::Null;"
"aba > z;",
// [5]
"::Upper;"
"ABC > xy;"
"AB > x;"
"C > z;"
"::Upper;"
"XYZ > p;"
"XY > q;"
"Z > r;"
"::Upper;",
// [6]
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
"$delim = [\\-$ws];"
"$ws $delim* > ' ';"
"'-' $delim* > '-';",
// [7]
"::Null;"
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
"$delim = [\\-$ws];"
"$ws $delim* > ' ';"
"'-' $delim* > '-';",
// [8]
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
"$delim = [\\-$ws];"
"$ws $delim* > ' ';"
"'-' $delim* > '-';"
"::Null;",
// [9]
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
"$delim = [\\-$ws];"
"::Null;"
"$ws $delim* > ' ';"
"'-' $delim* > '-';",
// [10]
/*
"::BEGIN;"
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
"$delim = [\\-$ws];"
"::END;"
"$ws $delim* > ' ';"
"'-' $delim* > '-';",
*/
"", // test case commented out below, this is here to keep from messing up the indexes
// [11]
/*
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
"$delim = [\\-$ws];"
"::BEGIN;"
"$ws $delim* > ' ';"
"'-' $delim* > '-';"
"::END;",
*/
"", // test case commented out below, this is here to keep from messing up the indexes
// [12]
/*
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
"$delim = [\\-$ws];"
"$ab = [ab];"
"::BEGIN;"
"$ws $delim* > ' ';"
"'-' $delim* > '-';"
"::END;"
"::BEGIN;"
"$ab { ' ' } $ab > '-';"
"c { ' ' > ;"
"::END;"
"::BEGIN;"
"'a-a' > a\\%|a;"
"::END;",
*/
"", // test case commented out below, this is here to keep from messing up the indexes
// [13]
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
"$delim = [\\-$ws];"
"$ab = [ab];"
"::Null;"
"$ws $delim* > ' ';"
"'-' $delim* > '-';"
"::Null;"
"$ab { ' ' } $ab > '-';"
"c { ' ' > ;"
"::Null;"
"'a-a' > a\\%|a;",
// [14]
/*
"::[abc];"
"::BEGIN;"
"abc > xy;"
"::END;"
"::BEGIN;"
"aba > yz;"
"::END;"
"::Upper;",
*/
"", // test case commented out below, this is here to keep from messing up the indexes
// [15]
"::[abc];"
"abc > xy;"
"::Null;"
"aba > yz;"
"::Upper;",
// [16]
/*
"::[abc];"
"::BEGIN;"
"abc <> xy;"
"::END;"
"::BEGIN;"
"aba <> yz;"
"::END;"
"::Upper(Lower);"
"::([XYZ]);"
*/
"", // test case commented out below, this is here to keep from messing up the indexes
// [17]
"::[abc];"
"abc <> xy;"
"::Null;"
"aba <> yz;"
"::Upper(Lower);"
"::([XYZ]);"
};
static const int32_t BEGIN_END_RULES_length = (int32_t)(sizeof(BEGIN_END_RULES) / sizeof(BEGIN_END_RULES[0]));
/*
(This entire test is commented out below and will need some heavy revision when we re-add
the ::BEGIN/::END stuff)
static const char* BOGUS_BEGIN_END_RULES[] = {
// [7]
"::BEGIN;"
"abc > xy;"
"::BEGIN;"
"aba > z;"
"::END;"
"::END;",
// [8]
"abc > xy;"
" aba > z;"
"::END;",
// [9]
"::BEGIN;"
"::Upper;"
"::END;"
};
static const int32_t BOGUS_BEGIN_END_RULES_length = (int32_t)(sizeof(BOGUS_BEGIN_END_RULES) / sizeof(BOGUS_BEGIN_END_RULES[0]));
*/
static const char* BEGIN_END_TEST_CASES[] = {
// rules input expected output
BEGIN_END_RULES[0], "abc ababc aba", "xy zbc z",
// BEGIN_END_RULES[1], "abc ababc aba", "xy abxy z",
// BEGIN_END_RULES[2], "abc ababc aba", "xy abxy z",
// BEGIN_END_RULES[3], "abc ababc aba", "xy abxy z",
BEGIN_END_RULES[4], "abc ababc aba", "xy abxy z",
BEGIN_END_RULES[5], "abccabaacababcbc", "PXAARXQBR",
BEGIN_END_RULES[6], "e e - e---e- e", "e e e-e-e",
BEGIN_END_RULES[7], "e e - e---e- e", "e e e-e-e",
BEGIN_END_RULES[8], "e e - e---e- e", "e e e-e-e",
BEGIN_END_RULES[9], "e e - e---e- e", "e e e-e-e",
// BEGIN_END_RULES[10], "e e - e---e- e", "e e e-e-e",
// BEGIN_END_RULES[11], "e e - e---e- e", "e e e-e-e",
// BEGIN_END_RULES[12], "e e - e---e- e", "e e e-e-e",
// BEGIN_END_RULES[12], "a a a a", "a%a%a%a",
// BEGIN_END_RULES[12], "a a-b c b a", "a%a-b cb-a",
BEGIN_END_RULES[13], "e e - e---e- e", "e e e-e-e",
BEGIN_END_RULES[13], "a a a a", "a%a%a%a",
BEGIN_END_RULES[13], "a a-b c b a", "a%a-b cb-a",
// BEGIN_END_RULES[14], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
BEGIN_END_RULES[15], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
// BEGIN_END_RULES[16], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
BEGIN_END_RULES[17], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ"
};
static const int32_t BEGIN_END_TEST_CASES_length = (int32_t)(sizeof(BEGIN_END_TEST_CASES) / sizeof(BEGIN_END_TEST_CASES[0]));
void TransliteratorTest::TestBeginEnd() {
// run through the list of test cases above
int32_t i = 0;
for (i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
expect((UnicodeString)"Test case #" + (i / 3),
UnicodeString(BEGIN_END_TEST_CASES[i]),
UnicodeString(BEGIN_END_TEST_CASES[i + 1]),
UnicodeString(BEGIN_END_TEST_CASES[i + 2]));
}
// instantiate the one reversible rule set in the reverse direction and make sure it does the right thing
UParseError parseError;
UErrorCode status = U_ZERO_ERROR;
Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
UTRANS_REVERSE, parseError, status);
if (reversed == 0 || U_FAILURE(status)) {
reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
} else {
expect(*reversed, UnicodeString("xy XY XYZ yz YZ"), UnicodeString("xy abc xaba yz aba"));
}
delete reversed;
// finally, run through the list of syntactically-ill-formed rule sets above and make sure
// that all of them cause errors
/*
(commented out until we have the real ::BEGIN/::END stuff in place
for (i = 0; i < BOGUS_BEGIN_END_RULES_length; i++) {
UParseError parseError;
UErrorCode status = U_ZERO_ERROR;
Transliterator* t = Transliterator::createFromRules("foo", UnicodeString(BOGUS_BEGIN_END_RULES[i]),
UTRANS_FORWARD, parseError, status);
if (!U_FAILURE(status)) {
delete t;
errln((UnicodeString)"Should have gotten syntax error from " + BOGUS_BEGIN_END_RULES[i]);
}
}
*/
}
void TransliteratorTest::TestBeginEndToRules() {
// run through the same list of test cases we used above, but this time, instead of just
// instantiating a Transliterator from the rules and running the test against it, we instantiate
// a Transliterator from the rules, do toRules() on it, instantiate a Transliterator from
// the resulting set of rules, and make sure that the generated rule set is semantically equivalent
// to (i.e., does the same thing as) the original rule set
for (int32_t i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
UParseError parseError;
UErrorCode status = U_ZERO_ERROR;
Transliterator* t = Transliterator::createFromRules("--", UnicodeString(BEGIN_END_TEST_CASES[i]),
UTRANS_FORWARD, parseError, status);
if (U_FAILURE(status)) {
reportParseError(UnicodeString("FAIL: Couldn't create transliterator"), parseError, status);
} else {
UnicodeString rules;
t->toRules(rules, TRUE);
Transliterator* t2 = Transliterator::createFromRules((UnicodeString)"Test case #" + (i / 3), rules,
UTRANS_FORWARD, parseError, status);
if (U_FAILURE(status)) {
reportParseError(UnicodeString("FAIL: Couldn't create transliterator from generated rules"),
parseError, status);
delete t;
} else {
expect(*t2,
UnicodeString(BEGIN_END_TEST_CASES[i + 1]),
UnicodeString(BEGIN_END_TEST_CASES[i + 2]));
delete t;
delete t2;
}
}
}
// do the same thing for the reversible test case
UParseError parseError;
UErrorCode status = U_ZERO_ERROR;
Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
UTRANS_REVERSE, parseError, status);
if (U_FAILURE(status)) {
reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
} else {
UnicodeString rules;
reversed->toRules(rules, FALSE);
Transliterator* reversed2 = Transliterator::createFromRules("Reversed", rules, UTRANS_FORWARD,
parseError, status);
if (U_FAILURE(status)) {
reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator from generated rules"),
parseError, status);
delete reversed;
} else {
expect(*reversed2,
UnicodeString("xy XY XYZ yz YZ"),
UnicodeString("xy abc xaba yz aba"));
delete reversed;
delete reversed2;
}
}
}
//======================================================================
// Support methods
//======================================================================
@ -3990,14 +4318,35 @@ void TransliteratorTest::expectT(const UnicodeString& id,
delete t;
}
void TransliteratorTest::reportParseError(const UnicodeString& message,
const UParseError& parseError,
const UErrorCode& status) {
errln(message +
/*", parse error " + parseError.code +*/
", line " + parseError.line +
", offset " + parseError.offset +
", pre-context " + prettify(parseError.preContext, TRUE) +
", post-context " + prettify(parseError.postContext,TRUE) +
", Error: " + u_errorName(status));
}
void TransliteratorTest::expect(const UnicodeString& rules,
const UnicodeString& source,
const UnicodeString& expectedResult,
UTransPosition *pos) {
expect("<ID>", rules, source, expectedResult, pos);
}
void TransliteratorTest::expect(const UnicodeString& id,
const UnicodeString& rules,
const UnicodeString& source,
const UnicodeString& expectedResult,
UTransPosition *pos) {
UErrorCode status = U_ZERO_ERROR;
Transliterator *t = new RuleBasedTransliterator("<ID>", rules, status);
UParseError parseError;
Transliterator* t = Transliterator::createFromRules(id, rules, UTRANS_FORWARD, parseError, status);
if (U_FAILURE(status)) {
errln("FAIL: Transliterator constructor failed");
reportParseError(UnicodeString("Couldn't create transliterator from ") + rules, parseError, status);
} else {
expect(*t, source, expectedResult, pos);
}
@ -4021,7 +4370,6 @@ void TransliteratorTest::expect(const Transliterator& t,
t.transliterate(result);
expectAux(t.getID() + ":String", source, result, expectedResult);
}
UTransPosition index={0, 0, 0, 0};
if (pos != 0) {
index = *pos;

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2003, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -344,6 +344,10 @@ private:
void TestAlternateSyntax(void);
void TestBeginEnd(void);
void TestBeginEndToRules(void);
//======================================================================
// Support methods
//======================================================================
@ -357,6 +361,12 @@ private:
const UnicodeString& expectedResult,
UTransPosition *pos=0);
void expect(const UnicodeString& id,
const UnicodeString& rules,
const UnicodeString& source,
const UnicodeString& expectedResult,
UTransPosition *pos=0);
void expect(const Transliterator& t,
const UnicodeString& source,
const UnicodeString& expectedResult,
@ -385,6 +395,8 @@ private:
void CheckIncrementalAux(const Transliterator* t,
const UnicodeString& input);
void reportParseError(const UnicodeString& message, const UParseError& parseError, const UErrorCode& status);
const UnicodeString DESERET_DEE;
const UnicodeString DESERET_dee;