ICU-4539 Added ability to put ::ID rules into the middle of Transliterator rule sets.
X-SVN-Rev: 17844
This commit is contained in:
parent
950b3b0bc3
commit
fb164eba5a
@ -1,5 +1,5 @@
|
||||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2004, International Business Machines
|
||||
# Copyright (c) 1999-2005, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
@ -27,7 +27,7 @@ e {($tone) r} > r &tone-digit($1);
|
||||
|
||||
# The following backs up until it finds the right vowel, then deposits the tone
|
||||
|
||||
$vowel = [aAeEiIoOuUüÜ];
|
||||
$vowel = [aAeEiIoOuUüÜ {u\u0308} {U\u0308} ];
|
||||
$consonant = [[a-z A-Z] - [$vowel]];
|
||||
$digit = [1-5];
|
||||
$1 &digit-tone($3) $2 < ([aAeE]) ($vowel* $consonant*) ($digit);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -51,7 +51,7 @@ CompoundTransliterator::CompoundTransliterator(
|
||||
int32_t transliteratorCount,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter),
|
||||
trans(0), count(0), compoundRBTIndex(-1) {
|
||||
trans(0), count(0), numAnonymousRBTs(0) {
|
||||
setTransliterators(transliterators, transliteratorCount);
|
||||
}
|
||||
|
||||
@ -68,20 +68,36 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
|
||||
UParseError& /*parseError*/,
|
||||
UErrorCode& status) :
|
||||
Transliterator(id, adoptedFilter),
|
||||
trans(0), compoundRBTIndex(-1) {
|
||||
trans(0), numAnonymousRBTs(0) {
|
||||
// TODO add code for parseError...currently unused, but
|
||||
// later may be used by parsing code...
|
||||
init(id, direction, -1, 0, TRUE, status);
|
||||
init(id, direction, TRUE, status);
|
||||
}
|
||||
|
||||
CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
|
||||
UParseError& /*parseError*/,
|
||||
UErrorCode& status) :
|
||||
Transliterator(id, 0), // set filter to 0 here!
|
||||
trans(0), compoundRBTIndex(-1) {
|
||||
trans(0), numAnonymousRBTs(0) {
|
||||
// TODO add code for parseError...currently unused, but
|
||||
// later may be used by parsing code...
|
||||
init(id, UTRANS_FORWARD, -1, 0, TRUE, status);
|
||||
init(id, UTRANS_FORWARD, TRUE, status);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Private constructor for use of TransliteratorAlias
|
||||
*/
|
||||
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
|
||||
UVector& list,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
int32_t anonymousRBTs,
|
||||
UParseError& /*parseError*/,
|
||||
UErrorCode& status) :
|
||||
Transliterator(ID, adoptedFilter),
|
||||
trans(0), numAnonymousRBTs(anonymousRBTs)
|
||||
{
|
||||
init(list, UTRANS_FORWARD, FALSE, status);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -93,7 +109,7 @@ CompoundTransliterator::CompoundTransliterator(UVector& list,
|
||||
UParseError& /*parseError*/,
|
||||
UErrorCode& status) :
|
||||
Transliterator(EMPTY, NULL),
|
||||
trans(0), compoundRBTIndex(-1)
|
||||
trans(0), numAnonymousRBTs(0)
|
||||
{
|
||||
// TODO add code for parseError...currently unused, but
|
||||
// later may be used by parsing code...
|
||||
@ -101,20 +117,14 @@ CompoundTransliterator::CompoundTransliterator(UVector& list,
|
||||
// assume caller will fixup ID
|
||||
}
|
||||
|
||||
/**
|
||||
* Private constructor for compound RBTs. Construct a compound
|
||||
* transliterator using the given idBlock, with the adoptedTrans
|
||||
* inserted at the idSplitPoint.
|
||||
*/
|
||||
CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID,
|
||||
const UnicodeString& idBlock,
|
||||
int32_t idSplitPoint,
|
||||
Transliterator *adoptedTrans,
|
||||
CompoundTransliterator::CompoundTransliterator(UVector& list,
|
||||
int32_t anonymousRBTs,
|
||||
UParseError& /*parseError*/,
|
||||
UErrorCode& status) :
|
||||
Transliterator(newID, 0),
|
||||
trans(0), compoundRBTIndex(-1)
|
||||
Transliterator(EMPTY, NULL),
|
||||
trans(0), numAnonymousRBTs(anonymousRBTs)
|
||||
{
|
||||
init(idBlock, UTRANS_FORWARD, idSplitPoint, adoptedTrans, FALSE, status);
|
||||
init(list, UTRANS_FORWARD, FALSE, status);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -135,14 +145,11 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID,
|
||||
*/
|
||||
void CompoundTransliterator::init(const UnicodeString& id,
|
||||
UTransDirection direction,
|
||||
int32_t idSplitPoint,
|
||||
Transliterator *adoptedSplitTrans,
|
||||
UBool fixReverseID,
|
||||
UErrorCode& status) {
|
||||
// assert(trans == 0);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete adoptedSplitTrans;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -152,12 +159,11 @@ void CompoundTransliterator::init(const UnicodeString& id,
|
||||
if (!TransliteratorIDParser::parseCompoundID(id, direction,
|
||||
regenID, list, compoundFilter)) {
|
||||
status = U_INVALID_ID;
|
||||
delete adoptedSplitTrans;
|
||||
delete compoundFilter;
|
||||
return;
|
||||
}
|
||||
|
||||
compoundRBTIndex = TransliteratorIDParser::instantiateList(list, adoptedSplitTrans, idSplitPoint, status);
|
||||
TransliteratorIDParser::instantiateList(list, status);
|
||||
|
||||
init(list, direction, fixReverseID, status);
|
||||
|
||||
@ -209,11 +215,6 @@ void CompoundTransliterator::init(UVector& list,
|
||||
trans[i] = (Transliterator*) list.elementAt(j);
|
||||
}
|
||||
|
||||
// Fix compoundRBTIndex for REVERSE transliterators
|
||||
if (compoundRBTIndex >= 0 && direction == UTRANS_REVERSE) {
|
||||
compoundRBTIndex = count - 1 - compoundRBTIndex;
|
||||
}
|
||||
|
||||
// If the direction is UTRANS_REVERSE then we may need to fix the
|
||||
// ID.
|
||||
if (direction == UTRANS_REVERSE && fixReverseID) {
|
||||
@ -251,7 +252,7 @@ UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterat
|
||||
* Copy constructor.
|
||||
*/
|
||||
CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :
|
||||
Transliterator(t), trans(0), count(0), compoundRBTIndex(-1) {
|
||||
Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) {
|
||||
*this = t;
|
||||
}
|
||||
|
||||
@ -292,7 +293,7 @@ CompoundTransliterator& CompoundTransliterator::operator=(
|
||||
for (i=0; i<count; ++i) {
|
||||
trans[i] = t.trans[i]->clone();
|
||||
}
|
||||
compoundRBTIndex = t.compoundRBTIndex;
|
||||
numAnonymousRBTs = t.numAnonymousRBTs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -359,7 +360,7 @@ UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
|
||||
// compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
|
||||
// we do call toRules() recursively.
|
||||
rulesSource.truncate(0);
|
||||
if (compoundRBTIndex >= 0 && getFilter() != NULL) {
|
||||
if (numAnonymousRBTs >= 1 && getFilter() != NULL) {
|
||||
// If we are a compound RBT and if we have a global
|
||||
// filter, then emit it at the top.
|
||||
UnicodeString pat;
|
||||
@ -367,8 +368,24 @@ UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
|
||||
}
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
UnicodeString rule;
|
||||
if (i == compoundRBTIndex) {
|
||||
|
||||
// Anonymous RuleBasedTransliterators (inline rules and
|
||||
// ::BEGIN/::END blocks) are given IDs that begin with
|
||||
// "%Pass": use toRules() to write all the rules to the output
|
||||
// (and insert "::Null;" if we have two in a row)
|
||||
if (trans[i]->getID().startsWith("%Pass")) {
|
||||
trans[i]->toRules(rule, escapeUnprintable);
|
||||
if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith("%Pass"))
|
||||
rule = "::Null;" + rule;
|
||||
|
||||
// we also use toRules() on CompoundTransliterators (which we
|
||||
// check for by looking for a semicolon in the ID)-- this gets
|
||||
// the list of their child transliterators output in the right
|
||||
// format
|
||||
} else if (trans[i]->getID().indexOf(';') >= 0) {
|
||||
trans[i]->toRules(rule, escapeUnprintable);
|
||||
|
||||
// for everything else, use Transliterator::toRules()
|
||||
} else {
|
||||
trans[i]->Transliterator::toRules(rule, escapeUnprintable);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -42,13 +42,7 @@ class U_I18N_API CompoundTransliterator : public Transliterator {
|
||||
|
||||
int32_t count;
|
||||
|
||||
/**
|
||||
* For compound RBTs (those with an ::id block before and/or after
|
||||
* the main rule block) we record the index of the RBT here.
|
||||
* Otherwise, this should have a value of -1. We need this
|
||||
* information to implement toRules().
|
||||
*/
|
||||
int32_t compoundRBTIndex;
|
||||
int32_t numAnonymousRBTs;
|
||||
|
||||
public:
|
||||
|
||||
@ -202,28 +196,27 @@ private:
|
||||
friend class Transliterator;
|
||||
friend class TransliteratorAlias; // to access private ct
|
||||
|
||||
/**
|
||||
* Private constructor for compound RBTs. Construct a compound
|
||||
* transliterator using the given idBlock, with the adoptedTrans
|
||||
* inserted at the idSplitPoint.
|
||||
*/
|
||||
CompoundTransliterator(const UnicodeString& ID,
|
||||
const UnicodeString& idBlock,
|
||||
int32_t idSplitPoint,
|
||||
Transliterator *adoptedTrans,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Private constructor for Transliterator.
|
||||
*/
|
||||
CompoundTransliterator(const UnicodeString& ID,
|
||||
UVector& list,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
int32_t numAnonymousRBTs,
|
||||
UParseError& parseError,
|
||||
UErrorCode& status);
|
||||
|
||||
CompoundTransliterator(UVector& list,
|
||||
UParseError& parseError,
|
||||
UErrorCode& status);
|
||||
|
||||
CompoundTransliterator(UVector& list,
|
||||
int32_t anonymousRBTs,
|
||||
UParseError& parseError,
|
||||
UErrorCode& status);
|
||||
|
||||
void init(const UnicodeString& id,
|
||||
UTransDirection direction,
|
||||
int32_t idSplitPoint,
|
||||
Transliterator *adoptedRbt,
|
||||
UBool fixReverseID,
|
||||
UErrorCode& status);
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -43,13 +43,13 @@ void RuleBasedTransliterator::_construct(const UnicodeString& rules,
|
||||
return;
|
||||
}
|
||||
|
||||
if (parser.idBlock.length() != 0 ||
|
||||
if (parser.idBlockVector->size() != 0 ||
|
||||
parser.compoundFilter != NULL) {
|
||||
status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
|
||||
return;
|
||||
}
|
||||
|
||||
fData = parser.orphanData();
|
||||
fData = (TransliterationRuleData*)parser.dataVector->orphanElementAt(0);
|
||||
setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -23,7 +23,7 @@ U_NAMESPACE_BEGIN
|
||||
|
||||
TransliterationRuleData::TransliterationRuleData(UErrorCode& status)
|
||||
: UMemory(), ruleSet(status),
|
||||
variableNames(0), variables(0)
|
||||
variableNames(0), variables(0), variablesAreOwned(TRUE)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
@ -44,7 +44,8 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status)
|
||||
TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData& other) :
|
||||
UMemory(other), ruleSet(other.ruleSet),
|
||||
variablesBase(other.variablesBase),
|
||||
variablesLength(other.variablesLength)
|
||||
variablesLength(other.variablesLength),
|
||||
variablesAreOwned(TRUE)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
variableNames = new Hashtable(status);
|
||||
@ -78,12 +79,12 @@ TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData&
|
||||
|
||||
TransliterationRuleData::~TransliterationRuleData() {
|
||||
delete variableNames;
|
||||
if (variables != 0) {
|
||||
if (variablesAreOwned && variables != 0) {
|
||||
for (int32_t i=0; i<variablesLength; ++i) {
|
||||
delete variables[i];
|
||||
}
|
||||
uprv_free(variables);
|
||||
}
|
||||
uprv_free(variables);
|
||||
}
|
||||
|
||||
UnicodeFunctor*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 1999-2004, International Business Machines Corporation and others. All Rights Reserved.
|
||||
* Copyright (C) 1999-2005, International Business Machines Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
@ -74,6 +74,15 @@ public:
|
||||
*/
|
||||
UnicodeFunctor** variables;
|
||||
|
||||
/**
|
||||
* Flag that indicates whether the variables are owned (if a single
|
||||
* call to Transliterator::createFromRules() produces a CompoundTransliterator
|
||||
* with more than one RuleBasedTransliterator as children, they all share
|
||||
* the same variables list, so only the first one is considered to own
|
||||
* the variables)
|
||||
*/
|
||||
bool variablesAreOwned;
|
||||
|
||||
/**
|
||||
* The character that represents variables[0]. Characters
|
||||
* variablesBase through variablesBase +
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "unicode/symtable.h"
|
||||
#include "tridpars.h"
|
||||
#include "uvector.h"
|
||||
#include "hash.h"
|
||||
#include "util.h"
|
||||
#include "cmemory.h"
|
||||
#include "uprops.h"
|
||||
@ -108,6 +109,15 @@ static const UChar HALF_ENDERS[] = { // "=><;"
|
||||
static const int32_t ID_TOKEN_LEN = 2;
|
||||
static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
|
||||
|
||||
/*
|
||||
commented out until we do real ::BEGIN/::END functionality
|
||||
static const int32_t BEGIN_TOKEN_LEN = 5;
|
||||
static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'
|
||||
|
||||
static const int32_t END_TOKEN_LEN = 3;
|
||||
static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
|
||||
*/
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
@ -126,8 +136,11 @@ public:
|
||||
|
||||
const UVector* variablesVector; // alias
|
||||
|
||||
const Hashtable* variableNames; // alias
|
||||
|
||||
ParseData(const TransliterationRuleData* data = 0,
|
||||
const UVector* variablesVector = 0);
|
||||
const UVector* variablesVector = 0,
|
||||
const Hashtable* variableNames = 0);
|
||||
|
||||
virtual const UnicodeString* lookup(const UnicodeString& s) const;
|
||||
|
||||
@ -153,14 +166,15 @@ private:
|
||||
};
|
||||
|
||||
ParseData::ParseData(const TransliterationRuleData* d,
|
||||
const UVector* sets) :
|
||||
data(d), variablesVector(sets) {}
|
||||
const UVector* sets,
|
||||
const Hashtable* vNames) :
|
||||
data(d), variablesVector(sets), variableNames(vNames) {}
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API.
|
||||
*/
|
||||
const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
|
||||
return (const UnicodeString*) data->variableNames->get(name);
|
||||
return (const UnicodeString*) variableNames->get(name);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -516,7 +530,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
|
||||
// standin for that matcher.
|
||||
StringMatcher* m =
|
||||
new StringMatcher(buf, bufSegStart, buf.length(),
|
||||
segmentNumber, *parser.data);
|
||||
segmentNumber, *parser.curData);
|
||||
|
||||
// Record and associate object and segment number
|
||||
parser.setSegmentObject(segmentNumber, m);
|
||||
@ -554,7 +568,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
|
||||
UnicodeString output;
|
||||
buf.extractBetween(bufSegStart, buf.length(), output);
|
||||
FunctionReplacer *r =
|
||||
new FunctionReplacer(t, new StringReplacer(output, parser.data));
|
||||
new FunctionReplacer(t, new StringReplacer(output, parser.curData));
|
||||
|
||||
// Replace the buffer contents with a stand-in
|
||||
buf.truncate(bufSegStart);
|
||||
@ -645,7 +659,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
|
||||
}
|
||||
|
||||
UnicodeFunctor *m =
|
||||
new StringMatcher(buf, qstart, qlimit, 0, *parser.data);
|
||||
new StringMatcher(buf, qstart, qlimit, 0, *parser.curData);
|
||||
int32_t min = 0;
|
||||
int32_t max = Quantifier::MAX;
|
||||
switch (c) {
|
||||
@ -794,10 +808,13 @@ UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {
|
||||
* Constructor.
|
||||
*/
|
||||
TransliteratorParser::TransliteratorParser() {
|
||||
data = NULL;
|
||||
dataVector = NULL;
|
||||
idBlockVector = NULL;
|
||||
curData = NULL;
|
||||
compoundFilter = NULL;
|
||||
parseData = NULL;
|
||||
variablesVector = NULL;
|
||||
variableNames = NULL;
|
||||
segmentObjects = NULL;
|
||||
}
|
||||
|
||||
@ -805,10 +822,16 @@ TransliteratorParser::TransliteratorParser() {
|
||||
* Destructor.
|
||||
*/
|
||||
TransliteratorParser::~TransliteratorParser() {
|
||||
delete data;
|
||||
while (dataVector != NULL && !dataVector->isEmpty())
|
||||
delete (TransliterationRuleData*)(dataVector->orphanElementAt(0));
|
||||
delete dataVector;
|
||||
delete idBlockVector;
|
||||
delete compoundFilter;
|
||||
delete parseData;
|
||||
while (variablesVector != NULL && !variablesVector->isEmpty())
|
||||
delete (UnicodeFunctor*)variablesVector->orphanElementAt(0);
|
||||
delete variablesVector;
|
||||
delete variableNames;
|
||||
delete segmentObjects;
|
||||
}
|
||||
|
||||
@ -833,15 +856,6 @@ UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
|
||||
return f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the data object parsed by parse(). Caller owns result.
|
||||
*/
|
||||
TransliterationRuleData* TransliteratorParser::orphanData() {
|
||||
TransliterationRuleData* d = data;
|
||||
data = NULL;
|
||||
return d;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Private implementation
|
||||
//----------------------------------------------------------------------
|
||||
@ -861,12 +875,31 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
|
||||
parseError.preContext[0] = parseError.postContext[0] = (UChar)0;
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
delete data;
|
||||
data = new TransliterationRuleData(status);
|
||||
UBool parsingIDs = TRUE;
|
||||
UBool inBeginEndBlock = FALSE;
|
||||
int32_t ruleCount = 0;
|
||||
|
||||
if (dataVector == NULL)
|
||||
dataVector = new UVector(status);
|
||||
else {
|
||||
while (!dataVector->isEmpty())
|
||||
delete (TransliterationRuleData*)(dataVector->orphanElementAt(0));
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (idBlockVector == NULL) {
|
||||
idBlockVector = new UVector(status);
|
||||
idBlockVector->setDeleter(uhash_deleteUnicodeString);
|
||||
}
|
||||
else
|
||||
idBlockVector->removeAllElements();
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
curData = NULL;
|
||||
|
||||
direction = theDirection;
|
||||
ruleCount = 0;
|
||||
|
||||
@ -876,34 +909,27 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
|
||||
if (variablesVector == NULL) {
|
||||
variablesVector = new UVector(status);
|
||||
} else {
|
||||
variablesVector->removeAllElements();
|
||||
while (!variablesVector->isEmpty())
|
||||
delete (UnicodeFunctor*)variablesVector->orphanElementAt(0);
|
||||
}
|
||||
parseData = new ParseData(0, variablesVector);
|
||||
if (variableNames == NULL) {
|
||||
variableNames = new Hashtable(status);
|
||||
variableNames->setValueDeleter(uhash_deleteUnicodeString);
|
||||
} else {
|
||||
variableNames->removeAll();
|
||||
}
|
||||
parseData = new ParseData(0, variablesVector, variableNames);
|
||||
if (parseData == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
parseData->data = data;
|
||||
|
||||
// By default, rules use part of the private use area
|
||||
// E000..F8FF for variables and other stand-ins. Currently
|
||||
// the range F000..F8FF is typically sufficient. The 'use
|
||||
// variable range' pragma allows rule sets to modify this.
|
||||
setVariableRange(0xF000, 0xF8FF);
|
||||
|
||||
dotStandIn = (UChar) -1;
|
||||
|
||||
UnicodeString str; // scratch
|
||||
idBlock.truncate(0);
|
||||
idSplitPoint = -1;
|
||||
UnicodeString idBlockResult;
|
||||
int32_t pos = 0;
|
||||
int32_t limit = rule.length();
|
||||
// The mode marks whether we are in the header ::id block, the
|
||||
// rule block, or the footer ::id block.
|
||||
// mode == 0: start: rule->1, ::id->0
|
||||
// mode == 1: in rules: rule->1, ::id->2
|
||||
// mode == 2: in footer rule block: rule->ERROR, ::id->2
|
||||
int32_t mode = 0;
|
||||
|
||||
// The compound filter offset is an index into idBlockResult.
|
||||
// If it is 0, then the compound filter occurred at the start,
|
||||
@ -913,9 +939,6 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
|
||||
compoundFilter = NULL;
|
||||
int32_t compoundFilterOffset = -1;
|
||||
|
||||
// The number of ::ID block entries we have parsed
|
||||
int32_t idBlockCount = 0;
|
||||
|
||||
while (pos < limit && U_SUCCESS(status)) {
|
||||
UChar c = rule.charAt(pos++);
|
||||
if (uprv_isRuleWhiteSpace(c)) {
|
||||
@ -930,13 +953,21 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
|
||||
}
|
||||
continue; // Either fall out or restart with next line
|
||||
}
|
||||
|
||||
// skip empty rules
|
||||
if (c == END_OF_RULE)
|
||||
continue;
|
||||
|
||||
// keep track of how many rules we've seen
|
||||
++ruleCount;
|
||||
|
||||
// We've found the start of a rule or ID. c is its first
|
||||
// character, and pos points past c.
|
||||
--pos;
|
||||
// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
|
||||
// chars left.
|
||||
if ((pos + ID_TOKEN_LEN + 1) <= limit &&
|
||||
rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
|
||||
rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
|
||||
pos += ID_TOKEN_LEN;
|
||||
c = rule.charAt(pos);
|
||||
while (uprv_isRuleWhiteSpace(c) && pos < limit) {
|
||||
@ -944,33 +975,35 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
|
||||
c = rule.charAt(pos);
|
||||
}
|
||||
|
||||
if (mode == 1) {
|
||||
// We have just entered the footer ::ID block
|
||||
mode = 2;
|
||||
// In the forward direction add elements at the end.
|
||||
// In the reverse direction add elements at the start.
|
||||
idSplitPoint = idBlockCount;
|
||||
}
|
||||
int32_t p = pos;
|
||||
|
||||
if (!parsingIDs) {
|
||||
if (curData != NULL) {
|
||||
if (direction == UTRANS_FORWARD)
|
||||
dataVector->addElement(curData, status);
|
||||
else
|
||||
dataVector->insertElementAt(curData, 0, status);
|
||||
curData = NULL;
|
||||
}
|
||||
parsingIDs = TRUE;
|
||||
}
|
||||
|
||||
TransliteratorIDParser::SingleID* id =
|
||||
TransliteratorIDParser::parseSingleID(rule, p, direction, status);
|
||||
if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) {
|
||||
// Successful ::ID parse.
|
||||
|
||||
|
||||
if (direction == UTRANS_FORWARD) {
|
||||
idBlock.append(id->canonID).append(END_OF_RULE);
|
||||
idBlockResult.append(id->canonID).append(END_OF_RULE);
|
||||
} else {
|
||||
idBlock.insert(0, END_OF_RULE);
|
||||
idBlock.insert(0, id->canonID);
|
||||
idBlockResult.insert(0, END_OF_RULE);
|
||||
idBlockResult.insert(0, id->canonID);
|
||||
}
|
||||
|
||||
++idBlockCount;
|
||||
|
||||
|
||||
} else {
|
||||
// Couldn't parse an ID. Try to parse a global filter
|
||||
int32_t withParens = -1;
|
||||
UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, &idBlock);
|
||||
UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL);
|
||||
if (f != NULL) {
|
||||
if (ICU_Utility::parseChar(rule, p, END_OF_RULE)
|
||||
&& (direction == UTRANS_FORWARD) == (withParens == 0))
|
||||
@ -981,7 +1014,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
|
||||
delete f;
|
||||
} else {
|
||||
compoundFilter = f;
|
||||
compoundFilterOffset = idBlockCount;
|
||||
compoundFilterOffset = ruleCount;
|
||||
}
|
||||
} else {
|
||||
delete f;
|
||||
@ -993,78 +1026,93 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
|
||||
}
|
||||
}
|
||||
delete id;
|
||||
|
||||
pos = p;
|
||||
} else if (resemblesPragma(rule, pos, limit)) {
|
||||
int32_t ppp = parsePragma(rule, pos, limit);
|
||||
if (ppp < 0) {
|
||||
syntaxError(U_MALFORMED_PRAGMA, rule, pos);
|
||||
}
|
||||
pos = ppp;
|
||||
} else {
|
||||
// Parse a rule
|
||||
pos = parseRule(rule, pos, limit);
|
||||
if (U_SUCCESS(status)) {
|
||||
++ruleCount;
|
||||
if (mode == 2) {
|
||||
// ::id in illegal position (because a rule
|
||||
// occurred after the ::id footer block)
|
||||
syntaxError(U_ILLEGAL_ARGUMENT_ERROR,rule,pos);
|
||||
}
|
||||
}else{
|
||||
syntaxError(status,rule,pos);
|
||||
if (parsingIDs) {
|
||||
if (direction == UTRANS_FORWARD)
|
||||
idBlockVector->addElement(new UnicodeString(idBlockResult), status);
|
||||
else
|
||||
idBlockVector->insertElementAt(new UnicodeString(idBlockResult), 0, status);
|
||||
idBlockResult.remove();
|
||||
parsingIDs = FALSE;
|
||||
curData = new TransliterationRuleData(status);
|
||||
parseData->data = curData;
|
||||
|
||||
// By default, rules use part of the private use area
|
||||
// E000..F8FF for variables and other stand-ins. Currently
|
||||
// the range F000..F8FF is typically sufficient. The 'use
|
||||
// variable range' pragma allows rule sets to modify this.
|
||||
setVariableRange(0xF000, 0xF8FF);
|
||||
}
|
||||
|
||||
if (resemblesPragma(rule, pos, limit)) {
|
||||
int32_t ppp = parsePragma(rule, pos, limit);
|
||||
if (ppp < 0) {
|
||||
syntaxError(U_MALFORMED_PRAGMA, rule, pos);
|
||||
}
|
||||
pos = ppp;
|
||||
// Parse a rule
|
||||
} else {
|
||||
pos = parseRule(rule, pos, limit);
|
||||
}
|
||||
mode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (idSplitPoint < 0) {
|
||||
idSplitPoint = idBlockCount;
|
||||
|
||||
if (parsingIDs && idBlockResult.length() > 0) {
|
||||
if (direction == UTRANS_FORWARD)
|
||||
idBlockVector->addElement(new UnicodeString(idBlockResult), status);
|
||||
else
|
||||
idBlockVector->insertElementAt(new UnicodeString(idBlockResult), 0, status);
|
||||
}
|
||||
else if (!parsingIDs && curData != NULL) {
|
||||
if (direction == UTRANS_FORWARD)
|
||||
dataVector->addElement(curData, status);
|
||||
else
|
||||
dataVector->insertElementAt(curData, 0, status);
|
||||
}
|
||||
|
||||
if (direction == UTRANS_REVERSE) {
|
||||
idSplitPoint = idBlockCount - idSplitPoint;
|
||||
}
|
||||
|
||||
// Convert the set vector to an array
|
||||
data->variablesLength = variablesVector->size();
|
||||
if(data->variablesLength == 0) {
|
||||
data->variables = 0;
|
||||
} else {
|
||||
data->variables = (UnicodeFunctor **)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor *));
|
||||
}
|
||||
|
||||
// orphanElement removes the given element and shifts all other
|
||||
// elements down. For performance (and code clarity) we work from
|
||||
// the end back to index 0.
|
||||
int32_t i;
|
||||
for (i=data->variablesLength; i>0; ) {
|
||||
--i;
|
||||
data->variables[i] =
|
||||
(UnicodeSet*) variablesVector->orphanElementAt(i);
|
||||
}
|
||||
|
||||
// Index the rules
|
||||
if (U_SUCCESS(status)) {
|
||||
// Convert the set vector to an array
|
||||
for (int32_t i = 0; i < dataVector->size(); i++) {
|
||||
TransliterationRuleData* data = (TransliterationRuleData*)dataVector->elementAt(i);
|
||||
data->variablesLength = variablesVector->size();
|
||||
if (data->variablesLength == 0) {
|
||||
data->variables = 0;
|
||||
} else {
|
||||
data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*));
|
||||
data->variablesAreOwned = (i == 0);
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j < data->variablesLength; j++) {
|
||||
data->variables[j] =
|
||||
((UnicodeSet*)variablesVector->elementAt(j));
|
||||
}
|
||||
|
||||
data->variableNames->removeAll();
|
||||
int32_t pos = -1;
|
||||
const UHashElement* he = variableNames->nextElement(pos);
|
||||
while (he != NULL) {
|
||||
data->variableNames->put(*((UnicodeString*)(he->key.pointer)),
|
||||
((UnicodeString*)(he->value.pointer))->clone(), status);
|
||||
he = variableNames->nextElement(pos);
|
||||
}
|
||||
}
|
||||
variablesVector->removeAllElements(); // keeps them from getting deleted when we succeed
|
||||
|
||||
// Index the rules
|
||||
if (compoundFilter != NULL) {
|
||||
if ((direction == UTRANS_FORWARD &&
|
||||
compoundFilterOffset != 0) ||
|
||||
(direction == UTRANS_REVERSE &&
|
||||
compoundFilterOffset != idBlockCount)) {
|
||||
if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) ||
|
||||
(direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) {
|
||||
status = U_MISPLACED_COMPOUND_FILTER;
|
||||
}
|
||||
}
|
||||
|
||||
data->ruleSet.freeze(parseError,status);
|
||||
|
||||
if (idSplitPoint < 0) {
|
||||
idSplitPoint = idBlock.length();
|
||||
}
|
||||
|
||||
if (ruleCount == 0) {
|
||||
delete data;
|
||||
data = NULL;
|
||||
for (int32_t i = 0; i < dataVector->size(); i++) {
|
||||
TransliterationRuleData* data = (TransliterationRuleData*)dataVector->elementAt(i);
|
||||
data->ruleSet.freeze(parseError, status);
|
||||
}
|
||||
if (idBlockVector->size() == 1 && ((UnicodeString*)idBlockVector->elementAt(0))->isEmpty())
|
||||
idBlockVector->removeElementAt(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1077,8 +1125,11 @@ void TransliteratorParser::setVariableRange(int32_t start, int32_t end) {
|
||||
return;
|
||||
}
|
||||
|
||||
data->variablesBase = variableNext = (UChar) start; // first private use
|
||||
variableLimit = (UChar) (end + 1);
|
||||
curData->variablesBase = (UChar) start;
|
||||
if (dataVector->size() == 0) {
|
||||
variableNext = (UChar) start;
|
||||
variableLimit = (UChar) (end + 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1087,7 +1138,7 @@ void TransliteratorParser::setVariableRange(int32_t start, int32_t end) {
|
||||
* variable range does not overlap characters used in a rule.
|
||||
*/
|
||||
UBool TransliteratorParser::checkVariableRange(UChar32 ch) const {
|
||||
return !(ch >= data->variablesBase && ch < variableLimit);
|
||||
return !(ch >= curData->variablesBase && ch < variableLimit);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1276,7 +1327,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
||||
}
|
||||
// We allow anything on the right, including an empty string.
|
||||
UnicodeString* value = new UnicodeString(right->text);
|
||||
data->variableNames->put(undefinedVariableName, value, status);
|
||||
variableNames->put(undefinedVariableName, value, status);
|
||||
++variableLimit;
|
||||
return pos;
|
||||
}
|
||||
@ -1363,13 +1414,13 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
||||
segmentObjects->toArray((void**) segmentsArray);
|
||||
}
|
||||
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
curData->ruleSet.addRule(new TransliterationRule(
|
||||
left->text, left->ante, left->post,
|
||||
right->text, right->cursor, right->cursorOffset,
|
||||
segmentsArray,
|
||||
segmentObjects->size(),
|
||||
left->anchorStart, left->anchorEnd,
|
||||
data,
|
||||
curData,
|
||||
status), status);
|
||||
|
||||
return pos;
|
||||
@ -1434,7 +1485,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted) {
|
||||
// (typical n is 0, 1, or 2); linear search is optimal.
|
||||
for (int32_t i=0; i<variablesVector->size(); ++i) {
|
||||
if (variablesVector->elementAt(i) == adopted) { // [sic] pointer comparison
|
||||
return (UChar) (data->variablesBase + i);
|
||||
return (UChar) (curData->variablesBase + i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1452,7 +1503,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted) {
|
||||
*/
|
||||
UChar TransliteratorParser::getSegmentStandin(int32_t seg) {
|
||||
// Special character used to indicate an empty spot
|
||||
UChar empty = data->variablesBase - 1;
|
||||
UChar empty = curData->variablesBase - 1;
|
||||
while (segmentStandins.length() < seg) {
|
||||
segmentStandins.append(empty);
|
||||
}
|
||||
@ -1483,7 +1534,7 @@ void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted)
|
||||
if (segmentObjects->size() < seg) {
|
||||
segmentObjects->setSize(seg);
|
||||
}
|
||||
int32_t index = getSegmentStandin(seg) - data->variablesBase;
|
||||
int32_t index = getSegmentStandin(seg) - curData->variablesBase;
|
||||
if (segmentObjects->elementAt(seg-1) != NULL ||
|
||||
variablesVector->elementAt(index) != NULL) {
|
||||
// should never happen
|
||||
@ -1511,7 +1562,7 @@ UChar TransliteratorParser::getDotStandIn() {
|
||||
*/
|
||||
void TransliteratorParser::appendVariableDef(const UnicodeString& name,
|
||||
UnicodeString& buf) {
|
||||
const UnicodeString* s = (const UnicodeString*) data->variableNames->get(name);
|
||||
const UnicodeString* s = (const UnicodeString*) variableNames->get(name);
|
||||
if (s == NULL) {
|
||||
// We allow one undefined variable so that variable definition
|
||||
// statements work. For the first undefined variable we return
|
||||
|
@ -26,6 +26,7 @@ class ParseData;
|
||||
class RuleHalf;
|
||||
class ParsePosition;
|
||||
class UVector;
|
||||
class Hashtable;
|
||||
class StringMatcher;
|
||||
|
||||
class TransliteratorParser : public UMemory {
|
||||
@ -33,27 +34,16 @@ class TransliteratorParser : public UMemory {
|
||||
public:
|
||||
|
||||
/**
|
||||
* PUBLIC data member containing the parsed data object, or null if
|
||||
* there were no rules.
|
||||
* A Vector of TransliterationRuleData objects, one for each discrete group
|
||||
* of rules in the rule set
|
||||
*/
|
||||
TransliterationRuleData* data;
|
||||
UVector* dataVector;
|
||||
|
||||
/**
|
||||
* PUBLIC data member.
|
||||
* The block of ::IDs, both at the top and at the bottom.
|
||||
* Inserted into these may be additional rules at the
|
||||
* idSplitPoint.
|
||||
* A Vector of UnicodeStrings containing all of the ID blocks in the rule set
|
||||
*/
|
||||
UnicodeString idBlock;
|
||||
|
||||
/**
|
||||
* PUBLIC data member.
|
||||
* In a compound RBT, the index at which the RBT rules are
|
||||
* inserted into the ID block. Index 0 means before any IDs
|
||||
* in the block. Index idBlock.length() means after all IDs
|
||||
* in the block. Index is a string index.
|
||||
*/
|
||||
int32_t idSplitPoint;
|
||||
UVector* idBlockVector;
|
||||
|
||||
/**
|
||||
* PUBLIC data member containing the parsed compound filter, if any.
|
||||
@ -62,10 +52,10 @@ class TransliteratorParser : public UMemory {
|
||||
|
||||
private:
|
||||
|
||||
// The number of rules parsed. This tells us if there were
|
||||
// any actual transliterator rules, or if there were just ::ID
|
||||
// block IDs.
|
||||
int32_t ruleCount;
|
||||
/**
|
||||
* The current data object for which we are parsing rules
|
||||
*/
|
||||
TransliterationRuleData* curData;
|
||||
|
||||
UTransDirection direction;
|
||||
|
||||
@ -92,6 +82,12 @@ class TransliteratorParser : public UMemory {
|
||||
*/
|
||||
UVector* variablesVector;
|
||||
|
||||
/**
|
||||
* Temporary table of variable names. When parsing is complete, this is
|
||||
* copied into data.variableNames.
|
||||
*/
|
||||
Hashtable* variableNames;
|
||||
|
||||
/**
|
||||
* String of standins for segments. Used during the parsing of a single
|
||||
* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
|
||||
@ -177,12 +173,6 @@ public:
|
||||
*/
|
||||
UnicodeSet* orphanCompoundFilter();
|
||||
|
||||
/**
|
||||
* Return the data object parsed by parse(). Caller owns result.
|
||||
* @return the data object parsed by parse().
|
||||
*/
|
||||
TransliterationRuleData* orphanData();
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -120,7 +120,7 @@ inline void _debugOut(const char* msg, TransliterationRule* rule,
|
||||
UnicodeString esc;
|
||||
_escape(buf, esc);
|
||||
CharString cbuf(esc);
|
||||
printf("%s\n", (char*) cbuf);
|
||||
printf("%s\n", (const char*) cbuf);
|
||||
}
|
||||
|
||||
#else
|
||||
|
@ -921,30 +921,26 @@ Transliterator::createInstance(const UnicodeString& ID,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
TransliteratorIDParser::instantiateList(list, NULL, -1, status);
|
||||
TransliteratorIDParser::instantiateList(list, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
U_ASSERT(list.size() > 0);
|
||||
Transliterator* t = NULL;
|
||||
switch (list.size()) {
|
||||
case 1:
|
||||
t = (Transliterator*) list.elementAt(0);
|
||||
break;
|
||||
default:
|
||||
|
||||
if (list.size() > 1 || canonID.indexOf(";") >= 0) {
|
||||
// [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
|
||||
// has one child transliterator. This is so that toRules() will return the right thing
|
||||
// (without any inactive ID), but our main ID still comes out correct. That is, if we
|
||||
// instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
|
||||
// even though the ID is "(Lower);Latin-Greek;".
|
||||
t = new CompoundTransliterator(list, parseError, status);
|
||||
/* test for NULL */
|
||||
if (t == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
delete t;
|
||||
return NULL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
else {
|
||||
t = (Transliterator*)list.elementAt(0);
|
||||
}
|
||||
|
||||
t->setID(canonID);
|
||||
if (globalFilter != NULL) {
|
||||
t->adoptFilter(globalFilter);
|
||||
@ -1053,59 +1049,61 @@ Transliterator::createFromRules(const UnicodeString& ID,
|
||||
}
|
||||
|
||||
// NOTE: The logic here matches that in TransliteratorRegistry.
|
||||
if (parser.idBlock.length() == 0) {
|
||||
if (parser.data == NULL) {
|
||||
// No idBlock, no data -- this is just an
|
||||
// alias for Null
|
||||
t = new NullTransliterator();
|
||||
} else {
|
||||
// No idBlock, data != 0 -- this is an
|
||||
// ordinary RBT_DATA.
|
||||
t = new RuleBasedTransliterator(ID, parser.orphanData(), TRUE); // TRUE == adopt data object
|
||||
if (parser.idBlockVector->size() == 0 && parser.dataVector->size() == 0) {
|
||||
t = new NullTransliterator();
|
||||
}
|
||||
else if (parser.idBlockVector->size() == 0 && parser.dataVector->size() == 1) {
|
||||
t = new RuleBasedTransliterator(ID, (TransliterationRuleData*)parser.dataVector->orphanElementAt(0), TRUE);
|
||||
}
|
||||
else if (parser.idBlockVector->size() == 1 && parser.dataVector->size() == 0) {
|
||||
// idBlock, no data -- this is an alias. The ID has
|
||||
// been munged from reverse into forward mode, if
|
||||
// necessary, so instantiate the ID in the forward
|
||||
// direction.
|
||||
if (parser.compoundFilter != NULL) {
|
||||
UnicodeString filterPattern;
|
||||
parser.compoundFilter->toPattern(filterPattern, FALSE);
|
||||
t = createInstance(filterPattern + ";"
|
||||
+ *((UnicodeString*)parser.idBlockVector->elementAt(0)), UTRANS_FORWARD, parseError, status);
|
||||
}
|
||||
/* test for NULL */
|
||||
if (t == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
if (parser.data == NULL) {
|
||||
// idBlock, no data -- this is an alias. The ID has
|
||||
// been munged from reverse into forward mode, if
|
||||
// necessary, so instantiate the ID in the forward
|
||||
// direction.
|
||||
t = createInstance(parser.idBlock, UTRANS_FORWARD, parseError, status);
|
||||
if (t != NULL) {
|
||||
t->setID(ID);
|
||||
}
|
||||
} else {
|
||||
// idBlock and data -- this is a compound
|
||||
// RBT
|
||||
UnicodeString id((UChar)0x005F); // '_'
|
||||
t = new RuleBasedTransliterator(id, parser.orphanData(), TRUE); // TRUE == adopt data object
|
||||
/* test for NULL */
|
||||
if (t == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
t = new CompoundTransliterator(ID, parser.idBlock, parser.idSplitPoint,
|
||||
t, status);
|
||||
/* test for NULL */
|
||||
if (t == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
delete t;
|
||||
t = 0;
|
||||
}
|
||||
if (parser.compoundFilter != NULL) {
|
||||
t->adoptFilter(parser.orphanCompoundFilter());
|
||||
}
|
||||
return t;
|
||||
else
|
||||
t = createInstance(*((UnicodeString*)parser.idBlockVector->elementAt(0)), UTRANS_FORWARD, parseError, status);
|
||||
|
||||
|
||||
if (t != NULL) {
|
||||
t->setID(ID);
|
||||
}
|
||||
}
|
||||
else {
|
||||
UVector transliterators(status);
|
||||
int32_t passNumber = 1;
|
||||
|
||||
int32_t limit = parser.idBlockVector->size();
|
||||
if (parser.dataVector->size() > limit)
|
||||
limit = parser.dataVector->size();
|
||||
|
||||
for (int32_t i = 0; i < limit; i++) {
|
||||
if (i < parser.idBlockVector->size()) {
|
||||
UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector->elementAt(i);
|
||||
if (!idBlock->isEmpty()) {
|
||||
Transliterator* temp = createInstance(*idBlock, UTRANS_FORWARD, parseError, status);
|
||||
if (temp != NULL && temp->getDynamicClassID() != NullTransliterator::getStaticClassID())
|
||||
transliterators.addElement(temp, status);
|
||||
else
|
||||
delete temp;
|
||||
}
|
||||
}
|
||||
if (!parser.dataVector->isEmpty()) {
|
||||
TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector->orphanElementAt(0);
|
||||
transliterators.addElement(new RuleBasedTransliterator((UnicodeString)"%Pass" + (passNumber++),
|
||||
data, TRUE), status);
|
||||
}
|
||||
}
|
||||
|
||||
t = new CompoundTransliterator(transliterators, passNumber - 1, parseError, status);
|
||||
t->setID(ID);
|
||||
t->adoptFilter(parser.orphanCompoundFilter());
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
|
@ -58,25 +58,25 @@ U_NAMESPACE_BEGIN
|
||||
// Alias
|
||||
//------------------------------------------------------------------
|
||||
|
||||
TransliteratorAlias::TransliteratorAlias(const UnicodeString& theAliasID) :
|
||||
TransliteratorAlias::TransliteratorAlias(const UnicodeString& theAliasID,
|
||||
const UnicodeSet* cpdFilter) :
|
||||
ID(),
|
||||
aliasID(theAliasID),
|
||||
trans(0),
|
||||
compoundFilter(0),
|
||||
idSplitPoint(-1),
|
||||
aliasesOrRules(theAliasID),
|
||||
transes(0),
|
||||
compoundFilter(cpdFilter),
|
||||
direction(UTRANS_FORWARD),
|
||||
type(TransliteratorAlias::SIMPLE) {
|
||||
}
|
||||
|
||||
TransliteratorAlias::TransliteratorAlias(const UnicodeString& theID,
|
||||
const UnicodeString& idBlock,
|
||||
Transliterator* adopted,
|
||||
int32_t theIDSplitPoint,
|
||||
const UnicodeString& idBlocks,
|
||||
UVector* adoptedTransliterators,
|
||||
const UnicodeSet* cpdFilter) :
|
||||
ID(theID),
|
||||
aliasID(idBlock),
|
||||
trans(adopted),
|
||||
aliasesOrRules(idBlocks),
|
||||
transes(adoptedTransliterators),
|
||||
compoundFilter(cpdFilter),
|
||||
idSplitPoint(theIDSplitPoint),
|
||||
direction(UTRANS_FORWARD),
|
||||
type(TransliteratorAlias::COMPOUND) {
|
||||
}
|
||||
|
||||
@ -84,15 +84,15 @@ TransliteratorAlias::TransliteratorAlias(const UnicodeString& theID,
|
||||
const UnicodeString& rules,
|
||||
UTransDirection dir) :
|
||||
ID(theID),
|
||||
aliasID(rules), // bad name -- rename aliasID!
|
||||
trans(0),
|
||||
aliasesOrRules(rules),
|
||||
transes(0),
|
||||
compoundFilter(0),
|
||||
idSplitPoint((int32_t) dir), // bad name -- rename idSplitPoint!
|
||||
direction(dir),
|
||||
type(TransliteratorAlias::RULES) {
|
||||
}
|
||||
|
||||
TransliteratorAlias::~TransliteratorAlias() {
|
||||
delete trans;
|
||||
delete transes;
|
||||
}
|
||||
|
||||
|
||||
@ -104,23 +104,60 @@ Transliterator* TransliteratorAlias::create(UParseError& pe,
|
||||
Transliterator *t = NULL;
|
||||
switch (type) {
|
||||
case SIMPLE:
|
||||
t = Transliterator::createInstance(aliasID, UTRANS_FORWARD, pe, ec);
|
||||
t = Transliterator::createInstance(aliasesOrRules, UTRANS_FORWARD, pe, ec);
|
||||
if (compoundFilter != 0)
|
||||
t->adoptFilter((UnicodeSet*)compoundFilter->clone());
|
||||
break;
|
||||
case COMPOUND:
|
||||
t = new CompoundTransliterator(ID, aliasID, idSplitPoint,
|
||||
trans, ec);
|
||||
/* test for NULL */
|
||||
if (t == 0) {
|
||||
ec = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
trans = 0; // so we don't delete it later
|
||||
if (compoundFilter) {
|
||||
// TODO: Is this right? Are we leaking memory here?
|
||||
// I'm suspicious because of the "trans = 0" line above;
|
||||
// doesn't seem to fit the cloning here. Don't have time
|
||||
// to track this down right now. [alan 3.0]
|
||||
t->adoptFilter((UnicodeSet*) compoundFilter->clone());
|
||||
{
|
||||
// the total number of transliterators in the compound is the total number of anonymous transliterators
|
||||
// plus the total number of ID blocks-- we start by assuming the list begins and ends with an ID
|
||||
// block and that each pair anonymous transliterators has an ID block between them. Then we go back
|
||||
// to see whether there really are ID blocks at the beginning and end (by looking for U+FFFF, which
|
||||
// marks the position where an anonymous transliterator goes) and adjust accordingly
|
||||
int32_t anonymousRBTs = transes->size();
|
||||
int32_t transCount = anonymousRBTs * 2 + 1;
|
||||
if (!aliasesOrRules.isEmpty() && aliasesOrRules[0] == (UChar)(0xffff))
|
||||
--transCount;
|
||||
if (aliasesOrRules.length() >= 2 && aliasesOrRules[aliasesOrRules.length() - 1] == (UChar)(0xffff))
|
||||
--transCount;
|
||||
UnicodeString noIDBlock((UChar)(0xffff));
|
||||
noIDBlock += ((UChar)(0xffff));
|
||||
int32_t pos = aliasesOrRules.indexOf(noIDBlock);
|
||||
while (pos >= 0) {
|
||||
--transCount;
|
||||
pos = aliasesOrRules.indexOf(noIDBlock, pos + 1);
|
||||
}
|
||||
|
||||
UVector transliterators(ec);
|
||||
UnicodeString idBlock;
|
||||
int32_t blockSeparatorPos = aliasesOrRules.indexOf((UChar)(0xffff));
|
||||
while (blockSeparatorPos >= 0) {
|
||||
aliasesOrRules.extract(0, blockSeparatorPos, idBlock);
|
||||
aliasesOrRules.remove(0, blockSeparatorPos + 1);
|
||||
if (!idBlock.isEmpty())
|
||||
transliterators.addElement(Transliterator::createInstance(idBlock, UTRANS_FORWARD, pe, ec), ec);
|
||||
if (!transes->isEmpty())
|
||||
transliterators.addElement(transes->orphanElementAt(0), ec);
|
||||
blockSeparatorPos = aliasesOrRules.indexOf((UChar)(0xffff));
|
||||
}
|
||||
if (!aliasesOrRules.isEmpty())
|
||||
transliterators.addElement(Transliterator::createInstance(aliasesOrRules, UTRANS_FORWARD, pe, ec), ec);
|
||||
while (!transes->isEmpty())
|
||||
transliterators.addElement(transes->orphanElementAt(0), ec);
|
||||
|
||||
if (U_SUCCESS(ec)) {
|
||||
t = new CompoundTransliterator(ID, transliterators,
|
||||
(compoundFilter ? (UnicodeSet*)(compoundFilter->clone()) : 0),
|
||||
anonymousRBTs, pe, ec);
|
||||
if (t == 0) {
|
||||
ec = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = 0; i < transliterators.size(); i++)
|
||||
delete (Transliterator*)(transliterators.elementAt(i));
|
||||
}
|
||||
}
|
||||
break;
|
||||
case RULES:
|
||||
@ -141,9 +178,7 @@ void TransliteratorAlias::parse(TransliteratorParser& parser,
|
||||
return;
|
||||
}
|
||||
|
||||
// aliasID is really rules -- rename it!
|
||||
// idSplitPoint is really UTransDirection -- rename it!
|
||||
parser.parse(aliasID, (UTransDirection) idSplitPoint, pe, ec);
|
||||
parser.parse(aliasesOrRules, direction, pe, ec);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
@ -399,7 +434,8 @@ public:
|
||||
UnicodeSet* compoundFilter; // For COMPOUND_RBT
|
||||
union {
|
||||
Transliterator* prototype; // For PROTOTYPE
|
||||
TransliterationRuleData* data; // For RBT_DATA, COMPOUND_RBT
|
||||
TransliterationRuleData* data; // For RBT_DATA
|
||||
UVector* dataVector; // For COMPOUND_RBT
|
||||
struct {
|
||||
Transliterator::Factory function;
|
||||
Transliterator::Token context;
|
||||
@ -428,12 +464,16 @@ Entry::~Entry() {
|
||||
DEBUG_delEntry(this);
|
||||
if (entryType == PROTOTYPE) {
|
||||
delete u.prototype;
|
||||
} else if (entryType == RBT_DATA || entryType == COMPOUND_RBT) {
|
||||
} else if (entryType == RBT_DATA) {
|
||||
// The data object is shared between instances of RBT. The
|
||||
// entry object owns it. It should only be deleted when the
|
||||
// transliterator component is being cleaned up. Doing so
|
||||
// invalidates any RBTs that the user has instantiated.
|
||||
delete u.data;
|
||||
} else if (entryType == COMPOUND_RBT) {
|
||||
while (u.dataVector != NULL && !u.dataVector->isEmpty())
|
||||
delete (TransliterationRuleData*)u.dataVector->orphanElementAt(0);
|
||||
delete u.dataVector;
|
||||
}
|
||||
delete compoundFilter;
|
||||
}
|
||||
@ -522,39 +562,41 @@ Transliterator* TransliteratorRegistry::reget(const UnicodeString& ID,
|
||||
entry->entryType == Entry::RULES_REVERSE ||
|
||||
entry->entryType == Entry::LOCALE_RULES) {
|
||||
|
||||
entry->u.data = parser.orphanData();
|
||||
entry->stringArg = parser.idBlock;
|
||||
entry->intArg = parser.idSplitPoint;
|
||||
entry->compoundFilter = parser.orphanCompoundFilter();
|
||||
if (parser.idBlockVector->isEmpty() && parser.dataVector->isEmpty()) {
|
||||
entry->u.data = 0;
|
||||
entry->entryType = Entry::ALIAS;
|
||||
entry->stringArg = UNICODE_STRING_SIMPLE("Any-NULL");
|
||||
}
|
||||
else if (parser.idBlockVector->isEmpty() && parser.dataVector->size() == 1) {
|
||||
entry->u.data = (TransliterationRuleData*)parser.dataVector->orphanElementAt(0);
|
||||
entry->entryType = Entry::RBT_DATA;
|
||||
}
|
||||
else if (parser.idBlockVector->size() == 1 && parser.dataVector->isEmpty()) {
|
||||
entry->stringArg = *(UnicodeString*)(parser.idBlockVector->elementAt(0));
|
||||
entry->compoundFilter = parser.orphanCompoundFilter();
|
||||
entry->entryType = Entry::ALIAS;
|
||||
}
|
||||
else {
|
||||
entry->entryType = Entry::COMPOUND_RBT;
|
||||
entry->compoundFilter = parser.orphanCompoundFilter();
|
||||
entry->u.dataVector = new UVector(status);
|
||||
entry->stringArg.remove();
|
||||
|
||||
// Reset entry->entryType to encapsulate the parsed data. The
|
||||
// next time we instantiate this ID (including this very next
|
||||
// time, at the end of this function) we won't have to parse
|
||||
// again.
|
||||
// NOTE: The logic here matches that in
|
||||
// Transliterator::createFromRules().
|
||||
if (entry->stringArg.length() == 0) {
|
||||
if (entry->u.data == 0) {
|
||||
// No idBlock, no data -- this is just an
|
||||
// alias for Null
|
||||
entry->entryType = Entry::ALIAS;
|
||||
entry->stringArg = UNICODE_STRING_SIMPLE("Any-Null");
|
||||
} else {
|
||||
// No idBlock, data != 0 -- this is an
|
||||
// ordinary RBT_DATA
|
||||
entry->entryType = Entry::RBT_DATA;
|
||||
}
|
||||
} else {
|
||||
if (entry->u.data == 0) {
|
||||
// idBlock, no data -- this is an alias. The ID has
|
||||
// been munged from reverse into forward mode, if
|
||||
// necessary, so instantiate the ID in the forward
|
||||
// direction.
|
||||
entry->entryType = Entry::ALIAS;
|
||||
} else {
|
||||
// idBlock and data -- this is a compound
|
||||
// RBT
|
||||
entry->entryType = Entry::COMPOUND_RBT;
|
||||
int32_t limit = parser.idBlockVector->size();
|
||||
if (parser.dataVector->size() > limit)
|
||||
limit = parser.dataVector->size();
|
||||
|
||||
for (int32_t i = 0; i < limit; i++) {
|
||||
if (i < parser.idBlockVector->size()) {
|
||||
UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector->elementAt(i);
|
||||
if (!idBlock->isEmpty())
|
||||
entry->stringArg += *idBlock;
|
||||
}
|
||||
if (!parser.dataVector->isEmpty()) {
|
||||
TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector->orphanElementAt(0);
|
||||
entry->u.dataVector->addElement(data, status);
|
||||
entry->stringArg += (UChar)0xffff; // use U+FFFF to mark position of RBTs in ID block
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1165,7 +1207,7 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
|
||||
}
|
||||
return t;
|
||||
case Entry::ALIAS:
|
||||
aliasReturn = new TransliteratorAlias(entry->stringArg);
|
||||
aliasReturn = new TransliteratorAlias(entry->stringArg, entry->compoundFilter);
|
||||
if (aliasReturn == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
@ -1178,13 +1220,19 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
|
||||
return t;
|
||||
case Entry::COMPOUND_RBT:
|
||||
{
|
||||
UnicodeString id((UChar)0x005F); /* "_" */
|
||||
Transliterator *t = new RuleBasedTransliterator(id, entry->u.data);
|
||||
if (t == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
UVector* rbts = new UVector(status);
|
||||
int32_t passNumber = 1;
|
||||
for (int32_t i = 0; U_SUCCESS(status) && i < entry->u.dataVector->size(); i++) {
|
||||
Transliterator* t = new RuleBasedTransliterator((UnicodeString)"%Pass" + (passNumber++),
|
||||
(TransliterationRuleData*)(entry->u.dataVector->elementAt(i)), FALSE);
|
||||
if (t == 0)
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
else
|
||||
rbts->addElement(t, status);
|
||||
}
|
||||
aliasReturn = new TransliteratorAlias(ID, entry->stringArg, t, entry->intArg, entry->compoundFilter);
|
||||
if (U_FAILURE(status))
|
||||
return 0;
|
||||
aliasReturn = new TransliteratorAlias(ID, entry->stringArg, rbts, entry->compoundFilter);
|
||||
}
|
||||
if (aliasReturn == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001-2004, International Business Machines
|
||||
* Copyright (c) 2001-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -44,13 +44,13 @@ class TransliteratorAlias : public UMemory {
|
||||
* Construct a simple alias (type == SIMPLE)
|
||||
* @param aliasID the given id.
|
||||
*/
|
||||
TransliteratorAlias(const UnicodeString& aliasID);
|
||||
TransliteratorAlias(const UnicodeString& aliasID, const UnicodeSet* compoundFilter);
|
||||
|
||||
/**
|
||||
* Construct a compound RBT alias (type == COMPOUND)
|
||||
*/
|
||||
TransliteratorAlias(const UnicodeString& ID, const UnicodeString& idBlock,
|
||||
Transliterator* adopted, int32_t idSplitPoint,
|
||||
TransliteratorAlias(const UnicodeString& ID, const UnicodeString& idBlocks,
|
||||
UVector* adoptedTransliterators,
|
||||
const UnicodeSet* compoundFilter);
|
||||
|
||||
/**
|
||||
@ -108,10 +108,10 @@ class TransliteratorAlias : public UMemory {
|
||||
// Here ID is the ID, aliasID is the rules string.
|
||||
// idSplitPoint is the UTransDirection.
|
||||
UnicodeString ID;
|
||||
UnicodeString aliasID; // rename! holds rules for RULES type
|
||||
Transliterator* trans; // owned
|
||||
UnicodeString aliasesOrRules;
|
||||
UVector* transes; // owned
|
||||
const UnicodeSet* compoundFilter; // alias
|
||||
int32_t idSplitPoint; // rename! holds UTransDirection for RULES type
|
||||
UTransDirection direction;
|
||||
enum { SIMPLE, COMPOUND, RULES } type;
|
||||
|
||||
TransliteratorAlias(const TransliteratorAlias &other); // forbid copying of this class
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2004, International Business Machines Corporation
|
||||
* Copyright (c) 2002-2005, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -437,22 +437,13 @@ UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t d
|
||||
* the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
|
||||
* SingleID entries to actual transliterators.
|
||||
*
|
||||
* Also, optionally, insert the given transliterator at the given
|
||||
* position. This effectively happens before anything else.
|
||||
*
|
||||
* @param list vector of SingleID objects. On exit, vector
|
||||
* of one or more Transliterators.
|
||||
* @param insert Transliterator to insert, or NULL if none.
|
||||
* Adopted.
|
||||
* @param insertIndex index from 0..list.size()-1, at which
|
||||
* to place 'insert', or -1 if none.
|
||||
* @return new value of insertIndex. The index will shift if
|
||||
* there are empty items, like "(Lower)", with indices less than
|
||||
* insertIndex.
|
||||
*/
|
||||
int32_t TransliteratorIDParser::instantiateList(UVector& list,
|
||||
Transliterator* insert,
|
||||
int32_t insertIndex,
|
||||
void TransliteratorIDParser::instantiateList(UVector& list,
|
||||
UErrorCode& ec) {
|
||||
UVector tlist(ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
@ -463,15 +454,6 @@ int32_t TransliteratorIDParser::instantiateList(UVector& list,
|
||||
Transliterator* t;
|
||||
int32_t i;
|
||||
for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size()
|
||||
if (insertIndex == i) {
|
||||
insertIndex = tlist.size();
|
||||
tlist.addElement(insert, ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
goto RETURN;
|
||||
}
|
||||
insert = NULL;
|
||||
}
|
||||
|
||||
// We run the loop too long by one, so we can
|
||||
// do an insert after the last element
|
||||
if (i==list.size()) {
|
||||
@ -525,9 +507,7 @@ int32_t TransliteratorIDParser::instantiateList(UVector& list,
|
||||
}
|
||||
}
|
||||
|
||||
delete insert; // Clean up in case of failure
|
||||
list.setDeleter(save);
|
||||
return insertIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**************************************************************************
|
||||
* Copyright (c) 2002-2004, International Business Machines Corporation *
|
||||
* Copyright (c) 2002-2005, International Business Machines Corporation *
|
||||
* and others. All Rights Reserved. *
|
||||
**************************************************************************
|
||||
* Date Name Description *
|
||||
@ -202,23 +202,15 @@ class TransliteratorIDParser /* not : public UObject because all methods are sta
|
||||
* the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
|
||||
* SingleID entries to actual transliterators.
|
||||
*
|
||||
* Also, optionally, insert the given transliterator at the given
|
||||
* position. This effectively happens before anything else.
|
||||
*
|
||||
* @param list vector of SingleID objects. On exit, vector
|
||||
* of one or more Transliterators.
|
||||
* @param insert Transliterator to insert, or null if none.
|
||||
* @param insertIndex index from 0..list.size()-1, at which
|
||||
* to place 'insert', or -1 if none.
|
||||
* @param ec Output param to receive a success or an error code.
|
||||
* @return new value of insertIndex. The index will shift if
|
||||
* there are empty items, like "(Lower)", with indices less than
|
||||
* insertIndex.
|
||||
*/
|
||||
static int32_t instantiateList(UVector& list,
|
||||
Transliterator* insert,
|
||||
int32_t insertIndex,
|
||||
UErrorCode& ec);
|
||||
static void instantiateList(UVector& list,
|
||||
UErrorCode& ec);
|
||||
|
||||
/**
|
||||
* Parse an ID into pieces. Take IDs of the form T, T/V, S-T,
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -343,6 +343,7 @@ protected:
|
||||
|
||||
friend class TransliteratorParser; // for parseID()
|
||||
friend class TransliteratorIDParser; // for createBasicInstance()
|
||||
friend class TransliteratorAlias; // for setID()
|
||||
|
||||
public:
|
||||
|
||||
|
@ -1101,19 +1101,19 @@ void TransliteratorRoundTripTest::TestHan() {
|
||||
pn->transliterate(target2);
|
||||
|
||||
// verify that there are no marks
|
||||
Transliterator *nfc = Transliterator::createInstance("nfc", UTRANS_FORWARD, status);
|
||||
Transliterator *nfd = Transliterator::createInstance("nfd", UTRANS_FORWARD, status);
|
||||
ASSERT_SUCCESS(status);
|
||||
|
||||
UnicodeString nfced = target2;
|
||||
nfc->transliterate(nfced);
|
||||
UnicodeSet allMarks("[:mark:]", status);
|
||||
UnicodeString nfded = target2;
|
||||
nfd->transliterate(nfded);
|
||||
UnicodeSet allMarks("[\\u0304\\u0301\\u030C\\u0300\\u0306]", status); // look only for Pinyin tone marks, not all marks (there are some others in there)
|
||||
ASSERT_SUCCESS(status);
|
||||
assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfced));
|
||||
assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfded));
|
||||
|
||||
// verify roundtrip
|
||||
Transliterator *np = pn->createInverse(status);
|
||||
ASSERT_SUCCESS(status);
|
||||
UnicodeString target3 = target;
|
||||
UnicodeString target3 = target2;
|
||||
np->transliterate(target3);
|
||||
UBool roundtripOK = (target3.compare(target) == 0);
|
||||
assertTrue("NumericPinyin must roundtrip", roundtripOK);
|
||||
@ -1125,13 +1125,15 @@ void TransliteratorRoundTripTest::TestHan() {
|
||||
writeStringInU8(out, target);
|
||||
fprintf(out, "\nPinyin-Numeric-Pinyin: ");
|
||||
writeStringInU8(out, target2);
|
||||
fprintf(out, "\nNumeric-Pinyin-Pinyin: ");
|
||||
writeStringInU8(out, target3);
|
||||
fprintf(out, "\n");
|
||||
fclose(out);
|
||||
}
|
||||
|
||||
delete hanTL;
|
||||
delete pn;
|
||||
delete nfc;
|
||||
delete nfd;
|
||||
delete np;
|
||||
uset_close(USetExemplars);
|
||||
}
|
||||
|
@ -183,6 +183,8 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
||||
TESTCASE(75,TestAllCodepoints);
|
||||
TESTCASE(76,TestBoilerplate);
|
||||
TESTCASE(77,TestAlternateSyntax);
|
||||
TESTCASE(78,TestBeginEnd);
|
||||
TESTCASE(79,TestBeginEndToRules);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
@ -776,7 +778,7 @@ void TransliteratorTest::TestJ277(void) {
|
||||
// Transliterate the Greek locale data
|
||||
Locale el("el");
|
||||
DateFormatSymbols syms(el, status);
|
||||
if (U_FAILURE(status)) { errln("FAIL: DateFormatSymbols constructor failed. Error: " + UnicodeString(u_errorName(status))); return; }
|
||||
if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
|
||||
int32_t i, count;
|
||||
const UnicodeString* data = syms.getMonths(count);
|
||||
for (i=0; i<count; ++i) {
|
||||
@ -3972,6 +3974,332 @@ void TransliteratorTest::TestAlternateSyntax() {
|
||||
"<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}");
|
||||
}
|
||||
|
||||
static const char* BEGIN_END_RULES[] = {
|
||||
// [0]
|
||||
"abc > xy;"
|
||||
"aba > z;",
|
||||
|
||||
// [1]
|
||||
/*
|
||||
"::BEGIN;"
|
||||
"abc > xy;"
|
||||
"::END;"
|
||||
"::BEGIN;"
|
||||
"aba > z;"
|
||||
"::END;",
|
||||
*/
|
||||
"", // test case commented out below, this is here to keep from messing up the indexes
|
||||
|
||||
// [2]
|
||||
/*
|
||||
"abc > xy;"
|
||||
"::BEGIN;"
|
||||
"aba > z;"
|
||||
"::END;",
|
||||
*/
|
||||
"", // test case commented out below, this is here to keep from messing up the indexes
|
||||
|
||||
// [3]
|
||||
/*
|
||||
"::BEGIN;"
|
||||
"abc > xy;"
|
||||
"::END;"
|
||||
"aba > z;",
|
||||
*/
|
||||
"", // test case commented out below, this is here to keep from messing up the indexes
|
||||
|
||||
// [4]
|
||||
"abc > xy;"
|
||||
"::Null;"
|
||||
"aba > z;",
|
||||
|
||||
// [5]
|
||||
"::Upper;"
|
||||
"ABC > xy;"
|
||||
"AB > x;"
|
||||
"C > z;"
|
||||
"::Upper;"
|
||||
"XYZ > p;"
|
||||
"XY > q;"
|
||||
"Z > r;"
|
||||
"::Upper;",
|
||||
|
||||
// [6]
|
||||
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
|
||||
"$delim = [\\-$ws];"
|
||||
"$ws $delim* > ' ';"
|
||||
"'-' $delim* > '-';",
|
||||
|
||||
// [7]
|
||||
"::Null;"
|
||||
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
|
||||
"$delim = [\\-$ws];"
|
||||
"$ws $delim* > ' ';"
|
||||
"'-' $delim* > '-';",
|
||||
|
||||
// [8]
|
||||
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
|
||||
"$delim = [\\-$ws];"
|
||||
"$ws $delim* > ' ';"
|
||||
"'-' $delim* > '-';"
|
||||
"::Null;",
|
||||
|
||||
// [9]
|
||||
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
|
||||
"$delim = [\\-$ws];"
|
||||
"::Null;"
|
||||
"$ws $delim* > ' ';"
|
||||
"'-' $delim* > '-';",
|
||||
|
||||
// [10]
|
||||
/*
|
||||
"::BEGIN;"
|
||||
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
|
||||
"$delim = [\\-$ws];"
|
||||
"::END;"
|
||||
"$ws $delim* > ' ';"
|
||||
"'-' $delim* > '-';",
|
||||
*/
|
||||
"", // test case commented out below, this is here to keep from messing up the indexes
|
||||
|
||||
// [11]
|
||||
/*
|
||||
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
|
||||
"$delim = [\\-$ws];"
|
||||
"::BEGIN;"
|
||||
"$ws $delim* > ' ';"
|
||||
"'-' $delim* > '-';"
|
||||
"::END;",
|
||||
*/
|
||||
"", // test case commented out below, this is here to keep from messing up the indexes
|
||||
|
||||
// [12]
|
||||
/*
|
||||
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
|
||||
"$delim = [\\-$ws];"
|
||||
"$ab = [ab];"
|
||||
"::BEGIN;"
|
||||
"$ws $delim* > ' ';"
|
||||
"'-' $delim* > '-';"
|
||||
"::END;"
|
||||
"::BEGIN;"
|
||||
"$ab { ' ' } $ab > '-';"
|
||||
"c { ' ' > ;"
|
||||
"::END;"
|
||||
"::BEGIN;"
|
||||
"'a-a' > a\\%|a;"
|
||||
"::END;",
|
||||
*/
|
||||
"", // test case commented out below, this is here to keep from messing up the indexes
|
||||
|
||||
// [13]
|
||||
"$ws = [[:Separator:][\\u0009-\\u000C]$];"
|
||||
"$delim = [\\-$ws];"
|
||||
"$ab = [ab];"
|
||||
"::Null;"
|
||||
"$ws $delim* > ' ';"
|
||||
"'-' $delim* > '-';"
|
||||
"::Null;"
|
||||
"$ab { ' ' } $ab > '-';"
|
||||
"c { ' ' > ;"
|
||||
"::Null;"
|
||||
"'a-a' > a\\%|a;",
|
||||
|
||||
// [14]
|
||||
/*
|
||||
"::[abc];"
|
||||
"::BEGIN;"
|
||||
"abc > xy;"
|
||||
"::END;"
|
||||
"::BEGIN;"
|
||||
"aba > yz;"
|
||||
"::END;"
|
||||
"::Upper;",
|
||||
*/
|
||||
"", // test case commented out below, this is here to keep from messing up the indexes
|
||||
|
||||
// [15]
|
||||
"::[abc];"
|
||||
"abc > xy;"
|
||||
"::Null;"
|
||||
"aba > yz;"
|
||||
"::Upper;",
|
||||
|
||||
// [16]
|
||||
/*
|
||||
"::[abc];"
|
||||
"::BEGIN;"
|
||||
"abc <> xy;"
|
||||
"::END;"
|
||||
"::BEGIN;"
|
||||
"aba <> yz;"
|
||||
"::END;"
|
||||
"::Upper(Lower);"
|
||||
"::([XYZ]);"
|
||||
*/
|
||||
"", // test case commented out below, this is here to keep from messing up the indexes
|
||||
|
||||
// [17]
|
||||
"::[abc];"
|
||||
"abc <> xy;"
|
||||
"::Null;"
|
||||
"aba <> yz;"
|
||||
"::Upper(Lower);"
|
||||
"::([XYZ]);"
|
||||
};
|
||||
static const int32_t BEGIN_END_RULES_length = (int32_t)(sizeof(BEGIN_END_RULES) / sizeof(BEGIN_END_RULES[0]));
|
||||
|
||||
/*
|
||||
(This entire test is commented out below and will need some heavy revision when we re-add
|
||||
the ::BEGIN/::END stuff)
|
||||
static const char* BOGUS_BEGIN_END_RULES[] = {
|
||||
// [7]
|
||||
"::BEGIN;"
|
||||
"abc > xy;"
|
||||
"::BEGIN;"
|
||||
"aba > z;"
|
||||
"::END;"
|
||||
"::END;",
|
||||
|
||||
// [8]
|
||||
"abc > xy;"
|
||||
" aba > z;"
|
||||
"::END;",
|
||||
|
||||
// [9]
|
||||
"::BEGIN;"
|
||||
"::Upper;"
|
||||
"::END;"
|
||||
};
|
||||
static const int32_t BOGUS_BEGIN_END_RULES_length = (int32_t)(sizeof(BOGUS_BEGIN_END_RULES) / sizeof(BOGUS_BEGIN_END_RULES[0]));
|
||||
*/
|
||||
|
||||
static const char* BEGIN_END_TEST_CASES[] = {
|
||||
// rules input expected output
|
||||
BEGIN_END_RULES[0], "abc ababc aba", "xy zbc z",
|
||||
// BEGIN_END_RULES[1], "abc ababc aba", "xy abxy z",
|
||||
// BEGIN_END_RULES[2], "abc ababc aba", "xy abxy z",
|
||||
// BEGIN_END_RULES[3], "abc ababc aba", "xy abxy z",
|
||||
BEGIN_END_RULES[4], "abc ababc aba", "xy abxy z",
|
||||
BEGIN_END_RULES[5], "abccabaacababcbc", "PXAARXQBR",
|
||||
|
||||
BEGIN_END_RULES[6], "e e - e---e- e", "e e e-e-e",
|
||||
BEGIN_END_RULES[7], "e e - e---e- e", "e e e-e-e",
|
||||
BEGIN_END_RULES[8], "e e - e---e- e", "e e e-e-e",
|
||||
BEGIN_END_RULES[9], "e e - e---e- e", "e e e-e-e",
|
||||
// BEGIN_END_RULES[10], "e e - e---e- e", "e e e-e-e",
|
||||
// BEGIN_END_RULES[11], "e e - e---e- e", "e e e-e-e",
|
||||
// BEGIN_END_RULES[12], "e e - e---e- e", "e e e-e-e",
|
||||
// BEGIN_END_RULES[12], "a a a a", "a%a%a%a",
|
||||
// BEGIN_END_RULES[12], "a a-b c b a", "a%a-b cb-a",
|
||||
BEGIN_END_RULES[13], "e e - e---e- e", "e e e-e-e",
|
||||
BEGIN_END_RULES[13], "a a a a", "a%a%a%a",
|
||||
BEGIN_END_RULES[13], "a a-b c b a", "a%a-b cb-a",
|
||||
|
||||
// BEGIN_END_RULES[14], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
|
||||
BEGIN_END_RULES[15], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
|
||||
// BEGIN_END_RULES[16], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
|
||||
BEGIN_END_RULES[17], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ"
|
||||
};
|
||||
static const int32_t BEGIN_END_TEST_CASES_length = (int32_t)(sizeof(BEGIN_END_TEST_CASES) / sizeof(BEGIN_END_TEST_CASES[0]));
|
||||
|
||||
void TransliteratorTest::TestBeginEnd() {
|
||||
// run through the list of test cases above
|
||||
int32_t i = 0;
|
||||
for (i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
|
||||
expect((UnicodeString)"Test case #" + (i / 3),
|
||||
UnicodeString(BEGIN_END_TEST_CASES[i]),
|
||||
UnicodeString(BEGIN_END_TEST_CASES[i + 1]),
|
||||
UnicodeString(BEGIN_END_TEST_CASES[i + 2]));
|
||||
}
|
||||
|
||||
// instantiate the one reversible rule set in the reverse direction and make sure it does the right thing
|
||||
UParseError parseError;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
|
||||
UTRANS_REVERSE, parseError, status);
|
||||
if (reversed == 0 || U_FAILURE(status)) {
|
||||
reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
|
||||
} else {
|
||||
expect(*reversed, UnicodeString("xy XY XYZ yz YZ"), UnicodeString("xy abc xaba yz aba"));
|
||||
}
|
||||
delete reversed;
|
||||
|
||||
// finally, run through the list of syntactically-ill-formed rule sets above and make sure
|
||||
// that all of them cause errors
|
||||
/*
|
||||
(commented out until we have the real ::BEGIN/::END stuff in place
|
||||
for (i = 0; i < BOGUS_BEGIN_END_RULES_length; i++) {
|
||||
UParseError parseError;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator* t = Transliterator::createFromRules("foo", UnicodeString(BOGUS_BEGIN_END_RULES[i]),
|
||||
UTRANS_FORWARD, parseError, status);
|
||||
if (!U_FAILURE(status)) {
|
||||
delete t;
|
||||
errln((UnicodeString)"Should have gotten syntax error from " + BOGUS_BEGIN_END_RULES[i]);
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
void TransliteratorTest::TestBeginEndToRules() {
|
||||
// run through the same list of test cases we used above, but this time, instead of just
|
||||
// instantiating a Transliterator from the rules and running the test against it, we instantiate
|
||||
// a Transliterator from the rules, do toRules() on it, instantiate a Transliterator from
|
||||
// the resulting set of rules, and make sure that the generated rule set is semantically equivalent
|
||||
// to (i.e., does the same thing as) the original rule set
|
||||
for (int32_t i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
|
||||
UParseError parseError;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator* t = Transliterator::createFromRules("--", UnicodeString(BEGIN_END_TEST_CASES[i]),
|
||||
UTRANS_FORWARD, parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
reportParseError(UnicodeString("FAIL: Couldn't create transliterator"), parseError, status);
|
||||
} else {
|
||||
UnicodeString rules;
|
||||
t->toRules(rules, TRUE);
|
||||
Transliterator* t2 = Transliterator::createFromRules((UnicodeString)"Test case #" + (i / 3), rules,
|
||||
UTRANS_FORWARD, parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
reportParseError(UnicodeString("FAIL: Couldn't create transliterator from generated rules"),
|
||||
parseError, status);
|
||||
delete t;
|
||||
} else {
|
||||
expect(*t2,
|
||||
UnicodeString(BEGIN_END_TEST_CASES[i + 1]),
|
||||
UnicodeString(BEGIN_END_TEST_CASES[i + 2]));
|
||||
delete t;
|
||||
delete t2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// do the same thing for the reversible test case
|
||||
UParseError parseError;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
|
||||
UTRANS_REVERSE, parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
|
||||
} else {
|
||||
UnicodeString rules;
|
||||
reversed->toRules(rules, FALSE);
|
||||
Transliterator* reversed2 = Transliterator::createFromRules("Reversed", rules, UTRANS_FORWARD,
|
||||
parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator from generated rules"),
|
||||
parseError, status);
|
||||
delete reversed;
|
||||
} else {
|
||||
expect(*reversed2,
|
||||
UnicodeString("xy XY XYZ yz YZ"),
|
||||
UnicodeString("xy abc xaba yz aba"));
|
||||
delete reversed;
|
||||
delete reversed2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
@ -3990,14 +4318,35 @@ void TransliteratorTest::expectT(const UnicodeString& id,
|
||||
delete t;
|
||||
}
|
||||
|
||||
void TransliteratorTest::reportParseError(const UnicodeString& message,
|
||||
const UParseError& parseError,
|
||||
const UErrorCode& status) {
|
||||
errln(message +
|
||||
/*", parse error " + parseError.code +*/
|
||||
", line " + parseError.line +
|
||||
", offset " + parseError.offset +
|
||||
", pre-context " + prettify(parseError.preContext, TRUE) +
|
||||
", post-context " + prettify(parseError.postContext,TRUE) +
|
||||
", Error: " + u_errorName(status));
|
||||
}
|
||||
|
||||
void TransliteratorTest::expect(const UnicodeString& rules,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult,
|
||||
UTransPosition *pos) {
|
||||
expect("<ID>", rules, source, expectedResult, pos);
|
||||
}
|
||||
|
||||
void TransliteratorTest::expect(const UnicodeString& id,
|
||||
const UnicodeString& rules,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult,
|
||||
UTransPosition *pos) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator *t = new RuleBasedTransliterator("<ID>", rules, status);
|
||||
UParseError parseError;
|
||||
Transliterator* t = Transliterator::createFromRules(id, rules, UTRANS_FORWARD, parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: Transliterator constructor failed");
|
||||
reportParseError(UnicodeString("Couldn't create transliterator from ") + rules, parseError, status);
|
||||
} else {
|
||||
expect(*t, source, expectedResult, pos);
|
||||
}
|
||||
@ -4021,7 +4370,6 @@ void TransliteratorTest::expect(const Transliterator& t,
|
||||
t.transliterate(result);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
}
|
||||
|
||||
UTransPosition index={0, 0, 0, 0};
|
||||
if (pos != 0) {
|
||||
index = *pos;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2003, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -344,6 +344,10 @@ private:
|
||||
|
||||
void TestAlternateSyntax(void);
|
||||
|
||||
void TestBeginEnd(void);
|
||||
|
||||
void TestBeginEndToRules(void);
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
@ -357,6 +361,12 @@ private:
|
||||
const UnicodeString& expectedResult,
|
||||
UTransPosition *pos=0);
|
||||
|
||||
void expect(const UnicodeString& id,
|
||||
const UnicodeString& rules,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult,
|
||||
UTransPosition *pos=0);
|
||||
|
||||
void expect(const Transliterator& t,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult,
|
||||
@ -385,6 +395,8 @@ private:
|
||||
void CheckIncrementalAux(const Transliterator* t,
|
||||
const UnicodeString& input);
|
||||
|
||||
void reportParseError(const UnicodeString& message, const UParseError& parseError, const UErrorCode& status);
|
||||
|
||||
|
||||
const UnicodeString DESERET_DEE;
|
||||
const UnicodeString DESERET_dee;
|
||||
|
Loading…
Reference in New Issue
Block a user