d656cb42f9
X-SVN-Rev: 6947
156 lines
5.0 KiB
C++
156 lines
5.0 KiB
C++
/*
|
|
**********************************************************************
|
|
* Copyright (C) 1999, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
* Date Name Description
|
|
* 11/17/99 aliu Creation.
|
|
**********************************************************************
|
|
*/
|
|
#include "unicode/rbt.h"
|
|
#include "rbt_pars.h"
|
|
#include "rbt_data.h"
|
|
#include "rbt_rule.h"
|
|
#include "unicode/rep.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
const char RuleBasedTransliterator::fgClassID = 0; // Value is irrelevant
|
|
|
|
void RuleBasedTransliterator::_construct(const UnicodeString& rules,
|
|
UTransDirection direction,
|
|
UParseError& parseError,
|
|
UErrorCode& status) {
|
|
data = 0;
|
|
isDataOwned = TRUE;
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
TransliteratorParser parser;
|
|
parser.parse(rules, direction, parseError, status);
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
if (parser.idBlock.length() != 0 ||
|
|
parser.compoundFilter != NULL) {
|
|
status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
|
|
return;
|
|
}
|
|
|
|
data = parser.orphanData();
|
|
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
|
|
}
|
|
|
|
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
|
|
const TransliterationRuleData* theData,
|
|
UnicodeFilter* adoptedFilter) :
|
|
Transliterator(id, adoptedFilter),
|
|
data((TransliterationRuleData*)theData), // cast away const
|
|
isDataOwned(FALSE) {
|
|
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
|
|
}
|
|
|
|
/**
|
|
* Internal constructor.
|
|
*/
|
|
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
|
|
TransliterationRuleData* theData,
|
|
UBool isDataAdopted) :
|
|
Transliterator(id, 0),
|
|
data(theData),
|
|
isDataOwned(isDataAdopted) {
|
|
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
|
|
}
|
|
|
|
/**
|
|
* Copy constructor.
|
|
*/
|
|
RuleBasedTransliterator::RuleBasedTransliterator(
|
|
const RuleBasedTransliterator& other) :
|
|
Transliterator(other), data(other.data),
|
|
isDataOwned(other.isDataOwned) {
|
|
|
|
// The data object may or may not be owned. If it is not owned we
|
|
// share it; it is invariant. If it is owned, it's still
|
|
// invariant, but we need to copy it to prevent double-deletion.
|
|
// If this becomes a performance issue (if people do a lot of RBT
|
|
// copying -- unlikely) we can reference count the data object.
|
|
|
|
// Only do a deep copy if this is owned data, that is, data that
|
|
// will be later deleted. System transliterators contain
|
|
// non-owned data.
|
|
if (isDataOwned) {
|
|
data = new TransliterationRuleData(*other.data);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Destructor.
|
|
*/
|
|
RuleBasedTransliterator::~RuleBasedTransliterator() {
|
|
// Delete the data object only if we own it.
|
|
if (isDataOwned) {
|
|
delete data;
|
|
}
|
|
}
|
|
|
|
Transliterator* // Covariant return NOT ALLOWED (for portability)
|
|
RuleBasedTransliterator::clone(void) const {
|
|
return new RuleBasedTransliterator(*this);
|
|
}
|
|
|
|
/**
|
|
* Implements {@link Transliterator#handleTransliterate}.
|
|
*/
|
|
void
|
|
RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
|
|
UBool isIncremental) const {
|
|
/* We keep contextStart and contextLimit fixed the entire time,
|
|
* relative to the text -- contextLimit may move numerically if
|
|
* text is inserted or removed. The start offset moves toward
|
|
* limit, with replacements happening under it.
|
|
*
|
|
* Example: rules 1. ab>x|y
|
|
* 2. yc>z
|
|
*
|
|
* |eabcd begin - no match, advance start
|
|
* e|abcd match rule 1 - change text & adjust start
|
|
* ex|ycd match rule 2 - change text & adjust start
|
|
* exz|d no match, advance start
|
|
* exzd| done
|
|
*/
|
|
|
|
/* A rule like
|
|
* a>b|a
|
|
* creates an infinite loop. To prevent that, we put an arbitrary
|
|
* limit on the number of iterations that we take, one that is
|
|
* high enough that any reasonable rules are ok, but low enough to
|
|
* prevent a server from hanging. The limit is 16 times the
|
|
* number of characters n, unless n is so large that 16n exceeds a
|
|
* uint32_t.
|
|
*/
|
|
uint32_t loopCount = 0;
|
|
uint32_t loopLimit = index.limit - index.start;
|
|
if (loopLimit >= 0x10000000) {
|
|
loopLimit = 0xFFFFFFFF;
|
|
} else {
|
|
loopLimit <<= 4;
|
|
}
|
|
|
|
while (index.start < index.limit &&
|
|
loopCount <= loopLimit &&
|
|
data->ruleSet.transliterate(text, index, isIncremental)) {
|
|
++loopCount;
|
|
}
|
|
}
|
|
|
|
UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
|
|
UBool escapeUnprintable) const {
|
|
return data->ruleSet.toRules(rulesSource, escapeUnprintable);
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
|