2017-01-20 00:20:31 +00:00
|
|
|
// © 2016 and later: Unicode, Inc. and others.
|
2016-06-15 18:58:17 +00:00
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
1999-11-20 00:40:50 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
2016-05-31 21:45:07 +00:00
|
|
|
* Copyright (C) 1999-2015, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
1999-11-20 00:40:50 +00:00
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 11/17/99 aliu Creation.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
2002-08-21 19:12:24 +00:00
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_TRANSLITERATION
|
|
|
|
|
2002-07-12 21:42:24 +00:00
|
|
|
#include "unicode/rep.h"
|
|
|
|
#include "unicode/uniset.h"
|
1999-11-20 00:40:50 +00:00
|
|
|
#include "rbt_pars.h"
|
|
|
|
#include "rbt_data.h"
|
|
|
|
#include "rbt_rule.h"
|
2003-02-14 01:44:49 +00:00
|
|
|
#include "rbt.h"
|
2015-04-29 23:18:32 +00:00
|
|
|
#include "mutex.h"
|
2004-02-23 04:36:02 +00:00
|
|
|
#include "umutex.h"
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2003-08-29 20:06:23 +00:00
|
|
|
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
|
2001-06-12 17:35:03 +00:00
|
|
|
|
2004-06-14 18:56:58 +00:00
|
|
|
static Replaceable *gLockedText = NULL;
|
2004-02-23 04:36:02 +00:00
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
void RuleBasedTransliterator::_construct(const UnicodeString& rules,
|
2000-06-27 19:00:38 +00:00
|
|
|
UTransDirection direction,
|
2001-08-31 03:23:39 +00:00
|
|
|
UParseError& parseError,
|
|
|
|
UErrorCode& status) {
|
2003-12-10 01:52:39 +00:00
|
|
|
fData = 0;
|
1999-11-20 00:40:50 +00:00
|
|
|
isDataOwned = TRUE;
|
2001-10-10 19:29:45 +00:00
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
2001-10-10 19:29:45 +00:00
|
|
|
|
2006-04-14 21:09:42 +00:00
|
|
|
TransliteratorParser parser(status);
|
2001-10-10 19:29:45 +00:00
|
|
|
parser.parse(rules, direction, parseError, status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2006-04-14 21:09:42 +00:00
|
|
|
if (parser.idBlockVector.size() != 0 ||
|
2006-07-13 21:04:20 +00:00
|
|
|
parser.compoundFilter != NULL ||
|
|
|
|
parser.dataVector.size() == 0) {
|
2001-10-10 19:29:45 +00:00
|
|
|
status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2006-04-14 21:09:42 +00:00
|
|
|
fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
|
2003-12-10 01:52:39 +00:00
|
|
|
setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
|
|
|
|
2003-08-29 20:06:23 +00:00
|
|
|
/**
|
|
|
|
* Constructs a new transliterator from the given rules.
|
|
|
|
* @param id the id for the transliterator.
|
|
|
|
* @param rules rules, separated by ';'
|
|
|
|
* @param direction either FORWARD or REVERSE.
|
|
|
|
* @param adoptedFilter the filter for this transliterator.
|
|
|
|
* @param parseError Struct to recieve information on position
|
|
|
|
* of error if an error is encountered
|
|
|
|
* @param status Output param set to success/failure code.
|
|
|
|
* @exception IllegalArgumentException if rules are malformed
|
|
|
|
* or direction is invalid.
|
|
|
|
*/
|
|
|
|
RuleBasedTransliterator::RuleBasedTransliterator(
|
|
|
|
const UnicodeString& id,
|
|
|
|
const UnicodeString& rules,
|
|
|
|
UTransDirection direction,
|
|
|
|
UnicodeFilter* adoptedFilter,
|
|
|
|
UParseError& parseError,
|
|
|
|
UErrorCode& status) :
|
|
|
|
Transliterator(id, adoptedFilter) {
|
|
|
|
_construct(rules, direction,parseError,status);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Constructs a new transliterator from the given rules.
|
|
|
|
* @param id the id for the transliterator.
|
|
|
|
* @param rules rules, separated by ';'
|
|
|
|
* @param direction either FORWARD or REVERSE.
|
|
|
|
* @param adoptedFilter the filter for this transliterator.
|
|
|
|
* @param status Output param set to success/failure code.
|
|
|
|
* @exception IllegalArgumentException if rules are malformed
|
|
|
|
* or direction is invalid.
|
|
|
|
*/
|
2007-06-11 17:37:31 +00:00
|
|
|
/*RuleBasedTransliterator::RuleBasedTransliterator(
|
2003-08-29 20:06:23 +00:00
|
|
|
const UnicodeString& id,
|
|
|
|
const UnicodeString& rules,
|
|
|
|
UTransDirection direction,
|
|
|
|
UnicodeFilter* adoptedFilter,
|
|
|
|
UErrorCode& status) :
|
|
|
|
Transliterator(id, adoptedFilter) {
|
|
|
|
UParseError parseError;
|
|
|
|
_construct(rules, direction,parseError, status);
|
2007-06-11 17:37:31 +00:00
|
|
|
}*/
|
2003-08-29 20:06:23 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Covenience constructor with no filter.
|
|
|
|
*/
|
2007-06-14 01:37:11 +00:00
|
|
|
/*RuleBasedTransliterator::RuleBasedTransliterator(
|
2003-08-29 20:06:23 +00:00
|
|
|
const UnicodeString& id,
|
|
|
|
const UnicodeString& rules,
|
|
|
|
UTransDirection direction,
|
|
|
|
UErrorCode& status) :
|
|
|
|
Transliterator(id, 0) {
|
|
|
|
UParseError parseError;
|
|
|
|
_construct(rules, direction,parseError, status);
|
2007-06-14 01:37:11 +00:00
|
|
|
}*/
|
2003-08-29 20:06:23 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Covenience constructor with no filter and FORWARD direction.
|
|
|
|
*/
|
2007-06-14 01:37:11 +00:00
|
|
|
/*RuleBasedTransliterator::RuleBasedTransliterator(
|
2003-08-29 20:06:23 +00:00
|
|
|
const UnicodeString& id,
|
|
|
|
const UnicodeString& rules,
|
|
|
|
UErrorCode& status) :
|
|
|
|
Transliterator(id, 0) {
|
|
|
|
UParseError parseError;
|
|
|
|
_construct(rules, UTRANS_FORWARD, parseError, status);
|
2007-06-14 01:37:11 +00:00
|
|
|
}*/
|
2003-08-29 20:06:23 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Covenience constructor with FORWARD direction.
|
|
|
|
*/
|
2007-06-11 17:37:31 +00:00
|
|
|
/*RuleBasedTransliterator::RuleBasedTransliterator(
|
2003-08-29 20:06:23 +00:00
|
|
|
const UnicodeString& id,
|
|
|
|
const UnicodeString& rules,
|
|
|
|
UnicodeFilter* adoptedFilter,
|
|
|
|
UErrorCode& status) :
|
|
|
|
Transliterator(id, adoptedFilter) {
|
|
|
|
UParseError parseError;
|
|
|
|
_construct(rules, UTRANS_FORWARD,parseError, status);
|
2007-06-11 17:37:31 +00:00
|
|
|
}*/
|
2003-08-29 20:06:23 +00:00
|
|
|
|
2000-09-30 00:06:54 +00:00
|
|
|
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
|
1999-11-20 00:40:50 +00:00
|
|
|
const TransliterationRuleData* theData,
|
|
|
|
UnicodeFilter* adoptedFilter) :
|
2000-09-30 00:06:54 +00:00
|
|
|
Transliterator(id, adoptedFilter),
|
2003-12-10 01:52:39 +00:00
|
|
|
fData((TransliterationRuleData*)theData), // cast away const
|
2000-01-18 18:27:27 +00:00
|
|
|
isDataOwned(FALSE) {
|
2003-12-10 01:52:39 +00:00
|
|
|
setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
|
2000-01-18 18:27:27 +00:00
|
|
|
}
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-07-13 21:09:41 +00:00
|
|
|
/**
|
|
|
|
* Internal constructor.
|
|
|
|
*/
|
|
|
|
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
|
|
|
|
TransliterationRuleData* theData,
|
|
|
|
UBool isDataAdopted) :
|
|
|
|
Transliterator(id, 0),
|
2003-12-10 01:52:39 +00:00
|
|
|
fData(theData),
|
2001-07-13 21:09:41 +00:00
|
|
|
isDataOwned(isDataAdopted) {
|
2003-12-10 01:52:39 +00:00
|
|
|
setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
|
2001-07-13 21:09:41 +00:00
|
|
|
}
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
/**
|
2001-11-16 19:08:46 +00:00
|
|
|
* Copy constructor.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
|
|
|
RuleBasedTransliterator::RuleBasedTransliterator(
|
|
|
|
const RuleBasedTransliterator& other) :
|
2003-12-10 01:52:39 +00:00
|
|
|
Transliterator(other), fData(other.fData),
|
2000-06-30 23:26:07 +00:00
|
|
|
isDataOwned(other.isDataOwned) {
|
|
|
|
|
2001-11-16 19:08:46 +00:00
|
|
|
// The data object may or may not be owned. If it is not owned we
|
|
|
|
// share it; it is invariant. If it is owned, it's still
|
|
|
|
// invariant, but we need to copy it to prevent double-deletion.
|
|
|
|
// If this becomes a performance issue (if people do a lot of RBT
|
|
|
|
// copying -- unlikely) we can reference count the data object.
|
|
|
|
|
|
|
|
// Only do a deep copy if this is owned data, that is, data that
|
|
|
|
// will be later deleted. System transliterators contain
|
|
|
|
// non-owned data.
|
2000-06-30 23:26:07 +00:00
|
|
|
if (isDataOwned) {
|
2003-12-10 01:52:39 +00:00
|
|
|
fData = new TransliterationRuleData(*other.fData);
|
2000-01-18 18:27:27 +00:00
|
|
|
}
|
1999-11-20 01:04:34 +00:00
|
|
|
}
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
/**
|
2001-11-16 19:08:46 +00:00
|
|
|
* Destructor.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
1999-11-20 01:04:34 +00:00
|
|
|
RuleBasedTransliterator::~RuleBasedTransliterator() {
|
2001-11-16 19:08:46 +00:00
|
|
|
// Delete the data object only if we own it.
|
1999-11-20 01:04:34 +00:00
|
|
|
if (isDataOwned) {
|
2003-12-10 01:52:39 +00:00
|
|
|
delete fData;
|
1999-11-20 01:04:34 +00:00
|
|
|
}
|
|
|
|
}
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
Transliterator* // Covariant return NOT ALLOWED (for portability)
|
1999-12-22 22:57:04 +00:00
|
|
|
RuleBasedTransliterator::clone(void) const {
|
1999-11-20 00:40:50 +00:00
|
|
|
return new RuleBasedTransliterator(*this);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2000-01-19 19:12:42 +00:00
|
|
|
* Implements {@link Transliterator#handleTransliterate}.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
2000-01-19 19:12:42 +00:00
|
|
|
void
|
2000-06-27 19:00:38 +00:00
|
|
|
RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool isIncremental) const {
|
2001-07-25 19:11:02 +00:00
|
|
|
/* We keep contextStart and contextLimit fixed the entire time,
|
|
|
|
* relative to the text -- contextLimit may move numerically if
|
|
|
|
* text is inserted or removed. The start offset moves toward
|
|
|
|
* limit, with replacements happening under it.
|
1999-11-20 00:40:50 +00:00
|
|
|
*
|
|
|
|
* Example: rules 1. ab>x|y
|
|
|
|
* 2. yc>z
|
|
|
|
*
|
2001-07-25 19:11:02 +00:00
|
|
|
* |eabcd begin - no match, advance start
|
|
|
|
* e|abcd match rule 1 - change text & adjust start
|
|
|
|
* ex|ycd match rule 2 - change text & adjust start
|
|
|
|
* exz|d no match, advance start
|
1999-11-20 00:40:50 +00:00
|
|
|
* exzd| done
|
|
|
|
*/
|
|
|
|
|
2000-02-12 02:05:36 +00:00
|
|
|
/* A rule like
|
|
|
|
* a>b|a
|
|
|
|
* creates an infinite loop. To prevent that, we put an arbitrary
|
|
|
|
* limit on the number of iterations that we take, one that is
|
|
|
|
* high enough that any reasonable rules are ok, but low enough to
|
|
|
|
* prevent a server from hanging. The limit is 16 times the
|
|
|
|
* number of characters n, unless n is so large that 16n exceeds a
|
|
|
|
* uint32_t.
|
|
|
|
*/
|
|
|
|
uint32_t loopCount = 0;
|
2000-06-29 00:18:43 +00:00
|
|
|
uint32_t loopLimit = index.limit - index.start;
|
2000-02-12 02:05:36 +00:00
|
|
|
if (loopLimit >= 0x10000000) {
|
|
|
|
loopLimit = 0xFFFFFFFF;
|
|
|
|
} else {
|
|
|
|
loopLimit <<= 4;
|
|
|
|
}
|
|
|
|
|
2004-06-14 18:54:35 +00:00
|
|
|
// Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
|
|
|
|
// operations must be prevented.
|
|
|
|
// A Complication: compound transliterators can result in recursive entries to this
|
|
|
|
// function, sometimes with different "This" objects, always with the same text.
|
|
|
|
// Double-locking must be prevented in these cases.
|
|
|
|
//
|
|
|
|
|
|
|
|
UBool lockedMutexAtThisLevel = FALSE;
|
2015-12-08 01:29:24 +00:00
|
|
|
|
|
|
|
// Test whether this request is operating on the same text string as
|
|
|
|
// some other transliteration that is still in progress and holding the
|
|
|
|
// transliteration mutex. If so, do not lock the transliteration
|
|
|
|
// mutex again.
|
|
|
|
//
|
|
|
|
// gLockedText variable is protected by the global ICU mutex.
|
|
|
|
// Shared RBT data protected by transliteratorDataMutex.
|
|
|
|
//
|
|
|
|
// TODO(andy): Need a better scheme for handling this.
|
2019-03-11 23:36:33 +00:00
|
|
|
|
2019-04-06 00:37:44 +00:00
|
|
|
static UMutex *transliteratorDataMutex = STATIC_NEW(UMutex);
|
2015-12-08 01:29:24 +00:00
|
|
|
UBool needToLock;
|
|
|
|
{
|
|
|
|
Mutex m;
|
|
|
|
needToLock = (&text != gLockedText);
|
|
|
|
}
|
|
|
|
if (needToLock) {
|
2019-04-06 00:37:44 +00:00
|
|
|
umtx_lock(transliteratorDataMutex); // Contention, longish waits possible here.
|
2015-12-08 01:29:24 +00:00
|
|
|
Mutex m;
|
|
|
|
gLockedText = &text;
|
|
|
|
lockedMutexAtThisLevel = TRUE;
|
2003-12-10 01:52:39 +00:00
|
|
|
}
|
2004-06-14 18:54:35 +00:00
|
|
|
|
2008-01-14 20:27:51 +00:00
|
|
|
// Check to make sure we don't dereference a null pointer.
|
|
|
|
if (fData != NULL) {
|
|
|
|
while (index.start < index.limit &&
|
|
|
|
loopCount <= loopLimit &&
|
|
|
|
fData->ruleSet.transliterate(text, index, isIncremental)) {
|
|
|
|
++loopCount;
|
|
|
|
}
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
2004-06-14 18:54:35 +00:00
|
|
|
if (lockedMutexAtThisLevel) {
|
2015-04-29 23:18:32 +00:00
|
|
|
{
|
|
|
|
Mutex m;
|
|
|
|
gLockedText = NULL;
|
|
|
|
}
|
2019-04-06 00:37:44 +00:00
|
|
|
umtx_unlock(transliteratorDataMutex);
|
2003-12-10 01:52:39 +00:00
|
|
|
}
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
2001-06-12 17:35:03 +00:00
|
|
|
|
|
|
|
UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
|
|
|
|
UBool escapeUnprintable) const {
|
2003-12-10 01:52:39 +00:00
|
|
|
return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
|
2001-06-12 17:35:03 +00:00
|
|
|
}
|
2001-10-08 23:26:58 +00:00
|
|
|
|
2002-06-28 21:13:54 +00:00
|
|
|
/**
|
|
|
|
* Implement Transliterator framework
|
|
|
|
*/
|
|
|
|
void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
|
2003-12-10 01:52:39 +00:00
|
|
|
fData->ruleSet.getSourceTargetSet(result, FALSE);
|
2002-06-28 21:13:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Override Transliterator framework
|
|
|
|
*/
|
|
|
|
UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
|
2003-12-10 01:52:39 +00:00
|
|
|
return fData->ruleSet.getSourceTargetSet(result, TRUE);
|
2002-06-28 21:13:54 +00:00
|
|
|
}
|
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_END
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|