/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt_set.h"
#include "rbt_rule.h"
#include "unicode/unistr.h"
/* Note: There was an old implementation that indexed by first letter of
* key. Problem with this is that key may not have a meaningful first
* letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
* rules whose intial key letter is a category variable. However, the
* problem is that they must be kept in order with respect to other rules.
* One solution -- add a sequence number to each rule. Do the usual
* first-letter lookup, and also a lookup from the spare bin with rules like
* {Lu}>*. Take the lower sequence number. This seems complex and not
* worth the trouble, but we may revisit this later. For documentation (or
* possible resurrection) the old code is included below, commented out
* with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
* implementation, rules
is a Hashtable, not a Vector.
*/
/**
* Construct a new empty rule set.
*/
TransliterationRuleSet::TransliterationRuleSet() {
maxContextLength = 0;
ruleVector = new UVector();
rules = NULL;
}
/**
* Destructor.
*/
TransliterationRuleSet::~TransliterationRuleSet() {
delete ruleVector;
delete[] rules;
}
/**
* Return the maximum context length.
* @return the length of the longest preceding context.
*/
int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
return maxContextLength;
}
/**
* Add a rule to this set. Rules are added in order, and order is
* significant.
*
*
Once freeze() is called, this method must not be called.
* @param adoptedRule the rule to add
*/
void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
UErrorCode& status) {
if (U_FAILURE(status)) {
delete adoptedRule;
return;
}
if (ruleVector == NULL) {
// throw new IllegalArgumentException("Cannot add rules after freezing");
status = U_ILLEGAL_ARGUMENT_ERROR;
delete adoptedRule;
return;
}
ruleVector->addElement(adoptedRule);
int32_t len;
if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
maxContextLength = len;
}
}
/**
* Close this rule set to further additions, check it for masked rules,
* and index it to optimize performance. Once this method is called,
* addRule() can no longer be called.
* @exception IllegalArgumentException if some rules are masked
*/
void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
/* Construct the rule array and index table. We reorder the
* rules by sorting them into 256 bins. Each bin contains all
* rules matching the index value for that bin. A rule
* matches an index value if string whose first key character
* has a low byte equal to the index value can match the rule.
*
* Each bin contains zero or more rules, in the same order
* they were found originally. However, the total rules in
* the bins may exceed the number in the original vector,
* since rules that have a variable as their first key
* character will generally fall into more than one bin.
*
* That is, each bin contains all rules that either have that
* first index value as their first key character, or have
* a set containing the index value as their first character.
*/
int32_t n = ruleVector->size();
int32_t j;
int16_t x;
UVector v(2*n); // heuristic; adjust as needed
/* Precompute the index values. This saves a LOT of time.
*/
int16_t* indexValue = new int16_t[n];
for (j=0; j0 <= start
* <= limit
.
* @param limit the ending index, exclusive; start <= limit
* <= text.length()
.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between start
and
* limit
.
* @param data a dictionary mapping variables to the sets they
* represent (maps Character
to UnicodeSet
)
* @param filter the filter. Any character for which
* filter.contains() returns false will not be
* altered by this transliterator. If filter is
* null then no filtering is applied.
* @return the matching rule, or null if none found.
*/
TransliterationRule*
TransliterationRuleSet::findMatch(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
/* We only need to check our indexed bin of the rule table,
* based on the low byte of the first key character.
*/
int16_t x = text.charAt(cursor) & 0xFF;
for (int32_t i=index[x]; ifindMatch()
, this method does an incremental match.
* An incremental match requires that there be no partial matches that might
* pre-empt the full match that is found. If there are partial matches,
* then null is returned. A non-null result indicates that a full match has
* been found, and that it cannot be pre-empted by a partial match
* regardless of what additional text is added to the translation buffer.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; 0 <= start
* <= limit
.
* @param limit the ending index, exclusive; start <= limit
* <= text.length()
.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between start
and
* limit
.
* @param data a dictionary mapping variables to the sets they
* represent (maps Character
to UnicodeSet
)
* @param partial output parameter. partial[0]
is set to
* true if a partial match is returned.
* @param filter the filter. Any character for which
* filter.contains() returns false will not be
* altered by this transliterator. If filter is
* null then no filtering is applied.
* @return the matching rule, or null if none found, or if the text buffer
* does not have enough text yet to unambiguously match a rule.
*/
TransliterationRule*
TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
int32_t start,
int32_t limit, int32_t cursor,
const TransliterationRuleData& data,
bool_t& isPartial,
const UnicodeFilter* filter) const {
/* We only need to check our indexed bin of the rule table,
* based on the low byte of the first key character.
*/
isPartial = FALSE;
int16_t x = text.charAt(cursor) & 0xFF;
for (int32_t i=index[x]; i