1999-11-20 00:40:50 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (C) 1999, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 11/17/99 aliu Creation.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
#include "rbt_set.h"
|
|
|
|
#include "rbt_rule.h"
|
1999-12-28 23:57:50 +00:00
|
|
|
#include "unicode/unistr.h"
|
2000-06-30 23:26:07 +00:00
|
|
|
#include "cmemory.h"
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-07-11 00:24:58 +00:00
|
|
|
static void U_CALLCONV _deleteRule(void *rule) {
|
2000-07-13 00:40:31 +00:00
|
|
|
delete (TransliterationRule *)rule;
|
|
|
|
}
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-08-31 03:23:39 +00:00
|
|
|
static void syntaxError(const UnicodeString& r1,
|
|
|
|
const UnicodeString& r2,
|
|
|
|
UParseError& parseError) {
|
|
|
|
parseError.line =0 ;
|
|
|
|
parseError.offset =0;
|
|
|
|
int32_t len1 = r1.length();
|
|
|
|
int32_t len2 = r2.length();
|
|
|
|
// for pre-context
|
|
|
|
int32_t start = (len1<U_PARSE_CONTEXT_LEN) ? 0: (len1 - (U_PARSE_CONTEXT_LEN-1));
|
|
|
|
int32_t stop = len1;
|
|
|
|
|
|
|
|
r1.extract(start,stop-start,parseError.preContext);
|
|
|
|
//null terminate the buffer
|
|
|
|
parseError.preContext[stop-start] = 0;
|
|
|
|
//for post-context
|
|
|
|
start = 0;
|
|
|
|
stop = (len2<U_PARSE_CONTEXT_LEN)? len2 : (U_PARSE_CONTEXT_LEN-1);
|
|
|
|
|
|
|
|
r2.extract(start,stop-start,parseError.postContext);
|
|
|
|
//null terminate the buffer
|
|
|
|
parseError.postContext[stop-start]= 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
/**
|
|
|
|
* Construct a new empty rule set.
|
|
|
|
*/
|
2001-08-23 01:06:08 +00:00
|
|
|
TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) {
|
1999-11-20 00:40:50 +00:00
|
|
|
maxContextLength = 0;
|
2001-08-23 01:06:08 +00:00
|
|
|
ruleVector = new UVector(status);
|
2000-07-13 00:40:31 +00:00
|
|
|
ruleVector->setDeleter(&_deleteRule);
|
2000-01-13 07:28:08 +00:00
|
|
|
rules = NULL;
|
2001-08-23 01:06:08 +00:00
|
|
|
if (ruleVector == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
}
|
2000-01-13 07:28:08 +00:00
|
|
|
}
|
|
|
|
|
2000-06-30 23:26:07 +00:00
|
|
|
/**
|
|
|
|
* Copy constructor. We assume that the ruleset being copied
|
|
|
|
* has already been frozen.
|
|
|
|
*/
|
2000-07-11 18:45:49 +00:00
|
|
|
TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) :
|
2000-07-13 00:40:31 +00:00
|
|
|
ruleVector(0),
|
2000-06-30 23:26:07 +00:00
|
|
|
maxContextLength(other.maxContextLength) {
|
|
|
|
|
|
|
|
uprv_memcpy(index, other.index, sizeof(index));
|
|
|
|
int32_t len = index[256]; // see freeze()
|
|
|
|
rules = new TransliterationRule*[len];
|
|
|
|
for (int32_t i=0; i<len; ++i) {
|
2000-07-11 18:45:49 +00:00
|
|
|
rules[i] = new TransliterationRule(*other.rules[i]);
|
2000-06-30 23:26:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-01-13 07:28:08 +00:00
|
|
|
/**
|
|
|
|
* Destructor.
|
|
|
|
*/
|
|
|
|
TransliterationRuleSet::~TransliterationRuleSet() {
|
|
|
|
delete ruleVector;
|
|
|
|
delete[] rules;
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the maximum context length.
|
|
|
|
* @return the length of the longest preceding context.
|
|
|
|
*/
|
1999-12-22 22:57:04 +00:00
|
|
|
int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
|
1999-11-20 00:40:50 +00:00
|
|
|
return maxContextLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Add a rule to this set. Rules are added in order, and order is
|
2000-07-13 00:40:31 +00:00
|
|
|
* significant. The last call to this method must be followed by
|
|
|
|
* a call to <code>freeze()</code> before the rule set is used.
|
1999-11-20 00:40:50 +00:00
|
|
|
*
|
2000-01-13 07:28:08 +00:00
|
|
|
* @param adoptedRule the rule to add
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
|
|
|
void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
|
|
|
|
UErrorCode& status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
delete adoptedRule;
|
|
|
|
return;
|
|
|
|
}
|
2001-08-23 01:06:08 +00:00
|
|
|
ruleVector->addElement(adoptedRule, status);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
int32_t len;
|
2001-07-25 19:11:02 +00:00
|
|
|
if ((len = adoptedRule->getContextLength()) > maxContextLength) {
|
1999-11-20 00:40:50 +00:00
|
|
|
maxContextLength = len;
|
|
|
|
}
|
2000-07-13 00:40:31 +00:00
|
|
|
|
|
|
|
delete[] rules; // Contains alias pointers
|
|
|
|
rules = 0;
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2000-07-13 00:40:31 +00:00
|
|
|
* Check this for masked rules and index it to optimize performance.
|
|
|
|
* The sequence of operations is: (1) add rules to a set using
|
|
|
|
* <code>addRule()</code>; (2) freeze the set using
|
|
|
|
* <code>freeze()</code>; (3) use the rule set. If
|
|
|
|
* <code>addRule()</code> is called after calling this method, it
|
|
|
|
* invalidates this object, and this method must be called again.
|
|
|
|
* That is, <code>freeze()</code> may be called multiple times,
|
|
|
|
* although for optimal performance it shouldn't be.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
2001-08-31 03:23:39 +00:00
|
|
|
void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) {
|
2000-01-13 07:28:08 +00:00
|
|
|
/* Construct the rule array and index table. We reorder the
|
|
|
|
* rules by sorting them into 256 bins. Each bin contains all
|
|
|
|
* rules matching the index value for that bin. A rule
|
|
|
|
* matches an index value if string whose first key character
|
|
|
|
* has a low byte equal to the index value can match the rule.
|
|
|
|
*
|
|
|
|
* Each bin contains zero or more rules, in the same order
|
|
|
|
* they were found originally. However, the total rules in
|
|
|
|
* the bins may exceed the number in the original vector,
|
|
|
|
* since rules that have a variable as their first key
|
|
|
|
* character will generally fall into more than one bin.
|
|
|
|
*
|
|
|
|
* That is, each bin contains all rules that either have that
|
|
|
|
* first index value as their first key character, or have
|
|
|
|
* a set containing the index value as their first character.
|
|
|
|
*/
|
|
|
|
int32_t n = ruleVector->size();
|
|
|
|
int32_t j;
|
|
|
|
int16_t x;
|
2001-08-23 01:06:08 +00:00
|
|
|
UVector v(status, 2*n); // heuristic; adjust as needed
|
|
|
|
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
2000-01-13 07:28:08 +00:00
|
|
|
|
|
|
|
/* Precompute the index values. This saves a LOT of time.
|
|
|
|
*/
|
|
|
|
int16_t* indexValue = new int16_t[n];
|
|
|
|
for (j=0; j<n; ++j) {
|
|
|
|
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
|
2001-07-25 19:11:02 +00:00
|
|
|
indexValue[j] = r->getIndexValue();
|
2000-01-13 07:28:08 +00:00
|
|
|
}
|
|
|
|
for (x=0; x<256; ++x) {
|
|
|
|
index[x] = v.size();
|
|
|
|
for (j=0; j<n; ++j) {
|
|
|
|
if (indexValue[j] >= 0) {
|
|
|
|
if (indexValue[j] == x) {
|
2001-08-23 01:06:08 +00:00
|
|
|
v.addElement(ruleVector->elementAt(j), status);
|
2000-01-13 07:28:08 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// If the indexValue is < 0, then the first key character is
|
|
|
|
// a set, and we must use the more time-consuming
|
|
|
|
// matchesIndexValue check. In practice this happens
|
|
|
|
// rarely, so we seldom tread this code path.
|
|
|
|
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
|
2001-07-25 19:11:02 +00:00
|
|
|
if (r->matchesIndexValue((uint8_t)x)) {
|
2001-08-23 01:06:08 +00:00
|
|
|
v.addElement(r, status);
|
2000-01-13 07:28:08 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
delete[] indexValue;
|
|
|
|
index[256] = v.size();
|
|
|
|
|
|
|
|
/* Freeze things into an array.
|
|
|
|
*/
|
2000-07-13 00:40:31 +00:00
|
|
|
delete[] rules; // Contains alias pointers
|
2000-01-13 07:28:08 +00:00
|
|
|
rules = new TransliterationRule*[v.size()];
|
|
|
|
for (j=0; j<v.size(); ++j) {
|
|
|
|
rules[j] = (TransliterationRule*) v.elementAt(j);
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
2000-01-13 07:28:08 +00:00
|
|
|
|
|
|
|
// TODO Add error reporting that indicates the rules that
|
|
|
|
// are being masked.
|
|
|
|
//UnicodeString errors;
|
|
|
|
|
|
|
|
/* Check for masking. This is MUCH faster than our old check,
|
|
|
|
* which was each rule against each following rule, since we
|
|
|
|
* only have to check for masking within each bin now. It's
|
|
|
|
* 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
|
|
|
|
* count, and n2 is the per-bin rule count. But n2<<n1, so
|
|
|
|
* it's a big win.
|
|
|
|
*/
|
|
|
|
for (x=0; x<256; ++x) {
|
|
|
|
for (j=index[x]; j<index[x+1]-1; ++j) {
|
|
|
|
TransliterationRule* r1 = rules[j];
|
|
|
|
for (int32_t k=j+1; k<index[x+1]; ++k) {
|
|
|
|
TransliterationRule* r2 = rules[k];
|
|
|
|
if (r1->masks(*r2)) {
|
|
|
|
//| if (errors == null) {
|
|
|
|
//| errors = new StringBuffer();
|
|
|
|
//| } else {
|
|
|
|
//| errors.append("\n");
|
|
|
|
//| }
|
|
|
|
//| errors.append("Rule " + r1 + " masks " + r2);
|
2001-08-31 03:23:39 +00:00
|
|
|
status = U_RULE_MASK_ERROR;
|
|
|
|
UnicodeString rp1,rp2;
|
|
|
|
syntaxError(r1->getPattern(rp1),r2->getPattern(rp2),parseError);
|
2000-01-13 07:28:08 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//if (errors != null) {
|
|
|
|
// throw new IllegalArgumentException(errors.toString());
|
|
|
|
//}
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2001-07-25 19:11:02 +00:00
|
|
|
* Transliterate the given text with the given UTransPosition
|
|
|
|
* indices. Return TRUE if the transliteration should continue
|
|
|
|
* or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
|
|
|
|
* Note that FALSE is only ever returned if isIncremental is TRUE.
|
|
|
|
* @param text the text to be transliterated
|
|
|
|
* @param pos the position indices, which will be updated
|
|
|
|
* @param incremental if TRUE, assume new text may be inserted
|
|
|
|
* at index.limit, and return FALSE if thre is a partial match.
|
|
|
|
* @return TRUE unless a U_PARTIAL_MATCH has been obtained,
|
|
|
|
* indicating that transliteration should stop until more text
|
|
|
|
* arrives.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
2001-07-25 19:11:02 +00:00
|
|
|
UBool TransliterationRuleSet::transliterate(Replaceable& text,
|
|
|
|
UTransPosition& pos,
|
|
|
|
UBool incremental) {
|
|
|
|
int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);
|
|
|
|
for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {
|
|
|
|
UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
|
|
|
|
switch (m) {
|
|
|
|
case U_MATCH:
|
|
|
|
return TRUE;
|
|
|
|
case U_PARTIAL_MATCH:
|
|
|
|
return FALSE;
|
2001-08-31 03:23:39 +00:00
|
|
|
default: /* Ram: added default to make GCC happy */
|
|
|
|
break;
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
|
|
|
}
|
2001-07-25 19:11:02 +00:00
|
|
|
// No match or partial match from any rule
|
2001-07-25 21:15:53 +00:00
|
|
|
pos.start += UTF_CHAR_LENGTH(text.char32At(pos.start));
|
2001-07-25 19:11:02 +00:00
|
|
|
return TRUE;
|
1999-11-20 00:40:50 +00:00
|
|
|
}
|
2001-06-12 18:02:16 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Create rule strings that represents this rule set.
|
|
|
|
*/
|
|
|
|
UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
|
|
|
|
UBool escapeUnprintable) const {
|
|
|
|
int32_t i;
|
|
|
|
int32_t count = index[256];
|
|
|
|
ruleSource.truncate(0);
|
|
|
|
for (i=0; i<count; ++i) {
|
|
|
|
if (i != 0) {
|
|
|
|
ruleSource.append((UChar) 0x000A /*\n*/);
|
|
|
|
}
|
2001-07-25 19:11:02 +00:00
|
|
|
rules[i]->toRule(ruleSource, escapeUnprintable);
|
2001-06-12 18:02:16 +00:00
|
|
|
}
|
|
|
|
return ruleSource;
|
|
|
|
}
|