2001-07-27 00:18:53 +00:00
|
|
|
/*
|
2003-12-19 21:29:27 +00:00
|
|
|
* Copyright (C) 2001-2003, International Business Machines Corporation and others. All Rights Reserved.
|
2001-07-27 00:18:53 +00:00
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 07/23/01 aliu Creation.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_TRANSLITERATION
|
|
|
|
|
2001-07-27 00:18:53 +00:00
|
|
|
#include "strmatch.h"
|
|
|
|
#include "rbt_data.h"
|
2002-02-07 01:07:55 +00:00
|
|
|
#include "util.h"
|
2002-06-28 21:13:54 +00:00
|
|
|
#include "unicode/uniset.h"
|
2001-07-27 00:18:53 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
const UChar EMPTY[] = { 0 }; // empty string: ""
|
|
|
|
|
2003-08-31 20:53:46 +00:00
|
|
|
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
|
2002-06-29 00:04:16 +00:00
|
|
|
|
2001-07-27 00:18:53 +00:00
|
|
|
StringMatcher::StringMatcher(const UnicodeString& theString,
|
|
|
|
int32_t start,
|
|
|
|
int32_t limit,
|
2002-02-07 01:07:55 +00:00
|
|
|
int32_t segmentNum,
|
2001-07-27 00:18:53 +00:00
|
|
|
const TransliterationRuleData& theData) :
|
2002-03-20 00:42:02 +00:00
|
|
|
data(&theData),
|
2002-02-07 01:07:55 +00:00
|
|
|
segmentNumber(segmentNum),
|
2001-10-30 18:08:53 +00:00
|
|
|
matchStart(-1),
|
|
|
|
matchLimit(-1)
|
2001-10-09 22:21:01 +00:00
|
|
|
{
|
2001-07-27 00:18:53 +00:00
|
|
|
theString.extractBetween(start, limit, pattern);
|
|
|
|
}
|
|
|
|
|
|
|
|
StringMatcher::StringMatcher(const StringMatcher& o) :
|
2003-12-01 05:33:41 +00:00
|
|
|
UnicodeFunctor(o),
|
2001-10-09 22:21:01 +00:00
|
|
|
UnicodeMatcher(o),
|
2003-12-01 05:33:41 +00:00
|
|
|
UnicodeReplacer(o),
|
2001-07-27 00:18:53 +00:00
|
|
|
pattern(o.pattern),
|
2001-10-09 22:21:01 +00:00
|
|
|
data(o.data),
|
2002-02-07 01:07:55 +00:00
|
|
|
segmentNumber(o.segmentNumber),
|
2001-10-30 18:08:53 +00:00
|
|
|
matchStart(o.matchStart),
|
2001-11-20 00:35:22 +00:00
|
|
|
matchLimit(o.matchLimit)
|
2001-10-09 22:21:01 +00:00
|
|
|
{
|
2001-07-27 00:18:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor
|
|
|
|
*/
|
|
|
|
StringMatcher::~StringMatcher() {
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* Implement UnicodeFunctor
|
2001-07-27 00:18:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
UnicodeFunctor* StringMatcher::clone() const {
|
2001-07-27 00:18:53 +00:00
|
|
|
return new StringMatcher(*this);
|
|
|
|
}
|
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
/**
|
|
|
|
* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
|
|
|
|
* and return the pointer.
|
|
|
|
*/
|
|
|
|
UnicodeMatcher* StringMatcher::toMatcher() const {
|
|
|
|
return (UnicodeMatcher*) this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
|
|
|
|
* and return the pointer.
|
|
|
|
*/
|
|
|
|
UnicodeReplacer* StringMatcher::toReplacer() const {
|
|
|
|
return (UnicodeReplacer*) this;
|
|
|
|
}
|
|
|
|
|
2001-07-27 00:18:53 +00:00
|
|
|
/**
|
|
|
|
* Implement UnicodeMatcher
|
|
|
|
*/
|
|
|
|
UMatchDegree StringMatcher::matches(const Replaceable& text,
|
|
|
|
int32_t& offset,
|
|
|
|
int32_t limit,
|
2001-10-30 23:55:09 +00:00
|
|
|
UBool incremental) {
|
2001-07-27 00:18:53 +00:00
|
|
|
int32_t i;
|
|
|
|
int32_t cursor = offset;
|
|
|
|
if (limit < cursor) {
|
2001-10-30 18:08:53 +00:00
|
|
|
// Match in the reverse direction
|
2001-07-27 00:18:53 +00:00
|
|
|
for (i=pattern.length()-1; i>=0; --i) {
|
|
|
|
UChar keyChar = pattern.charAt(i);
|
2002-03-20 00:42:02 +00:00
|
|
|
UnicodeMatcher* subm = data->lookupMatcher(keyChar);
|
2001-07-27 00:18:53 +00:00
|
|
|
if (subm == 0) {
|
2001-12-11 17:45:13 +00:00
|
|
|
if (cursor > limit &&
|
2001-07-27 00:18:53 +00:00
|
|
|
keyChar == text.charAt(cursor)) {
|
|
|
|
--cursor;
|
|
|
|
} else {
|
|
|
|
return U_MISMATCH;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
UMatchDegree m =
|
|
|
|
subm->matches(text, cursor, limit, incremental);
|
|
|
|
if (m != U_MATCH) {
|
|
|
|
return m;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2001-10-30 18:08:53 +00:00
|
|
|
// Record the match position, but adjust for a normal
|
|
|
|
// forward start, limit, and only if a prior match does not
|
|
|
|
// exist -- we want the rightmost match.
|
|
|
|
if (matchStart < 0) {
|
2001-10-30 23:55:09 +00:00
|
|
|
matchStart = cursor+1;
|
|
|
|
matchLimit = offset+1;
|
2001-10-30 18:08:53 +00:00
|
|
|
}
|
2001-07-27 00:18:53 +00:00
|
|
|
} else {
|
|
|
|
for (i=0; i<pattern.length(); ++i) {
|
|
|
|
if (incremental && cursor == limit) {
|
|
|
|
// We've reached the context limit without a mismatch and
|
|
|
|
// without completing our match.
|
|
|
|
return U_PARTIAL_MATCH;
|
|
|
|
}
|
|
|
|
UChar keyChar = pattern.charAt(i);
|
2002-03-20 00:42:02 +00:00
|
|
|
UnicodeMatcher* subm = data->lookupMatcher(keyChar);
|
2001-07-27 00:18:53 +00:00
|
|
|
if (subm == 0) {
|
|
|
|
// Don't need the cursor < limit check if
|
|
|
|
// incremental is TRUE (because it's done above); do need
|
|
|
|
// it otherwise.
|
|
|
|
if (cursor < limit &&
|
|
|
|
keyChar == text.charAt(cursor)) {
|
|
|
|
++cursor;
|
|
|
|
} else {
|
|
|
|
return U_MISMATCH;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
UMatchDegree m =
|
|
|
|
subm->matches(text, cursor, limit, incremental);
|
|
|
|
if (m != U_MATCH) {
|
|
|
|
return m;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2001-10-30 18:08:53 +00:00
|
|
|
// Record the match position
|
2001-10-30 23:55:09 +00:00
|
|
|
matchStart = offset;
|
|
|
|
matchLimit = cursor;
|
2001-07-27 00:18:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
offset = cursor;
|
|
|
|
return U_MATCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implement UnicodeMatcher
|
|
|
|
*/
|
|
|
|
UnicodeString& StringMatcher::toPattern(UnicodeString& result,
|
2002-02-28 01:42:40 +00:00
|
|
|
UBool escapeUnprintable) const
|
|
|
|
{
|
|
|
|
result.truncate(0);
|
2001-07-30 23:23:16 +00:00
|
|
|
UnicodeString str, quoteBuf;
|
2002-02-07 01:07:55 +00:00
|
|
|
if (segmentNumber > 0) {
|
2001-07-30 23:23:16 +00:00
|
|
|
result.append((UChar)40); /*(*/
|
|
|
|
}
|
2001-07-27 00:18:53 +00:00
|
|
|
for (int32_t i=0; i<pattern.length(); ++i) {
|
2001-07-30 23:23:16 +00:00
|
|
|
UChar keyChar = pattern.charAt(i);
|
2002-03-20 00:42:02 +00:00
|
|
|
const UnicodeMatcher* m = data->lookupMatcher(keyChar);
|
2001-07-30 23:23:16 +00:00
|
|
|
if (m == 0) {
|
2002-02-07 01:07:55 +00:00
|
|
|
ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
|
2001-07-30 23:23:16 +00:00
|
|
|
} else {
|
2002-02-07 01:07:55 +00:00
|
|
|
ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
|
2001-07-30 23:23:16 +00:00
|
|
|
TRUE, escapeUnprintable, quoteBuf);
|
|
|
|
}
|
|
|
|
}
|
2002-02-07 01:07:55 +00:00
|
|
|
if (segmentNumber > 0) {
|
2001-07-30 23:23:16 +00:00
|
|
|
result.append((UChar)41); /*)*/
|
2001-07-27 00:18:53 +00:00
|
|
|
}
|
2001-07-30 23:23:16 +00:00
|
|
|
// Flush quoteBuf out to result
|
2002-02-07 01:07:55 +00:00
|
|
|
ICU_Utility::appendToRule(result, -1,
|
|
|
|
TRUE, escapeUnprintable, quoteBuf);
|
2001-07-27 00:18:53 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implement UnicodeMatcher
|
|
|
|
*/
|
|
|
|
UBool StringMatcher::matchesIndexValue(uint8_t v) const {
|
|
|
|
if (pattern.length() == 0) {
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
UChar32 c = pattern.char32At(0);
|
2002-03-20 00:42:02 +00:00
|
|
|
const UnicodeMatcher *m = data->lookupMatcher(c);
|
2001-07-27 00:18:53 +00:00
|
|
|
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
|
|
|
|
}
|
|
|
|
|
2002-06-28 21:13:54 +00:00
|
|
|
/**
|
|
|
|
* Implement UnicodeMatcher
|
|
|
|
*/
|
|
|
|
void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
|
|
|
|
UChar32 ch;
|
|
|
|
for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
|
2003-08-31 20:53:46 +00:00
|
|
|
ch = pattern.char32At(i);
|
|
|
|
const UnicodeMatcher* matcher = data->lookupMatcher(ch);
|
|
|
|
if (matcher == NULL) {
|
|
|
|
toUnionTo.add(ch);
|
|
|
|
} else {
|
|
|
|
matcher->addMatchSetTo(toUnionTo);
|
|
|
|
}
|
2002-06-28 21:13:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-10-30 18:08:53 +00:00
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* UnicodeReplacer API
|
2001-10-30 18:08:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
int32_t StringMatcher::replace(Replaceable& text,
|
|
|
|
int32_t start,
|
|
|
|
int32_t limit,
|
|
|
|
int32_t& cursor) {
|
|
|
|
|
|
|
|
int32_t outLen = 0;
|
|
|
|
|
|
|
|
// Copy segment with out-of-band data
|
|
|
|
int32_t dest = limit;
|
|
|
|
// If there was no match, that means that a quantifier
|
|
|
|
// matched zero-length. E.g., x (a)* y matched "xy".
|
|
|
|
if (matchStart >= 0) {
|
|
|
|
if (matchStart != matchLimit) {
|
|
|
|
text.copy(matchStart, matchLimit, dest);
|
|
|
|
outLen = matchLimit - matchStart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
text.handleReplaceBetween(start, limit, EMPTY); // delete original text
|
|
|
|
|
|
|
|
return outLen;
|
2001-10-30 18:08:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* UnicodeReplacer API
|
2001-10-30 18:08:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
|
|
|
|
UBool escapeUnprintable) const {
|
|
|
|
// assert(segmentNumber > 0);
|
|
|
|
rule.truncate(0);
|
|
|
|
rule.append((UChar)0x0024 /*$*/);
|
|
|
|
ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
|
|
|
|
return rule;
|
2001-10-30 18:08:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2002-03-20 00:42:02 +00:00
|
|
|
* Remove any match info. This must be called before performing a
|
2002-02-07 01:07:55 +00:00
|
|
|
* set of matches with this segment.
|
2001-10-30 18:08:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
void StringMatcher::resetMatch() {
|
|
|
|
matchStart = matchLimit = -1;
|
2001-10-30 18:08:53 +00:00
|
|
|
}
|
|
|
|
|
2002-06-28 21:13:54 +00:00
|
|
|
/**
|
|
|
|
* Union the set of all characters that may output by this object
|
|
|
|
* into the given set.
|
|
|
|
* @param toUnionTo the set into which to union the output characters
|
|
|
|
*/
|
2003-12-01 05:33:41 +00:00
|
|
|
void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
|
2002-06-28 21:13:54 +00:00
|
|
|
// The output of this replacer varies; it is the source text between
|
|
|
|
// matchStart and matchLimit. Since this varies depending on the
|
|
|
|
// input text, we can't compute it here. We can either do nothing
|
|
|
|
// or we can add ALL characters to the set. It's probably more useful
|
|
|
|
// to do nothing.
|
|
|
|
}
|
|
|
|
|
2002-03-20 00:42:02 +00:00
|
|
|
/**
|
|
|
|
* Implement UnicodeFunctor
|
|
|
|
*/
|
|
|
|
void StringMatcher::setData(const TransliterationRuleData* d) {
|
|
|
|
data = d;
|
|
|
|
int32_t i = 0;
|
|
|
|
while (i<pattern.length()) {
|
|
|
|
UChar32 c = pattern.char32At(i);
|
|
|
|
UnicodeFunctor* f = data->lookup(c);
|
|
|
|
if (f != NULL) {
|
|
|
|
f->setData(data);
|
|
|
|
}
|
|
|
|
i += UTF_CHAR_LENGTH(c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_END
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|
|
|
|
|
2001-07-27 00:18:53 +00:00
|
|
|
//eof
|