2001-07-27 00:18:53 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 07/23/01 aliu Creation.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
#ifndef STRMATCH_H
|
|
|
|
#define STRMATCH_H
|
|
|
|
|
|
|
|
#include "unicode/unistr.h"
|
2002-02-07 01:07:55 +00:00
|
|
|
#include "unicode/unifunct.h"
|
2001-07-27 00:18:53 +00:00
|
|
|
#include "unicode/unimatch.h"
|
2002-02-07 01:07:55 +00:00
|
|
|
#include "unicode/unirepl.h"
|
2001-07-27 00:18:53 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2001-07-27 00:18:53 +00:00
|
|
|
class TransliterationRuleData;
|
|
|
|
|
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* An object that matches a fixed input string, implementing the
|
|
|
|
* UnicodeMatcher API. This object also implements the
|
|
|
|
* UnicodeReplacer API, allowing it to emit the matched text as
|
|
|
|
* output. Since the match text may contain flexible match elements,
|
|
|
|
* such as UnicodeSets, the emitted text is not the match pattern, but
|
|
|
|
* instead a substring of the actual matched text. Following
|
|
|
|
* convention, the output text is the leftmost match seen up to this
|
|
|
|
* point.
|
|
|
|
*
|
|
|
|
* A StringMatcher may represent a segment, in which case it has a
|
|
|
|
* positive segment number. This affects how the matcher converts
|
|
|
|
* itself to a pattern but does not otherwise affect its function.
|
|
|
|
*
|
|
|
|
* A StringMatcher that is not a segment should not be used as a
|
|
|
|
* UnicodeReplacer.
|
2001-07-27 00:18:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
|
2001-07-27 00:18:53 +00:00
|
|
|
|
|
|
|
public:
|
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
/**
|
|
|
|
* Construct a matcher that matches the given pattern string.
|
|
|
|
* @param theString the pattern to be matched, possibly containing
|
|
|
|
* stand-ins that represent nested UnicodeMatcher objects.
|
|
|
|
* @param segmentNum the segment number from 1..n, or 0 if this is
|
|
|
|
* not a segment.
|
|
|
|
* @param theData context object mapping stand-ins to
|
|
|
|
* UnicodeMatcher objects.
|
|
|
|
*/
|
2001-07-27 00:18:53 +00:00
|
|
|
StringMatcher(const UnicodeString& string,
|
|
|
|
int32_t start,
|
|
|
|
int32_t limit,
|
2002-02-07 01:07:55 +00:00
|
|
|
int32_t segmentNum,
|
2001-07-27 00:18:53 +00:00
|
|
|
const TransliterationRuleData& data);
|
|
|
|
|
|
|
|
StringMatcher(const StringMatcher& o);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor
|
|
|
|
*/
|
|
|
|
virtual ~StringMatcher();
|
|
|
|
|
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* Implement UnicodeFunctor
|
2001-07-27 00:18:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
virtual UnicodeFunctor* clone() const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
|
|
|
|
* and return the pointer.
|
|
|
|
*/
|
|
|
|
virtual UnicodeMatcher* toMatcher() const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
|
|
|
|
* and return the pointer.
|
|
|
|
*/
|
|
|
|
virtual UnicodeReplacer* toReplacer() const;
|
2001-07-27 00:18:53 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Implement UnicodeMatcher
|
|
|
|
*/
|
|
|
|
virtual UMatchDegree matches(const Replaceable& text,
|
|
|
|
int32_t& offset,
|
|
|
|
int32_t limit,
|
2001-10-30 23:55:09 +00:00
|
|
|
UBool incremental);
|
2001-07-27 00:18:53 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Implement UnicodeMatcher
|
|
|
|
*/
|
|
|
|
virtual UnicodeString& toPattern(UnicodeString& result,
|
|
|
|
UBool escapeUnprintable = FALSE) const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implement UnicodeMatcher
|
|
|
|
*/
|
|
|
|
virtual UBool matchesIndexValue(uint8_t v) const;
|
|
|
|
|
2002-03-20 00:42:02 +00:00
|
|
|
/**
|
|
|
|
* Implement UnicodeFunctor
|
|
|
|
*/
|
|
|
|
virtual void setData(const TransliterationRuleData*);
|
|
|
|
|
2001-10-30 18:08:53 +00:00
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* Replace characters in 'text' from 'start' to 'limit' with the
|
|
|
|
* output text of this object. Update the 'cursor' parameter to
|
|
|
|
* give the cursor position and return the length of the
|
|
|
|
* replacement text.
|
|
|
|
*
|
|
|
|
* @param text the text to be matched
|
|
|
|
* @param start inclusive start index of text to be replaced
|
|
|
|
* @param limit exclusive end index of text to be replaced;
|
|
|
|
* must be greater than or equal to start
|
|
|
|
* @param cursor output parameter for the cursor position.
|
|
|
|
* Not all replacer objects will update this, but in a complete
|
|
|
|
* tree of replacer objects, representing the entire output side
|
|
|
|
* of a transliteration rule, at least one must update it.
|
|
|
|
* @return the number of 16-bit code units in the text replacing
|
|
|
|
* the characters at offsets start..(limit-1) in text
|
2001-10-30 18:08:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
virtual int32_t replace(Replaceable& text,
|
|
|
|
int32_t start,
|
|
|
|
int32_t limit,
|
|
|
|
int32_t& cursor);
|
2001-10-30 18:08:53 +00:00
|
|
|
|
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* Returns a string representation of this replacer. If the
|
|
|
|
* result of calling this function is passed to the appropriate
|
|
|
|
* parser, typically TransliteratorParser, it will produce another
|
|
|
|
* replacer that is equal to this one.
|
|
|
|
* @param result the string to receive the pattern. Previous
|
|
|
|
* contents will be deleted.
|
|
|
|
* @param escapeUnprintable if TRUE then convert unprintable
|
|
|
|
* character to their hex escape representations, \\uxxxx or
|
|
|
|
* \\Uxxxxxxxx. Unprintable characters are defined by
|
|
|
|
* Utility.isUnprintable().
|
|
|
|
* @return a reference to 'result'.
|
2001-10-30 18:08:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
virtual UnicodeString& toReplacerPattern(UnicodeString& result,
|
|
|
|
UBool escapeUnprintable) const;
|
2001-10-30 18:08:53 +00:00
|
|
|
|
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* Remove any match data. This must be called before performing a
|
|
|
|
* set of matches with this segment.
|
2001-10-30 18:08:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
void resetMatch();
|
2001-10-30 18:08:53 +00:00
|
|
|
|
2001-07-27 00:18:53 +00:00
|
|
|
private:
|
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
/**
|
|
|
|
* The text to be matched.
|
|
|
|
*/
|
2001-07-27 00:18:53 +00:00
|
|
|
UnicodeString pattern;
|
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
/**
|
|
|
|
* Context object that maps stand-ins to matcher and replacer
|
|
|
|
* objects.
|
|
|
|
*/
|
2002-03-20 00:42:02 +00:00
|
|
|
const TransliterationRuleData* data;
|
2001-07-30 23:23:16 +00:00
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
/**
|
|
|
|
* The segment number, 1-based, or 0 if not a segment.
|
|
|
|
*/
|
|
|
|
int32_t segmentNumber;
|
2001-10-30 18:08:53 +00:00
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
/**
|
|
|
|
* Start offset, in the match text, of the <em>rightmost</em>
|
|
|
|
* match.
|
|
|
|
*/
|
2001-10-30 18:08:53 +00:00
|
|
|
int32_t matchStart;
|
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
/**
|
|
|
|
* Limit offset, in the match text, of the <em>rightmost</em>
|
|
|
|
* match.
|
|
|
|
*/
|
2001-10-30 18:08:53 +00:00
|
|
|
int32_t matchLimit;
|
2001-07-27 00:18:53 +00:00
|
|
|
};
|
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_END
|
|
|
|
|
2001-07-27 00:18:53 +00:00
|
|
|
#endif
|