3ee0a67db2
X-SVN-Rev: 225
301 lines
12 KiB
C++
301 lines
12 KiB
C++
/*
|
|
* Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
* Date Name Description
|
|
* 11/17/99 aliu Creation.
|
|
**********************************************************************
|
|
*/
|
|
#ifndef RBT_PARS_H
|
|
#define RBT_PARS_H
|
|
|
|
#include "rbt.h"
|
|
|
|
class TransliterationRuleData;
|
|
|
|
class TransliterationRuleParser {
|
|
|
|
/**
|
|
* This is a reference to external data we don't own. This works because
|
|
* we only hold this for the duration of the call to parse().
|
|
*/
|
|
const UnicodeString& rules;
|
|
|
|
RuleBasedTransliterator::Direction direction;
|
|
|
|
TransliterationRuleData* data;
|
|
|
|
/**
|
|
* We use a single error code during parsing. Rather than pass it
|
|
* through each API, we keep it here.
|
|
*/
|
|
UErrorCode status;
|
|
|
|
/**
|
|
* The next available stand-in for variables. This starts at some point in
|
|
* the private use area (discovered dynamically) and increments up toward
|
|
* <code>variableLimit</code>. At any point during parsing, available
|
|
* variables are <code>variableNext..variableLimit-1</code>.
|
|
*/
|
|
UChar variableNext;
|
|
|
|
/**
|
|
* The last available stand-in for variables. This is discovered
|
|
* dynamically. At any point during parsing, available variables are
|
|
* <code>variableNext..variableLimit-1</code>.
|
|
*/
|
|
UChar variableLimit;
|
|
|
|
// Operators
|
|
static const UChar VARIABLE_DEF_OP;
|
|
static const UChar FORWARD_RULE_OP;
|
|
static const UChar REVERSE_RULE_OP;
|
|
static const char* OPERATORS;
|
|
|
|
|
|
// Other special characters
|
|
static const UChar QUOTE;
|
|
static const UChar VARIABLE_REF_OPEN;
|
|
static const UChar VARIABLE_REF_CLOSE;
|
|
static const UChar CONTEXT_OPEN;
|
|
static const UChar CONTEXT_CLOSE;
|
|
static const UChar CURSOR_POS;
|
|
static const UChar RULE_COMMENT_CHAR;
|
|
|
|
|
|
/**
|
|
* Specials must be quoted in rules to be used as literals.
|
|
* Specials may not occur in variable names.
|
|
*/
|
|
static const char* SPECIALS;
|
|
|
|
/**
|
|
* Specials that must be quoted in variable definitions.
|
|
*/
|
|
static const char* DEF_SPECIALS;
|
|
|
|
public:
|
|
|
|
static TransliterationRuleData*
|
|
parse(const UnicodeString& rules,
|
|
RuleBasedTransliterator::Direction direction);
|
|
|
|
private:
|
|
|
|
/**
|
|
* @param rules list of rules, separated by newline characters
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
TransliterationRuleParser(const UnicodeString& rules,
|
|
RuleBasedTransliterator::Direction direction);
|
|
|
|
/**
|
|
* Parse the given string as a sequence of rules, separated by newline
|
|
* characters ('\n'), and cause this object to implement those rules. Any
|
|
* previous rules are discarded. Typically this method is called exactly
|
|
* once, during construction.
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
void parseRules();
|
|
|
|
/**
|
|
* Parse the given substring as a rule, and append it to the rules currently
|
|
* represented in this object.
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= rules.length()</code>.
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
void applyRule(int32_t start, int32_t limit);
|
|
|
|
/**
|
|
* Add a variable definition.
|
|
* @param name the name of the variable. It must not already be defined.
|
|
* @param pattern the value of the variable. It may be a single character
|
|
* or a pattern describing a character set.
|
|
* @exception IllegalArgumentException if there is a syntax error
|
|
*/
|
|
void applyVariableDef(const UnicodeString& name,
|
|
const UnicodeString& pattern);
|
|
|
|
/**
|
|
* Given a rule, parses it into three pieces: The left side, the right side,
|
|
* and the operator. Returns the operator. Quotes and variable references
|
|
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
|
|
* is literal text. This method delegates to other parsing methods to
|
|
* handle the match pattern, output pattern, and other sub-patterns in the
|
|
* rule.
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= rules.length()</code>.
|
|
* @param left left side of rule is appended to this buffer
|
|
* with the quotes removed and variables resolved
|
|
* @param right right side of rule is appended to this buffer
|
|
* with the quotes removed and variables resolved
|
|
* @param anteContext the preceding context of the match pattern,
|
|
* if there is one, is appended to this buffer
|
|
* @param postContext the following context of the match pattern,
|
|
* if there is one, is appended to this buffer
|
|
* @param cursorPos if there is a cursor in the output pattern, its
|
|
* offset is stored in <code>cursorPos[0]</code>
|
|
* @return The operator character, one of the characters in OPERATORS.
|
|
*/
|
|
UChar parseRule(int32_t start, int32_t limit,
|
|
UnicodeString& left, UnicodeString& right,
|
|
UnicodeString& anteContext,
|
|
UnicodeString& postContext,
|
|
int32_t& cursorPos);
|
|
|
|
/**
|
|
* Parses the match pattern of a forward or reverse rule. Given the raw
|
|
* match pattern, return the match text and the context on both sides, if
|
|
* any. Resolves all quotes and variables.
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= rules.length()</code>.
|
|
* @param text the key to be matched will be appended to this buffer
|
|
* @param anteContext the preceding context, if any, will be appended
|
|
* to this buffer.
|
|
* @param postContext the following context, if any, will be appended
|
|
* to this buffer.
|
|
*/
|
|
void parseMatchPattern(int32_t start, int32_t limit,
|
|
UnicodeString& text,
|
|
UnicodeString& anteContext,
|
|
UnicodeString& postContext);
|
|
|
|
void parseSubPattern(int32_t start, int32_t limit,
|
|
UnicodeString& text);
|
|
|
|
/**
|
|
* Parse a variable definition sub pattern. This kind of sub
|
|
* pattern differs in the set of characters that are considered
|
|
* special. In particular, the '[' and ']' characters are not
|
|
* special, since these are used in UnicodeSet patterns.
|
|
*/
|
|
void parseDefPattern(int32_t start, int32_t limit,
|
|
UnicodeString& text);
|
|
|
|
/**
|
|
* Parses the output pattern of a forward or reverse rule. Given the
|
|
* output pattern, return the output text and the position of the cursor,
|
|
* if any. Resolves all quotes and variables.
|
|
* @param rules the string to be parsed
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= rules.length()</code>.
|
|
* @param text the output text will be appended to this buffer
|
|
* @param cursorPos if this parameter is not null, then cursorPos[0]
|
|
* will be set to the cursor position, or -1 if there is none. If this
|
|
* parameter is null, then cursors will be disallowed.
|
|
*/
|
|
void parseOutputPattern(int32_t start, int32_t limit,
|
|
UnicodeString& text,
|
|
int32_t& cursorPos);
|
|
|
|
/**
|
|
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
|
|
* if any. Resolves all quotes and variables.
|
|
* @param rules the string to be parsed
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= rules.length()</code>.
|
|
* @param text the output text will be appended to this buffer
|
|
* @param cursorPos if this parameter is not null, then cursorPos[0]
|
|
* will be set to the cursor position, or -1 if there is none. If this
|
|
* parameter is null, then cursors will be disallowed.
|
|
* @param specials characters that must be quoted; typically either
|
|
* SPECIALS or DEF_SPECIALS.
|
|
*/
|
|
void parseSubPattern(int32_t start, int32_t limit,
|
|
UnicodeString& text,
|
|
int32_t* cursorPos,
|
|
const UnicodeString& specials);
|
|
|
|
void validateVariableName(const UnicodeString& name);
|
|
|
|
/**
|
|
* Returns the single character value of the given variable name. Defined
|
|
* names are recognized.
|
|
*
|
|
* NO LONGER SUPPORTED:
|
|
* If a Unicode category name is given, a standard character variable
|
|
* in the range firstCategoryVariable to lastCategoryVariable is returned,
|
|
* with value firstCategoryVariable + n, where n is the category
|
|
* number.
|
|
* @exception IllegalArgumentException if the name is unknown.
|
|
*/
|
|
//$ Character getVariableDef(const UnicodeString& name);
|
|
|
|
/**
|
|
* Determines what part of the private use region of Unicode we can use for
|
|
* variable stand-ins. The correct way to do this is as follows: Parse each
|
|
* rule, and for forward and reverse rules, take the FROM expression, and
|
|
* make a hash of all characters used. The TO expression should be ignored.
|
|
* When done, everything not in the hash is available for use. In practice,
|
|
* this method may employ some other algorithm for improved speed.
|
|
*/
|
|
void determineVariableRange();
|
|
|
|
/**
|
|
* Returns the index of the first character in a set, ignoring quoted text.
|
|
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
|
* found by a search for "h". Unlike String.indexOf(), this method searches
|
|
* not for a single character, but for any character of the string
|
|
* <code>setOfChars</code>.
|
|
* @param text text to be searched
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= text.length()</code>.
|
|
* @param setOfChars string with one or more distinct characters
|
|
* @return Offset of the first character in <code>setOfChars</code>
|
|
* found, or -1 if not found.
|
|
* @see #indexOf
|
|
*/
|
|
static int32_t quotedIndexOf(const UnicodeString& text,
|
|
int32_t start, int32_t limit,
|
|
const UnicodeString& setOfChars);
|
|
|
|
/**
|
|
* Returns the index of the first character in a set. Unlike
|
|
* String.indexOf(), this method searches not for a single character, but
|
|
* for any character of the string <code>setOfChars</code>.
|
|
* @param text text to be searched
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= text.length()</code>.
|
|
* @param setOfChars string with one or more distinct characters
|
|
* @return Offset of the first character in <code>setOfChars</code>
|
|
* found, or -1 if not found.
|
|
* @see #quotedIndexOf
|
|
*/
|
|
static int32_t indexOf(const UnicodeString& text,
|
|
int32_t start, int32_t limit,
|
|
const UnicodeString& setOfChars);
|
|
|
|
/**
|
|
* Returns the index of the first character in a set. Unlike
|
|
* String.indexOf(), this method searches not for a single character, but
|
|
* for any character of the string <code>setOfChars</code>.
|
|
* @param text text to be searched
|
|
* @param setOfChars string with one or more distinct characters
|
|
* @return Offset of the first character in <code>setOfChars</code>
|
|
* found, or -1 if not found.
|
|
* @see #quotedIndexOf
|
|
*/
|
|
static int32_t indexOf(const UnicodeString& text,
|
|
const UnicodeString& setOfChars);
|
|
|
|
};
|
|
|
|
#endif
|