1999-11-20 00:40:50 +00:00
|
|
|
|
/*
|
1999-11-22 21:47:27 +00:00
|
|
|
|
* Copyright <EFBFBD> {1999}, International Business Machines Corporation and others. All Rights Reserved.
|
1999-11-20 00:40:50 +00:00
|
|
|
|
**********************************************************************
|
|
|
|
|
* Date Name Description
|
|
|
|
|
* 11/17/99 aliu Creation.
|
|
|
|
|
**********************************************************************
|
|
|
|
|
*/
|
|
|
|
|
#ifndef RBT_PARS_H
|
|
|
|
|
#define RBT_PARS_H
|
|
|
|
|
|
1999-12-28 23:57:50 +00:00
|
|
|
|
#include "unicode/rbt.h"
|
2000-02-08 02:49:15 +00:00
|
|
|
|
#include "uvector.h"
|
2000-03-18 01:42:45 +00:00
|
|
|
|
#include "unicode/parseerr.h"
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
|
|
class TransliterationRuleData;
|
2000-01-13 07:28:08 +00:00
|
|
|
|
class UnicodeSet;
|
2000-02-08 02:49:15 +00:00
|
|
|
|
class ParseData;
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
|
|
class TransliterationRuleParser {
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This is a reference to external data we don't own. This works because
|
|
|
|
|
* we only hold this for the duration of the call to parse().
|
|
|
|
|
*/
|
|
|
|
|
const UnicodeString& rules;
|
|
|
|
|
|
|
|
|
|
RuleBasedTransliterator::Direction direction;
|
|
|
|
|
|
|
|
|
|
TransliterationRuleData* data;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* We use a single error code during parsing. Rather than pass it
|
|
|
|
|
* through each API, we keep it here.
|
|
|
|
|
*/
|
|
|
|
|
UErrorCode status;
|
|
|
|
|
|
2000-03-18 01:42:45 +00:00
|
|
|
|
/**
|
|
|
|
|
* Pointer to user structure in which to return parse error information.
|
|
|
|
|
* May be NULL.
|
|
|
|
|
*/
|
|
|
|
|
ParseError* parseError;
|
|
|
|
|
|
2000-02-08 02:49:15 +00:00
|
|
|
|
/**
|
|
|
|
|
* Temporary symbol table used during parsing.
|
|
|
|
|
*/
|
|
|
|
|
ParseData* parseData;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Temporary vector of set variables. When parsing is complete, this
|
|
|
|
|
* is copied into the array data.setVariables. As with data.setVariables,
|
|
|
|
|
* element 0 corresponds to character data.setVariablesBase.
|
|
|
|
|
*/
|
|
|
|
|
UVector setVariablesVector;
|
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
|
/**
|
|
|
|
|
* The next available stand-in for variables. This starts at some point in
|
|
|
|
|
* the private use area (discovered dynamically) and increments up toward
|
|
|
|
|
* <code>variableLimit</code>. At any point during parsing, available
|
|
|
|
|
* variables are <code>variableNext..variableLimit-1</code>.
|
|
|
|
|
*/
|
|
|
|
|
UChar variableNext;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The last available stand-in for variables. This is discovered
|
|
|
|
|
* dynamically. At any point during parsing, available variables are
|
|
|
|
|
* <code>variableNext..variableLimit-1</code>.
|
|
|
|
|
*/
|
|
|
|
|
UChar variableLimit;
|
|
|
|
|
|
|
|
|
|
// Operators
|
|
|
|
|
static const UChar VARIABLE_DEF_OP;
|
|
|
|
|
static const UChar FORWARD_RULE_OP;
|
|
|
|
|
static const UChar REVERSE_RULE_OP;
|
2000-01-13 07:28:08 +00:00
|
|
|
|
static const UChar FWDREV_RULE_OP; // internal rep of <> op
|
|
|
|
|
static const UnicodeString OPERATORS;
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
|
|
// Other special characters
|
|
|
|
|
static const UChar QUOTE;
|
2000-01-13 07:28:08 +00:00
|
|
|
|
static const UChar ESCAPE;
|
|
|
|
|
static const UChar END_OF_RULE;
|
|
|
|
|
static const UChar RULE_COMMENT_CHAR;
|
1999-11-20 00:40:50 +00:00
|
|
|
|
static const UChar VARIABLE_REF_OPEN;
|
|
|
|
|
static const UChar VARIABLE_REF_CLOSE;
|
|
|
|
|
static const UChar CONTEXT_OPEN;
|
|
|
|
|
static const UChar CONTEXT_CLOSE;
|
2000-01-13 07:28:08 +00:00
|
|
|
|
static const UChar SET_OPEN;
|
|
|
|
|
static const UChar SET_CLOSE;
|
1999-11-20 00:40:50 +00:00
|
|
|
|
static const UChar CURSOR_POS;
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
|
|
static TransliterationRuleData*
|
|
|
|
|
parse(const UnicodeString& rules,
|
2000-03-18 01:42:45 +00:00
|
|
|
|
RuleBasedTransliterator::Direction direction,
|
|
|
|
|
ParseError* parseError = 0);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param rules list of rules, separated by newline characters
|
|
|
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
|
|
|
* rules
|
|
|
|
|
*/
|
|
|
|
|
TransliterationRuleParser(const UnicodeString& rules,
|
2000-03-18 01:42:45 +00:00
|
|
|
|
RuleBasedTransliterator::Direction direction,
|
|
|
|
|
ParseError* parseError = 0);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
2000-02-08 02:49:15 +00:00
|
|
|
|
/**
|
|
|
|
|
* Destructor.
|
|
|
|
|
*/
|
|
|
|
|
~TransliterationRuleParser();
|
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
|
/**
|
|
|
|
|
* Parse the given string as a sequence of rules, separated by newline
|
|
|
|
|
* characters ('\n'), and cause this object to implement those rules. Any
|
|
|
|
|
* previous rules are discarded. Typically this method is called exactly
|
|
|
|
|
* once, during construction.
|
|
|
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
|
|
|
* rules
|
|
|
|
|
*/
|
1999-12-22 22:57:04 +00:00
|
|
|
|
void parseRules(void);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
|
|
/**
|
2000-01-13 07:28:08 +00:00
|
|
|
|
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
|
|
|
|
* at pos. Return the index after the last character parsed. Do not
|
|
|
|
|
* parse characters at or after limit.
|
|
|
|
|
*
|
|
|
|
|
* Important: The character at pos must be a non-whitespace character
|
|
|
|
|
* that is not the comment character.
|
|
|
|
|
*
|
|
|
|
|
* This method handles quoting, escaping, and whitespace removal. It
|
|
|
|
|
* parses the end-of-rule character. It recognizes context and cursor
|
|
|
|
|
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
|
|
|
|
* creates a rule object and adds it to our rule list.
|
1999-11-20 00:40:50 +00:00
|
|
|
|
*/
|
2000-01-13 07:28:08 +00:00
|
|
|
|
int32_t parseRule(int32_t pos, int32_t limit);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
|
|
/**
|
2000-01-13 07:28:08 +00:00
|
|
|
|
* Called by main parser upon syntax error. Search the rule string
|
|
|
|
|
* for the probable end of the rule. Of course, if the error is that
|
|
|
|
|
* the end of rule marker is missing, then the rule end will not be found.
|
|
|
|
|
* In any case the rule start will be correctly reported.
|
|
|
|
|
* @param msg error description
|
|
|
|
|
* @param rule pattern string
|
|
|
|
|
* @param start position of first character of current rule
|
1999-11-20 00:40:50 +00:00
|
|
|
|
*/
|
2000-03-18 01:42:45 +00:00
|
|
|
|
int32_t syntaxError(int32_t parseErrorCode, const UnicodeString&, int32_t start);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
|
|
/**
|
2000-01-13 07:28:08 +00:00
|
|
|
|
* Allocate a private-use substitution character for the given set,
|
|
|
|
|
* register it in the setVariables hash, and return the substitution
|
|
|
|
|
* character.
|
1999-11-20 00:40:50 +00:00
|
|
|
|
*/
|
2000-01-13 07:28:08 +00:00
|
|
|
|
UChar registerSet(UnicodeSet* adoptedSet);
|
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
|
/**
|
|
|
|
|
* Determines what part of the private use region of Unicode we can use for
|
|
|
|
|
* variable stand-ins. The correct way to do this is as follows: Parse each
|
|
|
|
|
* rule, and for forward and reverse rules, take the FROM expression, and
|
|
|
|
|
* make a hash of all characters used. The TO expression should be ignored.
|
|
|
|
|
* When done, everything not in the hash is available for use. In practice,
|
|
|
|
|
* this method may employ some other algorithm for improved speed.
|
|
|
|
|
*/
|
1999-12-22 22:57:04 +00:00
|
|
|
|
void determineVariableRange(void);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
|
|
/**
|
2000-03-18 01:42:45 +00:00
|
|
|
|
* Returns the index of a character, ignoring quoted text.
|
1999-11-20 00:40:50 +00:00
|
|
|
|
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
2000-03-18 01:42:45 +00:00
|
|
|
|
* found by a search for 'h'.
|
1999-11-20 00:40:50 +00:00
|
|
|
|
* @param text text to be searched
|
|
|
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
|
|
|
* <= limit</code>.
|
|
|
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
|
|
|
* <= text.length()</code>.
|
2000-03-18 01:42:45 +00:00
|
|
|
|
* @param c character to search for
|
|
|
|
|
* @return Offset of the first instance of c, or -1 if not found.
|
1999-11-20 00:40:50 +00:00
|
|
|
|
*/
|
|
|
|
|
static int32_t quotedIndexOf(const UnicodeString& text,
|
|
|
|
|
int32_t start, int32_t limit,
|
2000-03-18 01:42:45 +00:00
|
|
|
|
UChar c);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
#endif
|