1999-11-20 00:40:50 +00:00
|
|
|
/*
|
2001-03-22 00:09:10 +00:00
|
|
|
* Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved.
|
1999-11-20 00:40:50 +00:00
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 11/17/99 aliu Creation.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
#ifndef RBT_PARS_H
|
|
|
|
#define RBT_PARS_H
|
|
|
|
|
2002-06-27 01:19:20 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/uobject.h"
|
1999-12-28 23:57:50 +00:00
|
|
|
#include "unicode/rbt.h"
|
2000-03-18 01:42:45 +00:00
|
|
|
#include "unicode/parseerr.h"
|
2001-10-23 23:26:47 +00:00
|
|
|
#include "unicode/unorm.h"
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
class TransliterationRuleData;
|
2002-02-07 01:07:55 +00:00
|
|
|
class UnicodeFunctor;
|
2000-02-08 02:49:15 +00:00
|
|
|
class ParseData;
|
2000-05-20 04:40:29 +00:00
|
|
|
class RuleHalf;
|
|
|
|
class ParsePosition;
|
2001-10-10 19:29:45 +00:00
|
|
|
class UVector;
|
2002-02-07 01:07:55 +00:00
|
|
|
class StringMatcher;
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2002-06-27 01:19:20 +00:00
|
|
|
class TransliteratorParser : public UObject {
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-10-10 19:29:45 +00:00
|
|
|
public:
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
/**
|
2001-10-10 19:29:45 +00:00
|
|
|
* PUBLIC data member containing the parsed data object, or null if
|
|
|
|
* there were no rules.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
2001-10-10 19:29:45 +00:00
|
|
|
TransliterationRuleData* data;
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-10-10 19:29:45 +00:00
|
|
|
/**
|
|
|
|
* PUBLIC data member.
|
|
|
|
* The block of ::IDs, both at the top and at the bottom.
|
|
|
|
* Inserted into these may be additional rules at the
|
|
|
|
* idSplitPoint.
|
|
|
|
*/
|
|
|
|
UnicodeString idBlock;
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-10-10 19:29:45 +00:00
|
|
|
/**
|
|
|
|
* PUBLIC data member.
|
|
|
|
* In a compound RBT, the index at which the RBT rules are
|
|
|
|
* inserted into the ID block. Index 0 means before any IDs
|
|
|
|
* in the block. Index idBlock.length() means after all IDs
|
|
|
|
* in the block. Index is a string index.
|
|
|
|
*/
|
|
|
|
int32_t idSplitPoint;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* PUBLIC data member containing the parsed compound filter, if any.
|
|
|
|
*/
|
|
|
|
UnicodeSet* compoundFilter;
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
// The number of rules parsed. This tells us if there were
|
|
|
|
// any actual transliterator rules, or if there were just ::ID
|
|
|
|
// block IDs.
|
|
|
|
int32_t ruleCount;
|
|
|
|
|
|
|
|
UTransDirection direction;
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* We use a single error code during parsing. Rather than pass it
|
|
|
|
* through each API, we keep it here.
|
|
|
|
*/
|
|
|
|
UErrorCode status;
|
|
|
|
|
2000-03-18 01:42:45 +00:00
|
|
|
/**
|
2001-10-10 19:29:45 +00:00
|
|
|
* Parse error information.
|
2000-03-18 01:42:45 +00:00
|
|
|
*/
|
2001-10-10 19:29:45 +00:00
|
|
|
UParseError parseError;
|
2000-03-18 01:42:45 +00:00
|
|
|
|
2000-02-08 02:49:15 +00:00
|
|
|
/**
|
|
|
|
* Temporary symbol table used during parsing.
|
|
|
|
*/
|
|
|
|
ParseData* parseData;
|
|
|
|
|
|
|
|
/**
|
2001-07-27 00:18:53 +00:00
|
|
|
* Temporary vector of matcher variables. When parsing is complete, this
|
|
|
|
* is copied into the array data.variables. As with data.variables,
|
|
|
|
* element 0 corresponds to character data.variablesBase.
|
2000-02-08 02:49:15 +00:00
|
|
|
*/
|
2001-10-10 19:29:45 +00:00
|
|
|
UVector* variablesVector;
|
2000-02-08 02:49:15 +00:00
|
|
|
|
2002-02-07 01:07:55 +00:00
|
|
|
/**
|
|
|
|
* String of standins for segments. Used during the parsing of a single
|
|
|
|
* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
|
|
|
|
* to StringMatcher object segmentObjects.elementAt(0), etc.
|
|
|
|
*/
|
|
|
|
UnicodeString segmentStandins;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Vector of StringMatcher objects for segments. Used during the
|
|
|
|
* parsing of a single rule.
|
|
|
|
* segmentStandins.charAt(0) is the standin for "$1" and corresponds
|
|
|
|
* to StringMatcher object segmentObjects.elementAt(0), etc.
|
|
|
|
*/
|
|
|
|
UVector* segmentObjects;
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
/**
|
|
|
|
* The next available stand-in for variables. This starts at some point in
|
|
|
|
* the private use area (discovered dynamically) and increments up toward
|
|
|
|
* <code>variableLimit</code>. At any point during parsing, available
|
|
|
|
* variables are <code>variableNext..variableLimit-1</code>.
|
|
|
|
*/
|
|
|
|
UChar variableNext;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The last available stand-in for variables. This is discovered
|
|
|
|
* dynamically. At any point during parsing, available variables are
|
|
|
|
* <code>variableNext..variableLimit-1</code>.
|
|
|
|
*/
|
|
|
|
UChar variableLimit;
|
|
|
|
|
2000-05-20 04:40:29 +00:00
|
|
|
/**
|
|
|
|
* When we encounter an undefined variable, we do not immediately signal
|
|
|
|
* an error, in case we are defining this variable, e.g., "$a = [a-z];".
|
|
|
|
* Instead, we save the name of the undefined variable, and substitute
|
|
|
|
* in the placeholder char variableLimit - 1, and decrement
|
|
|
|
* variableLimit.
|
|
|
|
*/
|
|
|
|
UnicodeString undefinedVariableName;
|
|
|
|
|
2001-10-17 19:21:12 +00:00
|
|
|
/**
|
|
|
|
* The stand-in character for the 'dot' set, represented by '.' in
|
|
|
|
* patterns. This is allocated the first time it is needed, and
|
|
|
|
* reused thereafter.
|
|
|
|
*/
|
|
|
|
UChar dotStandIn;
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
2001-10-10 19:29:45 +00:00
|
|
|
* Constructor.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
2001-10-10 19:29:45 +00:00
|
|
|
TransliteratorParser();
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2000-02-08 02:49:15 +00:00
|
|
|
/**
|
|
|
|
* Destructor.
|
|
|
|
*/
|
2001-07-13 21:09:41 +00:00
|
|
|
~TransliteratorParser();
|
2000-02-08 02:49:15 +00:00
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
/**
|
|
|
|
* Parse the given string as a sequence of rules, separated by newline
|
|
|
|
* characters ('\n'), and cause this object to implement those rules. Any
|
|
|
|
* previous rules are discarded. Typically this method is called exactly
|
2001-10-10 19:29:45 +00:00
|
|
|
* once after construction.
|
|
|
|
*
|
|
|
|
* Parse the given rules, in the given direction. After this call
|
|
|
|
* returns, query the public data members for results. The caller
|
|
|
|
* owns the 'data' and 'compoundFilter' data members after this
|
|
|
|
* call returns.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param rules rules, separated by ';'
|
|
|
|
* @param direction either FORWARD or REVERSE.
|
|
|
|
* @param pe Struct to recieve information on position
|
|
|
|
* of error if an error is encountered
|
|
|
|
* @param ec Output param set to success/failure code.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
2001-10-10 19:29:45 +00:00
|
|
|
void parse(const UnicodeString& rules,
|
|
|
|
UTransDirection direction,
|
|
|
|
UParseError& pe,
|
|
|
|
UErrorCode& ec);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the compound filter parsed by parse(). Caller owns result.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @return the compound filter parsed by parse().
|
2001-10-10 19:29:45 +00:00
|
|
|
*/
|
|
|
|
UnicodeSet* orphanCompoundFilter();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the data object parsed by parse(). Caller owns result.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @return the data object parsed by parse().
|
2001-10-10 19:29:45 +00:00
|
|
|
*/
|
|
|
|
TransliterationRuleData* orphanData();
|
|
|
|
|
2002-06-29 00:04:16 +00:00
|
|
|
/**
|
|
|
|
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
|
|
|
*
|
|
|
|
* @draft ICU 2.2
|
|
|
|
*/
|
|
|
|
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ICU "poor man's RTTI", returns a UClassID for this class.
|
|
|
|
*
|
|
|
|
* @draft ICU 2.2
|
|
|
|
*/
|
|
|
|
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
|
|
|
|
2001-10-10 19:29:45 +00:00
|
|
|
private:
|
|
|
|
|
2002-07-01 11:04:45 +00:00
|
|
|
/**
|
|
|
|
* Return a representation of this transliterator as source rules.
|
|
|
|
* @param rules Output param to receive the rules.
|
|
|
|
* @param direction either FORWARD or REVERSE.
|
|
|
|
*/
|
2001-10-10 19:29:45 +00:00
|
|
|
void parseRules(const UnicodeString& rules,
|
|
|
|
UTransDirection direction);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
|
|
|
/**
|
2000-01-13 07:28:08 +00:00
|
|
|
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
|
|
|
* at pos. Return the index after the last character parsed. Do not
|
|
|
|
* parse characters at or after limit.
|
|
|
|
*
|
|
|
|
* Important: The character at pos must be a non-whitespace character
|
|
|
|
* that is not the comment character.
|
|
|
|
*
|
|
|
|
* This method handles quoting, escaping, and whitespace removal. It
|
|
|
|
* parses the end-of-rule character. It recognizes context and cursor
|
|
|
|
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
|
|
|
* creates a rule object and adds it to our rule list.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param rules Output param to receive the rules.
|
|
|
|
* @param pos the starting position.
|
|
|
|
* @param limit pointer past the last character of the rule.
|
|
|
|
* @return the index after the last character parsed.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
2001-10-10 19:29:45 +00:00
|
|
|
int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2001-10-23 23:26:47 +00:00
|
|
|
/**
|
|
|
|
* Set the variable range to [start, end] (inclusive).
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param start the start value of the range.
|
|
|
|
* @param end the end value of the range.
|
2001-10-23 23:26:47 +00:00
|
|
|
*/
|
|
|
|
void setVariableRange(int32_t start, int32_t end);
|
|
|
|
|
2001-11-12 19:12:08 +00:00
|
|
|
/**
|
|
|
|
* Assert that the given character is NOT within the variable range.
|
|
|
|
* If it is, return FALSE. This is neccesary to ensure that the
|
|
|
|
* variable range does not overlap characters used in a rule.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param ch the given character.
|
|
|
|
* @return True, if the given character is NOT within the variable range.
|
2001-11-12 19:12:08 +00:00
|
|
|
*/
|
|
|
|
UBool checkVariableRange(UChar32 ch) const;
|
|
|
|
|
2001-10-23 23:26:47 +00:00
|
|
|
/**
|
|
|
|
* Set the maximum backup to 'backup', in response to a pragma
|
|
|
|
* statement.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param backup the new value to be set.
|
2001-10-23 23:26:47 +00:00
|
|
|
*/
|
|
|
|
void pragmaMaximumBackup(int32_t backup);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Begin normalizing all rules using the given mode, in response
|
|
|
|
* to a pragma statement.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param mode the given mode.
|
2001-10-23 23:26:47 +00:00
|
|
|
*/
|
|
|
|
void pragmaNormalizeRules(UNormalizationMode mode);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return true if the given rule looks like a pragma.
|
|
|
|
* @param pos offset to the first non-whitespace character
|
|
|
|
* of the rule.
|
|
|
|
* @param limit pointer past the last character of the rule.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @return true if the given rule looks like a pragma.
|
2001-10-23 23:26:47 +00:00
|
|
|
*/
|
|
|
|
static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse a pragma. This method assumes resemblesPragma() has
|
|
|
|
* already returned true.
|
|
|
|
* @param pos offset to the first non-whitespace character
|
|
|
|
* of the rule.
|
|
|
|
* @param limit pointer past the last character of the rule.
|
|
|
|
* @return the position index after the final ';' of the pragma,
|
|
|
|
* or -1 on failure.
|
|
|
|
*/
|
|
|
|
int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
/**
|
2000-01-13 07:28:08 +00:00
|
|
|
* Called by main parser upon syntax error. Search the rule string
|
|
|
|
* for the probable end of the rule. Of course, if the error is that
|
|
|
|
* the end of rule marker is missing, then the rule end will not be found.
|
|
|
|
* In any case the rule start will be correctly reported.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param parseErrorCode error code.
|
|
|
|
* @param msg error description.
|
|
|
|
* @param start position of first character of current rule.
|
|
|
|
* @return start position of first character of current rule.
|
1999-11-20 00:40:50 +00:00
|
|
|
*/
|
2001-08-17 02:20:35 +00:00
|
|
|
int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);
|
1999-11-20 00:40:50 +00:00
|
|
|
|
2000-05-20 04:40:29 +00:00
|
|
|
/**
|
|
|
|
* Parse a UnicodeSet out, store it, and return the stand-in character
|
|
|
|
* used to represent it.
|
2002-07-01 11:04:45 +00:00
|
|
|
*
|
|
|
|
* @param rule the rule for UnicodeSet.
|
|
|
|
* @param pos the position in pattern at which to start parsing.
|
|
|
|
* @return the stand-in character used to represent it.
|
2000-05-20 04:40:29 +00:00
|
|
|
*/
|
|
|
|
UChar parseSet(const UnicodeString& rule,
|
|
|
|
ParsePosition& pos);
|
|
|
|
|
2001-07-27 00:18:53 +00:00
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* Generate and return a stand-in for a new UnicodeFunctor. Store
|
2001-07-27 00:18:53 +00:00
|
|
|
* the matcher (adopt it).
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param adopted the UnicodeFunctor to be adopted.
|
|
|
|
* @return a stand-in for a new UnicodeFunctor.
|
2001-07-27 00:18:53 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
UChar generateStandInFor(UnicodeFunctor* adopted);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the standin for segment seg (1-based).
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param seg the given segment.
|
|
|
|
* @return the standIn character for the given segment.
|
2002-02-07 01:07:55 +00:00
|
|
|
*/
|
|
|
|
UChar getSegmentStandin(int32_t seg);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set the object for segment seg (1-based).
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param seg the given segment.
|
|
|
|
* @param adopted the StringMatcher to be adopted.
|
2002-02-07 01:07:55 +00:00
|
|
|
*/
|
|
|
|
void setSegmentObject(int32_t seg, StringMatcher* adopted);
|
2001-07-27 00:18:53 +00:00
|
|
|
|
2001-10-17 19:21:12 +00:00
|
|
|
/**
|
|
|
|
* Return the stand-in for the dot set. It is allocated the first
|
|
|
|
* time and reused thereafter.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @return the stand-in for the dot set.
|
2001-10-17 19:21:12 +00:00
|
|
|
*/
|
|
|
|
UChar getDotStandIn();
|
|
|
|
|
2000-05-20 04:40:29 +00:00
|
|
|
/**
|
|
|
|
* Append the value of the given variable name to the given
|
|
|
|
* UnicodeString.
|
2002-07-01 11:04:45 +00:00
|
|
|
* @param name the variable name to be appended.
|
|
|
|
* @param buf the given UnicodeString to append to.
|
2000-05-20 04:40:29 +00:00
|
|
|
*/
|
|
|
|
void appendVariableDef(const UnicodeString& name,
|
|
|
|
UnicodeString& buf);
|
2001-03-22 00:09:10 +00:00
|
|
|
|
2001-07-30 23:23:51 +00:00
|
|
|
/**
|
2002-02-07 01:07:55 +00:00
|
|
|
* Glue method to get around access restrictions in C++.
|
2001-07-30 23:23:51 +00:00
|
|
|
*/
|
2002-02-07 01:07:55 +00:00
|
|
|
static Transliterator* createBasicInstance(const UnicodeString& id,
|
|
|
|
const UnicodeString* canonID);
|
2001-07-30 23:23:51 +00:00
|
|
|
|
2000-05-20 04:40:29 +00:00
|
|
|
friend class RuleHalf;
|
2000-08-15 18:25:20 +00:00
|
|
|
|
|
|
|
// Disallowed methods; no impl.
|
2002-07-01 11:04:45 +00:00
|
|
|
/**
|
|
|
|
* Copy constructor
|
|
|
|
*/
|
2001-07-13 21:09:41 +00:00
|
|
|
TransliteratorParser(const TransliteratorParser&);
|
2002-07-01 11:04:45 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Assignment operator
|
|
|
|
*/
|
2001-07-13 21:09:41 +00:00
|
|
|
TransliteratorParser& operator=(const TransliteratorParser&);
|
2002-06-29 00:04:16 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* The address of this static class variable serves as this class's ID
|
|
|
|
* for ICU "poor man's RTTI".
|
|
|
|
*/
|
|
|
|
static const char fgClassID;
|
1999-11-20 00:40:50 +00:00
|
|
|
};
|
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_END
|
|
|
|
|
1999-11-20 00:40:50 +00:00
|
|
|
#endif
|