scuffed-code/icu4c/source/i18n/rbt_pars.h

/*
* Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H

#include "unicode/rbt.h"
#include "uvector.h"
#include "unicode/parseerr.h"

class TransliterationRuleData;
class UnicodeMatcher;
class ParseData;
class RuleHalf;
class ParsePosition;

class TransliteratorParser {

    /**
     * This is a reference to external data we don't own.  This works because
     * we only hold this for the duration of the call to parse().
     */
    const UnicodeString& rules;

    UTransDirection direction;

    TransliterationRuleData* data;

    /**
     * We use a single error code during parsing.  Rather than pass it
     * through each API, we keep it here.
     */
    UErrorCode status;

    /**
     * Pointer to user structure in which to return parse error information.
     * May be NULL.
     */
    UParseError* parseError;

    /**
     * Temporary symbol table used during parsing.
     */
    ParseData* parseData;

    /**
     * Temporary vector of matcher variables.  When parsing is complete, this
     * is copied into the array data.variables.  As with data.variables,
     * element 0 corresponds to character data.variablesBase.
     */
    UVector variablesVector;

    /**
     * The next available stand-in for variables.  This starts at some point in
     * the private use area (discovered dynamically) and increments up toward
     * <code>variableLimit</code>.  At any point during parsing, available
     * variables are <code>variableNext..variableLimit-1</code>.
     */
    UChar variableNext;

    /**
     * The last available stand-in for variables.  This is discovered
     * dynamically.  At any point during parsing, available variables are
     * <code>variableNext..variableLimit-1</code>.
     */
    UChar variableLimit;

    /**
     * When we encounter an undefined variable, we do not immediately signal
     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
     * Instead, we save the name of the undefined variable, and substitute
     * in the placeholder char variableLimit - 1, and decrement
     * variableLimit.
     */
    UnicodeString undefinedVariableName;

public:

    static TransliterationRuleData*
        parse(const UnicodeString& rules,
              UTransDirection direction,
              UParseError* parseError = 0);

    /**
     * Parse a given set of rules.  Return up to three pieces of
     * parsed data.  These are the header ::id block, the rule block,
     * and the footer ::id block.  Any or all of these may be empty.
     * If the ::id blocks are empty, their corresponding parameters
     * are returned as the empty string.  If there are no rules, the
     * TransliterationRuleData result is 0.
     * @param ruleDataResult caller owns the pointer stored here.
     * May be NULL.
     * @param headerRule string including semicolons for the header
     * ::id block.  May be empty.
     * @param footerRule string including semicolons for the footer
     * ::id block.  May be empty.
     */
    static void parse(const UnicodeString& rules,
                      UTransDirection direction,
                      TransliterationRuleData*& ruleDataResult,
                      UnicodeString& idBlockResult,
                      int32_t& idSplitPointResult,
                      UParseError* parseError,
                      UErrorCode& ec);

private:

    /**
     * @param rules list of rules, separated by newline characters
     * @exception IllegalArgumentException if there is a syntax error in the
     * rules
     */
    TransliteratorParser(const UnicodeString& rules,
                              UTransDirection direction,
                              UParseError* parseError = 0);

    /**
     * Destructor.
     */
    ~TransliteratorParser();

    /**
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
     * previous rules are discarded.  Typically this method is called exactly
     * once, during construction.
     * @exception IllegalArgumentException if there is a syntax error in the
     * rules
     */
    void parseRules(UnicodeString& idBlockResult, int32_t& idSplitPointResult,
                    int32_t& ruleCount);

    /**
     * MAIN PARSER.  Parse the next rule in the given rule string, starting
     * at pos.  Return the index after the last character parsed.  Do not
     * parse characters at or after limit.
     *
     * Important:  The character at pos must be a non-whitespace character
     * that is not the comment character.
     *
     * This method handles quoting, escaping, and whitespace removal.  It
     * parses the end-of-rule character.  It recognizes context and cursor
     * indicators.  Once it does a lexical breakdown of the rule at pos, it
     * creates a rule object and adds it to our rule list.
     */
    int32_t parseRule(int32_t pos, int32_t limit);

    /**
     * Called by main parser upon syntax error.  Search the rule string
     * for the probable end of the rule.  Of course, if the error is that
     * the end of rule marker is missing, then the rule end will not be found.
     * In any case the rule start will be correctly reported.
     * @param msg error description
     * @param rule pattern string
     * @param start position of first character of current rule
     */
    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);

    /**
     * Parse a UnicodeSet out, store it, and return the stand-in character
     * used to represent it.
     */
    UChar parseSet(const UnicodeString& rule,
                   ParsePosition& pos);

    /**
     * Generate and return a stand-in for a new UnicodeMatcher.  Store
     * the matcher (adopt it).
     */
    UChar generateStandInFor(UnicodeMatcher* adopted);

    /**
     * Append the value of the given variable name to the given
     * UnicodeString.
     */
    void appendVariableDef(const UnicodeString& name,
                           UnicodeString& buf);

    /**
     * Return a stand-in character that refers to the given segments.
     * @param r a reference number >= 1
     * @return a stand-in for the given segment reference
     */
    UChar getSegmentStandin(int32_t r);

    /**
     * Determines what part of the private use region of Unicode we can use for
     * variable stand-ins.  The correct way to do this is as follows: Parse each
     * rule, and for forward and reverse rules, take the FROM expression, and
     * make a hash of all characters used.  The TO expression should be ignored.
     * When done, everything not in the hash is available for use.  In practice,
     * this method may employ some other algorithm for improved speed.
     */
    void determineVariableRange(void);

    /**
     * Returns the index of a character, ignoring quoted text.
     * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
     * found by a search for 'h'.
     * @param text text to be searched
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
     * @param c character to search for
     * @return Offset of the first instance of c, or -1 if not found.
     */
    static int32_t quotedIndexOf(const UnicodeString& text,
                                 int32_t start, int32_t limit,
                                 UChar c);

    friend class RuleHalf;

    // Disallowed methods; no impl.
    TransliteratorParser(const TransliteratorParser&);
    TransliteratorParser& operator=(const TransliteratorParser&);
};

#endif
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`/*`
ICU-903 updated copyright notices. X-SVN-Rev: 4249 2001-03-22 00:09:10 +00:00			`* Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved.`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`**********************************************************************`
			`* Date Name Description`
			`* 11/17/99 aliu Creation.`
			`**********************************************************************`
			`*/`
			`#ifndef RBT_PARS_H`
			`#define RBT_PARS_H`

ICU-12 all public include files are now in unicode dir, all private icu_ functions renamed to uprv_ X-SVN-Rev: 473 1999-12-28 23:57:50 +00:00			`#include "unicode/rbt.h"`
ICU-265 map char to set with array instead of hash for better performance X-SVN-Rev: 728 2000-02-08 02:49:15 +00:00			`#include "uvector.h"`
ICU-329 parse engines need better error reporting X-SVN-Rev: 958 2000-03-18 01:42:45 +00:00			`#include "unicode/parseerr.h"`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
			`class TransliterationRuleData;`
ICU-1076 initial limited support for Kleene star and plus operators X-SVN-Rev: 5359 2001-07-27 00:18:53 +00:00			`class UnicodeMatcher;`
ICU-265 map char to set with array instead of hash for better performance X-SVN-Rev: 728 2000-02-08 02:49:15 +00:00			`class ParseData;`
ICU-352 rbt support for segments, cursor offset, and new syntax X-SVN-Rev: 1422 2000-05-20 04:40:29 +00:00			`class RuleHalf;`
			`class ParsePosition;`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
ICU-1048 allow ::ID blocks in rules X-SVN-Rev: 5233 2001-07-13 21:09:41 +00:00			`class TransliteratorParser {`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
			`/**`
			`* This is a reference to external data we don't own. This works because`
			`* we only hold this for the duration of the call to parse().`
			`*/`
			`const UnicodeString& rules;`

ICU-450 change ParseError, Transliterator::Direction, Transliterator::Position to C structs X-SVN-Rev: 1655 2000-06-27 19:00:38 +00:00			`UTransDirection direction;`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
			`TransliterationRuleData* data;`

			`/**`
			`* We use a single error code during parsing. Rather than pass it`
			`* through each API, we keep it here.`
			`*/`
			`UErrorCode status;`

ICU-329 parse engines need better error reporting X-SVN-Rev: 958 2000-03-18 01:42:45 +00:00			`/**`
			`* Pointer to user structure in which to return parse error information.`
			`* May be NULL.`
			`*/`
ICU-450 change ParseError, Transliterator::Direction, Transliterator::Position to C structs X-SVN-Rev: 1655 2000-06-27 19:00:38 +00:00			`UParseError* parseError;`
ICU-329 parse engines need better error reporting X-SVN-Rev: 958 2000-03-18 01:42:45 +00:00
ICU-265 map char to set with array instead of hash for better performance X-SVN-Rev: 728 2000-02-08 02:49:15 +00:00			`/**`
			`* Temporary symbol table used during parsing.`
			`*/`
			`ParseData* parseData;`

			`/**`
ICU-1076 initial limited support for Kleene star and plus operators X-SVN-Rev: 5359 2001-07-27 00:18:53 +00:00			`* Temporary vector of matcher variables. When parsing is complete, this`
			`* is copied into the array data.variables. As with data.variables,`
			`* element 0 corresponds to character data.variablesBase.`
ICU-265 map char to set with array instead of hash for better performance X-SVN-Rev: 728 2000-02-08 02:49:15 +00:00			`*/`
ICU-1076 initial limited support for Kleene star and plus operators X-SVN-Rev: 5359 2001-07-27 00:18:53 +00:00			`UVector variablesVector;`
ICU-265 map char to set with array instead of hash for better performance X-SVN-Rev: 728 2000-02-08 02:49:15 +00:00
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`/**`
			`* The next available stand-in for variables. This starts at some point in`
			`* the private use area (discovered dynamically) and increments up toward`
			`* <code>variableLimit</code>. At any point during parsing, available`
			`* variables are <code>variableNext..variableLimit-1</code>.`
			`*/`
			`UChar variableNext;`

			`/**`
			`* The last available stand-in for variables. This is discovered`
			`* dynamically. At any point during parsing, available variables are`
			`* <code>variableNext..variableLimit-1</code>.`
			`*/`
			`UChar variableLimit;`

ICU-352 rbt support for segments, cursor offset, and new syntax X-SVN-Rev: 1422 2000-05-20 04:40:29 +00:00			`/**`
			`* When we encounter an undefined variable, we do not immediately signal`
			`* an error, in case we are defining this variable, e.g., "$a = [a-z];".`
			`* Instead, we save the name of the undefined variable, and substitute`
			`* in the placeholder char variableLimit - 1, and decrement`
			`* variableLimit.`
			`*/`
			`UnicodeString undefinedVariableName;`

ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`public:`

			`static TransliterationRuleData*`
			`parse(const UnicodeString& rules,`
ICU-450 change ParseError, Transliterator::Direction, Transliterator::Position to C structs X-SVN-Rev: 1655 2000-06-27 19:00:38 +00:00			`UTransDirection direction,`
			`UParseError* parseError = 0);`
ICU-903 updated copyright notices. X-SVN-Rev: 4249 2001-03-22 00:09:10 +00:00
ICU-1048 allow ::ID blocks in rules X-SVN-Rev: 5233 2001-07-13 21:09:41 +00:00			`/**`
			`* Parse a given set of rules. Return up to three pieces of`
			`* parsed data. These are the header ::id block, the rule block,`
			`* and the footer ::id block. Any or all of these may be empty.`
			`* If the ::id blocks are empty, their corresponding parameters`
			`* are returned as the empty string. If there are no rules, the`
			`* TransliterationRuleData result is 0.`
			`* @param ruleDataResult caller owns the pointer stored here.`
			`* May be NULL.`
			`* @param headerRule string including semicolons for the header`
			`* ::id block. May be empty.`
			`* @param footerRule string including semicolons for the footer`
			`* ::id block. May be empty.`
			`*/`
			`static void parse(const UnicodeString& rules,`
			`UTransDirection direction,`
			`TransliterationRuleData*& ruleDataResult,`
			`UnicodeString& idBlockResult,`
			`int32_t& idSplitPointResult,`
			`UParseError* parseError,`
			`UErrorCode& ec);`

ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`private:`

			`/**`
			`* @param rules list of rules, separated by newline characters`
			`* @exception IllegalArgumentException if there is a syntax error in the`
			`* rules`
			`*/`
ICU-1048 allow ::ID blocks in rules X-SVN-Rev: 5233 2001-07-13 21:09:41 +00:00			`TransliteratorParser(const UnicodeString& rules,`
ICU-450 change ParseError, Transliterator::Direction, Transliterator::Position to C structs X-SVN-Rev: 1655 2000-06-27 19:00:38 +00:00			`UTransDirection direction,`
			`UParseError* parseError = 0);`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
ICU-265 map char to set with array instead of hash for better performance X-SVN-Rev: 728 2000-02-08 02:49:15 +00:00			`/**`
			`* Destructor.`
			`*/`
ICU-1048 allow ::ID blocks in rules X-SVN-Rev: 5233 2001-07-13 21:09:41 +00:00			`~TransliteratorParser();`
ICU-265 map char to set with array instead of hash for better performance X-SVN-Rev: 728 2000-02-08 02:49:15 +00:00
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`/**`
			`* Parse the given string as a sequence of rules, separated by newline`
			`* characters ('\n'), and cause this object to implement those rules. Any`
			`* previous rules are discarded. Typically this method is called exactly`
			`* once, during construction.`
			`* @exception IllegalArgumentException if there is a syntax error in the`
			`* rules`
			`*/`
ICU-1048 allow ::ID blocks in rules X-SVN-Rev: 5233 2001-07-13 21:09:41 +00:00			`void parseRules(UnicodeString& idBlockResult, int32_t& idSplitPointResult,`
			`int32_t& ruleCount);`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
			`/**`
ICU-199 new rule syntax; performance improvement; update rules X-SVN-Rev: 559 2000-01-13 07:28:08 +00:00			`* MAIN PARSER. Parse the next rule in the given rule string, starting`
			`* at pos. Return the index after the last character parsed. Do not`
			`* parse characters at or after limit.`
			`*`
			`* Important: The character at pos must be a non-whitespace character`
			`* that is not the comment character.`
			`*`
			`* This method handles quoting, escaping, and whitespace removal. It`
			`* parses the end-of-rule character. It recognizes context and cursor`
			`* indicators. Once it does a lexical breakdown of the rule at pos, it`
			`* creates a rule object and adds it to our rule list.`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`*/`
ICU-199 new rule syntax; performance improvement; update rules X-SVN-Rev: 559 2000-01-13 07:28:08 +00:00			`int32_t parseRule(int32_t pos, int32_t limit);`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
			`/**`
ICU-199 new rule syntax; performance improvement; update rules X-SVN-Rev: 559 2000-01-13 07:28:08 +00:00			`* Called by main parser upon syntax error. Search the rule string`
			`* for the probable end of the rule. Of course, if the error is that`
			`* the end of rule marker is missing, then the rule end will not be found.`
			`* In any case the rule start will be correctly reported.`
			`* @param msg error description`
			`* @param rule pattern string`
			`* @param start position of first character of current rule`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`*/`
ICU-329 Add more error codes for formatting api X-SVN-Rev: 5503 2001-08-17 02:20:35 +00:00			`int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
ICU-352 rbt support for segments, cursor offset, and new syntax X-SVN-Rev: 1422 2000-05-20 04:40:29 +00:00			`/**`
			`* Parse a UnicodeSet out, store it, and return the stand-in character`
			`* used to represent it.`
			`*/`
			`UChar parseSet(const UnicodeString& rule,`
			`ParsePosition& pos);`

ICU-1076 initial limited support for Kleene star and plus operators X-SVN-Rev: 5359 2001-07-27 00:18:53 +00:00			`/**`
			`* Generate and return a stand-in for a new UnicodeMatcher. Store`
			`* the matcher (adopt it).`
			`*/`
			`UChar generateStandInFor(UnicodeMatcher* adopted);`

ICU-352 rbt support for segments, cursor offset, and new syntax X-SVN-Rev: 1422 2000-05-20 04:40:29 +00:00			`/**`
			`* Append the value of the given variable name to the given`
			`* UnicodeString.`
			`*/`
			`void appendVariableDef(const UnicodeString& name,`
			`UnicodeString& buf);`
ICU-903 updated copyright notices. X-SVN-Rev: 4249 2001-03-22 00:09:10 +00:00
ICU-1076 implement ? operator, remove 9 segment limit, fix toPattern X-SVN-Rev: 5381 2001-07-30 23:23:51 +00:00			`/**`
			`* Return a stand-in character that refers to the given segments.`
			`* @param r a reference number >= 1`
			`* @return a stand-in for the given segment reference`
			`*/`
			`UChar getSegmentStandin(int32_t r);`

ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`/**`
			`* Determines what part of the private use region of Unicode we can use for`
			`* variable stand-ins. The correct way to do this is as follows: Parse each`
			`* rule, and for forward and reverse rules, take the FROM expression, and`
			`* make a hash of all characters used. The TO expression should be ignored.`
			`* When done, everything not in the hash is available for use. In practice,`
			`* this method may employ some other algorithm for improved speed.`
			`*/`
ICU-200 Updated with OS/400 specific port changes. X-SVN-Rev: 459 1999-12-22 22:57:04 +00:00			`void determineVariableRange(void);`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00
			`/**`
ICU-329 parse engines need better error reporting X-SVN-Rev: 958 2000-03-18 01:42:45 +00:00			`* Returns the index of a character, ignoring quoted text.`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`* For example, in the string "abc'hide'h", the 'h' in "hide" will not be`
ICU-329 parse engines need better error reporting X-SVN-Rev: 958 2000-03-18 01:42:45 +00:00			`* found by a search for 'h'.`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`* @param text text to be searched`
			`* @param start the beginning index, inclusive; <code>0 <= start`
			`* <= limit</code>.`
			`* @param limit the ending index, exclusive; <code>start <= limit`
			`* <= text.length()</code>.`
ICU-329 parse engines need better error reporting X-SVN-Rev: 958 2000-03-18 01:42:45 +00:00			`* @param c character to search for`
			`* @return Offset of the first instance of c, or -1 if not found.`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`*/`
			`static int32_t quotedIndexOf(const UnicodeString& text,`
			`int32_t start, int32_t limit,`
ICU-329 parse engines need better error reporting X-SVN-Rev: 958 2000-03-18 01:42:45 +00:00			`UChar c);`
ICU-352 rbt support for segments, cursor offset, and new syntax X-SVN-Rev: 1422 2000-05-20 04:40:29 +00:00
			`friend class RuleHalf;`
ICU-535 fix MSVC level 4 warnings X-SVN-Rev: 2259 2000-08-15 18:25:20 +00:00
			`// Disallowed methods; no impl.`
ICU-1048 allow ::ID blocks in rules X-SVN-Rev: 5233 2001-07-13 21:09:41 +00:00			`TransliteratorParser(const TransliteratorParser&);`
			`TransliteratorParser& operator=(const TransliteratorParser&);`
ICU-114 Transliterator framework first working version X-SVN-Rev: 194 1999-11-20 00:40:50 +00:00			`};`

			`#endif`