scuffed-code/icu4c/source/i18n/rbt_pars.h

/*
* Copyright <EFBFBD> {1999}, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H

#include "unicode/rbt.h"
#include "uvector.h"

class TransliterationRuleData;
class UnicodeSet;
class ParseData;

class TransliterationRuleParser {

    /**
     * This is a reference to external data we don't own.  This works because
     * we only hold this for the duration of the call to parse().
     */
    const UnicodeString& rules;

    RuleBasedTransliterator::Direction direction;

    TransliterationRuleData* data;

    /**
     * We use a single error code during parsing.  Rather than pass it
     * through each API, we keep it here.
     */
    UErrorCode status;

    /**
     * Temporary symbol table used during parsing.
     */
    ParseData* parseData;

    /**
     * Temporary vector of set variables.  When parsing is complete, this
     * is copied into the array data.setVariables.  As with data.setVariables,
     * element 0 corresponds to character data.setVariablesBase.
     */
    UVector setVariablesVector;

    /**
     * The next available stand-in for variables.  This starts at some point in
     * the private use area (discovered dynamically) and increments up toward
     * <code>variableLimit</code>.  At any point during parsing, available
     * variables are <code>variableNext..variableLimit-1</code>.
     */
    UChar variableNext;

    /**
     * The last available stand-in for variables.  This is discovered
     * dynamically.  At any point during parsing, available variables are
     * <code>variableNext..variableLimit-1</code>.
     */
    UChar variableLimit;

    // Operators
    static const UChar VARIABLE_DEF_OP;
    static const UChar FORWARD_RULE_OP;
    static const UChar REVERSE_RULE_OP;
    static const UChar FWDREV_RULE_OP; // internal rep of <> op
    static const UnicodeString OPERATORS;

    // Other special characters
    static const UChar QUOTE;
    static const UChar ESCAPE;
    static const UChar END_OF_RULE;
    static const UChar RULE_COMMENT_CHAR;
    static const UChar VARIABLE_REF_OPEN;
    static const UChar VARIABLE_REF_CLOSE;
    static const UChar CONTEXT_OPEN;
    static const UChar CONTEXT_CLOSE;
    static const UChar SET_OPEN;
    static const UChar SET_CLOSE;
    static const UChar CURSOR_POS;

public:

    static TransliterationRuleData*
        parse(const UnicodeString& rules,
              RuleBasedTransliterator::Direction direction);
    
private:

    /**
     * @param rules list of rules, separated by newline characters
     * @exception IllegalArgumentException if there is a syntax error in the
     * rules
     */
    TransliterationRuleParser(const UnicodeString& rules,
                              RuleBasedTransliterator::Direction direction);

    /**
     * Destructor.
     */
    ~TransliterationRuleParser();

    /**
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
     * previous rules are discarded.  Typically this method is called exactly
     * once, during construction.
     * @exception IllegalArgumentException if there is a syntax error in the
     * rules
     */
    void parseRules(void);

    /**
     * MAIN PARSER.  Parse the next rule in the given rule string, starting
     * at pos.  Return the index after the last character parsed.  Do not
     * parse characters at or after limit.
     *
     * Important:  The character at pos must be a non-whitespace character
     * that is not the comment character.
     *
     * This method handles quoting, escaping, and whitespace removal.  It
     * parses the end-of-rule character.  It recognizes context and cursor
     * indicators.  Once it does a lexical breakdown of the rule at pos, it
     * creates a rule object and adds it to our rule list.
     */
    int32_t parseRule(int32_t pos, int32_t limit);

    /**
     * Called by main parser upon syntax error.  Search the rule string
     * for the probable end of the rule.  Of course, if the error is that
     * the end of rule marker is missing, then the rule end will not be found.
     * In any case the rule start will be correctly reported.
     * @param msg error description
     * @param rule pattern string
     * @param start position of first character of current rule
     */
    int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start);

    /**
     * Allocate a private-use substitution character for the given set,
     * register it in the setVariables hash, and return the substitution
     * character.
     */
    UChar registerSet(UnicodeSet* adoptedSet);
 
    /**
     * Determines what part of the private use region of Unicode we can use for
     * variable stand-ins.  The correct way to do this is as follows: Parse each
     * rule, and for forward and reverse rules, take the FROM expression, and
     * make a hash of all characters used.  The TO expression should be ignored.
     * When done, everything not in the hash is available for use.  In practice,
     * this method may employ some other algorithm for improved speed.
     */
    void determineVariableRange(void);

    /**
     * Returns the index of the first character in a set, ignoring quoted text.
     * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
     * found by a search for "h".  Unlike String.indexOf(), this method searches
     * not for a single character, but for any character of the string
     * <code>setOfChars</code>.
     * @param text text to be searched
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
     * @param setOfChars string with one or more distinct characters
     * @return Offset of the first character in <code>setOfChars</code>
     * found, or -1 if not found.
     * @see #indexOf
     */
    static int32_t quotedIndexOf(const UnicodeString& text,
                                 int32_t start, int32_t limit,
                                 const UnicodeString& setOfChars);
};

#endif
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								/*
-												ICU-161 Updated with the correct copyright notice from IP lawyer.

X-SVN-Rev: 225
											
										
										
											1999-11-22 21:47:27 +00:00
+								* Copyright <EFBFBD> {1999}, International Business Machines Corporation and others. All Rights Reserved.
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								**********************************************************************
 								*   Date        Name        Description
 								*   11/17/99    aliu        Creation.
 								**********************************************************************
 								*/
 								#ifndef RBT_PARS_H
 								#define RBT_PARS_H
-												ICU-12 all public include files are now in unicode dir, all private icu_ functions renamed to uprv_

X-SVN-Rev: 473
											
										
										
											1999-12-28 23:57:50 +00:00
+								#include "unicode/rbt.h"
-												ICU-265 map char to set with array instead of hash for better performance

X-SVN-Rev: 728
											
										
										
											2000-02-08 02:49:15 +00:00
+								#include "uvector.h"
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
 								class TransliterationRuleData;
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								class UnicodeSet;
-												ICU-265 map char to set with array instead of hash for better performance

X-SVN-Rev: 728
											
										
										
											2000-02-08 02:49:15 +00:00
+								class ParseData;
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
 								class TransliterationRuleParser {
 								    /**
 								     * This is a reference to external data we don't own.  This works because
 								     * we only hold this for the duration of the call to parse().
 								     */
 								    const UnicodeString& rules;
 								    RuleBasedTransliterator::Direction direction;
 								    TransliterationRuleData* data;
 								    /**
 								     * We use a single error code during parsing.  Rather than pass it
 								     * through each API, we keep it here.
 								     */
 								    UErrorCode status;
-												ICU-265 map char to set with array instead of hash for better performance

X-SVN-Rev: 728
											
										
										
											2000-02-08 02:49:15 +00:00
+								    /**
 								     * Temporary symbol table used during parsing.
 								     */
 								    ParseData* parseData;
 								    /**
 								     * Temporary vector of set variables.  When parsing is complete, this
 								     * is copied into the array data.setVariables.  As with data.setVariables,
 								     * element 0 corresponds to character data.setVariablesBase.
 								     */
 								    UVector setVariablesVector;
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								    /**
 								     * The next available stand-in for variables.  This starts at some point in
 								     * the private use area (discovered dynamically) and increments up toward
 								     * <code>variableLimit</code>.  At any point during parsing, available
 								     * variables are <code>variableNext..variableLimit-1</code>.
 								     */
 								    UChar variableNext;
 								    /**
 								     * The last available stand-in for variables.  This is discovered
 								     * dynamically.  At any point during parsing, available variables are
 								     * <code>variableNext..variableLimit-1</code>.
 								     */
 								    UChar variableLimit;
 								    // Operators
 								    static const UChar VARIABLE_DEF_OP;
 								    static const UChar FORWARD_RULE_OP;
 								    static const UChar REVERSE_RULE_OP;
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								    static const UChar FWDREV_RULE_OP; // internal rep of <> op
 								    static const UnicodeString OPERATORS;
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
 								    // Other special characters
 								    static const UChar QUOTE;
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								    static const UChar ESCAPE;
 								    static const UChar END_OF_RULE;
 								    static const UChar RULE_COMMENT_CHAR;
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								    static const UChar VARIABLE_REF_OPEN;
 								    static const UChar VARIABLE_REF_CLOSE;
 								    static const UChar CONTEXT_OPEN;
 								    static const UChar CONTEXT_CLOSE;
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								    static const UChar SET_OPEN;
 								    static const UChar SET_CLOSE;
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								    static const UChar CURSOR_POS;
 								public:
 								    static TransliterationRuleData*
 								        parse(const UnicodeString& rules,
 								              RuleBasedTransliterator::Direction direction);
 								private:
 								    /**
 								     * @param rules list of rules, separated by newline characters
 								     * @exception IllegalArgumentException if there is a syntax error in the
 								     * rules
 								     */
 								    TransliterationRuleParser(const UnicodeString& rules,
 								                              RuleBasedTransliterator::Direction direction);
-												ICU-265 map char to set with array instead of hash for better performance

X-SVN-Rev: 728
											
										
										
											2000-02-08 02:49:15 +00:00
+								    /**
 								     * Destructor.
 								     */
 								    ~TransliterationRuleParser();
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								    /**
 								     * Parse the given string as a sequence of rules, separated by newline
 								     * characters ('\n'), and cause this object to implement those rules.  Any
 								     * previous rules are discarded.  Typically this method is called exactly
 								     * once, during construction.
 								     * @exception IllegalArgumentException if there is a syntax error in the
 								     * rules
 								     */
-												ICU-200 Updated with OS/400 specific port changes.

X-SVN-Rev: 459
											
										
										
											1999-12-22 22:57:04 +00:00
+								    void parseRules(void);
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
 								    /**
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								     * MAIN PARSER.  Parse the next rule in the given rule string, starting
 								     * at pos.  Return the index after the last character parsed.  Do not
 								     * parse characters at or after limit.
 								     *
 								     * Important:  The character at pos must be a non-whitespace character
 								     * that is not the comment character.
 								     *
 								     * This method handles quoting, escaping, and whitespace removal.  It
 								     * parses the end-of-rule character.  It recognizes context and cursor
 								     * indicators.  Once it does a lexical breakdown of the rule at pos, it
 								     * creates a rule object and adds it to our rule list.
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								     */
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								    int32_t parseRule(int32_t pos, int32_t limit);
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
 								    /**
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								     * Called by main parser upon syntax error.  Search the rule string
 								     * for the probable end of the rule.  Of course, if the error is that
 								     * the end of rule marker is missing, then the rule end will not be found.
 								     * In any case the rule start will be correctly reported.
 								     * @param msg error description
 								     * @param rule pattern string
 								     * @param start position of first character of current rule
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								     */
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								    int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start);
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
 								    /**
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								     * Allocate a private-use substitution character for the given set,
 								     * register it in the setVariables hash, and return the substitution
 								     * character.
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								     */
-												ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
											
										
										
											2000-01-13 07:28:08 +00:00
+								    UChar registerSet(UnicodeSet* adoptedSet);
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
+								    /**
 								     * Determines what part of the private use region of Unicode we can use for
 								     * variable stand-ins.  The correct way to do this is as follows: Parse each
 								     * rule, and for forward and reverse rules, take the FROM expression, and
 								     * make a hash of all characters used.  The TO expression should be ignored.
 								     * When done, everything not in the hash is available for use.  In practice,
 								     * this method may employ some other algorithm for improved speed.
 								     */
-												ICU-200 Updated with OS/400 specific port changes.

X-SVN-Rev: 459
											
										
										
											1999-12-22 22:57:04 +00:00
+								    void determineVariableRange(void);
-												ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
											
										
										
											1999-11-20 00:40:50 +00:00
 								    /**
 								     * Returns the index of the first character in a set, ignoring quoted text.
 								     * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
 								     * found by a search for "h".  Unlike String.indexOf(), this method searches
 								     * not for a single character, but for any character of the string
 								     * <code>setOfChars</code>.
 								     * @param text text to be searched
 								     * @param start the beginning index, inclusive; <code>0 <= start
 								     * <= limit</code>.
 								     * @param limit the ending index, exclusive; <code>start <= limit
 								     * <= text.length()</code>.
 								     * @param setOfChars string with one or more distinct characters
 								     * @return Offset of the first character in <code>setOfChars</code>
 								     * found, or -1 if not found.
 								     * @see #indexOf
 								     */
 								    static int32_t quotedIndexOf(const UnicodeString& text,
 								                                 int32_t start, int32_t limit,
 								                                 const UnicodeString& setOfChars);
 								};
 								#endif