/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef RBT_H
#define RBT_H
#include "unicode/translit.h"
#include "unicode/utypes.h"
#include "unicode/parseerr.h"
class TransliterationRuleData;
/**
* A transliterator that reads a set of rules in order to determine how to perform
* translations. Rules are stored in resource bundles indexed by name. Rules are separated by
* semicolons (';'). To include a literal semicolon, prefix it with a backslash ('\;').
* Whitespace, as defined by Character.isWhitespace()
, is ignored. If the first
* non-blank character on a line is '#', the entire line is ignored as a comment.
Each set of rules consists of two groups, one forward, and one reverse. This is a * convention that is not enforced; rules for one direction may be omitted, with the result * that translations in that direction will not modify the source text.
* *Rule syntax
* *Rule statements take one of the following forms: * *
alefmadda=\u0622
{alefmadda}
", will be
* replaced by the Unicode character U+0622. If the right hand side is longer than one
* character, then it is interpreted as a character category expression; see below for
* details.softvowel=[eiyEIY]
[abc] |
* The set containing the characters 'a', 'b', and 'c'. | *
[^abc] |
* The set of all characters except 'a', 'b', and 'c'. | *
[A-Z] |
* The set of all characters from 'A' to 'Z' in Unicode order. | *
[:Lu:] |
* The set of Unicode uppercase letters. See www.unicode.org * for a complete list of categories and their two-letter codes. | *
[^a-z[:Lu:][:Ll:]] |
* The set of all characters except 'a' through 'z' and uppercase or lowercase * letters. | *
See {@link UnicodeSet} for more documentation and examples.
*ai>{alefmadda}
ai<{alefmadda}
ai<>{alefmadda}
Forward and reverse translation rules consist of a match pattern and an output
* string. The match pattern consists of literal characters, optionally preceded by
* context, and optionally followed by context. Context characters, like literal pattern
* characters, must be matched in the text being transliterated. However, unlike literal
* pattern characters, they are not replaced by the output text. For example, the pattern
* "(abc)def
" indicates the characters "def
"
* must be preceded by "abc
" for a successful match. If there is a
* successful match, "def
" will be replaced, but not "abc
".
* The initial '(
' is optional, so "abc)def
" is
* equivalent to "(abc)def
". Another example is "123(456)
"
* (or "123(456
") in which the literal pattern "123
"
* must be followed by "456
".
The output string of a forward or reverse rule consists of characters to replace the
* literal pattern characters. If the output string contains the character '|
',
* this is taken to indicate the location of the cursor after replacement. The
* cursor is the point in the text at which the next replacement, if any, will be applied.
In addition to being defined in variables, UnicodeSet
patterns may be
* embedded directly into rule strings. Thus, the following two rules are equivalent:
** **
vowel=[aeiou]; {vowel}>*; # One way to do this
* [aeiou]>*; * # * Another way
Example
* *The following example rules illustrate many of the features of the rule language.
* *Rule 1. | *(abc)def>x|y |
*
Rule 2. | *xyz>r |
*
Rule 3. | *yz>q |
*
Applying these rules to the string "adefabcdefz
" yields the
* following results:
|adefabcdefz |
* Initial state, no rules match. Advance cursor. | *
a|defabcdefz |
* Still no match. Rule 1 does not match because the preceding context is not present. | *
ad|efabcdefz |
* Still no match. Keep advancing until there is a match... | *
ade|fabcdefz |
* ... | *
adef|abcdefz |
* ... | *
adefa|bcdefz |
* ... | *
adefab|cdefz |
* ... | *
adefabc|defz |
* Rule 1 matches; replace "def " with "xy "
* and back up the cursor to before the 'y '. |
*
adefabcx|yz |
* Although "xyz " is present, rule 2 does not match because the
* cursor is before the 'y ', not before the 'x '. Rule 3 does match.
* Replace "yz " with "q ". |
*
adefabcxq| |
* The cursor is at the end; transliteration is complete. | *
The order of rules is significant. If multiple rules may match at some point, the first * matching rule is applied.
* *Forward and reverse rules may have an empty output string. Otherwise, an empty left or * right hand side of any statement is a syntax error.
* *Single quotes are used to quote the special characters =><{}[]()|
.
* To specify a single quote itself, inside or outside of quotes, use two single quotes in a
* row. For example, the rule "'>'>o''clock
" changes the string
* ">
" to the string "o'clock
".
Notes
* *While a RuleBasedTransliterator is being built, it checks that the rules are added in * proper order. For example, if the rule "a>x" is followed by the rule * "ab>y", then the second rule will throw an exception. The reason is that the * second rule can never be triggered, since the first rule always matches anything it * matches. In other words, the first rule masks the second rule.
* * @author Alan Liu * @draft */ class U_I18N_API RuleBasedTransliterator : public Transliterator { /** * The data object is immutable, so we can freely share it with * other instances of RBT, as long as we do NOT own this object. */ TransliterationRuleData* data; /** * If true, we own the data object and must delete it. */ bool_t isDataOwned; public: /** * Constructs a new transliterator from the given rules. * @param rules rules, separated by ';' * @param direction either FORWARD or REVERSE. * @exception IllegalArgumentException if rules are malformed * or direction is invalid. * @draft */ RuleBasedTransliterator(const UnicodeString& ID, const UnicodeString& rules, Direction direction, UnicodeFilter* adoptedFilter, ParseError& parseError, UErrorCode& status); /** * Constructs a new transliterator from the given rules. * @param rules rules, separated by ';' * @param direction either FORWARD or REVERSE. * @exception IllegalArgumentException if rules are malformed * or direction is invalid. */ RuleBasedTransliterator(const UnicodeString& ID, const UnicodeString& rules, Direction direction, UnicodeFilter* adoptedFilter, UErrorCode& status); /** * Covenience constructor with no filter. * @draft */ RuleBasedTransliterator(const UnicodeString& ID, const UnicodeString& rules, Direction direction, UErrorCode& status); /** * Covenience constructor with no filter and FORWARD direction. * @draft */ RuleBasedTransliterator(const UnicodeString& ID, const UnicodeString& rules, UErrorCode& status); /** * Covenience constructor with FORWARD direction. * @draft */ RuleBasedTransliterator(const UnicodeString& ID, const UnicodeString& rules, UnicodeFilter* adoptedFilter, UErrorCode& status); /** * Covenience constructor. * @draft */ RuleBasedTransliterator(const UnicodeString& ID, const TransliterationRuleData* theData, UnicodeFilter* adoptedFilter = 0); /** * Copy constructor. * @draft */ RuleBasedTransliterator(const RuleBasedTransliterator&); virtual ~RuleBasedTransliterator(); /** * Implement Transliterator API. * @draft */ Transliterator* clone(void) const; /** * Implements {@link Transliterator#handleTransliterate}. * @draft */ virtual void handleTransliterate(Replaceable& text, Position& offsets, bool_t isIncremental) const; /** * Parse error codes generated by RuleBasedTransliterator. * See parseerr.h. */ enum { PARSE_ERROR_BASE = 0x10000, DUPLICATE_VARIABLE_DEFINITION, MALFORMED_RHS, MALFORMED_RULE, MALFORMED_SET, MALFORMED_UNICODE_ESCAPE, MALFORMED_VARIABLE_REFERENCE, MISSING_OPERATOR, MULTIPLE_ANTE_CONTEXTS, MULTIPLE_CURSORS, MULTIPLE_POST_CONTEXTS, TEXT_AFTER_CLOSE_CONTEXT, TRAILING_BACKSLASH, UNDEFINED_VARIABLE, UNEXPECTED_CLOSE_CONTEXT, UNQUOTED_SPECIAL, UNTERMINATED_QUOTE }; private: void _construct(const UnicodeString& rules, Direction direction, UErrorCode& status, ParseError* parseError = 0); }; /** * Constructs a new transliterator from the given rules. * @param rules rules, separated by ';' * @param direction either FORWARD or REVERSE. * @exception IllegalArgumentException if rules are malformed * or direction is invalid. */ inline RuleBasedTransliterator::RuleBasedTransliterator( const UnicodeString& ID, const UnicodeString& rules, Direction direction, UnicodeFilter* adoptedFilter, ParseError& parseError, UErrorCode& status) : Transliterator(ID, adoptedFilter) { _construct(rules, direction, status, &parseError); } /** * Constructs a new transliterator from the given rules. * @param rules rules, separated by ';' * @param direction either FORWARD or REVERSE. * @exception IllegalArgumentException if rules are malformed * or direction is invalid. */ inline RuleBasedTransliterator::RuleBasedTransliterator( const UnicodeString& ID, const UnicodeString& rules, Direction direction, UnicodeFilter* adoptedFilter, UErrorCode& status) : Transliterator(ID, adoptedFilter) { _construct(rules, direction, status); } /** * Covenience constructor with no filter. */ inline RuleBasedTransliterator::RuleBasedTransliterator( const UnicodeString& ID, const UnicodeString& rules, Direction direction, UErrorCode& status) : Transliterator(ID, 0) { _construct(rules, direction, status); } /** * Covenience constructor with no filter and FORWARD direction. */ inline RuleBasedTransliterator::RuleBasedTransliterator( const UnicodeString& ID, const UnicodeString& rules, UErrorCode& status) : Transliterator(ID, 0) { _construct(rules, FORWARD, status); } /** * Covenience constructor with FORWARD direction. */ inline RuleBasedTransliterator::RuleBasedTransliterator( const UnicodeString& ID, const UnicodeString& rules, UnicodeFilter* adoptedFilter, UErrorCode& status) : Transliterator(ID, adoptedFilter) { _construct(rules, FORWARD, status); } #endif