package com.ibm.text;
import java.util.Hashtable;
import java.util.Vector;
import java.text.ParsePosition;
/**
* A transliterator that reads a set of rules in order to determine how to perform
* translations. Rules are stored in resource bundles indexed by name. Rules are separated by
* semicolons (';'). To include a literal semicolon, prefix it with a backslash ('\;').
* Whitespace, as defined by Character.isWhitespace()
, is ignored. If the first
* non-blank character on a line is '#', the entire line is ignored as a comment.
Each set of rules consists of two groups, one forward, and one reverse. This is a * convention that is not enforced; rules for one direction may be omitted, with the result * that translations in that direction will not modify the source text.
* *Rule syntax
* *Rule statements take one of the following forms: * *
alefmadda=\u0622
{alefmadda}
", will be
* replaced by the Unicode character U+0622. If the right hand side is longer than one
* character, then it is interpreted as a character category expression; see below for
* details.softvowel=[eiyEIY]
[abc] |
* The set containing the characters 'a', 'b', and 'c'. | *
[^abc] |
* The set of all characters except 'a', 'b', and 'c'. | *
[A-Z] |
* The set of all characters from 'A' to 'Z' in Unicode order. | *
[:Lu:] |
* The set of Unicode uppercase letters. See www.unicode.org * for a complete list of categories and their two-letter codes. | *
[^a-z[:Lu:][:Ll:]] |
* The set of all characters except 'a' through 'z' and uppercase or lowercase * letters. | *
See {@link UnicodeSet} for more documentation and examples.
*ai>{alefmadda}
ai<{alefmadda}
ai<>{alefmadda}
Forward and reverse translation rules consist of a match pattern and an output
* string. The match pattern consists of literal characters, optionally preceded by
* context, and optionally followed by context. Context characters, like literal pattern
* characters, must be matched in the text being transliterated. However, unlike literal
* pattern characters, they are not replaced by the output text. For example, the pattern
* "(abc)def
" indicates the characters "def
"
* must be preceded by "abc
" for a successful match. If there is a
* successful match, "def
" will be replaced, but not "abc
".
* The initial '(
' is optional, so "abc)def
" is
* equivalent to "(abc)def
". Another example is "123(456)
"
* (or "123(456
") in which the literal pattern "123
"
* must be followed by "456
".
The output string of a forward or reverse rule consists of characters to replace the
* literal pattern characters. If the output string contains the character '|
',
* this is taken to indicate the location of the cursor after replacement. The
* cursor is the point in the text at which the next replacement, if any, will be applied.
In addition to being defined in variables, UnicodeSet
patterns may be
* embedded directly into rule strings. Thus, the following two rules are equivalent:
** **
vowel=[aeiou]; {vowel}>*; # One way to do this
* [aeiou]>*; * # * Another way
Example
* *The following example rules illustrate many of the features of the rule language.
* *Rule 1. | *(abc)def>x|y |
*
Rule 2. | *xyz>r |
*
Rule 3. | *yz>q |
*
Applying these rules to the string "adefabcdefz
" yields the
* following results:
|adefabcdefz |
* Initial state, no rules match. Advance cursor. | *
a|defabcdefz |
* Still no match. Rule 1 does not match because the preceding context is not present. | *
ad|efabcdefz |
* Still no match. Keep advancing until there is a match... | *
ade|fabcdefz |
* ... | *
adef|abcdefz |
* ... | *
adefa|bcdefz |
* ... | *
adefab|cdefz |
* ... | *
adefabc|defz |
* Rule 1 matches; replace "def " with "xy "
* and back up the cursor to before the 'y '. |
*
adefabcx|yz |
* Although "xyz " is present, rule 2 does not match because the
* cursor is before the 'y ', not before the 'x '. Rule 3 does match.
* Replace "yz " with "q ". |
*
adefabcxq| |
* The cursor is at the end; transliteration is complete. | *
The order of rules is significant. If multiple rules may match at some point, the first * matching rule is applied.
* *Forward and reverse rules may have an empty output string. Otherwise, an empty left or * right hand side of any statement is a syntax error.
* *Single quotes are used to quote the special characters =><{}[]()|
.
* To specify a single quote itself, inside or outside of quotes, use two single quotes in a
* row. For example, the rule "'>'>o''clock
" changes the string
* ">
" to the string "o'clock
".
Notes
* *While a RuleBasedTransliterator is being built, it checks that the rules are added in * proper order. For example, if the rule "a>x" is followed by the rule * "ab>y", then the second rule will throw an exception. The reason is that the * second rule can never be triggered, since the first rule always matches anything it * matches. In other words, the first rule masks the second rule.
* *Copyright (c) IBM Corporation 1999-2000. All rights reserved.
* * @author Alan Liu * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.10 $ $Date: 2000/01/13 23:53:23 $ * * $Log: RuleBasedTransliterator.java,v $ * Revision 1.10 2000/01/13 23:53:23 Alan * Fix bugs found during ICU port * * Revision 1.9 2000/01/11 04:12:06 Alan * Cleanup, embellish comments * * Revision 1.8 2000/01/11 02:25:03 Alan * Rewrite UnicodeSet and RBT parsers for better performance and new syntax * * Revision 1.7 2000/01/06 01:36:36 Alan * Allow string arrays in rule resource bundles * * Revision 1.6 2000/01/04 21:43:57 Alan * Add rule indexing, and move masking check to TransliterationRuleSet. * * Revision 1.5 1999/12/22 01:40:54 Alan * Consolidate rule pattern anteContext, key, and postContext into one string. * * Revision 1.4 1999/12/22 01:05:54 Alan * Improve masking checking; turn it off by default, for better performance */ public class RuleBasedTransliterator extends Transliterator { /** * Direction constant passed to constructor to create a transliterator * using the forward rules. */ public static final int FORWARD = 0; /** * Direction constant passed to constructor to create a transliterator * using the reverse rules. */ public static final int REVERSE = 1; private Data data; static final boolean DEBUG = false; private static final String COPYRIGHT = "\u00A9 IBM Corporation 1999. All rights reserved."; /** * Constructs a new transliterator from the given rules. * @param rules rules, separated by ';' * @param direction either FORWARD or REVERSE. * @exception IllegalArgumentException if rules are malformed * or direction is invalid. */ public RuleBasedTransliterator(String ID, String rules, int direction, UnicodeFilter filter) { super(ID, filter); if (direction != FORWARD && direction != REVERSE) { throw new IllegalArgumentException("Invalid direction"); } data = parse(rules, direction); } /** * Constructs a new transliterator from the given rules in the *FORWARD
direction.
* @param rules rules, separated by ';'
* @exception IllegalArgumentException if rules are malformed
* or direction is invalid.
*/
public RuleBasedTransliterator(String ID, String rules) {
this(ID, rules, FORWARD, null);
}
RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) {
super(ID, filter);
this.data = data;
}
static Data parse(String[] rules, int direction) {
return new Parser(rules, direction).getData();
}
static Data parse(String rules, int direction) {
return parse(new String[] { rules }, direction);
}
/**
* Transliterates a segment of a string. Transliterator
API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; 0 <= start
* <= limit
.
* @param limit the ending index, exclusive; start <= limit
* <= text.length()
.
* @param result buffer to receive the transliterated text; previous
* contents are discarded
*/
public void transliterate(String text, int start, int limit,
StringBuffer result) {
/* In the following loop there is a virtual buffer consisting of the
* text transliterated so far followed by the untransliterated text. There is
* also a cursor, which may be in the already transliterated buffer or just
* before the untransliterated text.
*
* Example: rules 1. ab>x|y
* 2. yc>z
*
* []|eabcd start - no match, copy e to tranlated buffer
* [e]|abcd match rule 1 - copy output & adjust cursor
* [ex|y]cd match rule 2 - copy output & adjust cursor
* [exz]|d no match, copy d to transliterated buffer
* [exzd]| done
*
* cursor: an index into the virtual buffer, 0..result.length()-1.
* Matches take place at the cursor. If there is no match, the cursor
* is advanced, and one character is moved from the source text to the
* result buffer.
*
* start, limit: these designate the substring of the source text which
* has not been processed yet. The range of offsets is start..limit-1.
* At any moment the virtual buffer consists of result +
* text.substring(start, limit).
*/
int cursor = 0;
result.setLength(0);
while (start < limit || cursor < result.length()) {
TransliterationRule r = data.ruleSet.findMatch(text, start, limit, result,
cursor, data.setVariables, getFilter());
if (DEBUG) {
StringBuffer buf = new StringBuffer(
result.toString() + '#' + text.substring(start, limit));
buf.insert(cursor <= result.length()
? cursor : (cursor + 1),
'|');
System.err.print((r == null ? "nomatch:" : ("match:" + r + ", "))
+ buf);
}
if (r == null) {
if (cursor == result.length()) {
result.append(text.charAt(start++));
}
++cursor;
} else {
// resultPad is length of result to right of cursor; >= 0
int resultPad = result.length() - cursor;
char[] tail = null;
if (r.getKeyLength() > resultPad) {
start += r.getKeyLength() - resultPad;
} else if (r.getKeyLength() < resultPad) {
tail = new char[resultPad - r.getKeyLength()];
result.getChars(cursor + r.getKeyLength(), result.length(),
tail, 0);
}
result.setLength(cursor);
result.append(r.getOutput());
if (tail != null) {
result.append(tail);
}
cursor += r.getCursorPos();
}
if (DEBUG) {
StringBuffer buf = new StringBuffer(
result.toString() + '#' + text.substring(start, limit));
buf.insert(cursor <= result.length()
? cursor : (cursor + 1),
'|');
System.err.println(" => " + buf);
}
}
}
/**
* Transliterates a segment of a string. Transliterator
API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; 0 <= start
* <= limit
.
* @param limit the ending index, exclusive; start <= limit
* <= text.length()
.
* @return The new limit index
*/
public int transliterate(Replaceable text, int start, int limit) {
/* When using Replaceable, the algorithm is simpler, since we don't have
* two separate buffers. We keep start and limit fixed the entire time,
* relative to the text -- limit may move numerically if text is
* inserted or removed. The cursor moves from start to limit, with
* replacements happening under it.
*
* Example: rules 1. ab>x|y
* 2. yc>z
*
* |eabcd start - no match, advance cursor
* e|abcd match rule 1 - change text & adjust cursor
* ex|ycd match rule 2 - change text & adjust cursor
* exz|d no match, advance cursor
* exzd| done
*/
int cursor = start;
while (cursor < limit) {
TransliterationRule r = data.ruleSet.findMatch(text, start, limit,
cursor, data.setVariables, getFilter());
if (r == null) {
++cursor;
} else {
text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
limit += r.getOutput().length() - r.getKeyLength();
cursor += r.getCursorPos();
}
}
return limit;
}
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
protected void handleKeyboardTransliterate(Replaceable text,
int[] index) {
int start = index[START];
int limit = index[LIMIT];
int cursor = index[CURSOR];
if (DEBUG) {
System.out.print("\"" +
escape(rsubstring(text, start, cursor)) + '|' +
escape(rsubstring(text, cursor, limit)) + "\"");
}
boolean partial[] = new boolean[1];
while (cursor < limit) {
TransliterationRule r = data.ruleSet.findIncrementalMatch(
text, start, limit, cursor, data.setVariables, partial, getFilter());
/* If we match a rule then apply it by replacing the key
* with the rule output and repositioning the cursor
* appropriately. If we get a partial match, then we
* can't do anything without more text; return with the
* cursor at the current position. If we get null, then
* there is no match at this position, and we can advance
* the cursor.
*/
if (r == null) {
if (partial[0]) {
break;
} else {
++cursor;
}
} else {
text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
limit += r.getOutput().length() - r.getKeyLength();
cursor += r.getCursorPos();
}
}
if (DEBUG) {
System.out.println(" -> \"" +
escape(rsubstring(text, start, cursor)) + '|' +
escape(rsubstring(text, cursor, cursor)) + '|' +
escape(rsubstring(text, cursor, limit)) + "\"");
}
index[LIMIT] = limit;
index[CURSOR] = cursor;
}
/**
* Returns the length of the longest context required by this transliterator.
* This is preceding context.
* @return Maximum number of preceding context characters this
* transliterator needs to examine
*/
protected int getMaximumContextLength() {
return data.ruleSet.getMaximumContextLength();
}
/**
* FOR DEBUGGING: Return a substring of a Replaceable.
*/
private static String rsubstring(Replaceable r, int start, int limit) {
StringBuffer buf = new StringBuffer();
while (start < limit) {
buf.append(r.charAt(start++));
}
return buf.toString();
}
/**
* FOR DEBUGGING: Escape non-ASCII characters as Unicode.
*/
private static final String escape(String s) {
StringBuffer buf = new StringBuffer();
for (int i=0; ivariableLimit
. At any point during parsing, available
* variables are variableNext..variableLimit-1
.
*/
private char variableNext;
/**
* The last available stand-in for variables. This is discovered
* dynamically. At any point during parsing, available variables are
* variableNext..variableLimit-1
.
*/
private char variableLimit;
// Operators
private static final char VARIABLE_DEF_OP = '=';
private static final char FORWARD_RULE_OP = '>';
private static final char REVERSE_RULE_OP = '<';
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
private static final String OPERATORS = "=><";
// Other special characters
private static final char QUOTE = '\'';
private static final char ESCAPE = '\\';
private static final char END_OF_RULE = ';';
private static final char RULE_COMMENT_CHAR = '#';
private static final char VARIABLE_REF_OPEN = '{';
private static final char VARIABLE_REF_CLOSE = '}';
private static final char CONTEXT_OPEN = '(';
private static final char CONTEXT_CLOSE = ')';
private static final char SET_OPEN = '[';
private static final char SET_CLOSE = ']';
private static final char CURSOR_POS = '|';
/**
* @param rules list of rules, separated by semicolon characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
public Parser(String[] ruleArray, int direction) {
this.direction = direction;
data = new Data();
parseRules(ruleArray);
}
public Data getData() {
return data;
}
/**
* Parse an array of zero or more rules. The strings in the array are
* treated as if they were concatenated together, with rule terminators
* inserted between array elements if not present already.
*
* Any previous rules are discarded. Typically this method is called exactly
* once, during construction.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
private void parseRules(String[] ruleArray) {
determineVariableRange(ruleArray);
StringBuffer errors = null;
try {
for (int i=0; isetOfChars
.
* @param text text to be searched
* @param start the beginning index, inclusive; 0 <= start
* <= limit
.
* @param limit the ending index, exclusive; start <= limit
* <= text.length()
.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in setOfChars
* found, or -1 if not found.
* @see #indexOf
*/
private static int quotedIndexOf(String text, int start, int limit,
String setOfChars) {
for (int i=start; i