scuffed-code/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
Alan Liu fb7313675a Fix bugs found during ICU port
X-SVN-Rev: 575
2000-01-13 23:53:23 +00:00

1117 lines
46 KiB
Java
Executable File

package com.ibm.text;
import java.util.Hashtable;
import java.util.Vector;
import java.text.ParsePosition;
/**
* A transliterator that reads a set of rules in order to determine how to perform
* translations. Rules are stored in resource bundles indexed by name. Rules are separated by
* semicolons (';'). To include a literal semicolon, prefix it with a backslash ('\;').
* Whitespace, as defined by <code>Character.isWhitespace()</code>, is ignored. If the first
* non-blank character on a line is '#', the entire line is ignored as a comment. </p>
*
* <p>Each set of rules consists of two groups, one forward, and one reverse. This is a
* convention that is not enforced; rules for one direction may be omitted, with the result
* that translations in that direction will not modify the source text. </p>
*
* <p><b>Rule syntax</b> </p>
*
* <p>Rule statements take one of the following forms:
*
* <dl>
* <dt><code>alefmadda=\u0622</code></dt>
* <dd><strong>Variable definition.</strong> The name on the left is assigned the character or
* expression on the right. Names may not contain any special characters (see list below).
* Duplicate names (including duplicates of simple variables or category names) cause an
* exception to be thrown. If the right hand side consists of one character, then the
* variable stands for that character. In this example, after this statement, instances of
* the left hand name surrounded by braces, &quot;<code>{alefmadda}</code>&quot;, will be
* replaced by the Unicode character U+0622. If the right hand side is longer than one
* character, then it is interpreted as a character category expression; see below for
* details.</dd>
* <dt>&nbsp;</dt>
* <dt><code>softvowel=[eiyEIY]</code></dt>
* <dd><strong>Category definition.</strong> The name on the left is assigned to stand for a
* set of characters. The same rules for names of simple variables apply. After this
* statement, the left hand variable will be interpreted as indicating a set of characters in
* appropriate contexts. The pattern syntax defining sets of characters is defined by {@link
* UnicodeSet}. Examples of valid patterns are:<table>
* <tr valign="top">
* <td nowrap><code>[abc]</code></td>
* <td>The set containing the characters 'a', 'b', and 'c'.</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>[^abc]</code></td>
* <td>The set of all characters <em>except</em> 'a', 'b', and 'c'.</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>[A-Z]</code></td>
* <td>The set of all characters from 'A' to 'Z' in Unicode order.</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>[:Lu:]</code></td>
* <td>The set of Unicode uppercase letters. See <a href="http://www.unicode.org">www.unicode.org</a>
* for a complete list of categories and their two-letter codes.</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
* <td>The set of all characters <em>except</em> 'a' through 'z' and uppercase or lowercase
* letters.</td>
* </tr>
* </table>
* <p>See {@link UnicodeSet} for more documentation and examples. </p>
* </dd>
* <dt><code>ai&gt;{alefmadda}</code></dt>
* <dd><strong>Forward translation rule.</strong> This rule states that the string on the left
* will be changed to the string on the right when performing forward transliteration.</dd>
* <dt>&nbsp;</dt>
* <dt><code>ai&lt;{alefmadda}</code></dt>
* <dd><strong>Reverse translation rule.</strong> This rule states that the string on the right
* will be changed to the string on the left when performing reverse transliteration.</dd>
* </dl>
*
* <dl>
* <dt><code>ai&lt;&gt;{alefmadda}</code></dt>
* <dd><strong>Bidirectional translation rule.</strong> This rule states that the string on the
* right will be changed to the string on the left when performing forward transliteration,
* and vice versa when performing reverse transliteration.</dd>
* </dl>
*
* <p>Forward and reverse translation rules consist of a <em>match pattern</em> and an <em>output
* string</em>. The match pattern consists of literal characters, optionally preceded by
* context, and optionally followed by context. Context characters, like literal pattern
* characters, must be matched in the text being transliterated. However, unlike literal
* pattern characters, they are not replaced by the output text. For example, the pattern
* &quot;<code>(abc)def</code>&quot; indicates the characters &quot;<code>def</code>&quot;
* must be preceded by &quot;<code>abc</code>&quot; for a successful match. If there is a
* successful match, &quot;<code>def</code>&quot; will be replaced, but not &quot;<code>abc</code>&quot;.
* The initial '<code>(</code>' is optional, so &quot;<code>abc)def</code>&quot; is
* equivalent to &quot;<code>(abc)def</code>&quot;. Another example is &quot;<code>123(456)</code>&quot;
* (or &quot;<code>123(456</code>&quot;) in which the literal pattern &quot;<code>123</code>&quot;
* must be followed by &quot;<code>456</code>&quot;. </p>
*
* <p>The output string of a forward or reverse rule consists of characters to replace the
* literal pattern characters. If the output string contains the character '<code>|</code>',
* this is taken to indicate the location of the <em>cursor</em> after replacement. The
* cursor is the point in the text at which the next replacement, if any, will be applied. </p>
*
* <p>In addition to being defined in variables, <code>UnicodeSet</code> patterns may be
* embedded directly into rule strings. Thus, the following two rules are equivalent:</p>
*
* <blockquote>
* <p><code>vowel=[aeiou]; {vowel}&gt;*; # One way to do this<br>
* [aeiou]&gt;*;
* &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #
* Another way</code></p>
* </blockquote>
*
* <p><b>Example</b> </p>
*
* <p>The following example rules illustrate many of the features of the rule language. </p>
*
* <table cellpadding="4">
* <tr valign="top">
* <td>Rule 1.</td>
* <td nowrap><code>(abc)def&gt;x|y</code></td>
* </tr>
* <tr valign="top">
* <td>Rule 2.</td>
* <td nowrap><code>xyz&gt;r</code></td>
* </tr>
* <tr valign="top">
* <td>Rule 3.</td>
* <td nowrap><code>yz&gt;q</code></td>
* </tr>
* </table>
*
* <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot; yields the
* following results: </p>
*
* <table cellpadding="4">
* <tr valign="top">
* <td nowrap><code>|adefabcdefz</code></td>
* <td>Initial state, no rules match. Advance cursor.</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>a|defabcdefz</code></td>
* <td>Still no match. Rule 1 does not match because the preceding context is not present.</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>ad|efabcdefz</code></td>
* <td>Still no match. Keep advancing until there is a match...</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>ade|fabcdefz</code></td>
* <td>...</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>adef|abcdefz</code></td>
* <td>...</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>adefa|bcdefz</code></td>
* <td>...</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>adefab|cdefz</code></td>
* <td>...</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>adefabc|defz</code></td>
* <td>Rule 1 matches; replace &quot;<code>def</code>&quot; with &quot;<code>xy</code>&quot;
* and back up the cursor to before the '<code>y</code>'.</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>adefabcx|yz</code></td>
* <td>Although &quot;<code>xyz</code>&quot; is present, rule 2 does not match because the
* cursor is before the '<code>y</code>', not before the '<code>x</code>'. Rule 3 does match.
* Replace &quot;<code>yz</code>&quot; with &quot;<code>q</code>&quot;.</td>
* </tr>
* <tr valign="top">
* <td nowrap><code>adefabcxq|</code></td>
* <td>The cursor is at the end; transliteration is complete.</td>
* </tr>
* </table>
*
* <p>The order of rules is significant. If multiple rules may match at some point, the first
* matching rule is applied. </p>
*
* <p>Forward and reverse rules may have an empty output string. Otherwise, an empty left or
* right hand side of any statement is a syntax error. </p>
*
* <p>Single quotes are used to quote the special characters <code>=&gt;&lt;{}[]()|</code>.
* To specify a single quote itself, inside or outside of quotes, use two single quotes in a
* row. For example, the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the string
* &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;. </p>
*
* <p><b>Notes</b> </p>
*
* <p>While a RuleBasedTransliterator is being built, it checks that the rules are added in
* proper order. For example, if the rule &quot;a&gt;x&quot; is followed by the rule
* &quot;ab&gt;y&quot;, then the second rule will throw an exception. The reason is that the
* second rule can never be triggered, since the first rule always matches anything it
* matches. In other words, the first rule <em>masks</em> the second rule. </p>
*
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
*
* @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.10 $ $Date: 2000/01/13 23:53:23 $
*
* $Log: RuleBasedTransliterator.java,v $
* Revision 1.10 2000/01/13 23:53:23 Alan
* Fix bugs found during ICU port
*
* Revision 1.9 2000/01/11 04:12:06 Alan
* Cleanup, embellish comments
*
* Revision 1.8 2000/01/11 02:25:03 Alan
* Rewrite UnicodeSet and RBT parsers for better performance and new syntax
*
* Revision 1.7 2000/01/06 01:36:36 Alan
* Allow string arrays in rule resource bundles
*
* Revision 1.6 2000/01/04 21:43:57 Alan
* Add rule indexing, and move masking check to TransliterationRuleSet.
*
* Revision 1.5 1999/12/22 01:40:54 Alan
* Consolidate rule pattern anteContext, key, and postContext into one string.
*
* Revision 1.4 1999/12/22 01:05:54 Alan
* Improve masking checking; turn it off by default, for better performance
*/
public class RuleBasedTransliterator extends Transliterator {
/**
* Direction constant passed to constructor to create a transliterator
* using the forward rules.
*/
public static final int FORWARD = 0;
/**
* Direction constant passed to constructor to create a transliterator
* using the reverse rules.
*/
public static final int REVERSE = 1;
private Data data;
static final boolean DEBUG = false;
private static final String COPYRIGHT =
"\u00A9 IBM Corporation 1999. All rights reserved.";
/**
* Constructs a new transliterator from the given rules.
* @param rules rules, separated by ';'
* @param direction either FORWARD or REVERSE.
* @exception IllegalArgumentException if rules are malformed
* or direction is invalid.
*/
public RuleBasedTransliterator(String ID, String rules, int direction,
UnicodeFilter filter) {
super(ID, filter);
if (direction != FORWARD && direction != REVERSE) {
throw new IllegalArgumentException("Invalid direction");
}
data = parse(rules, direction);
}
/**
* Constructs a new transliterator from the given rules in the
* <code>FORWARD</code> direction.
* @param rules rules, separated by ';'
* @exception IllegalArgumentException if rules are malformed
* or direction is invalid.
*/
public RuleBasedTransliterator(String ID, String rules) {
this(ID, rules, FORWARD, null);
}
RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) {
super(ID, filter);
this.data = data;
}
static Data parse(String[] rules, int direction) {
return new Parser(rules, direction).getData();
}
static Data parse(String rules, int direction) {
return parse(new String[] { rules }, direction);
}
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result buffer to receive the transliterated text; previous
* contents are discarded
*/
public void transliterate(String text, int start, int limit,
StringBuffer result) {
/* In the following loop there is a virtual buffer consisting of the
* text transliterated so far followed by the untransliterated text. There is
* also a cursor, which may be in the already transliterated buffer or just
* before the untransliterated text.
*
* Example: rules 1. ab>x|y
* 2. yc>z
*
* []|eabcd start - no match, copy e to tranlated buffer
* [e]|abcd match rule 1 - copy output & adjust cursor
* [ex|y]cd match rule 2 - copy output & adjust cursor
* [exz]|d no match, copy d to transliterated buffer
* [exzd]| done
*
* cursor: an index into the virtual buffer, 0..result.length()-1.
* Matches take place at the cursor. If there is no match, the cursor
* is advanced, and one character is moved from the source text to the
* result buffer.
*
* start, limit: these designate the substring of the source text which
* has not been processed yet. The range of offsets is start..limit-1.
* At any moment the virtual buffer consists of result +
* text.substring(start, limit).
*/
int cursor = 0;
result.setLength(0);
while (start < limit || cursor < result.length()) {
TransliterationRule r = data.ruleSet.findMatch(text, start, limit, result,
cursor, data.setVariables, getFilter());
if (DEBUG) {
StringBuffer buf = new StringBuffer(
result.toString() + '#' + text.substring(start, limit));
buf.insert(cursor <= result.length()
? cursor : (cursor + 1),
'|');
System.err.print((r == null ? "nomatch:" : ("match:" + r + ", "))
+ buf);
}
if (r == null) {
if (cursor == result.length()) {
result.append(text.charAt(start++));
}
++cursor;
} else {
// resultPad is length of result to right of cursor; >= 0
int resultPad = result.length() - cursor;
char[] tail = null;
if (r.getKeyLength() > resultPad) {
start += r.getKeyLength() - resultPad;
} else if (r.getKeyLength() < resultPad) {
tail = new char[resultPad - r.getKeyLength()];
result.getChars(cursor + r.getKeyLength(), result.length(),
tail, 0);
}
result.setLength(cursor);
result.append(r.getOutput());
if (tail != null) {
result.append(tail);
}
cursor += r.getCursorPos();
}
if (DEBUG) {
StringBuffer buf = new StringBuffer(
result.toString() + '#' + text.substring(start, limit));
buf.insert(cursor <= result.length()
? cursor : (cursor + 1),
'|');
System.err.println(" => " + buf);
}
}
}
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return The new limit index
*/
public int transliterate(Replaceable text, int start, int limit) {
/* When using Replaceable, the algorithm is simpler, since we don't have
* two separate buffers. We keep start and limit fixed the entire time,
* relative to the text -- limit may move numerically if text is
* inserted or removed. The cursor moves from start to limit, with
* replacements happening under it.
*
* Example: rules 1. ab>x|y
* 2. yc>z
*
* |eabcd start - no match, advance cursor
* e|abcd match rule 1 - change text & adjust cursor
* ex|ycd match rule 2 - change text & adjust cursor
* exz|d no match, advance cursor
* exzd| done
*/
int cursor = start;
while (cursor < limit) {
TransliterationRule r = data.ruleSet.findMatch(text, start, limit,
cursor, data.setVariables, getFilter());
if (r == null) {
++cursor;
} else {
text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
limit += r.getOutput().length() - r.getKeyLength();
cursor += r.getCursorPos();
}
}
return limit;
}
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
protected void handleKeyboardTransliterate(Replaceable text,
int[] index) {
int start = index[START];
int limit = index[LIMIT];
int cursor = index[CURSOR];
if (DEBUG) {
System.out.print("\"" +
escape(rsubstring(text, start, cursor)) + '|' +
escape(rsubstring(text, cursor, limit)) + "\"");
}
boolean partial[] = new boolean[1];
while (cursor < limit) {
TransliterationRule r = data.ruleSet.findIncrementalMatch(
text, start, limit, cursor, data.setVariables, partial, getFilter());
/* If we match a rule then apply it by replacing the key
* with the rule output and repositioning the cursor
* appropriately. If we get a partial match, then we
* can't do anything without more text; return with the
* cursor at the current position. If we get null, then
* there is no match at this position, and we can advance
* the cursor.
*/
if (r == null) {
if (partial[0]) {
break;
} else {
++cursor;
}
} else {
text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
limit += r.getOutput().length() - r.getKeyLength();
cursor += r.getCursorPos();
}
}
if (DEBUG) {
System.out.println(" -> \"" +
escape(rsubstring(text, start, cursor)) + '|' +
escape(rsubstring(text, cursor, cursor)) + '|' +
escape(rsubstring(text, cursor, limit)) + "\"");
}
index[LIMIT] = limit;
index[CURSOR] = cursor;
}
/**
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @return Maximum number of preceding context characters this
* transliterator needs to examine
*/
protected int getMaximumContextLength() {
return data.ruleSet.getMaximumContextLength();
}
/**
* FOR DEBUGGING: Return a substring of a Replaceable.
*/
private static String rsubstring(Replaceable r, int start, int limit) {
StringBuffer buf = new StringBuffer();
while (start < limit) {
buf.append(r.charAt(start++));
}
return buf.toString();
}
/**
* FOR DEBUGGING: Escape non-ASCII characters as Unicode.
*/
private static final String escape(String s) {
StringBuffer buf = new StringBuffer();
for (int i=0; i<s.length(); ++i) {
char c = s.charAt(i);
if (c >= ' ' && c <= 0x007F) {
if (c == '\\') {
buf.append("\\\\"); // That is, "\\"
} else {
buf.append(c);
}
} else {
buf.append("\\u");
if (c < 0x1000) {
buf.append('0');
if (c < 0x100) {
buf.append('0');
if (c < 0x10) {
buf.append('0');
}
}
}
buf.append(Integer.toHexString(c));
}
}
return buf.toString();
}
static class Data {
public Data() {
variableNames = new Hashtable();
setVariables = new Hashtable();
ruleSet = new TransliterationRuleSet();
}
/**
* Rule table. May be empty.
*/
public TransliterationRuleSet ruleSet;
/**
* Map variable name (String) to variable (Character). A variable
* name may correspond to a single literal character, in which
* case the character is stored in this hash. It may also
* correspond to a UnicodeSet, in which case a character is
* again stored in this hash, but the character is a stand-in: it
* is a key for a secondary lookup in data.setVariables. The stand-in
* also represents the UnicodeSet in the stored rules.
*/
public Hashtable variableNames;
/**
* Map category variable (Character) to set (UnicodeSet).
* Variables that correspond to a set of characters are mapped
* from variable name to a stand-in character in data.variableNames.
* The stand-in then serves as a key in this hash to lookup the
* actual UnicodeSet object. In addition, the stand-in is
* stored in the rule text to represent the set of characters.
*/
public Hashtable setVariables;
}
private static class Parser {
/**
* Current rule being parsed.
*/
private String rules;
private int direction;
private Data data;
/**
* The next available stand-in for variables. This starts at some point in
* the private use area (discovered dynamically) and increments up toward
* <code>variableLimit</code>. At any point during parsing, available
* variables are <code>variableNext..variableLimit-1</code>.
*/
private char variableNext;
/**
* The last available stand-in for variables. This is discovered
* dynamically. At any point during parsing, available variables are
* <code>variableNext..variableLimit-1</code>.
*/
private char variableLimit;
// Operators
private static final char VARIABLE_DEF_OP = '=';
private static final char FORWARD_RULE_OP = '>';
private static final char REVERSE_RULE_OP = '<';
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
private static final String OPERATORS = "=><";
// Other special characters
private static final char QUOTE = '\'';
private static final char ESCAPE = '\\';
private static final char END_OF_RULE = ';';
private static final char RULE_COMMENT_CHAR = '#';
private static final char VARIABLE_REF_OPEN = '{';
private static final char VARIABLE_REF_CLOSE = '}';
private static final char CONTEXT_OPEN = '(';
private static final char CONTEXT_CLOSE = ')';
private static final char SET_OPEN = '[';
private static final char SET_CLOSE = ']';
private static final char CURSOR_POS = '|';
/**
* @param rules list of rules, separated by semicolon characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
public Parser(String[] ruleArray, int direction) {
this.direction = direction;
data = new Data();
parseRules(ruleArray);
}
public Data getData() {
return data;
}
/**
* Parse an array of zero or more rules. The strings in the array are
* treated as if they were concatenated together, with rule terminators
* inserted between array elements if not present already.
*
* Any previous rules are discarded. Typically this method is called exactly
* once, during construction.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
private void parseRules(String[] ruleArray) {
determineVariableRange(ruleArray);
StringBuffer errors = null;
try {
for (int i=0; i<ruleArray.length; ++i) {
String rule = ruleArray[i];
int pos = 0;
int limit = rule.length();
while (pos < limit) {
char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) {
// Ignore leading whitespace. Note that this is not
// Unicode spaces, but Java spaces -- a subset,
// representing whitespace likely to be seen in code.
continue;
}
// Skip lines starting with the comment character
if (c == RULE_COMMENT_CHAR) {
pos = rule.indexOf("\n", pos) + 1;
if (pos == 0) {
break; // No "\n" found; rest of rule is a commnet
}
continue; // Either fall out or restart with next line
}
// We've found the start of a rule. c is its first
// character, and pos points past c. Lexically parse the
// rule into component pieces.
pos = parseRule(rule, --pos, limit);
}
}
} catch (IllegalArgumentException e) {
// errors = new StringBuffer(e.getMessage());
}
// Index the rules
try {
data.ruleSet.freeze(data.setVariables);
} catch (IllegalArgumentException e) {
if (errors == null) {
errors = new StringBuffer(e.getMessage());
} else {
errors.append("\n").append(e.getMessage());
}
}
if (errors != null) {
throw new IllegalArgumentException(errors.toString());
}
}
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not
* parse characters at or after limit.
*
* Important: The character at pos must be a non-whitespace character
* that is not the comment character.
*
* This method handles quoting, escaping, and whitespace removal. It
* parses the end-of-rule character. It recognizes context and cursor
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
*/
private int parseRule(String rule, int pos, int limit) {
// Locate the left side, operator, and right side
int start = pos;
char operator = 0;
StringBuffer buf = new StringBuffer();
int cursor = -1; // position of cursor in buf
int ante = -1; // position of ante context marker ')' in buf
int post = -1; // position of post context marker '(' in buf
int postClose = -1; // position of post context close ')' in buf
// Assigned to buf and its adjuncts after the LHS has been
// parsed. Thereafter, buf etc. refer to the RHS.
String left = null;
int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
main:
while (pos < limit) {
char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode
// spaces, but Java spaces -- a subset, representing
// whitespace likely to be seen in code.
continue;
}
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
syntaxError("Trailing backslash", rule, start);
}
buf.append(rule.charAt(pos++));
continue;
}
// Handle quoted matter
if (c == QUOTE) {
int iq = rule.indexOf(QUOTE, pos);
if (iq == pos) {
buf.append(c); // Parse [''] outside quotes as [']
++pos;
} else {
/* This loop picks up a segment of quoted text of the
* form 'aaaa' each time through. If this segment
* hasn't really ended ('aaaa''bbbb') then it keeps
* looping, each time adding on a new segment. When it
* reaches the final quote it breaks.
*/
for (;;) {
if (iq < 0) {
syntaxError("Unterminated quote", rule, start);
}
buf.append(rule.substring(pos, iq));
pos = iq+1;
if (pos < limit && rule.charAt(pos) == QUOTE) {
// Parse [''] inside quotes as [']
iq = rule.indexOf(QUOTE, pos+1);
// Continue looping
} else {
break;
}
}
}
continue;
}
if (OPERATORS.indexOf(c) >= 0) {
if (operator != 0) {
syntaxError("Unquoted " + c, rule, start);
}
// Found an operator char. Check for forward-reverse operator.
if (c == REVERSE_RULE_OP &&
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
++pos;
operator = FWDREV_RULE_OP;
} else {
operator = c;
}
left = buf.toString(); // lhs
leftCursor = cursor;
leftAnte = ante;
leftPost = post;
leftPostClose = postClose;
buf.setLength(0);
cursor = ante = post = postClose = -1;
continue;
}
switch (c) {
case END_OF_RULE:
break main;
case VARIABLE_REF_OPEN:
{
int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
if (pos == j || j < 0) { // empty or unterminated
syntaxError("Malformed variable reference", rule, start);
}
String name = rule.substring(pos, j);
pos = j+1;
buf.append(getVariableDef(name));
}
break;
case CONTEXT_OPEN:
if (post >= 0) {
syntaxError("Multiple post contexts", rule, start);
}
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
// this is the optional opening delimiter for the ante context.
if (buf.length() > 0) {
post = buf.length();
}
break;
case CONTEXT_CLOSE:
if (postClose >= 0) {
syntaxError("Unexpected " + c, rule, start);
}
if (post >= 0) {
// This is probably the optional closing delimiter
// for the post context; save the pos and check later.
postClose = buf.length();
} else if (ante >= 0) {
syntaxError("Multiple ante contexts", rule, start);
} else {
ante = buf.length();
}
break;
case SET_OPEN:
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
buf.append(registerSet(new UnicodeSet(rule, pp,
data.variableNames, data.setVariables)));
pos = pp.getIndex();
break;
case VARIABLE_REF_CLOSE:
case SET_CLOSE:
syntaxError("Unquoted " + c, rule, start);
case CURSOR_POS:
if (cursor >= 0) {
syntaxError("Multiple cursors", rule, start);
}
cursor = buf.length();
break;
default:
buf.append(c);
break;
}
}
if (operator == 0) {
syntaxError("No operator", rule, start);
}
// Check context close parameters
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
(postClose >= 0 && postClose != buf.length())) {
syntaxError("Extra text after ]", rule, start);
}
// Context is only allowed on the input side; that is, the left side
// for forward rules. Cursors are only allowed on the output side;
// that is, the right side for forward rules. Bidirectional rules
// ignore elements that do not apply.
switch (operator) {
case VARIABLE_DEF_OP:
// LHS is the name. RHS is a single character, either a literal
// or a set (already parsed). If RHS is longer than one
// character, it is either a multi-character string, or multiple
// sets, or a mixture of chars and sets -- syntax error.
if (buf.length() != 1) {
syntaxError("Malformed RHS", rule, start);
}
if (data.variableNames.get(left) != null) {
syntaxError("Duplicate definition of {" +
left + "}", rule, start);
}
data.variableNames.put(left, new Character(buf.charAt(0)));
break;
case FORWARD_RULE_OP:
if (direction == FORWARD) {
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
syntaxError("Malformed rule", rule, start);
}
data.ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
buf.toString(), cursor));
} // otherwise ignore the rule; it's not the direction we want
break;
case REVERSE_RULE_OP:
if (direction == REVERSE) {
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
syntaxError("Malformed rule", rule, start);
}
data.ruleSet.addRule(new TransliterationRule(
buf.toString(), ante, post,
left, leftCursor));
} // otherwise ignore the rule; it's not the direction we want
break;
case FWDREV_RULE_OP:
if (direction == FORWARD) {
// The output side is the right; trim off any context
String output = buf.toString().substring(ante < 0 ? 0 : ante,
post < 0 ? buf.length() : post);
data.ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
output, cursor));
} else {
// The output side is the left; trim off any context
String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
leftPost < 0 ? left.length() : leftPost);
data.ruleSet.addRule(new TransliterationRule(
buf.toString(), ante, post,
output, leftCursor));
}
break;
}
return pos;
}
/**
* Throw an exception indicating a syntax error. Search the rule string
* for the probable end of the rule. Of course, if the error is that
* the end of rule marker is missing, then the rule end will not be found.
* In any case the rule start will be correctly reported.
* @param msg error description
* @param rule pattern string
* @param start position of first character of current rule
*/
private static final void syntaxError(String msg, String rule, int start) {
int end = quotedIndexOf(rule, start, rule.length(), ";");
if (end < 0) {
end = rule.length();
}
throw new IllegalArgumentException(msg + " in " +
rule.substring(start, end));
}
/**
* Allocate a private-use substitution character for the given set,
* register it in the setVariables hash, and return the substitution
* character.
*/
private final char registerSet(UnicodeSet set) {
if (variableNext >= variableLimit) {
throw new RuntimeException("Private use variables exhausted");
}
Character c = new Character(variableNext++);
data.setVariables.put(c, set);
return c.charValue();
}
/**
* Returns the single character value of the given variable name. Defined
* names are recognized.
* @exception IllegalArgumentException if the name is unknown.
*/
private char getVariableDef(String name) {
Character ch = (Character) data.variableNames.get(name);
if (ch == null) {
throw new IllegalArgumentException("Undefined variable: "
+ name);
}
return ch.charValue();
}
/**
* Determines what part of the private use region of Unicode we can use for
* variable stand-ins. The correct way to do this is as follows: Parse each
* rule, and for forward and reverse rules, take the FROM expression, and
* make a hash of all characters used. The TO expression should be ignored.
* When done, everything not in the hash is available for use. In practice,
* this method may employ some other algorithm for improved speed.
*/
private final void determineVariableRange(String[] ruleArray) {
// As an initial implementation, we just run through all the
// characters, ignoring any quoting. This works since the quote
// mechanisms are outside the private use area.
Range r = new Range('\uE000', 0x1900); // Private use area
r = r.largestUnusedSubrange(ruleArray);
if (r == null) {
throw new RuntimeException(
"No private use characters available for variables");
}
variableNext = r.start;
variableLimit = (char) (r.start + r.length);
if (variableNext >= variableLimit) {
throw new RuntimeException(
"Too few private use characters available for variables");
}
}
/**
* Returns the index of the first character in a set, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for "h". Unlike String.indexOf(), this method searches
* not for a single character, but for any character of the string
* <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #indexOf
*/
private static int quotedIndexOf(String text, int start, int limit,
String setOfChars) {
for (int i=start; i<limit; ++i) {
char c = text.charAt(i);
if (c == ESCAPE) {
++i;
} else if (c == QUOTE) {
while (++i < limit
&& text.charAt(i) != QUOTE) {}
} else if (setOfChars.indexOf(c) >= 0) {
return i;
}
}
return -1;
}
/**
* A range of Unicode characters. Support the operations of testing for
* inclusion (does this range contain this character?) and splitting.
* Splitting involves breaking a range into two smaller ranges around a
* character inside the original range. The split character is not included
* in either range. If the split character is at either extreme end of the
* range, one of the split products is an empty range.
*
* This class is used internally to determine the largest available private
* use character range for variable stand-ins.
*/
private static class Range implements Cloneable {
char start;
int length;
Range(char start, int length) {
this.start = start;
this.length = length;
}
public Object clone() {
return new Range(start, length);
}
boolean contains(char c) {
return c >= start && (c - start) < length;
}
/**
* Assume that contains(c) is true. Split this range into two new
* ranges around the character c. Make this range one of the new ranges
* (modify it in place) and return the other new range. The character
* itself is not included in either range. If the split results in an
* empty range (that is, if c == start or c == start + length - 1) then
* return null.
*/
Range split(char c) {
if (c == start) {
++start;
--length;
return null;
} else if (c - start == length - 1) {
--length;
return null;
} else {
++c;
Range r = new Range(c, start + length - c);
length = --c - start;
return r;
}
}
/**
* Finds the largest unused subrange by the given string. A
* subrange is unused by a string if the string contains no
* characters in that range. If the given string contains no
* characters in this range, then this range itself is
* returned.
*/
Range largestUnusedSubrange(String[] strings) {
Vector v = new Vector(1);
v.addElement(clone());
for (int k=0; k<strings.length; ++k) {
String str = strings[k];
int n = str.length();
for (int i=0; i<n; ++i) {
char c = str.charAt(i);
if (contains(c)) {
for (int j=0; j<v.size(); ++j) {
Range r = (Range) v.elementAt(j);
if (r.contains(c)) {
r = r.split(c);
if (r != null) {
v.addElement(r);
}
break;
}
}
}
}
}
Range bestRange = null;
for (int j=0; j<v.size(); ++j) {
Range r = (Range) v.elementAt(j);
if (bestRange == null || r.length > bestRange.length) {
bestRange = r;
}
}
return bestRange;
}
}
}
}