fb7313675a
X-SVN-Rev: 575
1117 lines
46 KiB
Java
Executable File
1117 lines
46 KiB
Java
Executable File
package com.ibm.text;
|
|
|
|
import java.util.Hashtable;
|
|
import java.util.Vector;
|
|
import java.text.ParsePosition;
|
|
|
|
/**
|
|
* A transliterator that reads a set of rules in order to determine how to perform
|
|
* translations. Rules are stored in resource bundles indexed by name. Rules are separated by
|
|
* semicolons (';'). To include a literal semicolon, prefix it with a backslash ('\;').
|
|
* Whitespace, as defined by <code>Character.isWhitespace()</code>, is ignored. If the first
|
|
* non-blank character on a line is '#', the entire line is ignored as a comment. </p>
|
|
*
|
|
* <p>Each set of rules consists of two groups, one forward, and one reverse. This is a
|
|
* convention that is not enforced; rules for one direction may be omitted, with the result
|
|
* that translations in that direction will not modify the source text. </p>
|
|
*
|
|
* <p><b>Rule syntax</b> </p>
|
|
*
|
|
* <p>Rule statements take one of the following forms:
|
|
*
|
|
* <dl>
|
|
* <dt><code>alefmadda=\u0622</code></dt>
|
|
* <dd><strong>Variable definition.</strong> The name on the left is assigned the character or
|
|
* expression on the right. Names may not contain any special characters (see list below).
|
|
* Duplicate names (including duplicates of simple variables or category names) cause an
|
|
* exception to be thrown. If the right hand side consists of one character, then the
|
|
* variable stands for that character. In this example, after this statement, instances of
|
|
* the left hand name surrounded by braces, "<code>{alefmadda}</code>", will be
|
|
* replaced by the Unicode character U+0622. If the right hand side is longer than one
|
|
* character, then it is interpreted as a character category expression; see below for
|
|
* details.</dd>
|
|
* <dt> </dt>
|
|
* <dt><code>softvowel=[eiyEIY]</code></dt>
|
|
* <dd><strong>Category definition.</strong> The name on the left is assigned to stand for a
|
|
* set of characters. The same rules for names of simple variables apply. After this
|
|
* statement, the left hand variable will be interpreted as indicating a set of characters in
|
|
* appropriate contexts. The pattern syntax defining sets of characters is defined by {@link
|
|
* UnicodeSet}. Examples of valid patterns are:<table>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>[abc]</code></td>
|
|
* <td>The set containing the characters 'a', 'b', and 'c'.</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>[^abc]</code></td>
|
|
* <td>The set of all characters <em>except</em> 'a', 'b', and 'c'.</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>[A-Z]</code></td>
|
|
* <td>The set of all characters from 'A' to 'Z' in Unicode order.</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>[:Lu:]</code></td>
|
|
* <td>The set of Unicode uppercase letters. See <a href="http://www.unicode.org">www.unicode.org</a>
|
|
* for a complete list of categories and their two-letter codes.</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
|
|
* <td>The set of all characters <em>except</em> 'a' through 'z' and uppercase or lowercase
|
|
* letters.</td>
|
|
* </tr>
|
|
* </table>
|
|
* <p>See {@link UnicodeSet} for more documentation and examples. </p>
|
|
* </dd>
|
|
* <dt><code>ai>{alefmadda}</code></dt>
|
|
* <dd><strong>Forward translation rule.</strong> This rule states that the string on the left
|
|
* will be changed to the string on the right when performing forward transliteration.</dd>
|
|
* <dt> </dt>
|
|
* <dt><code>ai<{alefmadda}</code></dt>
|
|
* <dd><strong>Reverse translation rule.</strong> This rule states that the string on the right
|
|
* will be changed to the string on the left when performing reverse transliteration.</dd>
|
|
* </dl>
|
|
*
|
|
* <dl>
|
|
* <dt><code>ai<>{alefmadda}</code></dt>
|
|
* <dd><strong>Bidirectional translation rule.</strong> This rule states that the string on the
|
|
* right will be changed to the string on the left when performing forward transliteration,
|
|
* and vice versa when performing reverse transliteration.</dd>
|
|
* </dl>
|
|
*
|
|
* <p>Forward and reverse translation rules consist of a <em>match pattern</em> and an <em>output
|
|
* string</em>. The match pattern consists of literal characters, optionally preceded by
|
|
* context, and optionally followed by context. Context characters, like literal pattern
|
|
* characters, must be matched in the text being transliterated. However, unlike literal
|
|
* pattern characters, they are not replaced by the output text. For example, the pattern
|
|
* "<code>(abc)def</code>" indicates the characters "<code>def</code>"
|
|
* must be preceded by "<code>abc</code>" for a successful match. If there is a
|
|
* successful match, "<code>def</code>" will be replaced, but not "<code>abc</code>".
|
|
* The initial '<code>(</code>' is optional, so "<code>abc)def</code>" is
|
|
* equivalent to "<code>(abc)def</code>". Another example is "<code>123(456)</code>"
|
|
* (or "<code>123(456</code>") in which the literal pattern "<code>123</code>"
|
|
* must be followed by "<code>456</code>". </p>
|
|
*
|
|
* <p>The output string of a forward or reverse rule consists of characters to replace the
|
|
* literal pattern characters. If the output string contains the character '<code>|</code>',
|
|
* this is taken to indicate the location of the <em>cursor</em> after replacement. The
|
|
* cursor is the point in the text at which the next replacement, if any, will be applied. </p>
|
|
*
|
|
* <p>In addition to being defined in variables, <code>UnicodeSet</code> patterns may be
|
|
* embedded directly into rule strings. Thus, the following two rules are equivalent:</p>
|
|
*
|
|
* <blockquote>
|
|
* <p><code>vowel=[aeiou]; {vowel}>*; # One way to do this<br>
|
|
* [aeiou]>*;
|
|
* #
|
|
* Another way</code></p>
|
|
* </blockquote>
|
|
*
|
|
* <p><b>Example</b> </p>
|
|
*
|
|
* <p>The following example rules illustrate many of the features of the rule language. </p>
|
|
*
|
|
* <table cellpadding="4">
|
|
* <tr valign="top">
|
|
* <td>Rule 1.</td>
|
|
* <td nowrap><code>(abc)def>x|y</code></td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td>Rule 2.</td>
|
|
* <td nowrap><code>xyz>r</code></td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td>Rule 3.</td>
|
|
* <td nowrap><code>yz>q</code></td>
|
|
* </tr>
|
|
* </table>
|
|
*
|
|
* <p>Applying these rules to the string "<code>adefabcdefz</code>" yields the
|
|
* following results: </p>
|
|
*
|
|
* <table cellpadding="4">
|
|
* <tr valign="top">
|
|
* <td nowrap><code>|adefabcdefz</code></td>
|
|
* <td>Initial state, no rules match. Advance cursor.</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>a|defabcdefz</code></td>
|
|
* <td>Still no match. Rule 1 does not match because the preceding context is not present.</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>ad|efabcdefz</code></td>
|
|
* <td>Still no match. Keep advancing until there is a match...</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>ade|fabcdefz</code></td>
|
|
* <td>...</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>adef|abcdefz</code></td>
|
|
* <td>...</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>adefa|bcdefz</code></td>
|
|
* <td>...</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>adefab|cdefz</code></td>
|
|
* <td>...</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>adefabc|defz</code></td>
|
|
* <td>Rule 1 matches; replace "<code>def</code>" with "<code>xy</code>"
|
|
* and back up the cursor to before the '<code>y</code>'.</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>adefabcx|yz</code></td>
|
|
* <td>Although "<code>xyz</code>" is present, rule 2 does not match because the
|
|
* cursor is before the '<code>y</code>', not before the '<code>x</code>'. Rule 3 does match.
|
|
* Replace "<code>yz</code>" with "<code>q</code>".</td>
|
|
* </tr>
|
|
* <tr valign="top">
|
|
* <td nowrap><code>adefabcxq|</code></td>
|
|
* <td>The cursor is at the end; transliteration is complete.</td>
|
|
* </tr>
|
|
* </table>
|
|
*
|
|
* <p>The order of rules is significant. If multiple rules may match at some point, the first
|
|
* matching rule is applied. </p>
|
|
*
|
|
* <p>Forward and reverse rules may have an empty output string. Otherwise, an empty left or
|
|
* right hand side of any statement is a syntax error. </p>
|
|
*
|
|
* <p>Single quotes are used to quote the special characters <code>=><{}[]()|</code>.
|
|
* To specify a single quote itself, inside or outside of quotes, use two single quotes in a
|
|
* row. For example, the rule "<code>'>'>o''clock</code>" changes the string
|
|
* "<code>></code>" to the string "<code>o'clock</code>". </p>
|
|
*
|
|
* <p><b>Notes</b> </p>
|
|
*
|
|
* <p>While a RuleBasedTransliterator is being built, it checks that the rules are added in
|
|
* proper order. For example, if the rule "a>x" is followed by the rule
|
|
* "ab>y", then the second rule will throw an exception. The reason is that the
|
|
* second rule can never be triggered, since the first rule always matches anything it
|
|
* matches. In other words, the first rule <em>masks</em> the second rule. </p>
|
|
*
|
|
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
|
*
|
|
* @author Alan Liu
|
|
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.10 $ $Date: 2000/01/13 23:53:23 $
|
|
*
|
|
* $Log: RuleBasedTransliterator.java,v $
|
|
* Revision 1.10 2000/01/13 23:53:23 Alan
|
|
* Fix bugs found during ICU port
|
|
*
|
|
* Revision 1.9 2000/01/11 04:12:06 Alan
|
|
* Cleanup, embellish comments
|
|
*
|
|
* Revision 1.8 2000/01/11 02:25:03 Alan
|
|
* Rewrite UnicodeSet and RBT parsers for better performance and new syntax
|
|
*
|
|
* Revision 1.7 2000/01/06 01:36:36 Alan
|
|
* Allow string arrays in rule resource bundles
|
|
*
|
|
* Revision 1.6 2000/01/04 21:43:57 Alan
|
|
* Add rule indexing, and move masking check to TransliterationRuleSet.
|
|
*
|
|
* Revision 1.5 1999/12/22 01:40:54 Alan
|
|
* Consolidate rule pattern anteContext, key, and postContext into one string.
|
|
*
|
|
* Revision 1.4 1999/12/22 01:05:54 Alan
|
|
* Improve masking checking; turn it off by default, for better performance
|
|
*/
|
|
public class RuleBasedTransliterator extends Transliterator {
|
|
/**
|
|
* Direction constant passed to constructor to create a transliterator
|
|
* using the forward rules.
|
|
*/
|
|
public static final int FORWARD = 0;
|
|
|
|
/**
|
|
* Direction constant passed to constructor to create a transliterator
|
|
* using the reverse rules.
|
|
*/
|
|
public static final int REVERSE = 1;
|
|
|
|
private Data data;
|
|
|
|
static final boolean DEBUG = false;
|
|
|
|
private static final String COPYRIGHT =
|
|
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
|
|
|
/**
|
|
* Constructs a new transliterator from the given rules.
|
|
* @param rules rules, separated by ';'
|
|
* @param direction either FORWARD or REVERSE.
|
|
* @exception IllegalArgumentException if rules are malformed
|
|
* or direction is invalid.
|
|
*/
|
|
public RuleBasedTransliterator(String ID, String rules, int direction,
|
|
UnicodeFilter filter) {
|
|
super(ID, filter);
|
|
if (direction != FORWARD && direction != REVERSE) {
|
|
throw new IllegalArgumentException("Invalid direction");
|
|
}
|
|
data = parse(rules, direction);
|
|
}
|
|
|
|
/**
|
|
* Constructs a new transliterator from the given rules in the
|
|
* <code>FORWARD</code> direction.
|
|
* @param rules rules, separated by ';'
|
|
* @exception IllegalArgumentException if rules are malformed
|
|
* or direction is invalid.
|
|
*/
|
|
public RuleBasedTransliterator(String ID, String rules) {
|
|
this(ID, rules, FORWARD, null);
|
|
}
|
|
|
|
RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) {
|
|
super(ID, filter);
|
|
this.data = data;
|
|
}
|
|
|
|
static Data parse(String[] rules, int direction) {
|
|
return new Parser(rules, direction).getData();
|
|
}
|
|
|
|
static Data parse(String rules, int direction) {
|
|
return parse(new String[] { rules }, direction);
|
|
}
|
|
|
|
/**
|
|
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
|
* @param text the string to be transliterated
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= text.length()</code>.
|
|
* @param result buffer to receive the transliterated text; previous
|
|
* contents are discarded
|
|
*/
|
|
public void transliterate(String text, int start, int limit,
|
|
StringBuffer result) {
|
|
/* In the following loop there is a virtual buffer consisting of the
|
|
* text transliterated so far followed by the untransliterated text. There is
|
|
* also a cursor, which may be in the already transliterated buffer or just
|
|
* before the untransliterated text.
|
|
*
|
|
* Example: rules 1. ab>x|y
|
|
* 2. yc>z
|
|
*
|
|
* []|eabcd start - no match, copy e to tranlated buffer
|
|
* [e]|abcd match rule 1 - copy output & adjust cursor
|
|
* [ex|y]cd match rule 2 - copy output & adjust cursor
|
|
* [exz]|d no match, copy d to transliterated buffer
|
|
* [exzd]| done
|
|
*
|
|
* cursor: an index into the virtual buffer, 0..result.length()-1.
|
|
* Matches take place at the cursor. If there is no match, the cursor
|
|
* is advanced, and one character is moved from the source text to the
|
|
* result buffer.
|
|
*
|
|
* start, limit: these designate the substring of the source text which
|
|
* has not been processed yet. The range of offsets is start..limit-1.
|
|
* At any moment the virtual buffer consists of result +
|
|
* text.substring(start, limit).
|
|
*/
|
|
int cursor = 0;
|
|
result.setLength(0);
|
|
while (start < limit || cursor < result.length()) {
|
|
TransliterationRule r = data.ruleSet.findMatch(text, start, limit, result,
|
|
cursor, data.setVariables, getFilter());
|
|
if (DEBUG) {
|
|
StringBuffer buf = new StringBuffer(
|
|
result.toString() + '#' + text.substring(start, limit));
|
|
buf.insert(cursor <= result.length()
|
|
? cursor : (cursor + 1),
|
|
'|');
|
|
System.err.print((r == null ? "nomatch:" : ("match:" + r + ", "))
|
|
+ buf);
|
|
}
|
|
|
|
if (r == null) {
|
|
if (cursor == result.length()) {
|
|
result.append(text.charAt(start++));
|
|
}
|
|
++cursor;
|
|
} else {
|
|
// resultPad is length of result to right of cursor; >= 0
|
|
int resultPad = result.length() - cursor;
|
|
char[] tail = null;
|
|
if (r.getKeyLength() > resultPad) {
|
|
start += r.getKeyLength() - resultPad;
|
|
} else if (r.getKeyLength() < resultPad) {
|
|
tail = new char[resultPad - r.getKeyLength()];
|
|
result.getChars(cursor + r.getKeyLength(), result.length(),
|
|
tail, 0);
|
|
}
|
|
result.setLength(cursor);
|
|
result.append(r.getOutput());
|
|
if (tail != null) {
|
|
result.append(tail);
|
|
}
|
|
cursor += r.getCursorPos();
|
|
}
|
|
|
|
if (DEBUG) {
|
|
StringBuffer buf = new StringBuffer(
|
|
result.toString() + '#' + text.substring(start, limit));
|
|
buf.insert(cursor <= result.length()
|
|
? cursor : (cursor + 1),
|
|
'|');
|
|
System.err.println(" => " + buf);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
|
* @param text the string to be transliterated
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= text.length()</code>.
|
|
* @return The new limit index
|
|
*/
|
|
public int transliterate(Replaceable text, int start, int limit) {
|
|
/* When using Replaceable, the algorithm is simpler, since we don't have
|
|
* two separate buffers. We keep start and limit fixed the entire time,
|
|
* relative to the text -- limit may move numerically if text is
|
|
* inserted or removed. The cursor moves from start to limit, with
|
|
* replacements happening under it.
|
|
*
|
|
* Example: rules 1. ab>x|y
|
|
* 2. yc>z
|
|
*
|
|
* |eabcd start - no match, advance cursor
|
|
* e|abcd match rule 1 - change text & adjust cursor
|
|
* ex|ycd match rule 2 - change text & adjust cursor
|
|
* exz|d no match, advance cursor
|
|
* exzd| done
|
|
*/
|
|
int cursor = start;
|
|
while (cursor < limit) {
|
|
TransliterationRule r = data.ruleSet.findMatch(text, start, limit,
|
|
cursor, data.setVariables, getFilter());
|
|
if (r == null) {
|
|
++cursor;
|
|
} else {
|
|
text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
|
|
limit += r.getOutput().length() - r.getKeyLength();
|
|
cursor += r.getCursorPos();
|
|
}
|
|
}
|
|
return limit;
|
|
}
|
|
|
|
/**
|
|
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
|
*/
|
|
protected void handleKeyboardTransliterate(Replaceable text,
|
|
int[] index) {
|
|
int start = index[START];
|
|
int limit = index[LIMIT];
|
|
int cursor = index[CURSOR];
|
|
|
|
if (DEBUG) {
|
|
System.out.print("\"" +
|
|
escape(rsubstring(text, start, cursor)) + '|' +
|
|
escape(rsubstring(text, cursor, limit)) + "\"");
|
|
}
|
|
|
|
boolean partial[] = new boolean[1];
|
|
|
|
while (cursor < limit) {
|
|
TransliterationRule r = data.ruleSet.findIncrementalMatch(
|
|
text, start, limit, cursor, data.setVariables, partial, getFilter());
|
|
/* If we match a rule then apply it by replacing the key
|
|
* with the rule output and repositioning the cursor
|
|
* appropriately. If we get a partial match, then we
|
|
* can't do anything without more text; return with the
|
|
* cursor at the current position. If we get null, then
|
|
* there is no match at this position, and we can advance
|
|
* the cursor.
|
|
*/
|
|
if (r == null) {
|
|
if (partial[0]) {
|
|
break;
|
|
} else {
|
|
++cursor;
|
|
}
|
|
} else {
|
|
text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
|
|
limit += r.getOutput().length() - r.getKeyLength();
|
|
cursor += r.getCursorPos();
|
|
}
|
|
}
|
|
|
|
if (DEBUG) {
|
|
System.out.println(" -> \"" +
|
|
escape(rsubstring(text, start, cursor)) + '|' +
|
|
escape(rsubstring(text, cursor, cursor)) + '|' +
|
|
escape(rsubstring(text, cursor, limit)) + "\"");
|
|
}
|
|
|
|
index[LIMIT] = limit;
|
|
index[CURSOR] = cursor;
|
|
}
|
|
|
|
/**
|
|
* Returns the length of the longest context required by this transliterator.
|
|
* This is <em>preceding</em> context.
|
|
* @return Maximum number of preceding context characters this
|
|
* transliterator needs to examine
|
|
*/
|
|
protected int getMaximumContextLength() {
|
|
return data.ruleSet.getMaximumContextLength();
|
|
}
|
|
|
|
|
|
/**
|
|
* FOR DEBUGGING: Return a substring of a Replaceable.
|
|
*/
|
|
private static String rsubstring(Replaceable r, int start, int limit) {
|
|
StringBuffer buf = new StringBuffer();
|
|
while (start < limit) {
|
|
buf.append(r.charAt(start++));
|
|
}
|
|
return buf.toString();
|
|
}
|
|
|
|
/**
|
|
* FOR DEBUGGING: Escape non-ASCII characters as Unicode.
|
|
*/
|
|
private static final String escape(String s) {
|
|
StringBuffer buf = new StringBuffer();
|
|
for (int i=0; i<s.length(); ++i) {
|
|
char c = s.charAt(i);
|
|
if (c >= ' ' && c <= 0x007F) {
|
|
if (c == '\\') {
|
|
buf.append("\\\\"); // That is, "\\"
|
|
} else {
|
|
buf.append(c);
|
|
}
|
|
} else {
|
|
buf.append("\\u");
|
|
if (c < 0x1000) {
|
|
buf.append('0');
|
|
if (c < 0x100) {
|
|
buf.append('0');
|
|
if (c < 0x10) {
|
|
buf.append('0');
|
|
}
|
|
}
|
|
}
|
|
buf.append(Integer.toHexString(c));
|
|
}
|
|
}
|
|
return buf.toString();
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static class Data {
|
|
public Data() {
|
|
variableNames = new Hashtable();
|
|
setVariables = new Hashtable();
|
|
ruleSet = new TransliterationRuleSet();
|
|
}
|
|
|
|
/**
|
|
* Rule table. May be empty.
|
|
*/
|
|
public TransliterationRuleSet ruleSet;
|
|
|
|
/**
|
|
* Map variable name (String) to variable (Character). A variable
|
|
* name may correspond to a single literal character, in which
|
|
* case the character is stored in this hash. It may also
|
|
* correspond to a UnicodeSet, in which case a character is
|
|
* again stored in this hash, but the character is a stand-in: it
|
|
* is a key for a secondary lookup in data.setVariables. The stand-in
|
|
* also represents the UnicodeSet in the stored rules.
|
|
*/
|
|
public Hashtable variableNames;
|
|
|
|
/**
|
|
* Map category variable (Character) to set (UnicodeSet).
|
|
* Variables that correspond to a set of characters are mapped
|
|
* from variable name to a stand-in character in data.variableNames.
|
|
* The stand-in then serves as a key in this hash to lookup the
|
|
* actual UnicodeSet object. In addition, the stand-in is
|
|
* stored in the rule text to represent the set of characters.
|
|
*/
|
|
public Hashtable setVariables;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private static class Parser {
|
|
/**
|
|
* Current rule being parsed.
|
|
*/
|
|
private String rules;
|
|
|
|
private int direction;
|
|
|
|
private Data data;
|
|
|
|
/**
|
|
* The next available stand-in for variables. This starts at some point in
|
|
* the private use area (discovered dynamically) and increments up toward
|
|
* <code>variableLimit</code>. At any point during parsing, available
|
|
* variables are <code>variableNext..variableLimit-1</code>.
|
|
*/
|
|
private char variableNext;
|
|
|
|
/**
|
|
* The last available stand-in for variables. This is discovered
|
|
* dynamically. At any point during parsing, available variables are
|
|
* <code>variableNext..variableLimit-1</code>.
|
|
*/
|
|
private char variableLimit;
|
|
|
|
// Operators
|
|
private static final char VARIABLE_DEF_OP = '=';
|
|
private static final char FORWARD_RULE_OP = '>';
|
|
private static final char REVERSE_RULE_OP = '<';
|
|
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
|
|
|
|
private static final String OPERATORS = "=><";
|
|
|
|
// Other special characters
|
|
private static final char QUOTE = '\'';
|
|
private static final char ESCAPE = '\\';
|
|
private static final char END_OF_RULE = ';';
|
|
private static final char RULE_COMMENT_CHAR = '#';
|
|
|
|
private static final char VARIABLE_REF_OPEN = '{';
|
|
private static final char VARIABLE_REF_CLOSE = '}';
|
|
private static final char CONTEXT_OPEN = '(';
|
|
private static final char CONTEXT_CLOSE = ')';
|
|
private static final char SET_OPEN = '[';
|
|
private static final char SET_CLOSE = ']';
|
|
private static final char CURSOR_POS = '|';
|
|
|
|
/**
|
|
* @param rules list of rules, separated by semicolon characters
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
public Parser(String[] ruleArray, int direction) {
|
|
this.direction = direction;
|
|
data = new Data();
|
|
parseRules(ruleArray);
|
|
}
|
|
|
|
public Data getData() {
|
|
return data;
|
|
}
|
|
|
|
/**
|
|
* Parse an array of zero or more rules. The strings in the array are
|
|
* treated as if they were concatenated together, with rule terminators
|
|
* inserted between array elements if not present already.
|
|
*
|
|
* Any previous rules are discarded. Typically this method is called exactly
|
|
* once, during construction.
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
private void parseRules(String[] ruleArray) {
|
|
determineVariableRange(ruleArray);
|
|
|
|
StringBuffer errors = null;
|
|
|
|
try {
|
|
for (int i=0; i<ruleArray.length; ++i) {
|
|
String rule = ruleArray[i];
|
|
int pos = 0;
|
|
int limit = rule.length();
|
|
while (pos < limit) {
|
|
char c = rule.charAt(pos++);
|
|
if (Character.isWhitespace(c)) {
|
|
// Ignore leading whitespace. Note that this is not
|
|
// Unicode spaces, but Java spaces -- a subset,
|
|
// representing whitespace likely to be seen in code.
|
|
continue;
|
|
}
|
|
// Skip lines starting with the comment character
|
|
if (c == RULE_COMMENT_CHAR) {
|
|
pos = rule.indexOf("\n", pos) + 1;
|
|
if (pos == 0) {
|
|
break; // No "\n" found; rest of rule is a commnet
|
|
}
|
|
continue; // Either fall out or restart with next line
|
|
}
|
|
// We've found the start of a rule. c is its first
|
|
// character, and pos points past c. Lexically parse the
|
|
// rule into component pieces.
|
|
pos = parseRule(rule, --pos, limit);
|
|
}
|
|
}
|
|
} catch (IllegalArgumentException e) {
|
|
// errors = new StringBuffer(e.getMessage());
|
|
}
|
|
|
|
// Index the rules
|
|
try {
|
|
data.ruleSet.freeze(data.setVariables);
|
|
} catch (IllegalArgumentException e) {
|
|
if (errors == null) {
|
|
errors = new StringBuffer(e.getMessage());
|
|
} else {
|
|
errors.append("\n").append(e.getMessage());
|
|
}
|
|
}
|
|
|
|
if (errors != null) {
|
|
throw new IllegalArgumentException(errors.toString());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
|
* at pos. Return the index after the last character parsed. Do not
|
|
* parse characters at or after limit.
|
|
*
|
|
* Important: The character at pos must be a non-whitespace character
|
|
* that is not the comment character.
|
|
*
|
|
* This method handles quoting, escaping, and whitespace removal. It
|
|
* parses the end-of-rule character. It recognizes context and cursor
|
|
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
|
* creates a rule object and adds it to our rule list.
|
|
*/
|
|
private int parseRule(String rule, int pos, int limit) {
|
|
// Locate the left side, operator, and right side
|
|
int start = pos;
|
|
char operator = 0;
|
|
|
|
StringBuffer buf = new StringBuffer();
|
|
int cursor = -1; // position of cursor in buf
|
|
int ante = -1; // position of ante context marker ')' in buf
|
|
int post = -1; // position of post context marker '(' in buf
|
|
int postClose = -1; // position of post context close ')' in buf
|
|
|
|
// Assigned to buf and its adjuncts after the LHS has been
|
|
// parsed. Thereafter, buf etc. refer to the RHS.
|
|
String left = null;
|
|
int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
|
|
|
|
main:
|
|
while (pos < limit) {
|
|
char c = rule.charAt(pos++);
|
|
if (Character.isWhitespace(c)) {
|
|
// Ignore whitespace. Note that this is not Unicode
|
|
// spaces, but Java spaces -- a subset, representing
|
|
// whitespace likely to be seen in code.
|
|
continue;
|
|
}
|
|
// Handle escapes
|
|
if (c == ESCAPE) {
|
|
if (pos == limit) {
|
|
syntaxError("Trailing backslash", rule, start);
|
|
}
|
|
buf.append(rule.charAt(pos++));
|
|
continue;
|
|
}
|
|
// Handle quoted matter
|
|
if (c == QUOTE) {
|
|
int iq = rule.indexOf(QUOTE, pos);
|
|
if (iq == pos) {
|
|
buf.append(c); // Parse [''] outside quotes as [']
|
|
++pos;
|
|
} else {
|
|
/* This loop picks up a segment of quoted text of the
|
|
* form 'aaaa' each time through. If this segment
|
|
* hasn't really ended ('aaaa''bbbb') then it keeps
|
|
* looping, each time adding on a new segment. When it
|
|
* reaches the final quote it breaks.
|
|
*/
|
|
for (;;) {
|
|
if (iq < 0) {
|
|
syntaxError("Unterminated quote", rule, start);
|
|
}
|
|
buf.append(rule.substring(pos, iq));
|
|
pos = iq+1;
|
|
if (pos < limit && rule.charAt(pos) == QUOTE) {
|
|
// Parse [''] inside quotes as [']
|
|
iq = rule.indexOf(QUOTE, pos+1);
|
|
// Continue looping
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
if (OPERATORS.indexOf(c) >= 0) {
|
|
if (operator != 0) {
|
|
syntaxError("Unquoted " + c, rule, start);
|
|
}
|
|
// Found an operator char. Check for forward-reverse operator.
|
|
if (c == REVERSE_RULE_OP &&
|
|
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
|
|
++pos;
|
|
operator = FWDREV_RULE_OP;
|
|
} else {
|
|
operator = c;
|
|
}
|
|
left = buf.toString(); // lhs
|
|
leftCursor = cursor;
|
|
leftAnte = ante;
|
|
leftPost = post;
|
|
leftPostClose = postClose;
|
|
|
|
buf.setLength(0);
|
|
cursor = ante = post = postClose = -1;
|
|
continue;
|
|
}
|
|
switch (c) {
|
|
case END_OF_RULE:
|
|
break main;
|
|
case VARIABLE_REF_OPEN:
|
|
{
|
|
int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
|
|
if (pos == j || j < 0) { // empty or unterminated
|
|
syntaxError("Malformed variable reference", rule, start);
|
|
}
|
|
String name = rule.substring(pos, j);
|
|
pos = j+1;
|
|
buf.append(getVariableDef(name));
|
|
}
|
|
break;
|
|
case CONTEXT_OPEN:
|
|
if (post >= 0) {
|
|
syntaxError("Multiple post contexts", rule, start);
|
|
}
|
|
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
|
|
// this is the optional opening delimiter for the ante context.
|
|
if (buf.length() > 0) {
|
|
post = buf.length();
|
|
}
|
|
break;
|
|
case CONTEXT_CLOSE:
|
|
if (postClose >= 0) {
|
|
syntaxError("Unexpected " + c, rule, start);
|
|
}
|
|
if (post >= 0) {
|
|
// This is probably the optional closing delimiter
|
|
// for the post context; save the pos and check later.
|
|
postClose = buf.length();
|
|
} else if (ante >= 0) {
|
|
syntaxError("Multiple ante contexts", rule, start);
|
|
} else {
|
|
ante = buf.length();
|
|
}
|
|
break;
|
|
case SET_OPEN:
|
|
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
|
|
buf.append(registerSet(new UnicodeSet(rule, pp,
|
|
data.variableNames, data.setVariables)));
|
|
pos = pp.getIndex();
|
|
break;
|
|
case VARIABLE_REF_CLOSE:
|
|
case SET_CLOSE:
|
|
syntaxError("Unquoted " + c, rule, start);
|
|
case CURSOR_POS:
|
|
if (cursor >= 0) {
|
|
syntaxError("Multiple cursors", rule, start);
|
|
}
|
|
cursor = buf.length();
|
|
break;
|
|
default:
|
|
buf.append(c);
|
|
break;
|
|
}
|
|
}
|
|
if (operator == 0) {
|
|
syntaxError("No operator", rule, start);
|
|
}
|
|
|
|
// Check context close parameters
|
|
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
|
|
(postClose >= 0 && postClose != buf.length())) {
|
|
syntaxError("Extra text after ]", rule, start);
|
|
}
|
|
|
|
// Context is only allowed on the input side; that is, the left side
|
|
// for forward rules. Cursors are only allowed on the output side;
|
|
// that is, the right side for forward rules. Bidirectional rules
|
|
// ignore elements that do not apply.
|
|
|
|
switch (operator) {
|
|
case VARIABLE_DEF_OP:
|
|
// LHS is the name. RHS is a single character, either a literal
|
|
// or a set (already parsed). If RHS is longer than one
|
|
// character, it is either a multi-character string, or multiple
|
|
// sets, or a mixture of chars and sets -- syntax error.
|
|
if (buf.length() != 1) {
|
|
syntaxError("Malformed RHS", rule, start);
|
|
}
|
|
if (data.variableNames.get(left) != null) {
|
|
syntaxError("Duplicate definition of {" +
|
|
left + "}", rule, start);
|
|
}
|
|
data.variableNames.put(left, new Character(buf.charAt(0)));
|
|
break;
|
|
|
|
case FORWARD_RULE_OP:
|
|
if (direction == FORWARD) {
|
|
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
|
|
syntaxError("Malformed rule", rule, start);
|
|
}
|
|
data.ruleSet.addRule(new TransliterationRule(
|
|
left, leftAnte, leftPost,
|
|
buf.toString(), cursor));
|
|
} // otherwise ignore the rule; it's not the direction we want
|
|
break;
|
|
|
|
case REVERSE_RULE_OP:
|
|
if (direction == REVERSE) {
|
|
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
|
|
syntaxError("Malformed rule", rule, start);
|
|
}
|
|
data.ruleSet.addRule(new TransliterationRule(
|
|
buf.toString(), ante, post,
|
|
left, leftCursor));
|
|
} // otherwise ignore the rule; it's not the direction we want
|
|
break;
|
|
|
|
case FWDREV_RULE_OP:
|
|
if (direction == FORWARD) {
|
|
// The output side is the right; trim off any context
|
|
String output = buf.toString().substring(ante < 0 ? 0 : ante,
|
|
post < 0 ? buf.length() : post);
|
|
data.ruleSet.addRule(new TransliterationRule(
|
|
left, leftAnte, leftPost,
|
|
output, cursor));
|
|
} else {
|
|
// The output side is the left; trim off any context
|
|
String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
|
|
leftPost < 0 ? left.length() : leftPost);
|
|
data.ruleSet.addRule(new TransliterationRule(
|
|
buf.toString(), ante, post,
|
|
output, leftCursor));
|
|
}
|
|
break;
|
|
}
|
|
|
|
return pos;
|
|
}
|
|
|
|
/**
|
|
* Throw an exception indicating a syntax error. Search the rule string
|
|
* for the probable end of the rule. Of course, if the error is that
|
|
* the end of rule marker is missing, then the rule end will not be found.
|
|
* In any case the rule start will be correctly reported.
|
|
* @param msg error description
|
|
* @param rule pattern string
|
|
* @param start position of first character of current rule
|
|
*/
|
|
private static final void syntaxError(String msg, String rule, int start) {
|
|
int end = quotedIndexOf(rule, start, rule.length(), ";");
|
|
if (end < 0) {
|
|
end = rule.length();
|
|
}
|
|
throw new IllegalArgumentException(msg + " in " +
|
|
rule.substring(start, end));
|
|
}
|
|
|
|
/**
|
|
* Allocate a private-use substitution character for the given set,
|
|
* register it in the setVariables hash, and return the substitution
|
|
* character.
|
|
*/
|
|
private final char registerSet(UnicodeSet set) {
|
|
if (variableNext >= variableLimit) {
|
|
throw new RuntimeException("Private use variables exhausted");
|
|
}
|
|
Character c = new Character(variableNext++);
|
|
data.setVariables.put(c, set);
|
|
return c.charValue();
|
|
}
|
|
|
|
/**
|
|
* Returns the single character value of the given variable name. Defined
|
|
* names are recognized.
|
|
* @exception IllegalArgumentException if the name is unknown.
|
|
*/
|
|
private char getVariableDef(String name) {
|
|
Character ch = (Character) data.variableNames.get(name);
|
|
if (ch == null) {
|
|
throw new IllegalArgumentException("Undefined variable: "
|
|
+ name);
|
|
}
|
|
return ch.charValue();
|
|
}
|
|
|
|
/**
|
|
* Determines what part of the private use region of Unicode we can use for
|
|
* variable stand-ins. The correct way to do this is as follows: Parse each
|
|
* rule, and for forward and reverse rules, take the FROM expression, and
|
|
* make a hash of all characters used. The TO expression should be ignored.
|
|
* When done, everything not in the hash is available for use. In practice,
|
|
* this method may employ some other algorithm for improved speed.
|
|
*/
|
|
private final void determineVariableRange(String[] ruleArray) {
|
|
// As an initial implementation, we just run through all the
|
|
// characters, ignoring any quoting. This works since the quote
|
|
// mechanisms are outside the private use area.
|
|
|
|
Range r = new Range('\uE000', 0x1900); // Private use area
|
|
r = r.largestUnusedSubrange(ruleArray);
|
|
|
|
if (r == null) {
|
|
throw new RuntimeException(
|
|
"No private use characters available for variables");
|
|
}
|
|
|
|
variableNext = r.start;
|
|
variableLimit = (char) (r.start + r.length);
|
|
|
|
if (variableNext >= variableLimit) {
|
|
throw new RuntimeException(
|
|
"Too few private use characters available for variables");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns the index of the first character in a set, ignoring quoted text.
|
|
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
|
* found by a search for "h". Unlike String.indexOf(), this method searches
|
|
* not for a single character, but for any character of the string
|
|
* <code>setOfChars</code>.
|
|
* @param text text to be searched
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= text.length()</code>.
|
|
* @param setOfChars string with one or more distinct characters
|
|
* @return Offset of the first character in <code>setOfChars</code>
|
|
* found, or -1 if not found.
|
|
* @see #indexOf
|
|
*/
|
|
private static int quotedIndexOf(String text, int start, int limit,
|
|
String setOfChars) {
|
|
for (int i=start; i<limit; ++i) {
|
|
char c = text.charAt(i);
|
|
if (c == ESCAPE) {
|
|
++i;
|
|
} else if (c == QUOTE) {
|
|
while (++i < limit
|
|
&& text.charAt(i) != QUOTE) {}
|
|
} else if (setOfChars.indexOf(c) >= 0) {
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* A range of Unicode characters. Support the operations of testing for
|
|
* inclusion (does this range contain this character?) and splitting.
|
|
* Splitting involves breaking a range into two smaller ranges around a
|
|
* character inside the original range. The split character is not included
|
|
* in either range. If the split character is at either extreme end of the
|
|
* range, one of the split products is an empty range.
|
|
*
|
|
* This class is used internally to determine the largest available private
|
|
* use character range for variable stand-ins.
|
|
*/
|
|
private static class Range implements Cloneable {
|
|
char start;
|
|
int length;
|
|
|
|
Range(char start, int length) {
|
|
this.start = start;
|
|
this.length = length;
|
|
}
|
|
|
|
public Object clone() {
|
|
return new Range(start, length);
|
|
}
|
|
|
|
boolean contains(char c) {
|
|
return c >= start && (c - start) < length;
|
|
}
|
|
|
|
/**
|
|
* Assume that contains(c) is true. Split this range into two new
|
|
* ranges around the character c. Make this range one of the new ranges
|
|
* (modify it in place) and return the other new range. The character
|
|
* itself is not included in either range. If the split results in an
|
|
* empty range (that is, if c == start or c == start + length - 1) then
|
|
* return null.
|
|
*/
|
|
Range split(char c) {
|
|
if (c == start) {
|
|
++start;
|
|
--length;
|
|
return null;
|
|
} else if (c - start == length - 1) {
|
|
--length;
|
|
return null;
|
|
} else {
|
|
++c;
|
|
Range r = new Range(c, start + length - c);
|
|
length = --c - start;
|
|
return r;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Finds the largest unused subrange by the given string. A
|
|
* subrange is unused by a string if the string contains no
|
|
* characters in that range. If the given string contains no
|
|
* characters in this range, then this range itself is
|
|
* returned.
|
|
*/
|
|
Range largestUnusedSubrange(String[] strings) {
|
|
Vector v = new Vector(1);
|
|
v.addElement(clone());
|
|
|
|
for (int k=0; k<strings.length; ++k) {
|
|
String str = strings[k];
|
|
int n = str.length();
|
|
for (int i=0; i<n; ++i) {
|
|
char c = str.charAt(i);
|
|
if (contains(c)) {
|
|
for (int j=0; j<v.size(); ++j) {
|
|
Range r = (Range) v.elementAt(j);
|
|
if (r.contains(c)) {
|
|
r = r.split(c);
|
|
if (r != null) {
|
|
v.addElement(r);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Range bestRange = null;
|
|
for (int j=0; j<v.size(); ++j) {
|
|
Range r = (Range) v.elementAt(j);
|
|
if (bestRange == null || r.length > bestRange.length) {
|
|
bestRange = r;
|
|
}
|
|
}
|
|
|
|
return bestRange;
|
|
}
|
|
}
|
|
}
|
|
}
|