Add segment support.
X-SVN-Rev: 1165
This commit is contained in:
parent
9a19714271
commit
2947282e42
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||||
* $Date: 2000/03/22 02:00:08 $
|
* $Date: 2000/04/19 16:37:38 $
|
||||||
* $Revision: 1.14 $
|
* $Revision: 1.15 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -412,6 +412,25 @@ public class TransliteratorTest extends TestFmwk {
|
|||||||
expect(hex3, "012", "012");
|
expect(hex3, "012", "012");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test segments and segment references.
|
||||||
|
*/
|
||||||
|
public void TestSegments() {
|
||||||
|
// Array of 3n items
|
||||||
|
// Each item is <rules>, <input>, <expected output>
|
||||||
|
String[] DATA = {
|
||||||
|
"$([a-z]$) . $([0-9]$) > $2-$1",
|
||||||
|
"abc.123.xyz.456",
|
||||||
|
"ab1-c23.xy4-z56",
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i=0; i<DATA.length; i+=3) {
|
||||||
|
logln("Pattern: " + Utility.escape(DATA[i]));
|
||||||
|
Transliterator t = new RuleBasedTransliterator("<ID>", DATA[i]);
|
||||||
|
expect(t, DATA[i+1], DATA[i+2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//======================================================================
|
//======================================================================
|
||||||
// Support methods
|
// Support methods
|
||||||
//======================================================================
|
//======================================================================
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
|
||||||
* $Date: 2000/04/12 20:17:45 $
|
* $Date: 2000/04/19 16:34:18 $
|
||||||
* $Revision: 1.18 $
|
* $Revision: 1.19 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -209,9 +209,12 @@ import com.ibm.util.Utility;
|
|||||||
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
||||||
*
|
*
|
||||||
* @author Alan Liu
|
* @author Alan Liu
|
||||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.18 $ $Date: 2000/04/12 20:17:45 $
|
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.19 $ $Date: 2000/04/19 16:34:18 $
|
||||||
*
|
*
|
||||||
* $Log: RuleBasedTransliterator.java,v $
|
* $Log: RuleBasedTransliterator.java,v $
|
||||||
|
* Revision 1.19 2000/04/19 16:34:18 alan
|
||||||
|
* Add segment support.
|
||||||
|
*
|
||||||
* Revision 1.18 2000/04/12 20:17:45 alan
|
* Revision 1.18 2000/04/12 20:17:45 alan
|
||||||
* Delegate replace operation to rule object
|
* Delegate replace operation to rule object
|
||||||
*
|
*
|
||||||
@ -379,7 +382,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Delegate replacement to TransliterationRule object
|
// Delegate replacement to TransliterationRule object
|
||||||
limit += r.replace(text, cursor);
|
limit += r.replace(text, cursor, data);
|
||||||
// text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
|
// text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
|
||||||
// limit += r.getOutput().length() - r.getKeyLength();
|
// limit += r.getOutput().length() - r.getKeyLength();
|
||||||
cursor += r.getCursorPos();
|
cursor += r.getCursorPos();
|
||||||
@ -448,12 +451,14 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
public UnicodeSet[] setVariables;
|
public UnicodeSet[] setVariables;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The character represented by setVariables[0].
|
* The character that represents setVariables[0]. Characters
|
||||||
|
* setVariablesBase through setVariablesBase +
|
||||||
|
* setVariables.length - 1 represent UnicodeSet objects.
|
||||||
*/
|
*/
|
||||||
public char setVariablesBase;
|
public char setVariablesBase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the UnicodeSet associated with the given character, or
|
* Return the UnicodeSet represented by the given character, or
|
||||||
* null if none.
|
* null if none.
|
||||||
*/
|
*/
|
||||||
public UnicodeSet lookup(char c) {
|
public UnicodeSet lookup(char c) {
|
||||||
@ -461,6 +466,22 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
return (i >= 0 && i < setVariables.length)
|
return (i >= 0 && i < setVariables.length)
|
||||||
? setVariables[i] : null;
|
? setVariables[i] : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The character that represents segment 1. Characters segmentBase
|
||||||
|
* through segmentBase + 8 represent segments 1 through 9.
|
||||||
|
*/
|
||||||
|
public char segmentBase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the zero-based index of the segment represented by the given
|
||||||
|
* character, or -1 if none. Repeat: This is a zero-based return value,
|
||||||
|
* 0..8, even though these are notated "$1".."$9".
|
||||||
|
*/
|
||||||
|
public int lookupSegmentReference(char c) {
|
||||||
|
int i = c - segmentBase;
|
||||||
|
return (i >= 0 && i < 9) ? i : -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -548,6 +569,12 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
private static final char SET_CLOSE = ']';
|
private static final char SET_CLOSE = ']';
|
||||||
private static final char CURSOR_POS = '|';
|
private static final char CURSOR_POS = '|';
|
||||||
|
|
||||||
|
// Segments of the input string are delimited by "$(" and "$)". In the
|
||||||
|
// output string these segments are referenced as "$1" through "$9".
|
||||||
|
private static final char SEGMENT_REF = '$';
|
||||||
|
private static final char SEGMENT_OPEN = '(';
|
||||||
|
private static final char SEGMENT_CLOSE = ')';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param rules list of rules, separated by semicolon characters
|
* @param rules list of rules, separated by semicolon characters
|
||||||
* @exception IllegalArgumentException if there is a syntax error in the
|
* @exception IllegalArgumentException if there is a syntax error in the
|
||||||
@ -633,33 +660,34 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
* A class representing one side of a rule. This class knows how to
|
||||||
* at pos. Return the index after the last character parsed. Do not
|
* parse half of a rule. It is tightly coupled to the method
|
||||||
* parse characters at or after limit.
|
* RuleBasedTransliterator.Parser.parseRule().
|
||||||
*
|
|
||||||
* Important: The character at pos must be a non-whitespace character
|
|
||||||
* that is not the comment character.
|
|
||||||
*
|
|
||||||
* This method handles quoting, escaping, and whitespace removal. It
|
|
||||||
* parses the end-of-rule character. It recognizes context and cursor
|
|
||||||
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
|
||||||
* creates a rule object and adds it to our rule list.
|
|
||||||
*/
|
*/
|
||||||
private int parseRule(String rule, int pos, int limit) {
|
static class RuleHalf {
|
||||||
// Locate the left side, operator, and right side
|
|
||||||
|
public String text;
|
||||||
|
|
||||||
|
public int cursor = -1; // position of cursor in text
|
||||||
|
public int ante = -1; // position of ante context marker ')' in text
|
||||||
|
public int post = -1; // position of post context marker '(' in text
|
||||||
|
|
||||||
|
// Record the position of the segment substrings and references. A
|
||||||
|
// given side should have segments or segment references, but not
|
||||||
|
// both.
|
||||||
|
public Vector segments = null; // ref substring start,limits
|
||||||
|
public int maxRef = -1; // index of largest ref (1..9)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse one side of a rule, stopping at either the limit,
|
||||||
|
* the END_OF_RULE character, or an operator. Return
|
||||||
|
* the pos of the terminating character (or limit).
|
||||||
|
*/
|
||||||
|
public int parse(String rule, int pos, int limit,
|
||||||
|
RuleBasedTransliterator.Parser parser) {
|
||||||
int start = pos;
|
int start = pos;
|
||||||
char operator = 0;
|
|
||||||
|
|
||||||
StringBuffer buf = new StringBuffer();
|
StringBuffer buf = new StringBuffer();
|
||||||
int cursor = -1; // position of cursor in buf
|
int postClose = -1; // position of post context close ')' in text
|
||||||
int ante = -1; // position of ante context marker ')' in buf
|
|
||||||
int post = -1; // position of post context marker '(' in buf
|
|
||||||
int postClose = -1; // position of post context close ')' in buf
|
|
||||||
|
|
||||||
// Assigned to buf and its adjuncts after the LHS has been
|
|
||||||
// parsed. Thereafter, buf etc. refer to the RHS.
|
|
||||||
String left = null;
|
|
||||||
int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
|
|
||||||
|
|
||||||
main:
|
main:
|
||||||
while (pos < limit) {
|
while (pos < limit) {
|
||||||
@ -709,29 +737,47 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (OPERATORS.indexOf(c) >= 0) {
|
if (OPERATORS.indexOf(c) >= 0) {
|
||||||
if (operator != 0) {
|
--pos; // Backup to point to operator
|
||||||
syntaxError("Unquoted " + c, rule, start);
|
break main;
|
||||||
}
|
}
|
||||||
// Found an operator char. Check for forward-reverse operator.
|
// Handle segment definitions "$(" ")$" and references "$1"
|
||||||
if (c == REVERSE_RULE_OP &&
|
// .. "$9".
|
||||||
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
|
if (c == SEGMENT_REF) {
|
||||||
++pos;
|
// After a SEGMENT_REF, must see SEGMENT_OPEN,
|
||||||
operator = FWDREV_RULE_OP;
|
// SEGMENT_CLOSE, or a digit 1 to 9, with no intervening
|
||||||
|
// whitespace
|
||||||
|
if (pos == limit) {
|
||||||
|
syntaxError("Trailing " + c, rule, start);
|
||||||
|
}
|
||||||
|
c = rule.charAt(pos++);
|
||||||
|
if (c == SEGMENT_OPEN || c == SEGMENT_CLOSE) {
|
||||||
|
// Parse "$(", "$)"
|
||||||
|
if (segments == null) {
|
||||||
|
segments = new Vector();
|
||||||
|
}
|
||||||
|
if ((c == SEGMENT_OPEN) !=
|
||||||
|
(segments.size() % 2 == 0)) {
|
||||||
|
syntaxError("Mismatched segment delimiters",
|
||||||
|
rule, start);
|
||||||
|
}
|
||||||
|
segments.addElement(new Integer(buf.length()));
|
||||||
} else {
|
} else {
|
||||||
operator = c;
|
// Parse "$1" "$2" .. "$9"
|
||||||
|
int r = Character.digit(c, 10);
|
||||||
|
if (r < 1 || r > 9) {
|
||||||
|
syntaxError("Illegal char after " + SEGMENT_REF,
|
||||||
|
rule, start);
|
||||||
|
}
|
||||||
|
if (r > maxRef) {
|
||||||
|
maxRef = r;
|
||||||
|
}
|
||||||
|
buf.append((char) (parser.data.segmentBase + r - 1));
|
||||||
}
|
}
|
||||||
left = buf.toString(); // lhs
|
|
||||||
leftCursor = cursor;
|
|
||||||
leftAnte = ante;
|
|
||||||
leftPost = post;
|
|
||||||
leftPostClose = postClose;
|
|
||||||
|
|
||||||
buf.setLength(0);
|
|
||||||
cursor = ante = post = postClose = -1;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case END_OF_RULE:
|
case END_OF_RULE:
|
||||||
|
--pos; // Backup to point to END_OF_RULE
|
||||||
break main;
|
break main;
|
||||||
case VARIABLE_REF_OPEN:
|
case VARIABLE_REF_OPEN:
|
||||||
{
|
{
|
||||||
@ -741,7 +787,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
}
|
}
|
||||||
String name = rule.substring(pos, j);
|
String name = rule.substring(pos, j);
|
||||||
pos = j+1;
|
pos = j+1;
|
||||||
buf.append(getVariableDef(name));
|
buf.append(parser.getVariableDef(name));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case CONTEXT_OPEN:
|
case CONTEXT_OPEN:
|
||||||
@ -770,7 +816,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
break;
|
break;
|
||||||
case SET_OPEN:
|
case SET_OPEN:
|
||||||
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
|
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
|
||||||
buf.append(registerSet(new UnicodeSet(rule, pp, parseData)));
|
buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData)));
|
||||||
pos = pp.getIndex();
|
pos = pp.getIndex();
|
||||||
break;
|
break;
|
||||||
case VARIABLE_REF_CLOSE:
|
case VARIABLE_REF_CLOSE:
|
||||||
@ -787,77 +833,155 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (operator == 0) {
|
|
||||||
syntaxError("No operator", rule, start);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check context close parameters
|
// Check context close parameters
|
||||||
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
|
if (postClose >= 0 && postClose != buf.length()) {
|
||||||
(postClose >= 0 && postClose != buf.length())) {
|
|
||||||
syntaxError("Extra text after ]", rule, start);
|
syntaxError("Extra text after ]", rule, start);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Context is only allowed on the input side; that is, the left side
|
text = buf.toString();
|
||||||
// for forward rules. Cursors are only allowed on the output side;
|
return pos;
|
||||||
// that is, the right side for forward rules. Bidirectional rules
|
}
|
||||||
// ignore elements that do not apply.
|
|
||||||
|
|
||||||
switch (operator) {
|
/**
|
||||||
case VARIABLE_DEF_OP:
|
* Remove context.
|
||||||
|
*/
|
||||||
|
void removeContext() {
|
||||||
|
text = text.substring(ante < 0 ? 0 : ante,
|
||||||
|
post < 0 ? text.length() : post);
|
||||||
|
ante = post = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and return an int[] array of segments.
|
||||||
|
*/
|
||||||
|
int[] getSegments() {
|
||||||
|
if (segments == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
int[] result = new int[segments.size()];
|
||||||
|
for (int i=0; i<segments.size(); ++i) {
|
||||||
|
result[i] = ((Number)segments.elementAt(i)).intValue();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
||||||
|
* at pos. Return the index after the last character parsed. Do not
|
||||||
|
* parse characters at or after limit.
|
||||||
|
*
|
||||||
|
* Important: The character at pos must be a non-whitespace character
|
||||||
|
* that is not the comment character.
|
||||||
|
*
|
||||||
|
* This method handles quoting, escaping, and whitespace removal. It
|
||||||
|
* parses the end-of-rule character. It recognizes context and cursor
|
||||||
|
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
||||||
|
* creates a rule object and adds it to our rule list.
|
||||||
|
*
|
||||||
|
* This method is tightly coupled to the inner class RuleHalf.
|
||||||
|
*/
|
||||||
|
private int parseRule(String rule, int pos, int limit) {
|
||||||
|
// Locate the left side, operator, and right side
|
||||||
|
int start = pos;
|
||||||
|
char operator = 0;
|
||||||
|
|
||||||
|
RuleHalf left = new RuleHalf();
|
||||||
|
RuleHalf right = new RuleHalf();
|
||||||
|
|
||||||
|
pos = left.parse(rule, pos, limit, this);
|
||||||
|
|
||||||
|
if (pos == limit ||
|
||||||
|
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
|
||||||
|
syntaxError("No operator", rule, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Found an operator char. Check for forward-reverse operator.
|
||||||
|
if (operator == REVERSE_RULE_OP &&
|
||||||
|
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
|
||||||
|
++pos;
|
||||||
|
operator = FWDREV_RULE_OP;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = right.parse(rule, pos, limit, this);
|
||||||
|
|
||||||
|
if (pos < limit) {
|
||||||
|
if (rule.charAt(pos) == END_OF_RULE) {
|
||||||
|
++pos;
|
||||||
|
} else {
|
||||||
|
// RuleHalf parser must have terminated at an operator
|
||||||
|
syntaxError("Unquoted operator", rule, start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (operator == VARIABLE_DEF_OP) {
|
||||||
// LHS is the name. RHS is a single character, either a literal
|
// LHS is the name. RHS is a single character, either a literal
|
||||||
// or a set (already parsed). If RHS is longer than one
|
// or a set (already parsed). If RHS is longer than one
|
||||||
// character, it is either a multi-character string, or multiple
|
// character, it is either a multi-character string, or multiple
|
||||||
// sets, or a mixture of chars and sets -- syntax error.
|
// sets, or a mixture of chars and sets -- syntax error.
|
||||||
if (buf.length() != 1) {
|
if (right.text.length() != 1) {
|
||||||
syntaxError("Malformed RHS", rule, start);
|
syntaxError("Malformed RHS", rule, start);
|
||||||
}
|
}
|
||||||
if (data.variableNames.get(left) != null) {
|
if (data.variableNames.get(left.text) != null) {
|
||||||
syntaxError("Duplicate definition of {" +
|
syntaxError("Duplicate definition of {" +
|
||||||
left + "}", rule, start);
|
left.text + "}", rule, start);
|
||||||
}
|
}
|
||||||
data.variableNames.put(left, new Character(buf.charAt(0)));
|
data.variableNames.put(left.text, new Character(right.text.charAt(0)));
|
||||||
break;
|
return pos;
|
||||||
|
|
||||||
case FORWARD_RULE_OP:
|
|
||||||
if (direction == FORWARD) {
|
|
||||||
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
|
|
||||||
syntaxError("Malformed rule", rule, start);
|
|
||||||
}
|
}
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
|
||||||
left, leftAnte, leftPost,
|
|
||||||
buf.toString(), cursor));
|
|
||||||
} // otherwise ignore the rule; it's not the direction we want
|
|
||||||
break;
|
|
||||||
|
|
||||||
case REVERSE_RULE_OP:
|
// If the direction we want doesn't match the rule
|
||||||
|
// direction, do nothing.
|
||||||
|
if (operator != FWDREV_RULE_OP &&
|
||||||
|
((direction == FORWARD) != (operator == FORWARD_RULE_OP))) {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transform the rule into a forward rule by swapping the
|
||||||
|
// sides if necessary.
|
||||||
if (direction == REVERSE) {
|
if (direction == REVERSE) {
|
||||||
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
|
RuleHalf temp = left;
|
||||||
|
left = right;
|
||||||
|
right = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove non-applicable elements in forward-reverse
|
||||||
|
// rules. Bidirectional rules ignore elements that do not
|
||||||
|
// apply.
|
||||||
|
if (operator == FWDREV_RULE_OP) {
|
||||||
|
right.removeContext();
|
||||||
|
right.segments = null;
|
||||||
|
left.cursor = left.maxRef = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context is only allowed on the input side. Cursors are only
|
||||||
|
// allowed on the output side. Segment delimiters can only appear
|
||||||
|
// on the left, and references on the right.
|
||||||
|
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
|
||||||
|
right.segments != null || left.maxRef >= 0) {
|
||||||
syntaxError("Malformed rule", rule, start);
|
syntaxError("Malformed rule", rule, start);
|
||||||
}
|
}
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
|
||||||
buf.toString(), ante, post,
|
|
||||||
left, leftCursor));
|
|
||||||
} // otherwise ignore the rule; it's not the direction we want
|
|
||||||
break;
|
|
||||||
|
|
||||||
case FWDREV_RULE_OP:
|
// Check integrity of segments and segment references. Each
|
||||||
if (direction == FORWARD) {
|
// segment's start must have a corresponding limit, and the
|
||||||
// The output side is the right; trim off any context
|
// references must not refer to segments that do not exist.
|
||||||
String output = buf.toString().substring(ante < 0 ? 0 : ante,
|
int[] segmentsArray = null;
|
||||||
post < 0 ? buf.length() : post);
|
if (left.segments != null) {
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
int n = left.segments.size();
|
||||||
left, leftAnte, leftPost,
|
if (n % 2 != 0) {
|
||||||
output, cursor));
|
syntaxError("Odd length segments", rule, start);
|
||||||
} else {
|
|
||||||
// The output side is the left; trim off any context
|
|
||||||
String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
|
|
||||||
leftPost < 0 ? left.length() : leftPost);
|
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
|
||||||
buf.toString(), ante, post,
|
|
||||||
output, leftCursor));
|
|
||||||
}
|
}
|
||||||
break;
|
n /= 2;
|
||||||
|
if (right.maxRef > n) {
|
||||||
|
syntaxError("Undefined segment reference " + right.maxRef, rule, start);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data.ruleSet.addRule(new TransliterationRule(
|
||||||
|
left.text, left.ante, left.post,
|
||||||
|
right.text, right.cursor,
|
||||||
|
left.getSegments(), data));
|
||||||
|
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
@ -871,13 +995,13 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
* @param rule pattern string
|
* @param rule pattern string
|
||||||
* @param start position of first character of current rule
|
* @param start position of first character of current rule
|
||||||
*/
|
*/
|
||||||
private static final void syntaxError(String msg, String rule, int start) {
|
static final void syntaxError(String msg, String rule, int start) {
|
||||||
int end = quotedIndexOf(rule, start, rule.length(), ";");
|
int end = quotedIndexOf(rule, start, rule.length(), ";");
|
||||||
if (end < 0) {
|
if (end < 0) {
|
||||||
end = rule.length();
|
end = rule.length();
|
||||||
}
|
}
|
||||||
throw new IllegalArgumentException(msg + " in " +
|
throw new IllegalArgumentException(msg + " in \"" +
|
||||||
rule.substring(start, end));
|
Utility.escape(rule.substring(start, end)) + '"');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -928,7 +1052,9 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
"No private use characters available for variables");
|
"No private use characters available for variables");
|
||||||
}
|
}
|
||||||
|
|
||||||
data.setVariablesBase = variableNext = r.start;
|
// Allocate 9 characters for segment references 1 through 9
|
||||||
|
data.segmentBase = r.start;
|
||||||
|
data.setVariablesBase = variableNext = (char) (data.segmentBase + 9);
|
||||||
variableLimit = (char) (r.start + r.length);
|
variableLimit = (char) (r.start + r.length);
|
||||||
|
|
||||||
if (variableNext >= variableLimit) {
|
if (variableNext >= variableLimit) {
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
||||||
* $Date: 2000/04/12 20:17:45 $
|
* $Date: 2000/04/19 16:34:18 $
|
||||||
* $Revision: 1.15 $
|
* $Revision: 1.16 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -30,12 +30,26 @@ import com.ibm.util.Utility;
|
|||||||
* Variables are detected by looking up each character in a supplied
|
* Variables are detected by looking up each character in a supplied
|
||||||
* variable list to see if it has been so defined.
|
* variable list to see if it has been so defined.
|
||||||
*
|
*
|
||||||
|
* <p>A rule may contain segments in its input string and segment references in
|
||||||
|
* its output string. A segment is a substring of the input pattern, indicated
|
||||||
|
* by an offset and limit. The segment may span the preceding or following
|
||||||
|
* context. A segment reference is a special character in the output string
|
||||||
|
* that causes a segment of the input string (not the input pattern) to be
|
||||||
|
* copied to the output string. The range of special characters that represent
|
||||||
|
* segment references is defined by RuleBasedTransliterator.Data.
|
||||||
|
*
|
||||||
|
* <p>Example: The rule "$([a-z]$) . $([0-9]$) > $2 . $1" will change the input
|
||||||
|
* string "abc.123" to "ab1.c23".
|
||||||
|
*
|
||||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||||
*
|
*
|
||||||
* @author Alan Liu
|
* @author Alan Liu
|
||||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.15 $ $Date: 2000/04/12 20:17:45 $
|
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.16 $ $Date: 2000/04/19 16:34:18 $
|
||||||
*
|
*
|
||||||
* $Log: TransliterationRule.java,v $
|
* $Log: TransliterationRule.java,v $
|
||||||
|
* Revision 1.16 2000/04/19 16:34:18 alan
|
||||||
|
* Add segment support.
|
||||||
|
*
|
||||||
* Revision 1.15 2000/04/12 20:17:45 alan
|
* Revision 1.15 2000/04/12 20:17:45 alan
|
||||||
* Delegate replace operation to rule object
|
* Delegate replace operation to rule object
|
||||||
*
|
*
|
||||||
@ -121,6 +135,21 @@ class TransliterationRule {
|
|||||||
*/
|
*/
|
||||||
private String output;
|
private String output;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Array of segments. These are segments of the input string that may be
|
||||||
|
* referenced and appear in the output string. Each segment is stored as an
|
||||||
|
* offset, limit pair. Segments are referenced by a 1-based index;
|
||||||
|
* reference i thus includes characters at offset segments[2*i-2] to
|
||||||
|
* segments[2*i-1]-1 in the pattern string.
|
||||||
|
*
|
||||||
|
* In the output string, a segment reference is indicated by a character in
|
||||||
|
* a special range, as defined by RuleBasedTransliterator.Data.
|
||||||
|
*
|
||||||
|
* Most rules have no segments, in which case segments is null, and the
|
||||||
|
* output string need not be checked for segment reference characters.
|
||||||
|
*/
|
||||||
|
private int[] segments;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The length of the string that must match before the key. If
|
* The length of the string that must match before the key. If
|
||||||
* zero, then there is no matching requirement before the key.
|
* zero, then there is no matching requirement before the key.
|
||||||
@ -160,11 +189,17 @@ class TransliterationRule {
|
|||||||
* <code>output</code>; that is, -1 is equivalent to
|
* <code>output</code>; that is, -1 is equivalent to
|
||||||
* <code>output.length()</code>. If greater than
|
* <code>output.length()</code>. If greater than
|
||||||
* <code>output.length()</code> then an exception is thrown.
|
* <code>output.length()</code> then an exception is thrown.
|
||||||
|
* @param segs array of 2n integers. Each of n pairs consists of offset,
|
||||||
|
* limit for a segment of the input string. Characters in the output string
|
||||||
|
* refer to these segments if they are in a special range determined by the
|
||||||
|
* associated RuleBasedTransliterator.Data object. May be null if there are
|
||||||
|
* no segments.
|
||||||
*/
|
*/
|
||||||
public TransliterationRule(String input,
|
public TransliterationRule(String input,
|
||||||
int anteContextPos, int postContextPos,
|
int anteContextPos, int postContextPos,
|
||||||
String output,
|
String output,
|
||||||
int cursorPos) {
|
int cursorPos,
|
||||||
|
int[] segs) {
|
||||||
// Do range checks only when warranted to save time
|
// Do range checks only when warranted to save time
|
||||||
if (anteContextPos < 0) {
|
if (anteContextPos < 0) {
|
||||||
anteContextLength = 0;
|
anteContextLength = 0;
|
||||||
@ -193,6 +228,34 @@ class TransliterationRule {
|
|||||||
}
|
}
|
||||||
pattern = input;
|
pattern = input;
|
||||||
this.output = output;
|
this.output = output;
|
||||||
|
// We don't validate the segments array. The caller must
|
||||||
|
// guarantee that the segments are well-formed.
|
||||||
|
this.segments = segs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct a new rule with the given input, output text, and other
|
||||||
|
* attributes. A cursor position may be specified for the output text.
|
||||||
|
* @param input input string, including key and optional ante and
|
||||||
|
* post context
|
||||||
|
* @param anteContextPos offset into input to end of ante context, or -1 if
|
||||||
|
* none. Must be <= input.length() if not -1.
|
||||||
|
* @param postContextPos offset into input to start of post context, or -1
|
||||||
|
* if none. Must be <= input.length() if not -1, and must be >=
|
||||||
|
* anteContextPos.
|
||||||
|
* @param output output string
|
||||||
|
* @param cursorPos offset into output at which cursor is located, or -1 if
|
||||||
|
* none. If less than zero, then the cursor is placed after the
|
||||||
|
* <code>output</code>; that is, -1 is equivalent to
|
||||||
|
* <code>output.length()</code>. If greater than
|
||||||
|
* <code>output.length()</code> then an exception is thrown.
|
||||||
|
*/
|
||||||
|
public TransliterationRule(String input,
|
||||||
|
int anteContextPos, int postContextPos,
|
||||||
|
String output,
|
||||||
|
int cursorPos) {
|
||||||
|
this(input, anteContextPos, postContextPos,
|
||||||
|
output, cursorPos, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -238,11 +301,34 @@ class TransliterationRule {
|
|||||||
* matches. This is the offset to the point after the ante
|
* matches. This is the offset to the point after the ante
|
||||||
* context, if any, and before the match string and any post
|
* context, if any, and before the match string and any post
|
||||||
* context.
|
* context.
|
||||||
|
* @param data the RuleBasedTransliterator.Data object specifying
|
||||||
|
* context for this transliterator.
|
||||||
* @return the change in the length of the text
|
* @return the change in the length of the text
|
||||||
*/
|
*/
|
||||||
int replace(Replaceable text, int offset) {
|
public int replace(Replaceable text, int offset,
|
||||||
text.replace(offset, offset + keyLength, output);
|
RuleBasedTransliterator.Data data) {
|
||||||
return output.length() - keyLength;
|
String out;
|
||||||
|
if (segments == null) {
|
||||||
|
out = output;
|
||||||
|
} else {
|
||||||
|
int textStart = offset - anteContextLength;
|
||||||
|
StringBuffer buf = new StringBuffer();
|
||||||
|
for (int i=0; i<output.length(); ++i) {
|
||||||
|
char c = output.charAt(i);
|
||||||
|
int b = data.lookupSegmentReference(c);
|
||||||
|
if (b < 0) {
|
||||||
|
buf.append(c);
|
||||||
|
} else {
|
||||||
|
for (int j=textStart + segments[2*b];
|
||||||
|
j<textStart + segments[2*b+1]; ++j) {
|
||||||
|
buf.append(text.charAt(j));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = buf.toString();
|
||||||
|
}
|
||||||
|
text.replace(offset, offset + keyLength, out);
|
||||||
|
return out.length() - keyLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
||||||
* $Date: 2000/03/22 02:00:08 $
|
* $Date: 2000/04/19 16:37:38 $
|
||||||
* $Revision: 1.14 $
|
* $Revision: 1.15 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -412,6 +412,25 @@ public class TransliteratorTest extends TestFmwk {
|
|||||||
expect(hex3, "012", "012");
|
expect(hex3, "012", "012");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test segments and segment references.
|
||||||
|
*/
|
||||||
|
public void TestSegments() {
|
||||||
|
// Array of 3n items
|
||||||
|
// Each item is <rules>, <input>, <expected output>
|
||||||
|
String[] DATA = {
|
||||||
|
"$([a-z]$) . $([0-9]$) > $2-$1",
|
||||||
|
"abc.123.xyz.456",
|
||||||
|
"ab1-c23.xy4-z56",
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i=0; i<DATA.length; i+=3) {
|
||||||
|
logln("Pattern: " + Utility.escape(DATA[i]));
|
||||||
|
Transliterator t = new RuleBasedTransliterator("<ID>", DATA[i]);
|
||||||
|
expect(t, DATA[i+1], DATA[i+2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//======================================================================
|
//======================================================================
|
||||||
// Support methods
|
// Support methods
|
||||||
//======================================================================
|
//======================================================================
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
|
||||||
* $Date: 2000/04/12 20:17:45 $
|
* $Date: 2000/04/19 16:34:18 $
|
||||||
* $Revision: 1.18 $
|
* $Revision: 1.19 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -209,9 +209,12 @@ import com.ibm.util.Utility;
|
|||||||
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
||||||
*
|
*
|
||||||
* @author Alan Liu
|
* @author Alan Liu
|
||||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.18 $ $Date: 2000/04/12 20:17:45 $
|
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.19 $ $Date: 2000/04/19 16:34:18 $
|
||||||
*
|
*
|
||||||
* $Log: RuleBasedTransliterator.java,v $
|
* $Log: RuleBasedTransliterator.java,v $
|
||||||
|
* Revision 1.19 2000/04/19 16:34:18 alan
|
||||||
|
* Add segment support.
|
||||||
|
*
|
||||||
* Revision 1.18 2000/04/12 20:17:45 alan
|
* Revision 1.18 2000/04/12 20:17:45 alan
|
||||||
* Delegate replace operation to rule object
|
* Delegate replace operation to rule object
|
||||||
*
|
*
|
||||||
@ -379,7 +382,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Delegate replacement to TransliterationRule object
|
// Delegate replacement to TransliterationRule object
|
||||||
limit += r.replace(text, cursor);
|
limit += r.replace(text, cursor, data);
|
||||||
// text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
|
// text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
|
||||||
// limit += r.getOutput().length() - r.getKeyLength();
|
// limit += r.getOutput().length() - r.getKeyLength();
|
||||||
cursor += r.getCursorPos();
|
cursor += r.getCursorPos();
|
||||||
@ -448,12 +451,14 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
public UnicodeSet[] setVariables;
|
public UnicodeSet[] setVariables;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The character represented by setVariables[0].
|
* The character that represents setVariables[0]. Characters
|
||||||
|
* setVariablesBase through setVariablesBase +
|
||||||
|
* setVariables.length - 1 represent UnicodeSet objects.
|
||||||
*/
|
*/
|
||||||
public char setVariablesBase;
|
public char setVariablesBase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the UnicodeSet associated with the given character, or
|
* Return the UnicodeSet represented by the given character, or
|
||||||
* null if none.
|
* null if none.
|
||||||
*/
|
*/
|
||||||
public UnicodeSet lookup(char c) {
|
public UnicodeSet lookup(char c) {
|
||||||
@ -461,6 +466,22 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
return (i >= 0 && i < setVariables.length)
|
return (i >= 0 && i < setVariables.length)
|
||||||
? setVariables[i] : null;
|
? setVariables[i] : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The character that represents segment 1. Characters segmentBase
|
||||||
|
* through segmentBase + 8 represent segments 1 through 9.
|
||||||
|
*/
|
||||||
|
public char segmentBase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the zero-based index of the segment represented by the given
|
||||||
|
* character, or -1 if none. Repeat: This is a zero-based return value,
|
||||||
|
* 0..8, even though these are notated "$1".."$9".
|
||||||
|
*/
|
||||||
|
public int lookupSegmentReference(char c) {
|
||||||
|
int i = c - segmentBase;
|
||||||
|
return (i >= 0 && i < 9) ? i : -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -548,6 +569,12 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
private static final char SET_CLOSE = ']';
|
private static final char SET_CLOSE = ']';
|
||||||
private static final char CURSOR_POS = '|';
|
private static final char CURSOR_POS = '|';
|
||||||
|
|
||||||
|
// Segments of the input string are delimited by "$(" and "$)". In the
|
||||||
|
// output string these segments are referenced as "$1" through "$9".
|
||||||
|
private static final char SEGMENT_REF = '$';
|
||||||
|
private static final char SEGMENT_OPEN = '(';
|
||||||
|
private static final char SEGMENT_CLOSE = ')';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param rules list of rules, separated by semicolon characters
|
* @param rules list of rules, separated by semicolon characters
|
||||||
* @exception IllegalArgumentException if there is a syntax error in the
|
* @exception IllegalArgumentException if there is a syntax error in the
|
||||||
@ -633,33 +660,34 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
* A class representing one side of a rule. This class knows how to
|
||||||
* at pos. Return the index after the last character parsed. Do not
|
* parse half of a rule. It is tightly coupled to the method
|
||||||
* parse characters at or after limit.
|
* RuleBasedTransliterator.Parser.parseRule().
|
||||||
*
|
|
||||||
* Important: The character at pos must be a non-whitespace character
|
|
||||||
* that is not the comment character.
|
|
||||||
*
|
|
||||||
* This method handles quoting, escaping, and whitespace removal. It
|
|
||||||
* parses the end-of-rule character. It recognizes context and cursor
|
|
||||||
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
|
||||||
* creates a rule object and adds it to our rule list.
|
|
||||||
*/
|
*/
|
||||||
private int parseRule(String rule, int pos, int limit) {
|
static class RuleHalf {
|
||||||
// Locate the left side, operator, and right side
|
|
||||||
|
public String text;
|
||||||
|
|
||||||
|
public int cursor = -1; // position of cursor in text
|
||||||
|
public int ante = -1; // position of ante context marker ')' in text
|
||||||
|
public int post = -1; // position of post context marker '(' in text
|
||||||
|
|
||||||
|
// Record the position of the segment substrings and references. A
|
||||||
|
// given side should have segments or segment references, but not
|
||||||
|
// both.
|
||||||
|
public Vector segments = null; // ref substring start,limits
|
||||||
|
public int maxRef = -1; // index of largest ref (1..9)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse one side of a rule, stopping at either the limit,
|
||||||
|
* the END_OF_RULE character, or an operator. Return
|
||||||
|
* the pos of the terminating character (or limit).
|
||||||
|
*/
|
||||||
|
public int parse(String rule, int pos, int limit,
|
||||||
|
RuleBasedTransliterator.Parser parser) {
|
||||||
int start = pos;
|
int start = pos;
|
||||||
char operator = 0;
|
|
||||||
|
|
||||||
StringBuffer buf = new StringBuffer();
|
StringBuffer buf = new StringBuffer();
|
||||||
int cursor = -1; // position of cursor in buf
|
int postClose = -1; // position of post context close ')' in text
|
||||||
int ante = -1; // position of ante context marker ')' in buf
|
|
||||||
int post = -1; // position of post context marker '(' in buf
|
|
||||||
int postClose = -1; // position of post context close ')' in buf
|
|
||||||
|
|
||||||
// Assigned to buf and its adjuncts after the LHS has been
|
|
||||||
// parsed. Thereafter, buf etc. refer to the RHS.
|
|
||||||
String left = null;
|
|
||||||
int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
|
|
||||||
|
|
||||||
main:
|
main:
|
||||||
while (pos < limit) {
|
while (pos < limit) {
|
||||||
@ -709,29 +737,47 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (OPERATORS.indexOf(c) >= 0) {
|
if (OPERATORS.indexOf(c) >= 0) {
|
||||||
if (operator != 0) {
|
--pos; // Backup to point to operator
|
||||||
syntaxError("Unquoted " + c, rule, start);
|
break main;
|
||||||
}
|
}
|
||||||
// Found an operator char. Check for forward-reverse operator.
|
// Handle segment definitions "$(" ")$" and references "$1"
|
||||||
if (c == REVERSE_RULE_OP &&
|
// .. "$9".
|
||||||
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
|
if (c == SEGMENT_REF) {
|
||||||
++pos;
|
// After a SEGMENT_REF, must see SEGMENT_OPEN,
|
||||||
operator = FWDREV_RULE_OP;
|
// SEGMENT_CLOSE, or a digit 1 to 9, with no intervening
|
||||||
|
// whitespace
|
||||||
|
if (pos == limit) {
|
||||||
|
syntaxError("Trailing " + c, rule, start);
|
||||||
|
}
|
||||||
|
c = rule.charAt(pos++);
|
||||||
|
if (c == SEGMENT_OPEN || c == SEGMENT_CLOSE) {
|
||||||
|
// Parse "$(", "$)"
|
||||||
|
if (segments == null) {
|
||||||
|
segments = new Vector();
|
||||||
|
}
|
||||||
|
if ((c == SEGMENT_OPEN) !=
|
||||||
|
(segments.size() % 2 == 0)) {
|
||||||
|
syntaxError("Mismatched segment delimiters",
|
||||||
|
rule, start);
|
||||||
|
}
|
||||||
|
segments.addElement(new Integer(buf.length()));
|
||||||
} else {
|
} else {
|
||||||
operator = c;
|
// Parse "$1" "$2" .. "$9"
|
||||||
|
int r = Character.digit(c, 10);
|
||||||
|
if (r < 1 || r > 9) {
|
||||||
|
syntaxError("Illegal char after " + SEGMENT_REF,
|
||||||
|
rule, start);
|
||||||
|
}
|
||||||
|
if (r > maxRef) {
|
||||||
|
maxRef = r;
|
||||||
|
}
|
||||||
|
buf.append((char) (parser.data.segmentBase + r - 1));
|
||||||
}
|
}
|
||||||
left = buf.toString(); // lhs
|
|
||||||
leftCursor = cursor;
|
|
||||||
leftAnte = ante;
|
|
||||||
leftPost = post;
|
|
||||||
leftPostClose = postClose;
|
|
||||||
|
|
||||||
buf.setLength(0);
|
|
||||||
cursor = ante = post = postClose = -1;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case END_OF_RULE:
|
case END_OF_RULE:
|
||||||
|
--pos; // Backup to point to END_OF_RULE
|
||||||
break main;
|
break main;
|
||||||
case VARIABLE_REF_OPEN:
|
case VARIABLE_REF_OPEN:
|
||||||
{
|
{
|
||||||
@ -741,7 +787,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
}
|
}
|
||||||
String name = rule.substring(pos, j);
|
String name = rule.substring(pos, j);
|
||||||
pos = j+1;
|
pos = j+1;
|
||||||
buf.append(getVariableDef(name));
|
buf.append(parser.getVariableDef(name));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case CONTEXT_OPEN:
|
case CONTEXT_OPEN:
|
||||||
@ -770,7 +816,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
break;
|
break;
|
||||||
case SET_OPEN:
|
case SET_OPEN:
|
||||||
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
|
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
|
||||||
buf.append(registerSet(new UnicodeSet(rule, pp, parseData)));
|
buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData)));
|
||||||
pos = pp.getIndex();
|
pos = pp.getIndex();
|
||||||
break;
|
break;
|
||||||
case VARIABLE_REF_CLOSE:
|
case VARIABLE_REF_CLOSE:
|
||||||
@ -787,77 +833,155 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (operator == 0) {
|
|
||||||
syntaxError("No operator", rule, start);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check context close parameters
|
// Check context close parameters
|
||||||
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
|
if (postClose >= 0 && postClose != buf.length()) {
|
||||||
(postClose >= 0 && postClose != buf.length())) {
|
|
||||||
syntaxError("Extra text after ]", rule, start);
|
syntaxError("Extra text after ]", rule, start);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Context is only allowed on the input side; that is, the left side
|
text = buf.toString();
|
||||||
// for forward rules. Cursors are only allowed on the output side;
|
return pos;
|
||||||
// that is, the right side for forward rules. Bidirectional rules
|
}
|
||||||
// ignore elements that do not apply.
|
|
||||||
|
|
||||||
switch (operator) {
|
/**
|
||||||
case VARIABLE_DEF_OP:
|
* Remove context.
|
||||||
|
*/
|
||||||
|
void removeContext() {
|
||||||
|
text = text.substring(ante < 0 ? 0 : ante,
|
||||||
|
post < 0 ? text.length() : post);
|
||||||
|
ante = post = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and return an int[] array of segments.
|
||||||
|
*/
|
||||||
|
int[] getSegments() {
|
||||||
|
if (segments == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
int[] result = new int[segments.size()];
|
||||||
|
for (int i=0; i<segments.size(); ++i) {
|
||||||
|
result[i] = ((Number)segments.elementAt(i)).intValue();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
||||||
|
* at pos. Return the index after the last character parsed. Do not
|
||||||
|
* parse characters at or after limit.
|
||||||
|
*
|
||||||
|
* Important: The character at pos must be a non-whitespace character
|
||||||
|
* that is not the comment character.
|
||||||
|
*
|
||||||
|
* This method handles quoting, escaping, and whitespace removal. It
|
||||||
|
* parses the end-of-rule character. It recognizes context and cursor
|
||||||
|
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
||||||
|
* creates a rule object and adds it to our rule list.
|
||||||
|
*
|
||||||
|
* This method is tightly coupled to the inner class RuleHalf.
|
||||||
|
*/
|
||||||
|
private int parseRule(String rule, int pos, int limit) {
|
||||||
|
// Locate the left side, operator, and right side
|
||||||
|
int start = pos;
|
||||||
|
char operator = 0;
|
||||||
|
|
||||||
|
RuleHalf left = new RuleHalf();
|
||||||
|
RuleHalf right = new RuleHalf();
|
||||||
|
|
||||||
|
pos = left.parse(rule, pos, limit, this);
|
||||||
|
|
||||||
|
if (pos == limit ||
|
||||||
|
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
|
||||||
|
syntaxError("No operator", rule, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Found an operator char. Check for forward-reverse operator.
|
||||||
|
if (operator == REVERSE_RULE_OP &&
|
||||||
|
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
|
||||||
|
++pos;
|
||||||
|
operator = FWDREV_RULE_OP;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = right.parse(rule, pos, limit, this);
|
||||||
|
|
||||||
|
if (pos < limit) {
|
||||||
|
if (rule.charAt(pos) == END_OF_RULE) {
|
||||||
|
++pos;
|
||||||
|
} else {
|
||||||
|
// RuleHalf parser must have terminated at an operator
|
||||||
|
syntaxError("Unquoted operator", rule, start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (operator == VARIABLE_DEF_OP) {
|
||||||
// LHS is the name. RHS is a single character, either a literal
|
// LHS is the name. RHS is a single character, either a literal
|
||||||
// or a set (already parsed). If RHS is longer than one
|
// or a set (already parsed). If RHS is longer than one
|
||||||
// character, it is either a multi-character string, or multiple
|
// character, it is either a multi-character string, or multiple
|
||||||
// sets, or a mixture of chars and sets -- syntax error.
|
// sets, or a mixture of chars and sets -- syntax error.
|
||||||
if (buf.length() != 1) {
|
if (right.text.length() != 1) {
|
||||||
syntaxError("Malformed RHS", rule, start);
|
syntaxError("Malformed RHS", rule, start);
|
||||||
}
|
}
|
||||||
if (data.variableNames.get(left) != null) {
|
if (data.variableNames.get(left.text) != null) {
|
||||||
syntaxError("Duplicate definition of {" +
|
syntaxError("Duplicate definition of {" +
|
||||||
left + "}", rule, start);
|
left.text + "}", rule, start);
|
||||||
}
|
}
|
||||||
data.variableNames.put(left, new Character(buf.charAt(0)));
|
data.variableNames.put(left.text, new Character(right.text.charAt(0)));
|
||||||
break;
|
return pos;
|
||||||
|
|
||||||
case FORWARD_RULE_OP:
|
|
||||||
if (direction == FORWARD) {
|
|
||||||
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
|
|
||||||
syntaxError("Malformed rule", rule, start);
|
|
||||||
}
|
}
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
|
||||||
left, leftAnte, leftPost,
|
|
||||||
buf.toString(), cursor));
|
|
||||||
} // otherwise ignore the rule; it's not the direction we want
|
|
||||||
break;
|
|
||||||
|
|
||||||
case REVERSE_RULE_OP:
|
// If the direction we want doesn't match the rule
|
||||||
|
// direction, do nothing.
|
||||||
|
if (operator != FWDREV_RULE_OP &&
|
||||||
|
((direction == FORWARD) != (operator == FORWARD_RULE_OP))) {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transform the rule into a forward rule by swapping the
|
||||||
|
// sides if necessary.
|
||||||
if (direction == REVERSE) {
|
if (direction == REVERSE) {
|
||||||
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
|
RuleHalf temp = left;
|
||||||
|
left = right;
|
||||||
|
right = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove non-applicable elements in forward-reverse
|
||||||
|
// rules. Bidirectional rules ignore elements that do not
|
||||||
|
// apply.
|
||||||
|
if (operator == FWDREV_RULE_OP) {
|
||||||
|
right.removeContext();
|
||||||
|
right.segments = null;
|
||||||
|
left.cursor = left.maxRef = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context is only allowed on the input side. Cursors are only
|
||||||
|
// allowed on the output side. Segment delimiters can only appear
|
||||||
|
// on the left, and references on the right.
|
||||||
|
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
|
||||||
|
right.segments != null || left.maxRef >= 0) {
|
||||||
syntaxError("Malformed rule", rule, start);
|
syntaxError("Malformed rule", rule, start);
|
||||||
}
|
}
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
|
||||||
buf.toString(), ante, post,
|
|
||||||
left, leftCursor));
|
|
||||||
} // otherwise ignore the rule; it's not the direction we want
|
|
||||||
break;
|
|
||||||
|
|
||||||
case FWDREV_RULE_OP:
|
// Check integrity of segments and segment references. Each
|
||||||
if (direction == FORWARD) {
|
// segment's start must have a corresponding limit, and the
|
||||||
// The output side is the right; trim off any context
|
// references must not refer to segments that do not exist.
|
||||||
String output = buf.toString().substring(ante < 0 ? 0 : ante,
|
int[] segmentsArray = null;
|
||||||
post < 0 ? buf.length() : post);
|
if (left.segments != null) {
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
int n = left.segments.size();
|
||||||
left, leftAnte, leftPost,
|
if (n % 2 != 0) {
|
||||||
output, cursor));
|
syntaxError("Odd length segments", rule, start);
|
||||||
} else {
|
|
||||||
// The output side is the left; trim off any context
|
|
||||||
String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
|
|
||||||
leftPost < 0 ? left.length() : leftPost);
|
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
|
||||||
buf.toString(), ante, post,
|
|
||||||
output, leftCursor));
|
|
||||||
}
|
}
|
||||||
break;
|
n /= 2;
|
||||||
|
if (right.maxRef > n) {
|
||||||
|
syntaxError("Undefined segment reference " + right.maxRef, rule, start);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data.ruleSet.addRule(new TransliterationRule(
|
||||||
|
left.text, left.ante, left.post,
|
||||||
|
right.text, right.cursor,
|
||||||
|
left.getSegments(), data));
|
||||||
|
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
@ -871,13 +995,13 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
* @param rule pattern string
|
* @param rule pattern string
|
||||||
* @param start position of first character of current rule
|
* @param start position of first character of current rule
|
||||||
*/
|
*/
|
||||||
private static final void syntaxError(String msg, String rule, int start) {
|
static final void syntaxError(String msg, String rule, int start) {
|
||||||
int end = quotedIndexOf(rule, start, rule.length(), ";");
|
int end = quotedIndexOf(rule, start, rule.length(), ";");
|
||||||
if (end < 0) {
|
if (end < 0) {
|
||||||
end = rule.length();
|
end = rule.length();
|
||||||
}
|
}
|
||||||
throw new IllegalArgumentException(msg + " in " +
|
throw new IllegalArgumentException(msg + " in \"" +
|
||||||
rule.substring(start, end));
|
Utility.escape(rule.substring(start, end)) + '"');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -928,7 +1052,9 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||||||
"No private use characters available for variables");
|
"No private use characters available for variables");
|
||||||
}
|
}
|
||||||
|
|
||||||
data.setVariablesBase = variableNext = r.start;
|
// Allocate 9 characters for segment references 1 through 9
|
||||||
|
data.segmentBase = r.start;
|
||||||
|
data.setVariablesBase = variableNext = (char) (data.segmentBase + 9);
|
||||||
variableLimit = (char) (r.start + r.length);
|
variableLimit = (char) (r.start + r.length);
|
||||||
|
|
||||||
if (variableNext >= variableLimit) {
|
if (variableNext >= variableLimit) {
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
||||||
* $Date: 2000/04/12 20:17:45 $
|
* $Date: 2000/04/19 16:34:18 $
|
||||||
* $Revision: 1.15 $
|
* $Revision: 1.16 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -30,12 +30,26 @@ import com.ibm.util.Utility;
|
|||||||
* Variables are detected by looking up each character in a supplied
|
* Variables are detected by looking up each character in a supplied
|
||||||
* variable list to see if it has been so defined.
|
* variable list to see if it has been so defined.
|
||||||
*
|
*
|
||||||
|
* <p>A rule may contain segments in its input string and segment references in
|
||||||
|
* its output string. A segment is a substring of the input pattern, indicated
|
||||||
|
* by an offset and limit. The segment may span the preceding or following
|
||||||
|
* context. A segment reference is a special character in the output string
|
||||||
|
* that causes a segment of the input string (not the input pattern) to be
|
||||||
|
* copied to the output string. The range of special characters that represent
|
||||||
|
* segment references is defined by RuleBasedTransliterator.Data.
|
||||||
|
*
|
||||||
|
* <p>Example: The rule "$([a-z]$) . $([0-9]$) > $2 . $1" will change the input
|
||||||
|
* string "abc.123" to "ab1.c23".
|
||||||
|
*
|
||||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||||
*
|
*
|
||||||
* @author Alan Liu
|
* @author Alan Liu
|
||||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.15 $ $Date: 2000/04/12 20:17:45 $
|
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.16 $ $Date: 2000/04/19 16:34:18 $
|
||||||
*
|
*
|
||||||
* $Log: TransliterationRule.java,v $
|
* $Log: TransliterationRule.java,v $
|
||||||
|
* Revision 1.16 2000/04/19 16:34:18 alan
|
||||||
|
* Add segment support.
|
||||||
|
*
|
||||||
* Revision 1.15 2000/04/12 20:17:45 alan
|
* Revision 1.15 2000/04/12 20:17:45 alan
|
||||||
* Delegate replace operation to rule object
|
* Delegate replace operation to rule object
|
||||||
*
|
*
|
||||||
@ -121,6 +135,21 @@ class TransliterationRule {
|
|||||||
*/
|
*/
|
||||||
private String output;
|
private String output;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Array of segments. These are segments of the input string that may be
|
||||||
|
* referenced and appear in the output string. Each segment is stored as an
|
||||||
|
* offset, limit pair. Segments are referenced by a 1-based index;
|
||||||
|
* reference i thus includes characters at offset segments[2*i-2] to
|
||||||
|
* segments[2*i-1]-1 in the pattern string.
|
||||||
|
*
|
||||||
|
* In the output string, a segment reference is indicated by a character in
|
||||||
|
* a special range, as defined by RuleBasedTransliterator.Data.
|
||||||
|
*
|
||||||
|
* Most rules have no segments, in which case segments is null, and the
|
||||||
|
* output string need not be checked for segment reference characters.
|
||||||
|
*/
|
||||||
|
private int[] segments;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The length of the string that must match before the key. If
|
* The length of the string that must match before the key. If
|
||||||
* zero, then there is no matching requirement before the key.
|
* zero, then there is no matching requirement before the key.
|
||||||
@ -160,11 +189,17 @@ class TransliterationRule {
|
|||||||
* <code>output</code>; that is, -1 is equivalent to
|
* <code>output</code>; that is, -1 is equivalent to
|
||||||
* <code>output.length()</code>. If greater than
|
* <code>output.length()</code>. If greater than
|
||||||
* <code>output.length()</code> then an exception is thrown.
|
* <code>output.length()</code> then an exception is thrown.
|
||||||
|
* @param segs array of 2n integers. Each of n pairs consists of offset,
|
||||||
|
* limit for a segment of the input string. Characters in the output string
|
||||||
|
* refer to these segments if they are in a special range determined by the
|
||||||
|
* associated RuleBasedTransliterator.Data object. May be null if there are
|
||||||
|
* no segments.
|
||||||
*/
|
*/
|
||||||
public TransliterationRule(String input,
|
public TransliterationRule(String input,
|
||||||
int anteContextPos, int postContextPos,
|
int anteContextPos, int postContextPos,
|
||||||
String output,
|
String output,
|
||||||
int cursorPos) {
|
int cursorPos,
|
||||||
|
int[] segs) {
|
||||||
// Do range checks only when warranted to save time
|
// Do range checks only when warranted to save time
|
||||||
if (anteContextPos < 0) {
|
if (anteContextPos < 0) {
|
||||||
anteContextLength = 0;
|
anteContextLength = 0;
|
||||||
@ -193,6 +228,34 @@ class TransliterationRule {
|
|||||||
}
|
}
|
||||||
pattern = input;
|
pattern = input;
|
||||||
this.output = output;
|
this.output = output;
|
||||||
|
// We don't validate the segments array. The caller must
|
||||||
|
// guarantee that the segments are well-formed.
|
||||||
|
this.segments = segs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct a new rule with the given input, output text, and other
|
||||||
|
* attributes. A cursor position may be specified for the output text.
|
||||||
|
* @param input input string, including key and optional ante and
|
||||||
|
* post context
|
||||||
|
* @param anteContextPos offset into input to end of ante context, or -1 if
|
||||||
|
* none. Must be <= input.length() if not -1.
|
||||||
|
* @param postContextPos offset into input to start of post context, or -1
|
||||||
|
* if none. Must be <= input.length() if not -1, and must be >=
|
||||||
|
* anteContextPos.
|
||||||
|
* @param output output string
|
||||||
|
* @param cursorPos offset into output at which cursor is located, or -1 if
|
||||||
|
* none. If less than zero, then the cursor is placed after the
|
||||||
|
* <code>output</code>; that is, -1 is equivalent to
|
||||||
|
* <code>output.length()</code>. If greater than
|
||||||
|
* <code>output.length()</code> then an exception is thrown.
|
||||||
|
*/
|
||||||
|
public TransliterationRule(String input,
|
||||||
|
int anteContextPos, int postContextPos,
|
||||||
|
String output,
|
||||||
|
int cursorPos) {
|
||||||
|
this(input, anteContextPos, postContextPos,
|
||||||
|
output, cursorPos, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -238,11 +301,34 @@ class TransliterationRule {
|
|||||||
* matches. This is the offset to the point after the ante
|
* matches. This is the offset to the point after the ante
|
||||||
* context, if any, and before the match string and any post
|
* context, if any, and before the match string and any post
|
||||||
* context.
|
* context.
|
||||||
|
* @param data the RuleBasedTransliterator.Data object specifying
|
||||||
|
* context for this transliterator.
|
||||||
* @return the change in the length of the text
|
* @return the change in the length of the text
|
||||||
*/
|
*/
|
||||||
int replace(Replaceable text, int offset) {
|
public int replace(Replaceable text, int offset,
|
||||||
text.replace(offset, offset + keyLength, output);
|
RuleBasedTransliterator.Data data) {
|
||||||
return output.length() - keyLength;
|
String out;
|
||||||
|
if (segments == null) {
|
||||||
|
out = output;
|
||||||
|
} else {
|
||||||
|
int textStart = offset - anteContextLength;
|
||||||
|
StringBuffer buf = new StringBuffer();
|
||||||
|
for (int i=0; i<output.length(); ++i) {
|
||||||
|
char c = output.charAt(i);
|
||||||
|
int b = data.lookupSegmentReference(c);
|
||||||
|
if (b < 0) {
|
||||||
|
buf.append(c);
|
||||||
|
} else {
|
||||||
|
for (int j=textStart + segments[2*b];
|
||||||
|
j<textStart + segments[2*b+1]; ++j) {
|
||||||
|
buf.append(text.charAt(j));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = buf.toString();
|
||||||
|
}
|
||||||
|
text.replace(offset, offset + keyLength, out);
|
||||||
|
return out.length() - keyLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Loading…
Reference in New Issue
Block a user