Add segment support.

X-SVN-Rev: 1165
This commit is contained in:
Alan Liu 2000-04-19 16:37:38 +00:00
parent 9a19714271
commit 2947282e42
6 changed files with 882 additions and 420 deletions

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
* $Date: 2000/03/22 02:00:08 $ * $Date: 2000/04/19 16:37:38 $
* $Revision: 1.14 $ * $Revision: 1.15 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -412,6 +412,25 @@ public class TransliteratorTest extends TestFmwk {
expect(hex3, "012", "012"); expect(hex3, "012", "012");
} }
/**
* Test segments and segment references.
*/
public void TestSegments() {
// Array of 3n items
// Each item is <rules>, <input>, <expected output>
String[] DATA = {
"$([a-z]$) . $([0-9]$) > $2-$1",
"abc.123.xyz.456",
"ab1-c23.xy4-z56",
};
for (int i=0; i<DATA.length; i+=3) {
logln("Pattern: " + Utility.escape(DATA[i]));
Transliterator t = new RuleBasedTransliterator("<ID>", DATA[i]);
expect(t, DATA[i+1], DATA[i+2]);
}
}
//====================================================================== //======================================================================
// Support methods // Support methods
//====================================================================== //======================================================================

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
* $Date: 2000/04/12 20:17:45 $ * $Date: 2000/04/19 16:34:18 $
* $Revision: 1.18 $ * $Revision: 1.19 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -209,9 +209,12 @@ import com.ibm.util.Utility;
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p> * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
* *
* @author Alan Liu * @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.18 $ $Date: 2000/04/12 20:17:45 $ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.19 $ $Date: 2000/04/19 16:34:18 $
* *
* $Log: RuleBasedTransliterator.java,v $ * $Log: RuleBasedTransliterator.java,v $
* Revision 1.19 2000/04/19 16:34:18 alan
* Add segment support.
*
* Revision 1.18 2000/04/12 20:17:45 alan * Revision 1.18 2000/04/12 20:17:45 alan
* Delegate replace operation to rule object * Delegate replace operation to rule object
* *
@ -379,7 +382,7 @@ public class RuleBasedTransliterator extends Transliterator {
} }
} else { } else {
// Delegate replacement to TransliterationRule object // Delegate replacement to TransliterationRule object
limit += r.replace(text, cursor); limit += r.replace(text, cursor, data);
// text.replace(cursor, cursor + r.getKeyLength(), r.getOutput()); // text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
// limit += r.getOutput().length() - r.getKeyLength(); // limit += r.getOutput().length() - r.getKeyLength();
cursor += r.getCursorPos(); cursor += r.getCursorPos();
@ -448,12 +451,14 @@ public class RuleBasedTransliterator extends Transliterator {
public UnicodeSet[] setVariables; public UnicodeSet[] setVariables;
/** /**
* The character represented by setVariables[0]. * The character that represents setVariables[0]. Characters
* setVariablesBase through setVariablesBase +
* setVariables.length - 1 represent UnicodeSet objects.
*/ */
public char setVariablesBase; public char setVariablesBase;
/** /**
* Return the UnicodeSet associated with the given character, or * Return the UnicodeSet represented by the given character, or
* null if none. * null if none.
*/ */
public UnicodeSet lookup(char c) { public UnicodeSet lookup(char c) {
@ -461,6 +466,22 @@ public class RuleBasedTransliterator extends Transliterator {
return (i >= 0 && i < setVariables.length) return (i >= 0 && i < setVariables.length)
? setVariables[i] : null; ? setVariables[i] : null;
} }
/**
* The character that represents segment 1. Characters segmentBase
* through segmentBase + 8 represent segments 1 through 9.
*/
public char segmentBase;
/**
* Return the zero-based index of the segment represented by the given
* character, or -1 if none. Repeat: This is a zero-based return value,
* 0..8, even though these are notated "$1".."$9".
*/
public int lookupSegmentReference(char c) {
int i = c - segmentBase;
return (i >= 0 && i < 9) ? i : -1;
}
} }
@ -548,6 +569,12 @@ public class RuleBasedTransliterator extends Transliterator {
private static final char SET_CLOSE = ']'; private static final char SET_CLOSE = ']';
private static final char CURSOR_POS = '|'; private static final char CURSOR_POS = '|';
// Segments of the input string are delimited by "$(" and "$)". In the
// output string these segments are referenced as "$1" through "$9".
private static final char SEGMENT_REF = '$';
private static final char SEGMENT_OPEN = '(';
private static final char SEGMENT_CLOSE = ')';
/** /**
* @param rules list of rules, separated by semicolon characters * @param rules list of rules, separated by semicolon characters
* @exception IllegalArgumentException if there is a syntax error in the * @exception IllegalArgumentException if there is a syntax error in the
@ -632,6 +659,214 @@ public class RuleBasedTransliterator extends Transliterator {
} }
} }
/**
* A class representing one side of a rule. This class knows how to
* parse half of a rule. It is tightly coupled to the method
* RuleBasedTransliterator.Parser.parseRule().
*/
static class RuleHalf {
public String text;
public int cursor = -1; // position of cursor in text
public int ante = -1; // position of ante context marker ')' in text
public int post = -1; // position of post context marker '(' in text
// Record the position of the segment substrings and references. A
// given side should have segments or segment references, but not
// both.
public Vector segments = null; // ref substring start,limits
public int maxRef = -1; // index of largest ref (1..9)
/**
* Parse one side of a rule, stopping at either the limit,
* the END_OF_RULE character, or an operator. Return
* the pos of the terminating character (or limit).
*/
public int parse(String rule, int pos, int limit,
RuleBasedTransliterator.Parser parser) {
int start = pos;
StringBuffer buf = new StringBuffer();
int postClose = -1; // position of post context close ')' in text
main:
while (pos < limit) {
char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode
// spaces, but Java spaces -- a subset, representing
// whitespace likely to be seen in code.
continue;
}
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
syntaxError("Trailing backslash", rule, start);
}
buf.append(rule.charAt(pos++));
continue;
}
// Handle quoted matter
if (c == QUOTE) {
int iq = rule.indexOf(QUOTE, pos);
if (iq == pos) {
buf.append(c); // Parse [''] outside quotes as [']
++pos;
} else {
/* This loop picks up a segment of quoted text of the
* form 'aaaa' each time through. If this segment
* hasn't really ended ('aaaa''bbbb') then it keeps
* looping, each time adding on a new segment. When it
* reaches the final quote it breaks.
*/
for (;;) {
if (iq < 0) {
syntaxError("Unterminated quote", rule, start);
}
buf.append(rule.substring(pos, iq));
pos = iq+1;
if (pos < limit && rule.charAt(pos) == QUOTE) {
// Parse [''] inside quotes as [']
iq = rule.indexOf(QUOTE, pos+1);
// Continue looping
} else {
break;
}
}
}
continue;
}
if (OPERATORS.indexOf(c) >= 0) {
--pos; // Backup to point to operator
break main;
}
// Handle segment definitions "$(" ")$" and references "$1"
// .. "$9".
if (c == SEGMENT_REF) {
// After a SEGMENT_REF, must see SEGMENT_OPEN,
// SEGMENT_CLOSE, or a digit 1 to 9, with no intervening
// whitespace
if (pos == limit) {
syntaxError("Trailing " + c, rule, start);
}
c = rule.charAt(pos++);
if (c == SEGMENT_OPEN || c == SEGMENT_CLOSE) {
// Parse "$(", "$)"
if (segments == null) {
segments = new Vector();
}
if ((c == SEGMENT_OPEN) !=
(segments.size() % 2 == 0)) {
syntaxError("Mismatched segment delimiters",
rule, start);
}
segments.addElement(new Integer(buf.length()));
} else {
// Parse "$1" "$2" .. "$9"
int r = Character.digit(c, 10);
if (r < 1 || r > 9) {
syntaxError("Illegal char after " + SEGMENT_REF,
rule, start);
}
if (r > maxRef) {
maxRef = r;
}
buf.append((char) (parser.data.segmentBase + r - 1));
}
continue;
}
switch (c) {
case END_OF_RULE:
--pos; // Backup to point to END_OF_RULE
break main;
case VARIABLE_REF_OPEN:
{
int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
if (pos == j || j < 0) { // empty or unterminated
syntaxError("Malformed variable reference", rule, start);
}
String name = rule.substring(pos, j);
pos = j+1;
buf.append(parser.getVariableDef(name));
}
break;
case CONTEXT_OPEN:
if (post >= 0) {
syntaxError("Multiple post contexts", rule, start);
}
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
// this is the optional opening delimiter for the ante context.
if (buf.length() > 0) {
post = buf.length();
}
break;
case CONTEXT_CLOSE:
if (postClose >= 0) {
syntaxError("Unexpected " + c, rule, start);
}
if (post >= 0) {
// This is probably the optional closing delimiter
// for the post context; save the pos and check later.
postClose = buf.length();
} else if (ante >= 0) {
syntaxError("Multiple ante contexts", rule, start);
} else {
ante = buf.length();
}
break;
case SET_OPEN:
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData)));
pos = pp.getIndex();
break;
case VARIABLE_REF_CLOSE:
case SET_CLOSE:
syntaxError("Unquoted " + c, rule, start);
case CURSOR_POS:
if (cursor >= 0) {
syntaxError("Multiple cursors", rule, start);
}
cursor = buf.length();
break;
default:
buf.append(c);
break;
}
}
// Check context close parameters
if (postClose >= 0 && postClose != buf.length()) {
syntaxError("Extra text after ]", rule, start);
}
text = buf.toString();
return pos;
}
/**
* Remove context.
*/
void removeContext() {
text = text.substring(ante < 0 ? 0 : ante,
post < 0 ? text.length() : post);
ante = post = -1;
}
/**
* Create and return an int[] array of segments.
*/
int[] getSegments() {
if (segments == null) {
return null;
}
int[] result = new int[segments.size()];
for (int i=0; i<segments.size(); ++i) {
result[i] = ((Number)segments.elementAt(i)).intValue();
}
return result;
}
}
/** /**
* MAIN PARSER. Parse the next rule in the given rule string, starting * MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not * at pos. Return the index after the last character parsed. Do not
@ -644,221 +879,110 @@ public class RuleBasedTransliterator extends Transliterator {
* parses the end-of-rule character. It recognizes context and cursor * parses the end-of-rule character. It recognizes context and cursor
* indicators. Once it does a lexical breakdown of the rule at pos, it * indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list. * creates a rule object and adds it to our rule list.
*
* This method is tightly coupled to the inner class RuleHalf.
*/ */
private int parseRule(String rule, int pos, int limit) { private int parseRule(String rule, int pos, int limit) {
// Locate the left side, operator, and right side // Locate the left side, operator, and right side
int start = pos; int start = pos;
char operator = 0; char operator = 0;
StringBuffer buf = new StringBuffer(); RuleHalf left = new RuleHalf();
int cursor = -1; // position of cursor in buf RuleHalf right = new RuleHalf();
int ante = -1; // position of ante context marker ')' in buf
int post = -1; // position of post context marker '(' in buf
int postClose = -1; // position of post context close ')' in buf
// Assigned to buf and its adjuncts after the LHS has been pos = left.parse(rule, pos, limit, this);
// parsed. Thereafter, buf etc. refer to the RHS.
String left = null;
int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
main: if (pos == limit ||
while (pos < limit) { OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode
// spaces, but Java spaces -- a subset, representing
// whitespace likely to be seen in code.
continue;
}
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
syntaxError("Trailing backslash", rule, start);
}
buf.append(rule.charAt(pos++));
continue;
}
// Handle quoted matter
if (c == QUOTE) {
int iq = rule.indexOf(QUOTE, pos);
if (iq == pos) {
buf.append(c); // Parse [''] outside quotes as [']
++pos;
} else {
/* This loop picks up a segment of quoted text of the
* form 'aaaa' each time through. If this segment
* hasn't really ended ('aaaa''bbbb') then it keeps
* looping, each time adding on a new segment. When it
* reaches the final quote it breaks.
*/
for (;;) {
if (iq < 0) {
syntaxError("Unterminated quote", rule, start);
}
buf.append(rule.substring(pos, iq));
pos = iq+1;
if (pos < limit && rule.charAt(pos) == QUOTE) {
// Parse [''] inside quotes as [']
iq = rule.indexOf(QUOTE, pos+1);
// Continue looping
} else {
break;
}
}
}
continue;
}
if (OPERATORS.indexOf(c) >= 0) {
if (operator != 0) {
syntaxError("Unquoted " + c, rule, start);
}
// Found an operator char. Check for forward-reverse operator.
if (c == REVERSE_RULE_OP &&
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
++pos;
operator = FWDREV_RULE_OP;
} else {
operator = c;
}
left = buf.toString(); // lhs
leftCursor = cursor;
leftAnte = ante;
leftPost = post;
leftPostClose = postClose;
buf.setLength(0);
cursor = ante = post = postClose = -1;
continue;
}
switch (c) {
case END_OF_RULE:
break main;
case VARIABLE_REF_OPEN:
{
int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
if (pos == j || j < 0) { // empty or unterminated
syntaxError("Malformed variable reference", rule, start);
}
String name = rule.substring(pos, j);
pos = j+1;
buf.append(getVariableDef(name));
}
break;
case CONTEXT_OPEN:
if (post >= 0) {
syntaxError("Multiple post contexts", rule, start);
}
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
// this is the optional opening delimiter for the ante context.
if (buf.length() > 0) {
post = buf.length();
}
break;
case CONTEXT_CLOSE:
if (postClose >= 0) {
syntaxError("Unexpected " + c, rule, start);
}
if (post >= 0) {
// This is probably the optional closing delimiter
// for the post context; save the pos and check later.
postClose = buf.length();
} else if (ante >= 0) {
syntaxError("Multiple ante contexts", rule, start);
} else {
ante = buf.length();
}
break;
case SET_OPEN:
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
buf.append(registerSet(new UnicodeSet(rule, pp, parseData)));
pos = pp.getIndex();
break;
case VARIABLE_REF_CLOSE:
case SET_CLOSE:
syntaxError("Unquoted " + c, rule, start);
case CURSOR_POS:
if (cursor >= 0) {
syntaxError("Multiple cursors", rule, start);
}
cursor = buf.length();
break;
default:
buf.append(c);
break;
}
}
if (operator == 0) {
syntaxError("No operator", rule, start); syntaxError("No operator", rule, start);
} }
// Check context close parameters // Found an operator char. Check for forward-reverse operator.
if ((leftPostClose >= 0 && leftPostClose != left.length()) || if (operator == REVERSE_RULE_OP &&
(postClose >= 0 && postClose != buf.length())) { (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
syntaxError("Extra text after ]", rule, start); ++pos;
operator = FWDREV_RULE_OP;
} }
// Context is only allowed on the input side; that is, the left side pos = right.parse(rule, pos, limit, this);
// for forward rules. Cursors are only allowed on the output side;
// that is, the right side for forward rules. Bidirectional rules
// ignore elements that do not apply.
switch (operator) { if (pos < limit) {
case VARIABLE_DEF_OP: if (rule.charAt(pos) == END_OF_RULE) {
++pos;
} else {
// RuleHalf parser must have terminated at an operator
syntaxError("Unquoted operator", rule, start);
}
}
if (operator == VARIABLE_DEF_OP) {
// LHS is the name. RHS is a single character, either a literal // LHS is the name. RHS is a single character, either a literal
// or a set (already parsed). If RHS is longer than one // or a set (already parsed). If RHS is longer than one
// character, it is either a multi-character string, or multiple // character, it is either a multi-character string, or multiple
// sets, or a mixture of chars and sets -- syntax error. // sets, or a mixture of chars and sets -- syntax error.
if (buf.length() != 1) { if (right.text.length() != 1) {
syntaxError("Malformed RHS", rule, start); syntaxError("Malformed RHS", rule, start);
} }
if (data.variableNames.get(left) != null) { if (data.variableNames.get(left.text) != null) {
syntaxError("Duplicate definition of {" + syntaxError("Duplicate definition of {" +
left + "}", rule, start); left.text + "}", rule, start);
} }
data.variableNames.put(left, new Character(buf.charAt(0))); data.variableNames.put(left.text, new Character(right.text.charAt(0)));
break; return pos;
case FORWARD_RULE_OP:
if (direction == FORWARD) {
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
syntaxError("Malformed rule", rule, start);
}
data.ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
buf.toString(), cursor));
} // otherwise ignore the rule; it's not the direction we want
break;
case REVERSE_RULE_OP:
if (direction == REVERSE) {
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
syntaxError("Malformed rule", rule, start);
}
data.ruleSet.addRule(new TransliterationRule(
buf.toString(), ante, post,
left, leftCursor));
} // otherwise ignore the rule; it's not the direction we want
break;
case FWDREV_RULE_OP:
if (direction == FORWARD) {
// The output side is the right; trim off any context
String output = buf.toString().substring(ante < 0 ? 0 : ante,
post < 0 ? buf.length() : post);
data.ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
output, cursor));
} else {
// The output side is the left; trim off any context
String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
leftPost < 0 ? left.length() : leftPost);
data.ruleSet.addRule(new TransliterationRule(
buf.toString(), ante, post,
output, leftCursor));
}
break;
} }
// If the direction we want doesn't match the rule
// direction, do nothing.
if (operator != FWDREV_RULE_OP &&
((direction == FORWARD) != (operator == FORWARD_RULE_OP))) {
return pos;
}
// Transform the rule into a forward rule by swapping the
// sides if necessary.
if (direction == REVERSE) {
RuleHalf temp = left;
left = right;
right = temp;
}
// Remove non-applicable elements in forward-reverse
// rules. Bidirectional rules ignore elements that do not
// apply.
if (operator == FWDREV_RULE_OP) {
right.removeContext();
right.segments = null;
left.cursor = left.maxRef = -1;
}
// Context is only allowed on the input side. Cursors are only
// allowed on the output side. Segment delimiters can only appear
// on the left, and references on the right.
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
right.segments != null || left.maxRef >= 0) {
syntaxError("Malformed rule", rule, start);
}
// Check integrity of segments and segment references. Each
// segment's start must have a corresponding limit, and the
// references must not refer to segments that do not exist.
int[] segmentsArray = null;
if (left.segments != null) {
int n = left.segments.size();
if (n % 2 != 0) {
syntaxError("Odd length segments", rule, start);
}
n /= 2;
if (right.maxRef > n) {
syntaxError("Undefined segment reference " + right.maxRef, rule, start);
}
}
data.ruleSet.addRule(new TransliterationRule(
left.text, left.ante, left.post,
right.text, right.cursor,
left.getSegments(), data));
return pos; return pos;
} }
@ -871,13 +995,13 @@ public class RuleBasedTransliterator extends Transliterator {
* @param rule pattern string * @param rule pattern string
* @param start position of first character of current rule * @param start position of first character of current rule
*/ */
private static final void syntaxError(String msg, String rule, int start) { static final void syntaxError(String msg, String rule, int start) {
int end = quotedIndexOf(rule, start, rule.length(), ";"); int end = quotedIndexOf(rule, start, rule.length(), ";");
if (end < 0) { if (end < 0) {
end = rule.length(); end = rule.length();
} }
throw new IllegalArgumentException(msg + " in " + throw new IllegalArgumentException(msg + " in \"" +
rule.substring(start, end)); Utility.escape(rule.substring(start, end)) + '"');
} }
/** /**
@ -928,7 +1052,9 @@ public class RuleBasedTransliterator extends Transliterator {
"No private use characters available for variables"); "No private use characters available for variables");
} }
data.setVariablesBase = variableNext = r.start; // Allocate 9 characters for segment references 1 through 9
data.segmentBase = r.start;
data.setVariablesBase = variableNext = (char) (data.segmentBase + 9);
variableLimit = (char) (r.start + r.length); variableLimit = (char) (r.start + r.length);
if (variableNext >= variableLimit) { if (variableNext >= variableLimit) {

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
* $Date: 2000/04/12 20:17:45 $ * $Date: 2000/04/19 16:34:18 $
* $Revision: 1.15 $ * $Revision: 1.16 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -30,12 +30,26 @@ import com.ibm.util.Utility;
* Variables are detected by looking up each character in a supplied * Variables are detected by looking up each character in a supplied
* variable list to see if it has been so defined. * variable list to see if it has been so defined.
* *
* <p>A rule may contain segments in its input string and segment references in
* its output string. A segment is a substring of the input pattern, indicated
* by an offset and limit. The segment may span the preceding or following
* context. A segment reference is a special character in the output string
* that causes a segment of the input string (not the input pattern) to be
* copied to the output string. The range of special characters that represent
* segment references is defined by RuleBasedTransliterator.Data.
*
* <p>Example: The rule "$([a-z]$) . $([0-9]$) > $2 . $1" will change the input
* string "abc.123" to "ab1.c23".
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved. * <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
* *
* @author Alan Liu * @author Alan Liu
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.15 $ $Date: 2000/04/12 20:17:45 $ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.16 $ $Date: 2000/04/19 16:34:18 $
* *
* $Log: TransliterationRule.java,v $ * $Log: TransliterationRule.java,v $
* Revision 1.16 2000/04/19 16:34:18 alan
* Add segment support.
*
* Revision 1.15 2000/04/12 20:17:45 alan * Revision 1.15 2000/04/12 20:17:45 alan
* Delegate replace operation to rule object * Delegate replace operation to rule object
* *
@ -121,6 +135,21 @@ class TransliterationRule {
*/ */
private String output; private String output;
/**
* Array of segments. These are segments of the input string that may be
* referenced and appear in the output string. Each segment is stored as an
* offset, limit pair. Segments are referenced by a 1-based index;
* reference i thus includes characters at offset segments[2*i-2] to
* segments[2*i-1]-1 in the pattern string.
*
* In the output string, a segment reference is indicated by a character in
* a special range, as defined by RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
*/
private int[] segments;
/** /**
* The length of the string that must match before the key. If * The length of the string that must match before the key. If
* zero, then there is no matching requirement before the key. * zero, then there is no matching requirement before the key.
@ -160,11 +189,17 @@ class TransliterationRule {
* <code>output</code>; that is, -1 is equivalent to * <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than * <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown. * <code>output.length()</code> then an exception is thrown.
* @param segs array of 2n integers. Each of n pairs consists of offset,
* limit for a segment of the input string. Characters in the output string
* refer to these segments if they are in a special range determined by the
* associated RuleBasedTransliterator.Data object. May be null if there are
* no segments.
*/ */
public TransliterationRule(String input, public TransliterationRule(String input,
int anteContextPos, int postContextPos, int anteContextPos, int postContextPos,
String output, String output,
int cursorPos) { int cursorPos,
int[] segs) {
// Do range checks only when warranted to save time // Do range checks only when warranted to save time
if (anteContextPos < 0) { if (anteContextPos < 0) {
anteContextLength = 0; anteContextLength = 0;
@ -193,6 +228,34 @@ class TransliterationRule {
} }
pattern = input; pattern = input;
this.output = output; this.output = output;
// We don't validate the segments array. The caller must
// guarantee that the segments are well-formed.
this.segments = segs;
}
/**
* Construct a new rule with the given input, output text, and other
* attributes. A cursor position may be specified for the output text.
* @param input input string, including key and optional ante and
* post context
* @param anteContextPos offset into input to end of ante context, or -1 if
* none. Must be <= input.length() if not -1.
* @param postContextPos offset into input to start of post context, or -1
* if none. Must be <= input.length() if not -1, and must be >=
* anteContextPos.
* @param output output string
* @param cursorPos offset into output at which cursor is located, or -1 if
* none. If less than zero, then the cursor is placed after the
* <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown.
*/
public TransliterationRule(String input,
int anteContextPos, int postContextPos,
String output,
int cursorPos) {
this(input, anteContextPos, postContextPos,
output, cursorPos, null);
} }
/** /**
@ -238,11 +301,34 @@ class TransliterationRule {
* matches. This is the offset to the point after the ante * matches. This is the offset to the point after the ante
* context, if any, and before the match string and any post * context, if any, and before the match string and any post
* context. * context.
* @param data the RuleBasedTransliterator.Data object specifying
* context for this transliterator.
* @return the change in the length of the text * @return the change in the length of the text
*/ */
int replace(Replaceable text, int offset) { public int replace(Replaceable text, int offset,
text.replace(offset, offset + keyLength, output); RuleBasedTransliterator.Data data) {
return output.length() - keyLength; String out;
if (segments == null) {
out = output;
} else {
int textStart = offset - anteContextLength;
StringBuffer buf = new StringBuffer();
for (int i=0; i<output.length(); ++i) {
char c = output.charAt(i);
int b = data.lookupSegmentReference(c);
if (b < 0) {
buf.append(c);
} else {
for (int j=textStart + segments[2*b];
j<textStart + segments[2*b+1]; ++j) {
buf.append(text.charAt(j));
}
}
}
out = buf.toString();
}
text.replace(offset, offset + keyLength, out);
return out.length() - keyLength;
} }
/** /**

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
* $Date: 2000/03/22 02:00:08 $ * $Date: 2000/04/19 16:37:38 $
* $Revision: 1.14 $ * $Revision: 1.15 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -412,6 +412,25 @@ public class TransliteratorTest extends TestFmwk {
expect(hex3, "012", "&#x30;&#x31;&#x32;"); expect(hex3, "012", "&#x30;&#x31;&#x32;");
} }
/**
* Test segments and segment references.
*/
public void TestSegments() {
// Array of 3n items
// Each item is <rules>, <input>, <expected output>
String[] DATA = {
"$([a-z]$) . $([0-9]$) > $2-$1",
"abc.123.xyz.456",
"ab1-c23.xy4-z56",
};
for (int i=0; i<DATA.length; i+=3) {
logln("Pattern: " + Utility.escape(DATA[i]));
Transliterator t = new RuleBasedTransliterator("<ID>", DATA[i]);
expect(t, DATA[i+1], DATA[i+2]);
}
}
//====================================================================== //======================================================================
// Support methods // Support methods
//====================================================================== //======================================================================

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
* $Date: 2000/04/12 20:17:45 $ * $Date: 2000/04/19 16:34:18 $
* $Revision: 1.18 $ * $Revision: 1.19 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -209,9 +209,12 @@ import com.ibm.util.Utility;
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p> * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
* *
* @author Alan Liu * @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.18 $ $Date: 2000/04/12 20:17:45 $ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.19 $ $Date: 2000/04/19 16:34:18 $
* *
* $Log: RuleBasedTransliterator.java,v $ * $Log: RuleBasedTransliterator.java,v $
* Revision 1.19 2000/04/19 16:34:18 alan
* Add segment support.
*
* Revision 1.18 2000/04/12 20:17:45 alan * Revision 1.18 2000/04/12 20:17:45 alan
* Delegate replace operation to rule object * Delegate replace operation to rule object
* *
@ -379,7 +382,7 @@ public class RuleBasedTransliterator extends Transliterator {
} }
} else { } else {
// Delegate replacement to TransliterationRule object // Delegate replacement to TransliterationRule object
limit += r.replace(text, cursor); limit += r.replace(text, cursor, data);
// text.replace(cursor, cursor + r.getKeyLength(), r.getOutput()); // text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
// limit += r.getOutput().length() - r.getKeyLength(); // limit += r.getOutput().length() - r.getKeyLength();
cursor += r.getCursorPos(); cursor += r.getCursorPos();
@ -448,12 +451,14 @@ public class RuleBasedTransliterator extends Transliterator {
public UnicodeSet[] setVariables; public UnicodeSet[] setVariables;
/** /**
* The character represented by setVariables[0]. * The character that represents setVariables[0]. Characters
* setVariablesBase through setVariablesBase +
* setVariables.length - 1 represent UnicodeSet objects.
*/ */
public char setVariablesBase; public char setVariablesBase;
/** /**
* Return the UnicodeSet associated with the given character, or * Return the UnicodeSet represented by the given character, or
* null if none. * null if none.
*/ */
public UnicodeSet lookup(char c) { public UnicodeSet lookup(char c) {
@ -461,6 +466,22 @@ public class RuleBasedTransliterator extends Transliterator {
return (i >= 0 && i < setVariables.length) return (i >= 0 && i < setVariables.length)
? setVariables[i] : null; ? setVariables[i] : null;
} }
/**
* The character that represents segment 1. Characters segmentBase
* through segmentBase + 8 represent segments 1 through 9.
*/
public char segmentBase;
/**
* Return the zero-based index of the segment represented by the given
* character, or -1 if none. Repeat: This is a zero-based return value,
* 0..8, even though these are notated "$1".."$9".
*/
public int lookupSegmentReference(char c) {
int i = c - segmentBase;
return (i >= 0 && i < 9) ? i : -1;
}
} }
@ -548,6 +569,12 @@ public class RuleBasedTransliterator extends Transliterator {
private static final char SET_CLOSE = ']'; private static final char SET_CLOSE = ']';
private static final char CURSOR_POS = '|'; private static final char CURSOR_POS = '|';
// Segments of the input string are delimited by "$(" and "$)". In the
// output string these segments are referenced as "$1" through "$9".
private static final char SEGMENT_REF = '$';
private static final char SEGMENT_OPEN = '(';
private static final char SEGMENT_CLOSE = ')';
/** /**
* @param rules list of rules, separated by semicolon characters * @param rules list of rules, separated by semicolon characters
* @exception IllegalArgumentException if there is a syntax error in the * @exception IllegalArgumentException if there is a syntax error in the
@ -632,6 +659,214 @@ public class RuleBasedTransliterator extends Transliterator {
} }
} }
/**
* A class representing one side of a rule. This class knows how to
* parse half of a rule. It is tightly coupled to the method
* RuleBasedTransliterator.Parser.parseRule().
*/
static class RuleHalf {
public String text;
public int cursor = -1; // position of cursor in text
public int ante = -1; // position of ante context marker ')' in text
public int post = -1; // position of post context marker '(' in text
// Record the position of the segment substrings and references. A
// given side should have segments or segment references, but not
// both.
public Vector segments = null; // ref substring start,limits
public int maxRef = -1; // index of largest ref (1..9)
/**
* Parse one side of a rule, stopping at either the limit,
* the END_OF_RULE character, or an operator. Return
* the pos of the terminating character (or limit).
*/
public int parse(String rule, int pos, int limit,
RuleBasedTransliterator.Parser parser) {
int start = pos;
StringBuffer buf = new StringBuffer();
int postClose = -1; // position of post context close ')' in text
main:
while (pos < limit) {
char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode
// spaces, but Java spaces -- a subset, representing
// whitespace likely to be seen in code.
continue;
}
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
syntaxError("Trailing backslash", rule, start);
}
buf.append(rule.charAt(pos++));
continue;
}
// Handle quoted matter
if (c == QUOTE) {
int iq = rule.indexOf(QUOTE, pos);
if (iq == pos) {
buf.append(c); // Parse [''] outside quotes as [']
++pos;
} else {
/* This loop picks up a segment of quoted text of the
* form 'aaaa' each time through. If this segment
* hasn't really ended ('aaaa''bbbb') then it keeps
* looping, each time adding on a new segment. When it
* reaches the final quote it breaks.
*/
for (;;) {
if (iq < 0) {
syntaxError("Unterminated quote", rule, start);
}
buf.append(rule.substring(pos, iq));
pos = iq+1;
if (pos < limit && rule.charAt(pos) == QUOTE) {
// Parse [''] inside quotes as [']
iq = rule.indexOf(QUOTE, pos+1);
// Continue looping
} else {
break;
}
}
}
continue;
}
if (OPERATORS.indexOf(c) >= 0) {
--pos; // Backup to point to operator
break main;
}
// Handle segment definitions "$(" ")$" and references "$1"
// .. "$9".
if (c == SEGMENT_REF) {
// After a SEGMENT_REF, must see SEGMENT_OPEN,
// SEGMENT_CLOSE, or a digit 1 to 9, with no intervening
// whitespace
if (pos == limit) {
syntaxError("Trailing " + c, rule, start);
}
c = rule.charAt(pos++);
if (c == SEGMENT_OPEN || c == SEGMENT_CLOSE) {
// Parse "$(", "$)"
if (segments == null) {
segments = new Vector();
}
if ((c == SEGMENT_OPEN) !=
(segments.size() % 2 == 0)) {
syntaxError("Mismatched segment delimiters",
rule, start);
}
segments.addElement(new Integer(buf.length()));
} else {
// Parse "$1" "$2" .. "$9"
int r = Character.digit(c, 10);
if (r < 1 || r > 9) {
syntaxError("Illegal char after " + SEGMENT_REF,
rule, start);
}
if (r > maxRef) {
maxRef = r;
}
buf.append((char) (parser.data.segmentBase + r - 1));
}
continue;
}
switch (c) {
case END_OF_RULE:
--pos; // Backup to point to END_OF_RULE
break main;
case VARIABLE_REF_OPEN:
{
int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
if (pos == j || j < 0) { // empty or unterminated
syntaxError("Malformed variable reference", rule, start);
}
String name = rule.substring(pos, j);
pos = j+1;
buf.append(parser.getVariableDef(name));
}
break;
case CONTEXT_OPEN:
if (post >= 0) {
syntaxError("Multiple post contexts", rule, start);
}
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
// this is the optional opening delimiter for the ante context.
if (buf.length() > 0) {
post = buf.length();
}
break;
case CONTEXT_CLOSE:
if (postClose >= 0) {
syntaxError("Unexpected " + c, rule, start);
}
if (post >= 0) {
// This is probably the optional closing delimiter
// for the post context; save the pos and check later.
postClose = buf.length();
} else if (ante >= 0) {
syntaxError("Multiple ante contexts", rule, start);
} else {
ante = buf.length();
}
break;
case SET_OPEN:
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData)));
pos = pp.getIndex();
break;
case VARIABLE_REF_CLOSE:
case SET_CLOSE:
syntaxError("Unquoted " + c, rule, start);
case CURSOR_POS:
if (cursor >= 0) {
syntaxError("Multiple cursors", rule, start);
}
cursor = buf.length();
break;
default:
buf.append(c);
break;
}
}
// Check context close parameters
if (postClose >= 0 && postClose != buf.length()) {
syntaxError("Extra text after ]", rule, start);
}
text = buf.toString();
return pos;
}
/**
* Remove context.
*/
void removeContext() {
text = text.substring(ante < 0 ? 0 : ante,
post < 0 ? text.length() : post);
ante = post = -1;
}
/**
* Create and return an int[] array of segments.
*/
int[] getSegments() {
if (segments == null) {
return null;
}
int[] result = new int[segments.size()];
for (int i=0; i<segments.size(); ++i) {
result[i] = ((Number)segments.elementAt(i)).intValue();
}
return result;
}
}
/** /**
* MAIN PARSER. Parse the next rule in the given rule string, starting * MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not * at pos. Return the index after the last character parsed. Do not
@ -644,221 +879,110 @@ public class RuleBasedTransliterator extends Transliterator {
* parses the end-of-rule character. It recognizes context and cursor * parses the end-of-rule character. It recognizes context and cursor
* indicators. Once it does a lexical breakdown of the rule at pos, it * indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list. * creates a rule object and adds it to our rule list.
*
* This method is tightly coupled to the inner class RuleHalf.
*/ */
private int parseRule(String rule, int pos, int limit) { private int parseRule(String rule, int pos, int limit) {
// Locate the left side, operator, and right side // Locate the left side, operator, and right side
int start = pos; int start = pos;
char operator = 0; char operator = 0;
StringBuffer buf = new StringBuffer(); RuleHalf left = new RuleHalf();
int cursor = -1; // position of cursor in buf RuleHalf right = new RuleHalf();
int ante = -1; // position of ante context marker ')' in buf
int post = -1; // position of post context marker '(' in buf
int postClose = -1; // position of post context close ')' in buf
// Assigned to buf and its adjuncts after the LHS has been pos = left.parse(rule, pos, limit, this);
// parsed. Thereafter, buf etc. refer to the RHS.
String left = null;
int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
main: if (pos == limit ||
while (pos < limit) { OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode
// spaces, but Java spaces -- a subset, representing
// whitespace likely to be seen in code.
continue;
}
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
syntaxError("Trailing backslash", rule, start);
}
buf.append(rule.charAt(pos++));
continue;
}
// Handle quoted matter
if (c == QUOTE) {
int iq = rule.indexOf(QUOTE, pos);
if (iq == pos) {
buf.append(c); // Parse [''] outside quotes as [']
++pos;
} else {
/* This loop picks up a segment of quoted text of the
* form 'aaaa' each time through. If this segment
* hasn't really ended ('aaaa''bbbb') then it keeps
* looping, each time adding on a new segment. When it
* reaches the final quote it breaks.
*/
for (;;) {
if (iq < 0) {
syntaxError("Unterminated quote", rule, start);
}
buf.append(rule.substring(pos, iq));
pos = iq+1;
if (pos < limit && rule.charAt(pos) == QUOTE) {
// Parse [''] inside quotes as [']
iq = rule.indexOf(QUOTE, pos+1);
// Continue looping
} else {
break;
}
}
}
continue;
}
if (OPERATORS.indexOf(c) >= 0) {
if (operator != 0) {
syntaxError("Unquoted " + c, rule, start);
}
// Found an operator char. Check for forward-reverse operator.
if (c == REVERSE_RULE_OP &&
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
++pos;
operator = FWDREV_RULE_OP;
} else {
operator = c;
}
left = buf.toString(); // lhs
leftCursor = cursor;
leftAnte = ante;
leftPost = post;
leftPostClose = postClose;
buf.setLength(0);
cursor = ante = post = postClose = -1;
continue;
}
switch (c) {
case END_OF_RULE:
break main;
case VARIABLE_REF_OPEN:
{
int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
if (pos == j || j < 0) { // empty or unterminated
syntaxError("Malformed variable reference", rule, start);
}
String name = rule.substring(pos, j);
pos = j+1;
buf.append(getVariableDef(name));
}
break;
case CONTEXT_OPEN:
if (post >= 0) {
syntaxError("Multiple post contexts", rule, start);
}
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
// this is the optional opening delimiter for the ante context.
if (buf.length() > 0) {
post = buf.length();
}
break;
case CONTEXT_CLOSE:
if (postClose >= 0) {
syntaxError("Unexpected " + c, rule, start);
}
if (post >= 0) {
// This is probably the optional closing delimiter
// for the post context; save the pos and check later.
postClose = buf.length();
} else if (ante >= 0) {
syntaxError("Multiple ante contexts", rule, start);
} else {
ante = buf.length();
}
break;
case SET_OPEN:
ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
buf.append(registerSet(new UnicodeSet(rule, pp, parseData)));
pos = pp.getIndex();
break;
case VARIABLE_REF_CLOSE:
case SET_CLOSE:
syntaxError("Unquoted " + c, rule, start);
case CURSOR_POS:
if (cursor >= 0) {
syntaxError("Multiple cursors", rule, start);
}
cursor = buf.length();
break;
default:
buf.append(c);
break;
}
}
if (operator == 0) {
syntaxError("No operator", rule, start); syntaxError("No operator", rule, start);
} }
// Check context close parameters // Found an operator char. Check for forward-reverse operator.
if ((leftPostClose >= 0 && leftPostClose != left.length()) || if (operator == REVERSE_RULE_OP &&
(postClose >= 0 && postClose != buf.length())) { (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
syntaxError("Extra text after ]", rule, start); ++pos;
operator = FWDREV_RULE_OP;
} }
// Context is only allowed on the input side; that is, the left side pos = right.parse(rule, pos, limit, this);
// for forward rules. Cursors are only allowed on the output side;
// that is, the right side for forward rules. Bidirectional rules
// ignore elements that do not apply.
switch (operator) { if (pos < limit) {
case VARIABLE_DEF_OP: if (rule.charAt(pos) == END_OF_RULE) {
++pos;
} else {
// RuleHalf parser must have terminated at an operator
syntaxError("Unquoted operator", rule, start);
}
}
if (operator == VARIABLE_DEF_OP) {
// LHS is the name. RHS is a single character, either a literal // LHS is the name. RHS is a single character, either a literal
// or a set (already parsed). If RHS is longer than one // or a set (already parsed). If RHS is longer than one
// character, it is either a multi-character string, or multiple // character, it is either a multi-character string, or multiple
// sets, or a mixture of chars and sets -- syntax error. // sets, or a mixture of chars and sets -- syntax error.
if (buf.length() != 1) { if (right.text.length() != 1) {
syntaxError("Malformed RHS", rule, start); syntaxError("Malformed RHS", rule, start);
} }
if (data.variableNames.get(left) != null) { if (data.variableNames.get(left.text) != null) {
syntaxError("Duplicate definition of {" + syntaxError("Duplicate definition of {" +
left + "}", rule, start); left.text + "}", rule, start);
} }
data.variableNames.put(left, new Character(buf.charAt(0))); data.variableNames.put(left.text, new Character(right.text.charAt(0)));
break; return pos;
case FORWARD_RULE_OP:
if (direction == FORWARD) {
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
syntaxError("Malformed rule", rule, start);
}
data.ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
buf.toString(), cursor));
} // otherwise ignore the rule; it's not the direction we want
break;
case REVERSE_RULE_OP:
if (direction == REVERSE) {
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
syntaxError("Malformed rule", rule, start);
}
data.ruleSet.addRule(new TransliterationRule(
buf.toString(), ante, post,
left, leftCursor));
} // otherwise ignore the rule; it's not the direction we want
break;
case FWDREV_RULE_OP:
if (direction == FORWARD) {
// The output side is the right; trim off any context
String output = buf.toString().substring(ante < 0 ? 0 : ante,
post < 0 ? buf.length() : post);
data.ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
output, cursor));
} else {
// The output side is the left; trim off any context
String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
leftPost < 0 ? left.length() : leftPost);
data.ruleSet.addRule(new TransliterationRule(
buf.toString(), ante, post,
output, leftCursor));
}
break;
} }
// If the direction we want doesn't match the rule
// direction, do nothing.
if (operator != FWDREV_RULE_OP &&
((direction == FORWARD) != (operator == FORWARD_RULE_OP))) {
return pos;
}
// Transform the rule into a forward rule by swapping the
// sides if necessary.
if (direction == REVERSE) {
RuleHalf temp = left;
left = right;
right = temp;
}
// Remove non-applicable elements in forward-reverse
// rules. Bidirectional rules ignore elements that do not
// apply.
if (operator == FWDREV_RULE_OP) {
right.removeContext();
right.segments = null;
left.cursor = left.maxRef = -1;
}
// Context is only allowed on the input side. Cursors are only
// allowed on the output side. Segment delimiters can only appear
// on the left, and references on the right.
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
right.segments != null || left.maxRef >= 0) {
syntaxError("Malformed rule", rule, start);
}
// Check integrity of segments and segment references. Each
// segment's start must have a corresponding limit, and the
// references must not refer to segments that do not exist.
int[] segmentsArray = null;
if (left.segments != null) {
int n = left.segments.size();
if (n % 2 != 0) {
syntaxError("Odd length segments", rule, start);
}
n /= 2;
if (right.maxRef > n) {
syntaxError("Undefined segment reference " + right.maxRef, rule, start);
}
}
data.ruleSet.addRule(new TransliterationRule(
left.text, left.ante, left.post,
right.text, right.cursor,
left.getSegments(), data));
return pos; return pos;
} }
@ -871,13 +995,13 @@ public class RuleBasedTransliterator extends Transliterator {
* @param rule pattern string * @param rule pattern string
* @param start position of first character of current rule * @param start position of first character of current rule
*/ */
private static final void syntaxError(String msg, String rule, int start) { static final void syntaxError(String msg, String rule, int start) {
int end = quotedIndexOf(rule, start, rule.length(), ";"); int end = quotedIndexOf(rule, start, rule.length(), ";");
if (end < 0) { if (end < 0) {
end = rule.length(); end = rule.length();
} }
throw new IllegalArgumentException(msg + " in " + throw new IllegalArgumentException(msg + " in \"" +
rule.substring(start, end)); Utility.escape(rule.substring(start, end)) + '"');
} }
/** /**
@ -928,7 +1052,9 @@ public class RuleBasedTransliterator extends Transliterator {
"No private use characters available for variables"); "No private use characters available for variables");
} }
data.setVariablesBase = variableNext = r.start; // Allocate 9 characters for segment references 1 through 9
data.segmentBase = r.start;
data.setVariablesBase = variableNext = (char) (data.segmentBase + 9);
variableLimit = (char) (r.start + r.length); variableLimit = (char) (r.start + r.length);
if (variableNext >= variableLimit) { if (variableNext >= variableLimit) {

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
* $Date: 2000/04/12 20:17:45 $ * $Date: 2000/04/19 16:34:18 $
* $Revision: 1.15 $ * $Revision: 1.16 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -30,12 +30,26 @@ import com.ibm.util.Utility;
* Variables are detected by looking up each character in a supplied * Variables are detected by looking up each character in a supplied
* variable list to see if it has been so defined. * variable list to see if it has been so defined.
* *
* <p>A rule may contain segments in its input string and segment references in
* its output string. A segment is a substring of the input pattern, indicated
* by an offset and limit. The segment may span the preceding or following
* context. A segment reference is a special character in the output string
* that causes a segment of the input string (not the input pattern) to be
* copied to the output string. The range of special characters that represent
* segment references is defined by RuleBasedTransliterator.Data.
*
* <p>Example: The rule "$([a-z]$) . $([0-9]$) > $2 . $1" will change the input
* string "abc.123" to "ab1.c23".
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved. * <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
* *
* @author Alan Liu * @author Alan Liu
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.15 $ $Date: 2000/04/12 20:17:45 $ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.16 $ $Date: 2000/04/19 16:34:18 $
* *
* $Log: TransliterationRule.java,v $ * $Log: TransliterationRule.java,v $
* Revision 1.16 2000/04/19 16:34:18 alan
* Add segment support.
*
* Revision 1.15 2000/04/12 20:17:45 alan * Revision 1.15 2000/04/12 20:17:45 alan
* Delegate replace operation to rule object * Delegate replace operation to rule object
* *
@ -121,6 +135,21 @@ class TransliterationRule {
*/ */
private String output; private String output;
/**
* Array of segments. These are segments of the input string that may be
* referenced and appear in the output string. Each segment is stored as an
* offset, limit pair. Segments are referenced by a 1-based index;
* reference i thus includes characters at offset segments[2*i-2] to
* segments[2*i-1]-1 in the pattern string.
*
* In the output string, a segment reference is indicated by a character in
* a special range, as defined by RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
*/
private int[] segments;
/** /**
* The length of the string that must match before the key. If * The length of the string that must match before the key. If
* zero, then there is no matching requirement before the key. * zero, then there is no matching requirement before the key.
@ -160,11 +189,17 @@ class TransliterationRule {
* <code>output</code>; that is, -1 is equivalent to * <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than * <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown. * <code>output.length()</code> then an exception is thrown.
* @param segs array of 2n integers. Each of n pairs consists of offset,
* limit for a segment of the input string. Characters in the output string
* refer to these segments if they are in a special range determined by the
* associated RuleBasedTransliterator.Data object. May be null if there are
* no segments.
*/ */
public TransliterationRule(String input, public TransliterationRule(String input,
int anteContextPos, int postContextPos, int anteContextPos, int postContextPos,
String output, String output,
int cursorPos) { int cursorPos,
int[] segs) {
// Do range checks only when warranted to save time // Do range checks only when warranted to save time
if (anteContextPos < 0) { if (anteContextPos < 0) {
anteContextLength = 0; anteContextLength = 0;
@ -193,6 +228,34 @@ class TransliterationRule {
} }
pattern = input; pattern = input;
this.output = output; this.output = output;
// We don't validate the segments array. The caller must
// guarantee that the segments are well-formed.
this.segments = segs;
}
/**
* Construct a new rule with the given input, output text, and other
* attributes. A cursor position may be specified for the output text.
* @param input input string, including key and optional ante and
* post context
* @param anteContextPos offset into input to end of ante context, or -1 if
* none. Must be <= input.length() if not -1.
* @param postContextPos offset into input to start of post context, or -1
* if none. Must be <= input.length() if not -1, and must be >=
* anteContextPos.
* @param output output string
* @param cursorPos offset into output at which cursor is located, or -1 if
* none. If less than zero, then the cursor is placed after the
* <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown.
*/
public TransliterationRule(String input,
int anteContextPos, int postContextPos,
String output,
int cursorPos) {
this(input, anteContextPos, postContextPos,
output, cursorPos, null);
} }
/** /**
@ -238,11 +301,34 @@ class TransliterationRule {
* matches. This is the offset to the point after the ante * matches. This is the offset to the point after the ante
* context, if any, and before the match string and any post * context, if any, and before the match string and any post
* context. * context.
* @param data the RuleBasedTransliterator.Data object specifying
* context for this transliterator.
* @return the change in the length of the text * @return the change in the length of the text
*/ */
int replace(Replaceable text, int offset) { public int replace(Replaceable text, int offset,
text.replace(offset, offset + keyLength, output); RuleBasedTransliterator.Data data) {
return output.length() - keyLength; String out;
if (segments == null) {
out = output;
} else {
int textStart = offset - anteContextLength;
StringBuffer buf = new StringBuffer();
for (int i=0; i<output.length(); ++i) {
char c = output.charAt(i);
int b = data.lookupSegmentReference(c);
if (b < 0) {
buf.append(c);
} else {
for (int j=textStart + segments[2*b];
j<textStart + segments[2*b+1]; ++j) {
buf.append(text.charAt(j));
}
}
}
out = buf.toString();
}
text.replace(offset, offset + keyLength, out);
return out.length() - keyLength;
} }
/** /**