ICU-73 finish quantifier and supplemental char support

X-SVN-Rev: 6003
This commit is contained in:
Alan Liu 2001-10-03 00:18:23 +00:00
parent 40694d1edc
commit a56c858f03
12 changed files with 1154 additions and 660 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
* $Date: 2001/09/28 05:47:30 $
* $Revision: 1.47 $
* $Date: 2001/10/03 00:18:23 $
* $Revision: 1.48 $
*
*****************************************************************************************
*/
@ -77,14 +77,6 @@ public class TransliteratorTest extends TestFmwk {
logln("Elapsed time: " + ms + " ms");
}
public void TestDisplayName() {
String ID;
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
ID = (String) e.nextElement();
logln(ID + " -> " + Transliterator.getDisplayName(ID));
}
}
public void TestSimpleRules() {
/* Example: rules 1. ab>x|y
* 2. yc>z
@ -131,29 +123,6 @@ public class TransliteratorTest extends TestFmwk {
expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
}
/**
* Test undefined variable.
*/
public void TestUndefinedVariable() {
String rule = "$initial } a <> \u1161;";
try {
Transliterator t = new RuleBasedTransliterator("<ID>", rule);
t = null;
} catch (IllegalArgumentException e) {
logln("OK: Got exception for " + rule + ", as expected: " +
e.getMessage());
return;
}
errln("Fail: bogus rule " + rule + " compiled without error");
}
/**
* Test empty context.
*/
public void TestEmptyContext() {
expect(" { a } > b;", "xay a ", "xby b ");
}
/**
* Test inline set syntax and set variable syntax.
*/
@ -524,6 +493,9 @@ public class TransliteratorTest extends TestFmwk {
expect(hex3, "012", "&#x30;&#x31;&#x32;");
}
public void TestJ329_TODO() {
}
/**
* Test segments and segment references.
*/
@ -681,6 +653,9 @@ public class TransliteratorTest extends TestFmwk {
}
public void TestCopyJ476_TODO() {
}
/**
* Test inter-Indic transliterators. These are composed.
*/
@ -700,47 +675,6 @@ public class TransliteratorTest extends TestFmwk {
expect(dg, dev, guj);
}
/**
* Test IDs of inverses of compound transliterators. (J20)
*/
public void TestCompoundInverseID() {
String ID = "Latin-Jamo;NFC(NFD)";
Transliterator t = Transliterator.getInstance(ID);
Transliterator u = t.getInverse();
String exp = "NFD(NFC);Jamo-Latin";
String got = u.getID();
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
/**
* Inverse of "Null" should be "Null". (J21)
*/
public void TestNullInverse() {
Transliterator t = Transliterator.getInstance("Null");
Transliterator u = t.getInverse();
if (!u.getID().equals("Null")) {
errln("FAIL: Inverse of Null should be Null");
}
}
/**
* Check ID of inverse of alias. (J22)
*/
public void TestAliasInverseID() {
String ID = "Latin-Hangul"; // This should be any alias ID with an inverse
Transliterator t = Transliterator.getInstance(ID);
Transliterator u = t.getInverse();
String exp = "Hangul-Latin";
String got = u.getID();
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
/**
* Test filter syntax in IDs. (J23)
*/
@ -785,118 +719,6 @@ public class TransliteratorTest extends TestFmwk {
}
}
/**
* Test the "Remove" transliterator.
*/
public void TestRemove() {
Transliterator t = Transliterator.getInstance("Remove[aeiou]");
expect(t, "The quick brown fox.",
"Th qck brwn fx.");
}
public void TestToRules() {
String RBT = "rbt";
String SET = "set";
String[] DATA = {
RBT,
"$a=\\u4E61; [$a] > A;",
"[\\u4E61] > A;",
RBT,
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
"[[:Zs:][:Zl:]]{a} > A;",
SET,
"[[:Zs:][:Zl:]]",
"[[:Zs:][:Zl:]]",
SET,
"[:Ps:]",
"[:Ps:]",
SET,
"[:L:]",
"[:L:]",
SET,
"[[:L:]-[A]]",
"[[:L:]-[A]]",
SET,
"[~[:Lu:][:Ll:]]",
"[~[:Lu:][:Ll:]]",
SET,
"[~[a-z]]",
"[~[a-z]]",
RBT,
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
"[^[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
"[[a-z]-[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
"[[:Zs:]&[a-z]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
"[x[:Zs:]]{a} > A;",
};
for (int d=0; d < DATA.length; d+=3) {
if (DATA[d] == RBT) {
// Transliterator test
Transliterator t = Transliterator.createFromRules("ID",
DATA[d+1], Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules failed");
return;
}
String rules, escapedRules;
rules = t.toRules(false);
escapedRules = t.toRules(true);
String expRules = Utility.unescape(DATA[d+2]);
String expEscapedRules = DATA[d+2];
if (rules.equals(expRules)) {
logln("Ok: " + DATA[d+1] +
" => " + Utility.escape(rules));
} else {
errln("FAIL: " + DATA[d+1] +
" => " + Utility.escape(rules + ", exp " + expRules));
}
if (escapedRules.equals(expEscapedRules)) {
logln("Ok: " + DATA[d+1] +
" => " + escapedRules);
} else {
errln("FAIL: " + DATA[d+1] +
" => " + escapedRules + ", exp " + expEscapedRules);
}
} else {
// UnicodeSet test
String pat = DATA[d+1];
String expToPat = DATA[d+2];
UnicodeSet set = new UnicodeSet(pat);
// Adjust spacing etc. as necessary.
String toPat;
toPat = set.toPattern(true);
if (expToPat.equals(toPat)) {
logln("Ok: " + pat +
" => " + toPat);
} else {
errln("FAIL: " + pat +
" => " + Utility.escape(toPat) +
", exp " + Utility.escape(pat));
}
}
}
}
/**
* Test the case mapping transliterators.
*/
@ -966,6 +788,9 @@ public class TransliteratorTest extends TestFmwk {
}
}
public void TestCreateInstance_TODO() {
}
/**
* Test the normalization transliterator.
*/
@ -1139,6 +964,249 @@ public class TransliteratorTest extends TestFmwk {
}
}
public void TestCompoundFilter_TODO() {
}
/**
* Test the "Remove" transliterator.
*/
public void TestRemove() {
Transliterator t = Transliterator.getInstance("Remove[aeiou]");
expect(t, "The quick brown fox.",
"Th qck brwn fx.");
}
public void TestToRules() {
String RBT = "rbt";
String SET = "set";
String[] DATA = {
RBT,
"$a=\\u4E61; [$a] > A;",
"[\\u4E61] > A;",
RBT,
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
"[[:Zs:][:Zl:]]{a} > A;",
SET,
"[[:Zs:][:Zl:]]",
"[[:Zs:][:Zl:]]",
SET,
"[:Ps:]",
"[:Ps:]",
SET,
"[:L:]",
"[:L:]",
SET,
"[[:L:]-[A]]",
"[[:L:]-[A]]",
SET,
"[~[:Lu:][:Ll:]]",
"[~[:Lu:][:Ll:]]",
SET,
"[~[a-z]]",
"[~[a-z]]",
RBT,
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
"[^[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
"[[a-z]-[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
"[[:Zs:]&[a-z]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
"[x[:Zs:]]{a} > A;",
};
for (int d=0; d < DATA.length; d+=3) {
if (DATA[d] == RBT) {
// Transliterator test
Transliterator t = Transliterator.createFromRules("ID",
DATA[d+1], Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules failed");
return;
}
String rules, escapedRules;
rules = t.toRules(false);
escapedRules = t.toRules(true);
String expRules = Utility.unescape(DATA[d+2]);
String expEscapedRules = DATA[d+2];
if (rules.equals(expRules)) {
logln("Ok: " + DATA[d+1] +
" => " + Utility.escape(rules));
} else {
errln("FAIL: " + DATA[d+1] +
" => " + Utility.escape(rules + ", exp " + expRules));
}
if (escapedRules.equals(expEscapedRules)) {
logln("Ok: " + DATA[d+1] +
" => " + escapedRules);
} else {
errln("FAIL: " + DATA[d+1] +
" => " + escapedRules + ", exp " + expEscapedRules);
}
} else {
// UnicodeSet test
String pat = DATA[d+1];
String expToPat = DATA[d+2];
UnicodeSet set = new UnicodeSet(pat);
// Adjust spacing etc. as necessary.
String toPat;
toPat = set.toPattern(true);
if (expToPat.equals(toPat)) {
logln("Ok: " + pat +
" => " + toPat);
} else {
errln("FAIL: " + pat +
" => " + Utility.escape(toPat) +
", exp " + Utility.escape(pat));
}
}
}
}
public void TestContext() {
Transliterator.Position pos = new Transliterator.Position(0, 2, 0, 1); // cs cl s l
expect("de > x; {d}e > y;",
"de",
"ye",
pos);
expect("ab{c} > z;",
"xadabdabcy",
"xadabdabzy");
}
static final String CharsToUnicodeString(String s) {
return Utility.unescape(s);
}
public void TestSupplemental() {
expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];" +
"a > $a; $s > i;"),
CharsToUnicodeString("ab\\U0001030Fx"),
CharsToUnicodeString("\\U00010300bix"));
expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];" +
"$b=[A-Z\\U00010400-\\U0001044D];" +
"($a)($b) > $2 $1;"),
CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
// k|ax\\U00010300xm
// k|a\\U00010400\\U00010300xm
// ky|\\U00010400\\U00010300xm
// ky\\U00010400|\\U00010300xm
// ky\\U00010400|\\U00010300\\U00010400m
// ky\\U00010400y|\\U00010400m
expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];" +
"$a {x} > | @ \\U00010400;" +
"{$a} [^\\u0000-\\uFFFF] > y;"),
CharsToUnicodeString("kax\\U00010300xm"),
CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
}
public void TestQuantifier() {
// Make sure @ in a quantified anteContext works
expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
"AAAAAb",
"aaa(aac)");
// Make sure @ in a quantified postContext works
expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
"baaaaa",
"caa(aaa)");
// Make sure @ in a quantified postContext with seg ref works
expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
"baaaaa",
"baa(aaa)");
// Make sure @ past ante context doesn't enter ante context
Transliterator.Position pos = new Transliterator.Position(0, 5, 3, 5);
expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
"xxxab",
"xxx(ac)",
pos);
// Make sure @ past post context doesn't pass limit
Transliterator.Position pos2 = new Transliterator.Position(0, 4, 0, 2);
expect("{b} a+ > c @@ |; x > y; a > A;",
"baxx",
"caxx",
pos2);
// Make sure @ past post context doesn't enter post context
expect("{b} a+ > c @@ |; x > y; a > A;",
"baxx",
"cayy");
expect("(ab)? c > d;",
"c abc ababc",
"d d abd");
expect("(ab)+ {x} > '(' $1 ')';",
"x abx ababxy",
"x ab(ab) abab(abab)y");
expect("b+ > x;",
"ac abc abbc abbbc",
"ac axc axc axc");
expect("[abc]+ > x;",
"qac abrc abbcs abtbbc",
"qx xrx xs xtx");
expect("q{(ab)+} > x;",
"qa qab qaba qababc qaba",
"qa qx qxa qxc qxa");
expect("q(ab)* > x;",
"qa qab qaba qababc",
"xa x xa xc");
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
// In perl, it only matches the first occurrence, so the output
// is "()a (ab) (ab)a (ab)c".
expect("q(ab)* > '(' $1 ')';",
"qa qab qaba qababc",
"()a (ab) (ab)a (abab)c");
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
// quoted string
expect("'ab'+ > x;",
"bb ab ababb",
"bb x xb");
// $foo+ and $foo* -- the quantifier should apply to the entire
// variable reference
expect("$var = ab; $var+ > x;",
"bb ab ababb",
"bb x xb");
}
public void TestSTV_TODO() {
}
/**
* Test inverse of Greek-Latin; Title()
*/
@ -1159,9 +1227,92 @@ public class TransliteratorTest extends TestFmwk {
}
}
//======================================================================
// icu4j only
//======================================================================
/**
* Inverse of "Null" should be "Null". (J21)
*/
public void TestNullInverse() {
Transliterator t = Transliterator.getInstance("Null");
Transliterator u = t.getInverse();
if (!u.getID().equals("Null")) {
errln("FAIL: Inverse of Null should be Null");
}
}
/**
* Check ID of inverse of alias. (J22)
*/
public void TestAliasInverseID() {
String ID = "Latin-Hangul"; // This should be any alias ID with an inverse
Transliterator t = Transliterator.getInstance(ID);
Transliterator u = t.getInverse();
String exp = "Hangul-Latin";
String got = u.getID();
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
/**
* Test IDs of inverses of compound transliterators. (J20)
*/
public void TestCompoundInverseID() {
String ID = "Latin-Jamo;NFC(NFD)";
Transliterator t = Transliterator.getInstance(ID);
Transliterator u = t.getInverse();
String exp = "NFD(NFC);Jamo-Latin";
String got = u.getID();
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
/**
* Test undefined variable.
*/
public void TestUndefinedVariable() {
String rule = "$initial } a <> \u1161;";
try {
Transliterator t = new RuleBasedTransliterator("<ID>", rule);
t = null;
} catch (IllegalArgumentException e) {
logln("OK: Got exception for " + rule + ", as expected: " +
e.getMessage());
return;
}
errln("Fail: bogus rule " + rule + " compiled without error");
}
/**
* Test empty context.
*/
public void TestEmptyContext() {
expect(" { a } > b;", "xay a ", "xby b ");
}
public void TestDisplayName() {
String ID;
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
ID = (String) e.nextElement();
logln(ID + " -> " + Transliterator.getDisplayName(ID));
}
}
//======================================================================
// Support methods
//======================================================================
void expect(String rules,
String source,
String expectedResult,
Transliterator.Position pos) {
Transliterator t = new RuleBasedTransliterator("<ID>", rules);
expect(t, source, expectedResult, pos);
}
void expect(String rules, String source, String expectedResult) {
expect(new RuleBasedTransliterator("<ID>", rules), source, expectedResult);
@ -1176,33 +1327,53 @@ public class TransliteratorTest extends TestFmwk {
}
void expect(Transliterator t, String source, String expectedResult) {
String result = t.transliterate(source);
expectAux(t.getID() + ":String", source, result, expectedResult);
expect(t, source, expectedResult, (Transliterator.Position) null);
}
void expect(Transliterator t, String source, String expectedResult,
Transliterator.Position pos) {
if (pos == null) {
String result = t.transliterate(source);
expectAux(t.getID() + ":String", source, result, expectedResult);
}
Transliterator.Position index = null;
if (pos == null) {
index = new Transliterator.Position();
} else {
index = new Transliterator.Position(pos.contextStart, pos.contextLimit,
pos.start, pos.limit);
}
ReplaceableString rsource = new ReplaceableString(source);
t.transliterate(rsource);
result = rsource.toString();
if (pos == null) {
t.transliterate(rsource);
} else {
// Do it all at once -- below we do it incrementally
t.finishTransliteration(rsource, pos);
}
String result = rsource.toString();
expectAux(t.getID() + ":Replaceable", source, result, expectedResult);
// Test keyboard (incremental) transliteration -- this result
// must be the same after we finalize (see below).
rsource.replace(0, rsource.length(), "");
Transliterator.Position index = new Transliterator.Position();
StringBuffer log = new StringBuffer();
for (int i=0; i<source.length(); ++i) {
if (i != 0) {
log.append(" + ");
rsource.replace(0, rsource.length(), "");
if (pos != null) {
rsource.replace(0, 0, source);
formatInput(log, rsource, index);
log.append(" -> ");
t.transliterate(rsource, index);
formatInput(log, rsource, index);
} else {
for (int i=0; i<source.length(); ++i) {
if (i != 0) {
log.append(" + ");
}
log.append(source.charAt(i)).append(" -> ");
t.transliterate(rsource, index, source.charAt(i));
formatInput(log, rsource, index);
}
log.append(source.charAt(i)).append(" -> ");
t.transliterate(rsource, index,
String.valueOf(source.charAt(i)));
// Append the string buffer with a vertical bar '|' where
// the committed index is.
String s = rsource.toString();
log.append(s.substring(0, index.start)).
append('|').
append(s.substring(index.start));
}
// As a final step in keyboard transliteration, we must call
@ -1217,6 +1388,41 @@ public class TransliteratorTest extends TestFmwk {
expectedResult);
}
/**
* @param appendTo result is appended to this param.
* @param input the string being transliterated
* @param pos the index struct
*/
StringBuffer formatInput(StringBuffer appendTo,
final ReplaceableString input,
final Transliterator.Position pos) {
// Output a string of the form aaa{bbb|ccc|ddd}eee, where
// the {} indicate the context start and limit, and the ||
// indicate the start and limit.
if (0 <= pos.contextStart &&
pos.contextStart <= pos.start &&
pos.start <= pos.limit &&
pos.limit <= pos.contextLimit &&
pos.contextLimit <= input.length()) {
String a, b, c, d, e;
a = input.substring(0, pos.contextStart);
b = input.substring(pos.contextStart, pos.start);
c = input.substring(pos.start, pos.limit);
d = input.substring(pos.limit, pos.contextLimit);
e = input.substring(pos.contextLimit, input.length());
appendTo.append(a).append('{').append(b).
append('|').append(c).append('|').append(d).
append('}').append(e);
} else {
appendTo.append("INVALID Transliterator.Position {cs=" +
pos.contextStart + ", s=" + pos.start + ", l=" +
pos.limit + ", cl=" + pos.contextLimit + "} on " +
input);
}
return appendTo;
}
void expectAux(String tag, String source,
String result, String expectedResult) {
expectAux(tag, source + " -> " + result,

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/ReplaceableString.java,v $
* $Date: 2000/04/25 17:17:37 $
* $Revision: 1.3 $
* $Date: 2001/10/03 00:14:22 $
* $Revision: 1.4 $
*
*****************************************************************************************
*/
@ -24,7 +24,7 @@ package com.ibm.text;
*
* @see Replaceable
* @author Alan Liu
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.3 $ $Date: 2000/04/25 17:17:37 $
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.4 $ $Date: 2001/10/03 00:14:22 $
*/
public class ReplaceableString implements Replaceable {
private StringBuffer buf;
@ -67,6 +67,13 @@ public class ReplaceableString implements Replaceable {
return buf.toString();
}
/**
* Return a substring of the given string.
*/
public String substring(int start, int limit) {
return buf.substring(start, limit);
}
/**
* Return the number of characters contained in this object.
* <code>Replaceable</code> API.

View File

@ -4,9 +4,9 @@
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
* $Date: 2001/09/26 18:00:06 $
* $Revision: 1.46 $
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
* $Date: 2001/10/03 00:14:22 $
* $Revision: 1.47 $
*
*****************************************************************************************
*/
@ -27,18 +27,18 @@ import com.ibm.text.resources.ResourceReader;
* Whitespace, as defined by <code>Character.isWhitespace()</code>,
* is ignored. If the first non-blank character on a line is '#',
* the entire line is ignored as a comment. </p>
*
*
* <p>Each set of rules consists of two groups, one forward, and one
* reverse. This is a convention that is not enforced; rules for one
* direction may be omitted, with the result that translations in
* that direction will not modify the source text. In addition,
* bidirectional forward-reverse rules may be specified for
* symmetrical transformations.</p>
*
*
* <p><b>Rule syntax</b> </p>
*
*
* <p>Rule statements take one of the following forms: </p>
*
*
* <dl>
* <dt><code>$alefmadda=\u0622;</code></dt>
* <dd><strong>Variable definition.</strong> The name on the
@ -66,7 +66,7 @@ import com.ibm.text.resources.ResourceReader;
* the string on the left when performing reverse
* transliteration.</dd>
* </dl>
*
*
* <dl>
* <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
* <dd><strong>Bidirectional translation rule.</strong> This
@ -75,7 +75,7 @@ import com.ibm.text.resources.ResourceReader;
* transliteration, and vice versa when performing reverse
* transliteration.</dd>
* </dl>
*
*
* <p>Translation rules consist of a <em>match pattern</em> and an <em>output
* string</em>. The match pattern consists of literal characters,
* optionally preceded by context, and optionally followed by
@ -92,7 +92,7 @@ import com.ibm.text.resources.ResourceReader;
* (or &quot;<code>123}456</code>&quot;) in which the literal
* pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
* </p>
*
*
* <p>The output string of a forward or reverse rule consists of
* characters to replace the literal pattern characters. If the
* output string contains the character '<code>|</code>', this is
@ -102,59 +102,59 @@ import com.ibm.text.resources.ResourceReader;
* placed within the replacement text; however, it can actually be
* placed into the precending or following context by using the
* special character '<code>@</code>'. Examples:</p>
*
*
* <blockquote>
* <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor
* before a<br>
* {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between
* y and z</code></p>
* </blockquote>
*
*
* <p><b>UnicodeSet</b></p>
*
*
* <p><code>UnicodeSet</code> patterns may appear anywhere that
* makes sense. They may appear in variable definitions.
* Contrariwise, <code>UnicodeSet</code> patterns may themselves
* contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
* or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
*
*
* <p><code>UnicodeSet</code> patterns may also be embedded directly
* into rule strings. Thus, the following two rules are equivalent:</p>
*
*
* <blockquote>
* <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
* [aeiou]&gt;'*';
* &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
* Another way</code></p>
* </blockquote>
*
*
* <p>See {@link UnicodeSet} for more documentation and examples.</p>
*
*
* <p><b>Segments</b></p>
*
*
* <p>Segments of the input string can be matched and copied to the
* output string. This makes certain sets of rules simpler and more
* general, and makes reordering possible. For example:</p>
*
*
* <blockquote>
* <p><code>([a-z]) &gt; $1 $1;
* &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
* double lowercase letters<br>
* ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
* </blockquote>
*
*
* <p>The segment of the input string to be copied is delimited by
* &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
* nine segments may be defined. Segments may not overlap. In the
* output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
* represent the input string segments, in left-to-right order of
* definition.</p>
*
*
* <p><b>Anchors</b></p>
*
*
* <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
* special characters '<code>^</code>' and '<code>$</code>'. For example:</p>
*
*
* <blockquote>
* <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>
* &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
@ -163,24 +163,24 @@ import com.ibm.text.resources.ResourceReader;
* &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
* of 'z'</code></p>
* </blockquote>
*
*
* <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
* This is done by including a virtual anchor character '<code>$</code>' at the end of the
* set pattern. Although this is usually the match chafacter for the end anchor, the set will
* match either the beginning or the end of the text, depending on its placement. For
* example:</p>
*
*
* <blockquote>
* <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>
* $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>
* &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code></p>
* </blockquote>
*
*
* <p><b>Example</b> </p>
*
*
* <p>The following example rules illustrate many of the features of
* the rule language. </p>
*
*
* <table border="0" cellpadding="4">
* <tr>
* <td valign="top">Rule 1.</td>
@ -195,10 +195,10 @@ import com.ibm.text.resources.ResourceReader;
* <td valign="top" nowrap><code>yz&gt;q</code></td>
* </tr>
* </table>
*
*
* <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
* yields the following results: </p>
*
*
* <table border="0" cellpadding="4">
* <tr>
* <td valign="top" nowrap><code>|adefabcdefz</code></td>
@ -251,23 +251,23 @@ import com.ibm.text.resources.ResourceReader;
* transliteration is complete.</td>
* </tr>
* </table>
*
*
* <p>The order of rules is significant. If multiple rules may match
* at some point, the first matching rule is applied. </p>
*
*
* <p>Forward and reverse rules may have an empty output string.
* Otherwise, an empty left or right hand side of any statement is a
* syntax error. </p>
*
*
* <p>Single quotes are used to quote any character other than a
* digit or letter. To specify a single quote itself, inside or
* outside of quotes, use two single quotes in a row. For example,
* the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
* string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
* </p>
*
*
* <p><b>Notes</b> </p>
*
*
* <p>While a RuleBasedTransliterator is being built, it checks that
* the rules are added in proper order. For example, if the rule
* &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
@ -275,11 +275,11 @@ import com.ibm.text.resources.ResourceReader;
* the second rule can never be triggered, since the first rule
* always matches anything it matches. In other words, the first
* rule <em>masks</em> the second rule. </p>
*
*
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
*
*
* @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.46 $ $Date: 2001/09/26 18:00:06 $
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.47 $ $Date: 2001/10/03 00:14:22 $
*/
public class RuleBasedTransliterator extends Transliterator {
@ -433,7 +433,7 @@ public class RuleBasedTransliterator extends Transliterator {
* stored in the rule text to represent the set of characters.
* variables[i] represents character (variablesBase + i).
*/
UnicodeSet[] variables;
UnicodeMatcher[] variables;
/**
* The character that represents variables[0]. Characters
@ -498,6 +498,9 @@ public class RuleBasedTransliterator extends Transliterator {
/**
* $Log: RuleBasedTransliterator.java,v $
* Revision 1.47 2001/10/03 00:14:22 alan
* jitterbug 73: finish quantifier and supplemental char support
*
* Revision 1.46 2001/09/26 18:00:06 alan
* jitterbug 67: sync parser with icu4c, allow unlimited, nested segments
*

View File

@ -4,9 +4,9 @@
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
* $Date: 2001/09/26 18:00:06 $
* $Revision: 1.28 $
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
* $Date: 2001/10/03 00:14:23 $
* $Revision: 1.29 $
*
*****************************************************************************************
*/
@ -28,7 +28,7 @@ import com.ibm.util.Utility;
* may contain variables. Variables represent a set of Unicode
* characters, such as the letters <i>a</i> through <i>z</i>.
* Variables are detected by looking up each character in a supplied
* variable list to see if it has been so defined.
* variable list to see if it has been so defined.
*
* <p>A rule may contain segments in its input string and segment references in
* its output string. A segment is a substring of the input pattern, indicated
@ -44,7 +44,7 @@ import com.ibm.util.Utility;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.28 $ $Date: 2001/09/26 18:00:06 $
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.29 $ $Date: 2001/10/03 00:14:23 $
*/
class TransliterationRule {
@ -310,7 +310,7 @@ class TransliterationRule {
* r1: aakkkpppp
* r2: aaakkkkkpppp
* ^
*
*
* The strings must be aligned at the first character of the
* key. The length of r1 to the left of the alignment point
* must be <= the length of r2 to the left; ditto for the
@ -346,10 +346,10 @@ class TransliterationRule {
int left2 = r2.anteContextLength;
int right = pattern.length() - left;
int right2 = r2.pattern.length() - left2;
// TODO Clean this up -- some logic might be combinable with the
// next statement.
// Test for anchor masking
if (left == left2 && right == right2 &&
keyLength <= r2.keyLength &&
@ -371,7 +371,7 @@ class TransliterationRule {
pos - UTF16.getCharCount(UTF16.charAt(str, pos-1)) :
pos - 1;
}
static final int posAfter(Replaceable str, int pos) {
return (pos >= 0 && pos < str.length()) ?
pos + UTF16.getCharCount(UTF16.charAt(str, pos)) :
@ -387,10 +387,10 @@ class TransliterationRule {
* context and key characters match, but the text is not long
* enough to match all of them. A full match means all context
* and key characters match.
*
*
* If a full match is obtained, perform a replacement, update pos,
* and return U_MATCH. Otherwise both text and pos are unchanged.
*
*
* @param text the text
* @param pos the position indices
* @param incremental if TRUE, test for partial matches that may
@ -559,13 +559,13 @@ class TransliterationRule {
if (segments == null) {
text.replace(pos.start, keyLimit, output);
lenDelta = output.length() - (keyLimit - pos.start);
if (cursorPos >= 0 && cursorPos < keyLength) {
// Within the key, the cursor refers to 16-bit code units
if (cursorPos >= 0 && cursorPos <= output.length()) {
// Within the output string, the cursor refers to 16-bit code units
newStart = pos.start + cursorPos;
} else {
newStart = pos.start;
int n = cursorPos;
// Outside the key, cursorPos counts code points
// Outside the output string, cursorPos counts code points
while (n > 0) {
newStart += UTF16.getCharCount(UTF16.charAt(text, newStart));
--n;
@ -638,7 +638,7 @@ class TransliterationRule {
}
}
}
oText += lenDelta;
pos.limit += lenDelta;
pos.contextLimit += lenDelta;
@ -665,11 +665,11 @@ class TransliterationRule {
* cleared out by, at the end, calling this method with a literal
* character.
*/
protected void appendToRule(StringBuffer rule,
int c,
boolean isLiteral,
boolean escapeUnprintable,
StringBuffer quoteBuf) {
static void appendToRule(StringBuffer rule,
int c,
boolean isLiteral,
boolean escapeUnprintable,
StringBuffer quoteBuf) {
// If we are escaping unprintables, then escape them outside
// quotes. <backslash>u and <backslash>U are not recognized within quotes. The same
// logic applies to literals, but literals are never escaped.
@ -745,11 +745,11 @@ class TransliterationRule {
//System.out.println("rule=" + rule.toString() + " qb=" + quoteBuf.toString());
}
protected final void appendToRule(StringBuffer rule,
String text,
boolean isLiteral,
boolean escapeUnprintable,
StringBuffer quoteBuf) {
static final void appendToRule(StringBuffer rule,
String text,
boolean isLiteral,
boolean escapeUnprintable,
StringBuffer quoteBuf) {
for (int i=0; i<text.length(); ++i) {
appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
}
@ -764,7 +764,7 @@ class TransliterationRule {
*/
public String toRule(boolean escapeUnprintable) {
int i;
StringBuffer rule = new StringBuffer();
// iseg indexes into segments[] directly (not offset from FSPI)
@ -863,7 +863,7 @@ class TransliterationRule {
if (show) {
rule.append((char)(48+d));
}
}
}
rule.append(' ');
}
}
@ -905,6 +905,9 @@ class TransliterationRule {
/**
* $Log: TransliterationRule.java,v $
* Revision 1.29 2001/10/03 00:14:23 alan
* jitterbug 73: finish quantifier and supplemental char support
*
* Revision 1.28 2001/09/26 18:00:06 alan
* jitterbug 67: sync parser with icu4c, allow unlimited, nested segments
*

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Transliterator.java,v $
* $Date: 2001/09/28 20:37:09 $
* $Revision: 1.43 $
* $Date: 2001/10/03 00:14:23 $
* $Revision: 1.44 $
*
*****************************************************************************************
*/
@ -241,7 +241,7 @@ import com.ibm.util.CaseInsensitiveString;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.43 $ $Date: 2001/09/28 20:37:09 $
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.44 $ $Date: 2001/10/03 00:14:23 $
*/
public abstract class Transliterator {
/**
@ -553,6 +553,15 @@ public abstract class Transliterator {
index.contextLimit += insertion.length();
}
char last = (text.length() > 0) ?
text.charAt(text.length() - 1) : 0;
if (UTF16.isLeadSurrogate(last)) {
// Oops, the caller passed us a single lead surrogate at the
// end of the insertion. Don't transliterate until more text
// comes in.
return;
}
filteredTransliterate(text, index, true);
// This doesn't work once we add quantifier support. Need to rewrite

View File

@ -142,6 +142,10 @@ class TransliteratorParser {
private static final char CURSOR_OFFSET = '@';
private static final char ANCHOR_START = '^';
private static final char KLEENE_STAR = '*';
private static final char ONE_OR_MORE = '+';
private static final char ZERO_OR_ONE = '?';
// By definition, the ANCHOR_END special character is a
// trailing SymbolTable.SYMBOL_REF character.
// private static final char ANCHOR_END = '$';
@ -382,7 +386,7 @@ class TransliteratorParser {
idBlock = idBlockResult.toString();
// Convert the set vector to an array
data.variables = new UnicodeSet[variablesVector.size()];
data.variables = new UnicodeMatcher[variablesVector.size()];
variablesVector.copyInto(data.variables);
variablesVector = null;
@ -658,7 +662,7 @@ class TransliteratorParser {
int varStart = -1; // Most recent $variableReference
int varLimit = -1;
int[] iref = new int[1];
main:
while (pos < limit && !done) {
char c = rule.charAt(pos++);
@ -853,56 +857,71 @@ class TransliteratorParser {
}
}
break;
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
case KLEENE_STAR:
case ONE_OR_MORE:
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
int qstart, qlimit;
boolean[] isOpenParen = new boolean[1];
boolean isSegment = false;
if (segments != null &&
segments.getLastParenOffset(isOpenParen) == buf.length()) {
// The */+ immediately follows a segment
if (isOpenParen[0]) {
syntaxError("Misplaced quantifier", rule, start);
}
int[] startparam = new int[1];
int[] limitparam = new int[1];
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
syntaxError("Mismatched segment delimiters", rule, start);
}
qstart = startparam[0];
qlimit = limitparam[0];
isSegment = true;
} else {
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
qstart = quoteStart;
qlimit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
qstart = varStart;
qlimit = varLimit;
} else {
// The */+ follows a single character
qstart = buf.length() - 1;
qlimit = qstart + 1;
}
}
UnicodeMatcher m =
new StringMatcher(buf.toString(), qstart, qlimit,
isSegment, parser.data);
int min = 0;
int max = Quantifier.MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.setLength(qstart);
buf.append(parser.generateStandInFor(m));
}
break;
// case SET_CLOSE:
default:
// Disallow unquoted characters other than [0-9A-Za-z]
@ -947,7 +966,7 @@ class TransliteratorParser {
//----------------------------------------------------------------------
// END RuleHalf
//----------------------------------------------------------------------
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
* $Date: 2001/09/28 05:47:30 $
* $Revision: 1.47 $
* $Date: 2001/10/03 00:18:23 $
* $Revision: 1.48 $
*
*****************************************************************************************
*/
@ -77,14 +77,6 @@ public class TransliteratorTest extends TestFmwk {
logln("Elapsed time: " + ms + " ms");
}
public void TestDisplayName() {
String ID;
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
ID = (String) e.nextElement();
logln(ID + " -> " + Transliterator.getDisplayName(ID));
}
}
public void TestSimpleRules() {
/* Example: rules 1. ab>x|y
* 2. yc>z
@ -131,29 +123,6 @@ public class TransliteratorTest extends TestFmwk {
expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
}
/**
* Test undefined variable.
*/
public void TestUndefinedVariable() {
String rule = "$initial } a <> \u1161;";
try {
Transliterator t = new RuleBasedTransliterator("<ID>", rule);
t = null;
} catch (IllegalArgumentException e) {
logln("OK: Got exception for " + rule + ", as expected: " +
e.getMessage());
return;
}
errln("Fail: bogus rule " + rule + " compiled without error");
}
/**
* Test empty context.
*/
public void TestEmptyContext() {
expect(" { a } > b;", "xay a ", "xby b ");
}
/**
* Test inline set syntax and set variable syntax.
*/
@ -524,6 +493,9 @@ public class TransliteratorTest extends TestFmwk {
expect(hex3, "012", "&#x30;&#x31;&#x32;");
}
public void TestJ329_TODO() {
}
/**
* Test segments and segment references.
*/
@ -681,6 +653,9 @@ public class TransliteratorTest extends TestFmwk {
}
public void TestCopyJ476_TODO() {
}
/**
* Test inter-Indic transliterators. These are composed.
*/
@ -700,47 +675,6 @@ public class TransliteratorTest extends TestFmwk {
expect(dg, dev, guj);
}
/**
* Test IDs of inverses of compound transliterators. (J20)
*/
public void TestCompoundInverseID() {
String ID = "Latin-Jamo;NFC(NFD)";
Transliterator t = Transliterator.getInstance(ID);
Transliterator u = t.getInverse();
String exp = "NFD(NFC);Jamo-Latin";
String got = u.getID();
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
/**
* Inverse of "Null" should be "Null". (J21)
*/
public void TestNullInverse() {
Transliterator t = Transliterator.getInstance("Null");
Transliterator u = t.getInverse();
if (!u.getID().equals("Null")) {
errln("FAIL: Inverse of Null should be Null");
}
}
/**
* Check ID of inverse of alias. (J22)
*/
public void TestAliasInverseID() {
String ID = "Latin-Hangul"; // This should be any alias ID with an inverse
Transliterator t = Transliterator.getInstance(ID);
Transliterator u = t.getInverse();
String exp = "Hangul-Latin";
String got = u.getID();
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
/**
* Test filter syntax in IDs. (J23)
*/
@ -785,118 +719,6 @@ public class TransliteratorTest extends TestFmwk {
}
}
/**
* Test the "Remove" transliterator.
*/
public void TestRemove() {
Transliterator t = Transliterator.getInstance("Remove[aeiou]");
expect(t, "The quick brown fox.",
"Th qck brwn fx.");
}
public void TestToRules() {
String RBT = "rbt";
String SET = "set";
String[] DATA = {
RBT,
"$a=\\u4E61; [$a] > A;",
"[\\u4E61] > A;",
RBT,
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
"[[:Zs:][:Zl:]]{a} > A;",
SET,
"[[:Zs:][:Zl:]]",
"[[:Zs:][:Zl:]]",
SET,
"[:Ps:]",
"[:Ps:]",
SET,
"[:L:]",
"[:L:]",
SET,
"[[:L:]-[A]]",
"[[:L:]-[A]]",
SET,
"[~[:Lu:][:Ll:]]",
"[~[:Lu:][:Ll:]]",
SET,
"[~[a-z]]",
"[~[a-z]]",
RBT,
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
"[^[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
"[[a-z]-[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
"[[:Zs:]&[a-z]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
"[x[:Zs:]]{a} > A;",
};
for (int d=0; d < DATA.length; d+=3) {
if (DATA[d] == RBT) {
// Transliterator test
Transliterator t = Transliterator.createFromRules("ID",
DATA[d+1], Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules failed");
return;
}
String rules, escapedRules;
rules = t.toRules(false);
escapedRules = t.toRules(true);
String expRules = Utility.unescape(DATA[d+2]);
String expEscapedRules = DATA[d+2];
if (rules.equals(expRules)) {
logln("Ok: " + DATA[d+1] +
" => " + Utility.escape(rules));
} else {
errln("FAIL: " + DATA[d+1] +
" => " + Utility.escape(rules + ", exp " + expRules));
}
if (escapedRules.equals(expEscapedRules)) {
logln("Ok: " + DATA[d+1] +
" => " + escapedRules);
} else {
errln("FAIL: " + DATA[d+1] +
" => " + escapedRules + ", exp " + expEscapedRules);
}
} else {
// UnicodeSet test
String pat = DATA[d+1];
String expToPat = DATA[d+2];
UnicodeSet set = new UnicodeSet(pat);
// Adjust spacing etc. as necessary.
String toPat;
toPat = set.toPattern(true);
if (expToPat.equals(toPat)) {
logln("Ok: " + pat +
" => " + toPat);
} else {
errln("FAIL: " + pat +
" => " + Utility.escape(toPat) +
", exp " + Utility.escape(pat));
}
}
}
}
/**
* Test the case mapping transliterators.
*/
@ -966,6 +788,9 @@ public class TransliteratorTest extends TestFmwk {
}
}
public void TestCreateInstance_TODO() {
}
/**
* Test the normalization transliterator.
*/
@ -1139,6 +964,249 @@ public class TransliteratorTest extends TestFmwk {
}
}
public void TestCompoundFilter_TODO() {
}
/**
* Test the "Remove" transliterator.
*/
public void TestRemove() {
Transliterator t = Transliterator.getInstance("Remove[aeiou]");
expect(t, "The quick brown fox.",
"Th qck brwn fx.");
}
public void TestToRules() {
String RBT = "rbt";
String SET = "set";
String[] DATA = {
RBT,
"$a=\\u4E61; [$a] > A;",
"[\\u4E61] > A;",
RBT,
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
"[[:Zs:][:Zl:]]{a} > A;",
SET,
"[[:Zs:][:Zl:]]",
"[[:Zs:][:Zl:]]",
SET,
"[:Ps:]",
"[:Ps:]",
SET,
"[:L:]",
"[:L:]",
SET,
"[[:L:]-[A]]",
"[[:L:]-[A]]",
SET,
"[~[:Lu:][:Ll:]]",
"[~[:Lu:][:Ll:]]",
SET,
"[~[a-z]]",
"[~[a-z]]",
RBT,
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
"[^[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
"[[a-z]-[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
"[[:Zs:]&[a-z]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
"[x[:Zs:]]{a} > A;",
};
for (int d=0; d < DATA.length; d+=3) {
if (DATA[d] == RBT) {
// Transliterator test
Transliterator t = Transliterator.createFromRules("ID",
DATA[d+1], Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules failed");
return;
}
String rules, escapedRules;
rules = t.toRules(false);
escapedRules = t.toRules(true);
String expRules = Utility.unescape(DATA[d+2]);
String expEscapedRules = DATA[d+2];
if (rules.equals(expRules)) {
logln("Ok: " + DATA[d+1] +
" => " + Utility.escape(rules));
} else {
errln("FAIL: " + DATA[d+1] +
" => " + Utility.escape(rules + ", exp " + expRules));
}
if (escapedRules.equals(expEscapedRules)) {
logln("Ok: " + DATA[d+1] +
" => " + escapedRules);
} else {
errln("FAIL: " + DATA[d+1] +
" => " + escapedRules + ", exp " + expEscapedRules);
}
} else {
// UnicodeSet test
String pat = DATA[d+1];
String expToPat = DATA[d+2];
UnicodeSet set = new UnicodeSet(pat);
// Adjust spacing etc. as necessary.
String toPat;
toPat = set.toPattern(true);
if (expToPat.equals(toPat)) {
logln("Ok: " + pat +
" => " + toPat);
} else {
errln("FAIL: " + pat +
" => " + Utility.escape(toPat) +
", exp " + Utility.escape(pat));
}
}
}
}
public void TestContext() {
Transliterator.Position pos = new Transliterator.Position(0, 2, 0, 1); // cs cl s l
expect("de > x; {d}e > y;",
"de",
"ye",
pos);
expect("ab{c} > z;",
"xadabdabcy",
"xadabdabzy");
}
static final String CharsToUnicodeString(String s) {
return Utility.unescape(s);
}
public void TestSupplemental() {
expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];" +
"a > $a; $s > i;"),
CharsToUnicodeString("ab\\U0001030Fx"),
CharsToUnicodeString("\\U00010300bix"));
expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];" +
"$b=[A-Z\\U00010400-\\U0001044D];" +
"($a)($b) > $2 $1;"),
CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
// k|ax\\U00010300xm
// k|a\\U00010400\\U00010300xm
// ky|\\U00010400\\U00010300xm
// ky\\U00010400|\\U00010300xm
// ky\\U00010400|\\U00010300\\U00010400m
// ky\\U00010400y|\\U00010400m
expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];" +
"$a {x} > | @ \\U00010400;" +
"{$a} [^\\u0000-\\uFFFF] > y;"),
CharsToUnicodeString("kax\\U00010300xm"),
CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
}
public void TestQuantifier() {
// Make sure @ in a quantified anteContext works
expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
"AAAAAb",
"aaa(aac)");
// Make sure @ in a quantified postContext works
expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
"baaaaa",
"caa(aaa)");
// Make sure @ in a quantified postContext with seg ref works
expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
"baaaaa",
"baa(aaa)");
// Make sure @ past ante context doesn't enter ante context
Transliterator.Position pos = new Transliterator.Position(0, 5, 3, 5);
expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
"xxxab",
"xxx(ac)",
pos);
// Make sure @ past post context doesn't pass limit
Transliterator.Position pos2 = new Transliterator.Position(0, 4, 0, 2);
expect("{b} a+ > c @@ |; x > y; a > A;",
"baxx",
"caxx",
pos2);
// Make sure @ past post context doesn't enter post context
expect("{b} a+ > c @@ |; x > y; a > A;",
"baxx",
"cayy");
expect("(ab)? c > d;",
"c abc ababc",
"d d abd");
expect("(ab)+ {x} > '(' $1 ')';",
"x abx ababxy",
"x ab(ab) abab(abab)y");
expect("b+ > x;",
"ac abc abbc abbbc",
"ac axc axc axc");
expect("[abc]+ > x;",
"qac abrc abbcs abtbbc",
"qx xrx xs xtx");
expect("q{(ab)+} > x;",
"qa qab qaba qababc qaba",
"qa qx qxa qxc qxa");
expect("q(ab)* > x;",
"qa qab qaba qababc",
"xa x xa xc");
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
// In perl, it only matches the first occurrence, so the output
// is "()a (ab) (ab)a (ab)c".
expect("q(ab)* > '(' $1 ')';",
"qa qab qaba qababc",
"()a (ab) (ab)a (abab)c");
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
// quoted string
expect("'ab'+ > x;",
"bb ab ababb",
"bb x xb");
// $foo+ and $foo* -- the quantifier should apply to the entire
// variable reference
expect("$var = ab; $var+ > x;",
"bb ab ababb",
"bb x xb");
}
public void TestSTV_TODO() {
}
/**
* Test inverse of Greek-Latin; Title()
*/
@ -1159,9 +1227,92 @@ public class TransliteratorTest extends TestFmwk {
}
}
//======================================================================
// icu4j only
//======================================================================
/**
* Inverse of "Null" should be "Null". (J21)
*/
public void TestNullInverse() {
Transliterator t = Transliterator.getInstance("Null");
Transliterator u = t.getInverse();
if (!u.getID().equals("Null")) {
errln("FAIL: Inverse of Null should be Null");
}
}
/**
* Check ID of inverse of alias. (J22)
*/
public void TestAliasInverseID() {
String ID = "Latin-Hangul"; // This should be any alias ID with an inverse
Transliterator t = Transliterator.getInstance(ID);
Transliterator u = t.getInverse();
String exp = "Hangul-Latin";
String got = u.getID();
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
/**
* Test IDs of inverses of compound transliterators. (J20)
*/
public void TestCompoundInverseID() {
String ID = "Latin-Jamo;NFC(NFD)";
Transliterator t = Transliterator.getInstance(ID);
Transliterator u = t.getInverse();
String exp = "NFD(NFC);Jamo-Latin";
String got = u.getID();
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
/**
* Test undefined variable.
*/
public void TestUndefinedVariable() {
String rule = "$initial } a <> \u1161;";
try {
Transliterator t = new RuleBasedTransliterator("<ID>", rule);
t = null;
} catch (IllegalArgumentException e) {
logln("OK: Got exception for " + rule + ", as expected: " +
e.getMessage());
return;
}
errln("Fail: bogus rule " + rule + " compiled without error");
}
/**
* Test empty context.
*/
public void TestEmptyContext() {
expect(" { a } > b;", "xay a ", "xby b ");
}
public void TestDisplayName() {
String ID;
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
ID = (String) e.nextElement();
logln(ID + " -> " + Transliterator.getDisplayName(ID));
}
}
//======================================================================
// Support methods
//======================================================================
void expect(String rules,
String source,
String expectedResult,
Transliterator.Position pos) {
Transliterator t = new RuleBasedTransliterator("<ID>", rules);
expect(t, source, expectedResult, pos);
}
void expect(String rules, String source, String expectedResult) {
expect(new RuleBasedTransliterator("<ID>", rules), source, expectedResult);
@ -1176,33 +1327,53 @@ public class TransliteratorTest extends TestFmwk {
}
void expect(Transliterator t, String source, String expectedResult) {
String result = t.transliterate(source);
expectAux(t.getID() + ":String", source, result, expectedResult);
expect(t, source, expectedResult, (Transliterator.Position) null);
}
void expect(Transliterator t, String source, String expectedResult,
Transliterator.Position pos) {
if (pos == null) {
String result = t.transliterate(source);
expectAux(t.getID() + ":String", source, result, expectedResult);
}
Transliterator.Position index = null;
if (pos == null) {
index = new Transliterator.Position();
} else {
index = new Transliterator.Position(pos.contextStart, pos.contextLimit,
pos.start, pos.limit);
}
ReplaceableString rsource = new ReplaceableString(source);
t.transliterate(rsource);
result = rsource.toString();
if (pos == null) {
t.transliterate(rsource);
} else {
// Do it all at once -- below we do it incrementally
t.finishTransliteration(rsource, pos);
}
String result = rsource.toString();
expectAux(t.getID() + ":Replaceable", source, result, expectedResult);
// Test keyboard (incremental) transliteration -- this result
// must be the same after we finalize (see below).
rsource.replace(0, rsource.length(), "");
Transliterator.Position index = new Transliterator.Position();
StringBuffer log = new StringBuffer();
for (int i=0; i<source.length(); ++i) {
if (i != 0) {
log.append(" + ");
rsource.replace(0, rsource.length(), "");
if (pos != null) {
rsource.replace(0, 0, source);
formatInput(log, rsource, index);
log.append(" -> ");
t.transliterate(rsource, index);
formatInput(log, rsource, index);
} else {
for (int i=0; i<source.length(); ++i) {
if (i != 0) {
log.append(" + ");
}
log.append(source.charAt(i)).append(" -> ");
t.transliterate(rsource, index, source.charAt(i));
formatInput(log, rsource, index);
}
log.append(source.charAt(i)).append(" -> ");
t.transliterate(rsource, index,
String.valueOf(source.charAt(i)));
// Append the string buffer with a vertical bar '|' where
// the committed index is.
String s = rsource.toString();
log.append(s.substring(0, index.start)).
append('|').
append(s.substring(index.start));
}
// As a final step in keyboard transliteration, we must call
@ -1217,6 +1388,41 @@ public class TransliteratorTest extends TestFmwk {
expectedResult);
}
/**
* @param appendTo result is appended to this param.
* @param input the string being transliterated
* @param pos the index struct
*/
StringBuffer formatInput(StringBuffer appendTo,
final ReplaceableString input,
final Transliterator.Position pos) {
// Output a string of the form aaa{bbb|ccc|ddd}eee, where
// the {} indicate the context start and limit, and the ||
// indicate the start and limit.
if (0 <= pos.contextStart &&
pos.contextStart <= pos.start &&
pos.start <= pos.limit &&
pos.limit <= pos.contextLimit &&
pos.contextLimit <= input.length()) {
String a, b, c, d, e;
a = input.substring(0, pos.contextStart);
b = input.substring(pos.contextStart, pos.start);
c = input.substring(pos.start, pos.limit);
d = input.substring(pos.limit, pos.contextLimit);
e = input.substring(pos.contextLimit, input.length());
appendTo.append(a).append('{').append(b).
append('|').append(c).append('|').append(d).
append('}').append(e);
} else {
appendTo.append("INVALID Transliterator.Position {cs=" +
pos.contextStart + ", s=" + pos.start + ", l=" +
pos.limit + ", cl=" + pos.contextLimit + "} on " +
input);
}
return appendTo;
}
void expectAux(String tag, String source,
String result, String expectedResult) {
expectAux(tag, source + " -> " + result,

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/ReplaceableString.java,v $
* $Date: 2000/04/25 17:17:37 $
* $Revision: 1.3 $
* $Date: 2001/10/03 00:14:22 $
* $Revision: 1.4 $
*
*****************************************************************************************
*/
@ -24,7 +24,7 @@ package com.ibm.text;
*
* @see Replaceable
* @author Alan Liu
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.3 $ $Date: 2000/04/25 17:17:37 $
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.4 $ $Date: 2001/10/03 00:14:22 $
*/
public class ReplaceableString implements Replaceable {
private StringBuffer buf;
@ -67,6 +67,13 @@ public class ReplaceableString implements Replaceable {
return buf.toString();
}
/**
* Return a substring of the given string.
*/
public String substring(int start, int limit) {
return buf.substring(start, limit);
}
/**
* Return the number of characters contained in this object.
* <code>Replaceable</code> API.

View File

@ -4,9 +4,9 @@
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
* $Date: 2001/09/26 18:00:06 $
* $Revision: 1.46 $
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
* $Date: 2001/10/03 00:14:22 $
* $Revision: 1.47 $
*
*****************************************************************************************
*/
@ -27,18 +27,18 @@ import com.ibm.text.resources.ResourceReader;
* Whitespace, as defined by <code>Character.isWhitespace()</code>,
* is ignored. If the first non-blank character on a line is '#',
* the entire line is ignored as a comment. </p>
*
*
* <p>Each set of rules consists of two groups, one forward, and one
* reverse. This is a convention that is not enforced; rules for one
* direction may be omitted, with the result that translations in
* that direction will not modify the source text. In addition,
* bidirectional forward-reverse rules may be specified for
* symmetrical transformations.</p>
*
*
* <p><b>Rule syntax</b> </p>
*
*
* <p>Rule statements take one of the following forms: </p>
*
*
* <dl>
* <dt><code>$alefmadda=\u0622;</code></dt>
* <dd><strong>Variable definition.</strong> The name on the
@ -66,7 +66,7 @@ import com.ibm.text.resources.ResourceReader;
* the string on the left when performing reverse
* transliteration.</dd>
* </dl>
*
*
* <dl>
* <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
* <dd><strong>Bidirectional translation rule.</strong> This
@ -75,7 +75,7 @@ import com.ibm.text.resources.ResourceReader;
* transliteration, and vice versa when performing reverse
* transliteration.</dd>
* </dl>
*
*
* <p>Translation rules consist of a <em>match pattern</em> and an <em>output
* string</em>. The match pattern consists of literal characters,
* optionally preceded by context, and optionally followed by
@ -92,7 +92,7 @@ import com.ibm.text.resources.ResourceReader;
* (or &quot;<code>123}456</code>&quot;) in which the literal
* pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
* </p>
*
*
* <p>The output string of a forward or reverse rule consists of
* characters to replace the literal pattern characters. If the
* output string contains the character '<code>|</code>', this is
@ -102,59 +102,59 @@ import com.ibm.text.resources.ResourceReader;
* placed within the replacement text; however, it can actually be
* placed into the precending or following context by using the
* special character '<code>@</code>'. Examples:</p>
*
*
* <blockquote>
* <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor
* before a<br>
* {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between
* y and z</code></p>
* </blockquote>
*
*
* <p><b>UnicodeSet</b></p>
*
*
* <p><code>UnicodeSet</code> patterns may appear anywhere that
* makes sense. They may appear in variable definitions.
* Contrariwise, <code>UnicodeSet</code> patterns may themselves
* contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
* or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
*
*
* <p><code>UnicodeSet</code> patterns may also be embedded directly
* into rule strings. Thus, the following two rules are equivalent:</p>
*
*
* <blockquote>
* <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
* [aeiou]&gt;'*';
* &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
* Another way</code></p>
* </blockquote>
*
*
* <p>See {@link UnicodeSet} for more documentation and examples.</p>
*
*
* <p><b>Segments</b></p>
*
*
* <p>Segments of the input string can be matched and copied to the
* output string. This makes certain sets of rules simpler and more
* general, and makes reordering possible. For example:</p>
*
*
* <blockquote>
* <p><code>([a-z]) &gt; $1 $1;
* &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
* double lowercase letters<br>
* ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
* </blockquote>
*
*
* <p>The segment of the input string to be copied is delimited by
* &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
* nine segments may be defined. Segments may not overlap. In the
* output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
* represent the input string segments, in left-to-right order of
* definition.</p>
*
*
* <p><b>Anchors</b></p>
*
*
* <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
* special characters '<code>^</code>' and '<code>$</code>'. For example:</p>
*
*
* <blockquote>
* <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>
* &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
@ -163,24 +163,24 @@ import com.ibm.text.resources.ResourceReader;
* &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
* of 'z'</code></p>
* </blockquote>
*
*
* <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
* This is done by including a virtual anchor character '<code>$</code>' at the end of the
* set pattern. Although this is usually the match chafacter for the end anchor, the set will
* match either the beginning or the end of the text, depending on its placement. For
* example:</p>
*
*
* <blockquote>
* <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>
* $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>
* &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code></p>
* </blockquote>
*
*
* <p><b>Example</b> </p>
*
*
* <p>The following example rules illustrate many of the features of
* the rule language. </p>
*
*
* <table border="0" cellpadding="4">
* <tr>
* <td valign="top">Rule 1.</td>
@ -195,10 +195,10 @@ import com.ibm.text.resources.ResourceReader;
* <td valign="top" nowrap><code>yz&gt;q</code></td>
* </tr>
* </table>
*
*
* <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
* yields the following results: </p>
*
*
* <table border="0" cellpadding="4">
* <tr>
* <td valign="top" nowrap><code>|adefabcdefz</code></td>
@ -251,23 +251,23 @@ import com.ibm.text.resources.ResourceReader;
* transliteration is complete.</td>
* </tr>
* </table>
*
*
* <p>The order of rules is significant. If multiple rules may match
* at some point, the first matching rule is applied. </p>
*
*
* <p>Forward and reverse rules may have an empty output string.
* Otherwise, an empty left or right hand side of any statement is a
* syntax error. </p>
*
*
* <p>Single quotes are used to quote any character other than a
* digit or letter. To specify a single quote itself, inside or
* outside of quotes, use two single quotes in a row. For example,
* the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
* string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
* </p>
*
*
* <p><b>Notes</b> </p>
*
*
* <p>While a RuleBasedTransliterator is being built, it checks that
* the rules are added in proper order. For example, if the rule
* &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
@ -275,11 +275,11 @@ import com.ibm.text.resources.ResourceReader;
* the second rule can never be triggered, since the first rule
* always matches anything it matches. In other words, the first
* rule <em>masks</em> the second rule. </p>
*
*
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
*
*
* @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.46 $ $Date: 2001/09/26 18:00:06 $
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.47 $ $Date: 2001/10/03 00:14:22 $
*/
public class RuleBasedTransliterator extends Transliterator {
@ -433,7 +433,7 @@ public class RuleBasedTransliterator extends Transliterator {
* stored in the rule text to represent the set of characters.
* variables[i] represents character (variablesBase + i).
*/
UnicodeSet[] variables;
UnicodeMatcher[] variables;
/**
* The character that represents variables[0]. Characters
@ -498,6 +498,9 @@ public class RuleBasedTransliterator extends Transliterator {
/**
* $Log: RuleBasedTransliterator.java,v $
* Revision 1.47 2001/10/03 00:14:22 alan
* jitterbug 73: finish quantifier and supplemental char support
*
* Revision 1.46 2001/09/26 18:00:06 alan
* jitterbug 67: sync parser with icu4c, allow unlimited, nested segments
*

View File

@ -4,9 +4,9 @@
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
* $Date: 2001/09/26 18:00:06 $
* $Revision: 1.28 $
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
* $Date: 2001/10/03 00:14:23 $
* $Revision: 1.29 $
*
*****************************************************************************************
*/
@ -28,7 +28,7 @@ import com.ibm.util.Utility;
* may contain variables. Variables represent a set of Unicode
* characters, such as the letters <i>a</i> through <i>z</i>.
* Variables are detected by looking up each character in a supplied
* variable list to see if it has been so defined.
* variable list to see if it has been so defined.
*
* <p>A rule may contain segments in its input string and segment references in
* its output string. A segment is a substring of the input pattern, indicated
@ -44,7 +44,7 @@ import com.ibm.util.Utility;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.28 $ $Date: 2001/09/26 18:00:06 $
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.29 $ $Date: 2001/10/03 00:14:23 $
*/
class TransliterationRule {
@ -310,7 +310,7 @@ class TransliterationRule {
* r1: aakkkpppp
* r2: aaakkkkkpppp
* ^
*
*
* The strings must be aligned at the first character of the
* key. The length of r1 to the left of the alignment point
* must be <= the length of r2 to the left; ditto for the
@ -346,10 +346,10 @@ class TransliterationRule {
int left2 = r2.anteContextLength;
int right = pattern.length() - left;
int right2 = r2.pattern.length() - left2;
// TODO Clean this up -- some logic might be combinable with the
// next statement.
// Test for anchor masking
if (left == left2 && right == right2 &&
keyLength <= r2.keyLength &&
@ -371,7 +371,7 @@ class TransliterationRule {
pos - UTF16.getCharCount(UTF16.charAt(str, pos-1)) :
pos - 1;
}
static final int posAfter(Replaceable str, int pos) {
return (pos >= 0 && pos < str.length()) ?
pos + UTF16.getCharCount(UTF16.charAt(str, pos)) :
@ -387,10 +387,10 @@ class TransliterationRule {
* context and key characters match, but the text is not long
* enough to match all of them. A full match means all context
* and key characters match.
*
*
* If a full match is obtained, perform a replacement, update pos,
* and return U_MATCH. Otherwise both text and pos are unchanged.
*
*
* @param text the text
* @param pos the position indices
* @param incremental if TRUE, test for partial matches that may
@ -559,13 +559,13 @@ class TransliterationRule {
if (segments == null) {
text.replace(pos.start, keyLimit, output);
lenDelta = output.length() - (keyLimit - pos.start);
if (cursorPos >= 0 && cursorPos < keyLength) {
// Within the key, the cursor refers to 16-bit code units
if (cursorPos >= 0 && cursorPos <= output.length()) {
// Within the output string, the cursor refers to 16-bit code units
newStart = pos.start + cursorPos;
} else {
newStart = pos.start;
int n = cursorPos;
// Outside the key, cursorPos counts code points
// Outside the output string, cursorPos counts code points
while (n > 0) {
newStart += UTF16.getCharCount(UTF16.charAt(text, newStart));
--n;
@ -638,7 +638,7 @@ class TransliterationRule {
}
}
}
oText += lenDelta;
pos.limit += lenDelta;
pos.contextLimit += lenDelta;
@ -665,11 +665,11 @@ class TransliterationRule {
* cleared out by, at the end, calling this method with a literal
* character.
*/
protected void appendToRule(StringBuffer rule,
int c,
boolean isLiteral,
boolean escapeUnprintable,
StringBuffer quoteBuf) {
static void appendToRule(StringBuffer rule,
int c,
boolean isLiteral,
boolean escapeUnprintable,
StringBuffer quoteBuf) {
// If we are escaping unprintables, then escape them outside
// quotes. <backslash>u and <backslash>U are not recognized within quotes. The same
// logic applies to literals, but literals are never escaped.
@ -745,11 +745,11 @@ class TransliterationRule {
//System.out.println("rule=" + rule.toString() + " qb=" + quoteBuf.toString());
}
protected final void appendToRule(StringBuffer rule,
String text,
boolean isLiteral,
boolean escapeUnprintable,
StringBuffer quoteBuf) {
static final void appendToRule(StringBuffer rule,
String text,
boolean isLiteral,
boolean escapeUnprintable,
StringBuffer quoteBuf) {
for (int i=0; i<text.length(); ++i) {
appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
}
@ -764,7 +764,7 @@ class TransliterationRule {
*/
public String toRule(boolean escapeUnprintable) {
int i;
StringBuffer rule = new StringBuffer();
// iseg indexes into segments[] directly (not offset from FSPI)
@ -863,7 +863,7 @@ class TransliterationRule {
if (show) {
rule.append((char)(48+d));
}
}
}
rule.append(' ');
}
}
@ -905,6 +905,9 @@ class TransliterationRule {
/**
* $Log: TransliterationRule.java,v $
* Revision 1.29 2001/10/03 00:14:23 alan
* jitterbug 73: finish quantifier and supplemental char support
*
* Revision 1.28 2001/09/26 18:00:06 alan
* jitterbug 67: sync parser with icu4c, allow unlimited, nested segments
*

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/Transliterator.java,v $
* $Date: 2001/09/28 20:37:09 $
* $Revision: 1.43 $
* $Date: 2001/10/03 00:14:23 $
* $Revision: 1.44 $
*
*****************************************************************************************
*/
@ -241,7 +241,7 @@ import com.ibm.util.CaseInsensitiveString;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.43 $ $Date: 2001/09/28 20:37:09 $
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.44 $ $Date: 2001/10/03 00:14:23 $
*/
public abstract class Transliterator {
/**
@ -553,6 +553,15 @@ public abstract class Transliterator {
index.contextLimit += insertion.length();
}
char last = (text.length() > 0) ?
text.charAt(text.length() - 1) : 0;
if (UTF16.isLeadSurrogate(last)) {
// Oops, the caller passed us a single lead surrogate at the
// end of the insertion. Don't transliterate until more text
// comes in.
return;
}
filteredTransliterate(text, index, true);
// This doesn't work once we add quantifier support. Need to rewrite

View File

@ -142,6 +142,10 @@ class TransliteratorParser {
private static final char CURSOR_OFFSET = '@';
private static final char ANCHOR_START = '^';
private static final char KLEENE_STAR = '*';
private static final char ONE_OR_MORE = '+';
private static final char ZERO_OR_ONE = '?';
// By definition, the ANCHOR_END special character is a
// trailing SymbolTable.SYMBOL_REF character.
// private static final char ANCHOR_END = '$';
@ -382,7 +386,7 @@ class TransliteratorParser {
idBlock = idBlockResult.toString();
// Convert the set vector to an array
data.variables = new UnicodeSet[variablesVector.size()];
data.variables = new UnicodeMatcher[variablesVector.size()];
variablesVector.copyInto(data.variables);
variablesVector = null;
@ -658,7 +662,7 @@ class TransliteratorParser {
int varStart = -1; // Most recent $variableReference
int varLimit = -1;
int[] iref = new int[1];
main:
while (pos < limit && !done) {
char c = rule.charAt(pos++);
@ -853,56 +857,71 @@ class TransliteratorParser {
}
}
break;
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
// TODO Add quantifier parsing
case KLEENE_STAR:
case ONE_OR_MORE:
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
int qstart, qlimit;
boolean[] isOpenParen = new boolean[1];
boolean isSegment = false;
if (segments != null &&
segments.getLastParenOffset(isOpenParen) == buf.length()) {
// The */+ immediately follows a segment
if (isOpenParen[0]) {
syntaxError("Misplaced quantifier", rule, start);
}
int[] startparam = new int[1];
int[] limitparam = new int[1];
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
syntaxError("Mismatched segment delimiters", rule, start);
}
qstart = startparam[0];
qlimit = limitparam[0];
isSegment = true;
} else {
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
qstart = quoteStart;
qlimit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
qstart = varStart;
qlimit = varLimit;
} else {
// The */+ follows a single character
qstart = buf.length() - 1;
qlimit = qstart + 1;
}
}
UnicodeMatcher m =
new StringMatcher(buf.toString(), qstart, qlimit,
isSegment, parser.data);
int min = 0;
int max = Quantifier.MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.setLength(qstart);
buf.append(parser.generateStandInFor(m));
}
break;
// case SET_CLOSE:
default:
// Disallow unquoted characters other than [0-9A-Za-z]
@ -947,7 +966,7 @@ class TransliteratorParser {
//----------------------------------------------------------------------
// END RuleHalf
//----------------------------------------------------------------------
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not