ICU-73 finish quantifier and supplemental char support
X-SVN-Rev: 6003
This commit is contained in:
parent
40694d1edc
commit
a56c858f03
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||
* $Date: 2001/09/28 05:47:30 $
|
||||
* $Revision: 1.47 $
|
||||
* $Date: 2001/10/03 00:18:23 $
|
||||
* $Revision: 1.48 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -77,14 +77,6 @@ public class TransliteratorTest extends TestFmwk {
|
||||
logln("Elapsed time: " + ms + " ms");
|
||||
}
|
||||
|
||||
public void TestDisplayName() {
|
||||
String ID;
|
||||
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
|
||||
ID = (String) e.nextElement();
|
||||
logln(ID + " -> " + Transliterator.getDisplayName(ID));
|
||||
}
|
||||
}
|
||||
|
||||
public void TestSimpleRules() {
|
||||
/* Example: rules 1. ab>x|y
|
||||
* 2. yc>z
|
||||
@ -131,29 +123,6 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test undefined variable.
|
||||
*/
|
||||
public void TestUndefinedVariable() {
|
||||
String rule = "$initial } a <> \u1161;";
|
||||
try {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>", rule);
|
||||
t = null;
|
||||
} catch (IllegalArgumentException e) {
|
||||
logln("OK: Got exception for " + rule + ", as expected: " +
|
||||
e.getMessage());
|
||||
return;
|
||||
}
|
||||
errln("Fail: bogus rule " + rule + " compiled without error");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test empty context.
|
||||
*/
|
||||
public void TestEmptyContext() {
|
||||
expect(" { a } > b;", "xay a ", "xby b ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inline set syntax and set variable syntax.
|
||||
*/
|
||||
@ -524,6 +493,9 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expect(hex3, "012", "012");
|
||||
}
|
||||
|
||||
public void TestJ329_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test segments and segment references.
|
||||
*/
|
||||
@ -681,6 +653,9 @@ public class TransliteratorTest extends TestFmwk {
|
||||
|
||||
}
|
||||
|
||||
public void TestCopyJ476_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inter-Indic transliterators. These are composed.
|
||||
*/
|
||||
@ -700,47 +675,6 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expect(dg, dev, guj);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test IDs of inverses of compound transliterators. (J20)
|
||||
*/
|
||||
public void TestCompoundInverseID() {
|
||||
String ID = "Latin-Jamo;NFC(NFD)";
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
Transliterator u = t.getInverse();
|
||||
String exp = "NFD(NFC);Jamo-Latin";
|
||||
String got = u.getID();
|
||||
if (!got.equals(exp)) {
|
||||
errln("FAIL: Inverse of " + ID + " is " + got +
|
||||
", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inverse of "Null" should be "Null". (J21)
|
||||
*/
|
||||
public void TestNullInverse() {
|
||||
Transliterator t = Transliterator.getInstance("Null");
|
||||
Transliterator u = t.getInverse();
|
||||
if (!u.getID().equals("Null")) {
|
||||
errln("FAIL: Inverse of Null should be Null");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check ID of inverse of alias. (J22)
|
||||
*/
|
||||
public void TestAliasInverseID() {
|
||||
String ID = "Latin-Hangul"; // This should be any alias ID with an inverse
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
Transliterator u = t.getInverse();
|
||||
String exp = "Hangul-Latin";
|
||||
String got = u.getID();
|
||||
if (!got.equals(exp)) {
|
||||
errln("FAIL: Inverse of " + ID + " is " + got +
|
||||
", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test filter syntax in IDs. (J23)
|
||||
*/
|
||||
@ -785,118 +719,6 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the "Remove" transliterator.
|
||||
*/
|
||||
public void TestRemove() {
|
||||
Transliterator t = Transliterator.getInstance("Remove[aeiou]");
|
||||
expect(t, "The quick brown fox.",
|
||||
"Th qck brwn fx.");
|
||||
}
|
||||
|
||||
public void TestToRules() {
|
||||
String RBT = "rbt";
|
||||
String SET = "set";
|
||||
String[] DATA = {
|
||||
RBT,
|
||||
"$a=\\u4E61; [$a] > A;",
|
||||
"[\\u4E61] > A;",
|
||||
|
||||
RBT,
|
||||
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
|
||||
"[[:Zs:][:Zl:]]{a} > A;",
|
||||
|
||||
SET,
|
||||
"[[:Zs:][:Zl:]]",
|
||||
"[[:Zs:][:Zl:]]",
|
||||
|
||||
SET,
|
||||
"[:Ps:]",
|
||||
"[:Ps:]",
|
||||
|
||||
SET,
|
||||
"[:L:]",
|
||||
"[:L:]",
|
||||
|
||||
SET,
|
||||
"[[:L:]-[A]]",
|
||||
"[[:L:]-[A]]",
|
||||
|
||||
SET,
|
||||
"[~[:Lu:][:Ll:]]",
|
||||
"[~[:Lu:][:Ll:]]",
|
||||
|
||||
SET,
|
||||
"[~[a-z]]",
|
||||
"[~[a-z]]",
|
||||
|
||||
RBT,
|
||||
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
|
||||
"[^[:Zs:]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
|
||||
"[[a-z]-[:Zs:]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
|
||||
"[[:Zs:]&[a-z]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
|
||||
"[x[:Zs:]]{a} > A;",
|
||||
};
|
||||
|
||||
for (int d=0; d < DATA.length; d+=3) {
|
||||
if (DATA[d] == RBT) {
|
||||
// Transliterator test
|
||||
Transliterator t = Transliterator.createFromRules("ID",
|
||||
DATA[d+1], Transliterator.FORWARD);
|
||||
if (t == null) {
|
||||
errln("FAIL: createFromRules failed");
|
||||
return;
|
||||
}
|
||||
String rules, escapedRules;
|
||||
rules = t.toRules(false);
|
||||
escapedRules = t.toRules(true);
|
||||
String expRules = Utility.unescape(DATA[d+2]);
|
||||
String expEscapedRules = DATA[d+2];
|
||||
if (rules.equals(expRules)) {
|
||||
logln("Ok: " + DATA[d+1] +
|
||||
" => " + Utility.escape(rules));
|
||||
} else {
|
||||
errln("FAIL: " + DATA[d+1] +
|
||||
" => " + Utility.escape(rules + ", exp " + expRules));
|
||||
}
|
||||
if (escapedRules.equals(expEscapedRules)) {
|
||||
logln("Ok: " + DATA[d+1] +
|
||||
" => " + escapedRules);
|
||||
} else {
|
||||
errln("FAIL: " + DATA[d+1] +
|
||||
" => " + escapedRules + ", exp " + expEscapedRules);
|
||||
}
|
||||
|
||||
} else {
|
||||
// UnicodeSet test
|
||||
String pat = DATA[d+1];
|
||||
String expToPat = DATA[d+2];
|
||||
UnicodeSet set = new UnicodeSet(pat);
|
||||
|
||||
// Adjust spacing etc. as necessary.
|
||||
String toPat;
|
||||
toPat = set.toPattern(true);
|
||||
if (expToPat.equals(toPat)) {
|
||||
logln("Ok: " + pat +
|
||||
" => " + toPat);
|
||||
} else {
|
||||
errln("FAIL: " + pat +
|
||||
" => " + Utility.escape(toPat) +
|
||||
", exp " + Utility.escape(pat));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the case mapping transliterators.
|
||||
*/
|
||||
@ -966,6 +788,9 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
public void TestCreateInstance_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the normalization transliterator.
|
||||
*/
|
||||
@ -1139,6 +964,249 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
public void TestCompoundFilter_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the "Remove" transliterator.
|
||||
*/
|
||||
public void TestRemove() {
|
||||
Transliterator t = Transliterator.getInstance("Remove[aeiou]");
|
||||
expect(t, "The quick brown fox.",
|
||||
"Th qck brwn fx.");
|
||||
}
|
||||
|
||||
public void TestToRules() {
|
||||
String RBT = "rbt";
|
||||
String SET = "set";
|
||||
String[] DATA = {
|
||||
RBT,
|
||||
"$a=\\u4E61; [$a] > A;",
|
||||
"[\\u4E61] > A;",
|
||||
|
||||
RBT,
|
||||
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
|
||||
"[[:Zs:][:Zl:]]{a} > A;",
|
||||
|
||||
SET,
|
||||
"[[:Zs:][:Zl:]]",
|
||||
"[[:Zs:][:Zl:]]",
|
||||
|
||||
SET,
|
||||
"[:Ps:]",
|
||||
"[:Ps:]",
|
||||
|
||||
SET,
|
||||
"[:L:]",
|
||||
"[:L:]",
|
||||
|
||||
SET,
|
||||
"[[:L:]-[A]]",
|
||||
"[[:L:]-[A]]",
|
||||
|
||||
SET,
|
||||
"[~[:Lu:][:Ll:]]",
|
||||
"[~[:Lu:][:Ll:]]",
|
||||
|
||||
SET,
|
||||
"[~[a-z]]",
|
||||
"[~[a-z]]",
|
||||
|
||||
RBT,
|
||||
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
|
||||
"[^[:Zs:]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
|
||||
"[[a-z]-[:Zs:]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
|
||||
"[[:Zs:]&[a-z]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
|
||||
"[x[:Zs:]]{a} > A;",
|
||||
};
|
||||
|
||||
for (int d=0; d < DATA.length; d+=3) {
|
||||
if (DATA[d] == RBT) {
|
||||
// Transliterator test
|
||||
Transliterator t = Transliterator.createFromRules("ID",
|
||||
DATA[d+1], Transliterator.FORWARD);
|
||||
if (t == null) {
|
||||
errln("FAIL: createFromRules failed");
|
||||
return;
|
||||
}
|
||||
String rules, escapedRules;
|
||||
rules = t.toRules(false);
|
||||
escapedRules = t.toRules(true);
|
||||
String expRules = Utility.unescape(DATA[d+2]);
|
||||
String expEscapedRules = DATA[d+2];
|
||||
if (rules.equals(expRules)) {
|
||||
logln("Ok: " + DATA[d+1] +
|
||||
" => " + Utility.escape(rules));
|
||||
} else {
|
||||
errln("FAIL: " + DATA[d+1] +
|
||||
" => " + Utility.escape(rules + ", exp " + expRules));
|
||||
}
|
||||
if (escapedRules.equals(expEscapedRules)) {
|
||||
logln("Ok: " + DATA[d+1] +
|
||||
" => " + escapedRules);
|
||||
} else {
|
||||
errln("FAIL: " + DATA[d+1] +
|
||||
" => " + escapedRules + ", exp " + expEscapedRules);
|
||||
}
|
||||
|
||||
} else {
|
||||
// UnicodeSet test
|
||||
String pat = DATA[d+1];
|
||||
String expToPat = DATA[d+2];
|
||||
UnicodeSet set = new UnicodeSet(pat);
|
||||
|
||||
// Adjust spacing etc. as necessary.
|
||||
String toPat;
|
||||
toPat = set.toPattern(true);
|
||||
if (expToPat.equals(toPat)) {
|
||||
logln("Ok: " + pat +
|
||||
" => " + toPat);
|
||||
} else {
|
||||
errln("FAIL: " + pat +
|
||||
" => " + Utility.escape(toPat) +
|
||||
", exp " + Utility.escape(pat));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestContext() {
|
||||
Transliterator.Position pos = new Transliterator.Position(0, 2, 0, 1); // cs cl s l
|
||||
|
||||
expect("de > x; {d}e > y;",
|
||||
"de",
|
||||
"ye",
|
||||
pos);
|
||||
|
||||
expect("ab{c} > z;",
|
||||
"xadabdabcy",
|
||||
"xadabdabzy");
|
||||
}
|
||||
|
||||
static final String CharsToUnicodeString(String s) {
|
||||
return Utility.unescape(s);
|
||||
}
|
||||
|
||||
public void TestSupplemental() {
|
||||
|
||||
expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];" +
|
||||
"a > $a; $s > i;"),
|
||||
CharsToUnicodeString("ab\\U0001030Fx"),
|
||||
CharsToUnicodeString("\\U00010300bix"));
|
||||
|
||||
expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];" +
|
||||
"$b=[A-Z\\U00010400-\\U0001044D];" +
|
||||
"($a)($b) > $2 $1;"),
|
||||
CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
|
||||
CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
|
||||
|
||||
// k|ax\\U00010300xm
|
||||
|
||||
// k|a\\U00010400\\U00010300xm
|
||||
// ky|\\U00010400\\U00010300xm
|
||||
// ky\\U00010400|\\U00010300xm
|
||||
|
||||
// ky\\U00010400|\\U00010300\\U00010400m
|
||||
// ky\\U00010400y|\\U00010400m
|
||||
expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];" +
|
||||
"$a {x} > | @ \\U00010400;" +
|
||||
"{$a} [^\\u0000-\\uFFFF] > y;"),
|
||||
CharsToUnicodeString("kax\\U00010300xm"),
|
||||
CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
|
||||
}
|
||||
|
||||
public void TestQuantifier() {
|
||||
|
||||
// Make sure @ in a quantified anteContext works
|
||||
expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
|
||||
"AAAAAb",
|
||||
"aaa(aac)");
|
||||
|
||||
// Make sure @ in a quantified postContext works
|
||||
expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
|
||||
"baaaaa",
|
||||
"caa(aaa)");
|
||||
|
||||
// Make sure @ in a quantified postContext with seg ref works
|
||||
expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
|
||||
"baaaaa",
|
||||
"baa(aaa)");
|
||||
|
||||
// Make sure @ past ante context doesn't enter ante context
|
||||
Transliterator.Position pos = new Transliterator.Position(0, 5, 3, 5);
|
||||
expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
|
||||
"xxxab",
|
||||
"xxx(ac)",
|
||||
pos);
|
||||
|
||||
// Make sure @ past post context doesn't pass limit
|
||||
Transliterator.Position pos2 = new Transliterator.Position(0, 4, 0, 2);
|
||||
expect("{b} a+ > c @@ |; x > y; a > A;",
|
||||
"baxx",
|
||||
"caxx",
|
||||
pos2);
|
||||
|
||||
// Make sure @ past post context doesn't enter post context
|
||||
expect("{b} a+ > c @@ |; x > y; a > A;",
|
||||
"baxx",
|
||||
"cayy");
|
||||
|
||||
expect("(ab)? c > d;",
|
||||
"c abc ababc",
|
||||
"d d abd");
|
||||
|
||||
expect("(ab)+ {x} > '(' $1 ')';",
|
||||
"x abx ababxy",
|
||||
"x ab(ab) abab(abab)y");
|
||||
|
||||
expect("b+ > x;",
|
||||
"ac abc abbc abbbc",
|
||||
"ac axc axc axc");
|
||||
|
||||
expect("[abc]+ > x;",
|
||||
"qac abrc abbcs abtbbc",
|
||||
"qx xrx xs xtx");
|
||||
|
||||
expect("q{(ab)+} > x;",
|
||||
"qa qab qaba qababc qaba",
|
||||
"qa qx qxa qxc qxa");
|
||||
|
||||
expect("q(ab)* > x;",
|
||||
"qa qab qaba qababc",
|
||||
"xa x xa xc");
|
||||
|
||||
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
|
||||
// In perl, it only matches the first occurrence, so the output
|
||||
// is "()a (ab) (ab)a (ab)c".
|
||||
expect("q(ab)* > '(' $1 ')';",
|
||||
"qa qab qaba qababc",
|
||||
"()a (ab) (ab)a (abab)c");
|
||||
|
||||
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
|
||||
// quoted string
|
||||
expect("'ab'+ > x;",
|
||||
"bb ab ababb",
|
||||
"bb x xb");
|
||||
|
||||
// $foo+ and $foo* -- the quantifier should apply to the entire
|
||||
// variable reference
|
||||
expect("$var = ab; $var+ > x;",
|
||||
"bb ab ababb",
|
||||
"bb x xb");
|
||||
}
|
||||
|
||||
public void TestSTV_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inverse of Greek-Latin; Title()
|
||||
*/
|
||||
@ -1159,9 +1227,92 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// icu4j only
|
||||
//======================================================================
|
||||
|
||||
/**
|
||||
* Inverse of "Null" should be "Null". (J21)
|
||||
*/
|
||||
public void TestNullInverse() {
|
||||
Transliterator t = Transliterator.getInstance("Null");
|
||||
Transliterator u = t.getInverse();
|
||||
if (!u.getID().equals("Null")) {
|
||||
errln("FAIL: Inverse of Null should be Null");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check ID of inverse of alias. (J22)
|
||||
*/
|
||||
public void TestAliasInverseID() {
|
||||
String ID = "Latin-Hangul"; // This should be any alias ID with an inverse
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
Transliterator u = t.getInverse();
|
||||
String exp = "Hangul-Latin";
|
||||
String got = u.getID();
|
||||
if (!got.equals(exp)) {
|
||||
errln("FAIL: Inverse of " + ID + " is " + got +
|
||||
", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test IDs of inverses of compound transliterators. (J20)
|
||||
*/
|
||||
public void TestCompoundInverseID() {
|
||||
String ID = "Latin-Jamo;NFC(NFD)";
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
Transliterator u = t.getInverse();
|
||||
String exp = "NFD(NFC);Jamo-Latin";
|
||||
String got = u.getID();
|
||||
if (!got.equals(exp)) {
|
||||
errln("FAIL: Inverse of " + ID + " is " + got +
|
||||
", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test undefined variable.
|
||||
*/
|
||||
public void TestUndefinedVariable() {
|
||||
String rule = "$initial } a <> \u1161;";
|
||||
try {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>", rule);
|
||||
t = null;
|
||||
} catch (IllegalArgumentException e) {
|
||||
logln("OK: Got exception for " + rule + ", as expected: " +
|
||||
e.getMessage());
|
||||
return;
|
||||
}
|
||||
errln("Fail: bogus rule " + rule + " compiled without error");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test empty context.
|
||||
*/
|
||||
public void TestEmptyContext() {
|
||||
expect(" { a } > b;", "xay a ", "xby b ");
|
||||
}
|
||||
|
||||
public void TestDisplayName() {
|
||||
String ID;
|
||||
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
|
||||
ID = (String) e.nextElement();
|
||||
logln(ID + " -> " + Transliterator.getDisplayName(ID));
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
void expect(String rules,
|
||||
String source,
|
||||
String expectedResult,
|
||||
Transliterator.Position pos) {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>", rules);
|
||||
expect(t, source, expectedResult, pos);
|
||||
}
|
||||
|
||||
void expect(String rules, String source, String expectedResult) {
|
||||
expect(new RuleBasedTransliterator("<ID>", rules), source, expectedResult);
|
||||
@ -1176,33 +1327,53 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
|
||||
void expect(Transliterator t, String source, String expectedResult) {
|
||||
String result = t.transliterate(source);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
expect(t, source, expectedResult, (Transliterator.Position) null);
|
||||
}
|
||||
|
||||
void expect(Transliterator t, String source, String expectedResult,
|
||||
Transliterator.Position pos) {
|
||||
if (pos == null) {
|
||||
String result = t.transliterate(source);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
}
|
||||
|
||||
Transliterator.Position index = null;
|
||||
if (pos == null) {
|
||||
index = new Transliterator.Position();
|
||||
} else {
|
||||
index = new Transliterator.Position(pos.contextStart, pos.contextLimit,
|
||||
pos.start, pos.limit);
|
||||
}
|
||||
|
||||
ReplaceableString rsource = new ReplaceableString(source);
|
||||
t.transliterate(rsource);
|
||||
result = rsource.toString();
|
||||
if (pos == null) {
|
||||
t.transliterate(rsource);
|
||||
} else {
|
||||
// Do it all at once -- below we do it incrementally
|
||||
t.finishTransliteration(rsource, pos);
|
||||
}
|
||||
String result = rsource.toString();
|
||||
expectAux(t.getID() + ":Replaceable", source, result, expectedResult);
|
||||
|
||||
// Test keyboard (incremental) transliteration -- this result
|
||||
// must be the same after we finalize (see below).
|
||||
rsource.replace(0, rsource.length(), "");
|
||||
Transliterator.Position index = new Transliterator.Position();
|
||||
StringBuffer log = new StringBuffer();
|
||||
|
||||
for (int i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
rsource.replace(0, rsource.length(), "");
|
||||
if (pos != null) {
|
||||
rsource.replace(0, 0, source);
|
||||
formatInput(log, rsource, index);
|
||||
log.append(" -> ");
|
||||
t.transliterate(rsource, index);
|
||||
formatInput(log, rsource, index);
|
||||
} else {
|
||||
for (int i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
t.transliterate(rsource, index, source.charAt(i));
|
||||
formatInput(log, rsource, index);
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
t.transliterate(rsource, index,
|
||||
String.valueOf(source.charAt(i)));
|
||||
// Append the string buffer with a vertical bar '|' where
|
||||
// the committed index is.
|
||||
String s = rsource.toString();
|
||||
log.append(s.substring(0, index.start)).
|
||||
append('|').
|
||||
append(s.substring(index.start));
|
||||
}
|
||||
|
||||
// As a final step in keyboard transliteration, we must call
|
||||
@ -1217,6 +1388,41 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param appendTo result is appended to this param.
|
||||
* @param input the string being transliterated
|
||||
* @param pos the index struct
|
||||
*/
|
||||
StringBuffer formatInput(StringBuffer appendTo,
|
||||
final ReplaceableString input,
|
||||
final Transliterator.Position pos) {
|
||||
// Output a string of the form aaa{bbb|ccc|ddd}eee, where
|
||||
// the {} indicate the context start and limit, and the ||
|
||||
// indicate the start and limit.
|
||||
if (0 <= pos.contextStart &&
|
||||
pos.contextStart <= pos.start &&
|
||||
pos.start <= pos.limit &&
|
||||
pos.limit <= pos.contextLimit &&
|
||||
pos.contextLimit <= input.length()) {
|
||||
|
||||
String a, b, c, d, e;
|
||||
a = input.substring(0, pos.contextStart);
|
||||
b = input.substring(pos.contextStart, pos.start);
|
||||
c = input.substring(pos.start, pos.limit);
|
||||
d = input.substring(pos.limit, pos.contextLimit);
|
||||
e = input.substring(pos.contextLimit, input.length());
|
||||
appendTo.append(a).append('{').append(b).
|
||||
append('|').append(c).append('|').append(d).
|
||||
append('}').append(e);
|
||||
} else {
|
||||
appendTo.append("INVALID Transliterator.Position {cs=" +
|
||||
pos.contextStart + ", s=" + pos.start + ", l=" +
|
||||
pos.limit + ", cl=" + pos.contextLimit + "} on " +
|
||||
input);
|
||||
}
|
||||
return appendTo;
|
||||
}
|
||||
|
||||
void expectAux(String tag, String source,
|
||||
String result, String expectedResult) {
|
||||
expectAux(tag, source + " -> " + result,
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/ReplaceableString.java,v $
|
||||
* $Date: 2000/04/25 17:17:37 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/03 00:14:22 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -24,7 +24,7 @@ package com.ibm.text;
|
||||
*
|
||||
* @see Replaceable
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.3 $ $Date: 2000/04/25 17:17:37 $
|
||||
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.4 $ $Date: 2001/10/03 00:14:22 $
|
||||
*/
|
||||
public class ReplaceableString implements Replaceable {
|
||||
private StringBuffer buf;
|
||||
@ -67,6 +67,13 @@ public class ReplaceableString implements Replaceable {
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a substring of the given string.
|
||||
*/
|
||||
public String substring(int start, int limit) {
|
||||
return buf.substring(start, limit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters contained in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
|
@ -4,9 +4,9 @@
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
|
||||
* $Date: 2001/09/26 18:00:06 $
|
||||
* $Revision: 1.46 $
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
|
||||
* $Date: 2001/10/03 00:14:22 $
|
||||
* $Revision: 1.47 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -27,18 +27,18 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* Whitespace, as defined by <code>Character.isWhitespace()</code>,
|
||||
* is ignored. If the first non-blank character on a line is '#',
|
||||
* the entire line is ignored as a comment. </p>
|
||||
*
|
||||
*
|
||||
* <p>Each set of rules consists of two groups, one forward, and one
|
||||
* reverse. This is a convention that is not enforced; rules for one
|
||||
* direction may be omitted, with the result that translations in
|
||||
* that direction will not modify the source text. In addition,
|
||||
* bidirectional forward-reverse rules may be specified for
|
||||
* symmetrical transformations.</p>
|
||||
*
|
||||
*
|
||||
* <p><b>Rule syntax</b> </p>
|
||||
*
|
||||
*
|
||||
* <p>Rule statements take one of the following forms: </p>
|
||||
*
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>$alefmadda=\u0622;</code></dt>
|
||||
* <dd><strong>Variable definition.</strong> The name on the
|
||||
@ -66,7 +66,7 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* the string on the left when performing reverse
|
||||
* transliteration.</dd>
|
||||
* </dl>
|
||||
*
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>ai<>$alefmadda;</code></dt>
|
||||
* <dd><strong>Bidirectional translation rule.</strong> This
|
||||
@ -75,7 +75,7 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* transliteration, and vice versa when performing reverse
|
||||
* transliteration.</dd>
|
||||
* </dl>
|
||||
*
|
||||
*
|
||||
* <p>Translation rules consist of a <em>match pattern</em> and an <em>output
|
||||
* string</em>. The match pattern consists of literal characters,
|
||||
* optionally preceded by context, and optionally followed by
|
||||
@ -92,7 +92,7 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* (or "<code>123}456</code>") in which the literal
|
||||
* pattern "<code>123</code>" must be followed by "<code>456</code>".
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <p>The output string of a forward or reverse rule consists of
|
||||
* characters to replace the literal pattern characters. If the
|
||||
* output string contains the character '<code>|</code>', this is
|
||||
@ -102,59 +102,59 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* placed within the replacement text; however, it can actually be
|
||||
* placed into the precending or following context by using the
|
||||
* special character '<code>@</code>'. Examples:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>a {foo} z > | @ bar; # foo -> bar, move cursor
|
||||
* before a<br>
|
||||
* {foo} xyz > bar @@|; # foo -> bar, cursor between
|
||||
* y and z</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p><b>UnicodeSet</b></p>
|
||||
*
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may appear anywhere that
|
||||
* makes sense. They may appear in variable definitions.
|
||||
* Contrariwise, <code>UnicodeSet</code> patterns may themselves
|
||||
* contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>",
|
||||
* or "<code>$range=a-z;$ll=[$range]</code>".</p>
|
||||
*
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may also be embedded directly
|
||||
* into rule strings. Thus, the following two rules are equivalent:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>$vowel=[aeiou]; $vowel>'*'; # One way to do this<br>
|
||||
* [aeiou]>'*';
|
||||
* #
|
||||
* Another way</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p>See {@link UnicodeSet} for more documentation and examples.</p>
|
||||
*
|
||||
*
|
||||
* <p><b>Segments</b></p>
|
||||
*
|
||||
*
|
||||
* <p>Segments of the input string can be matched and copied to the
|
||||
* output string. This makes certain sets of rules simpler and more
|
||||
* general, and makes reordering possible. For example:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>([a-z]) > $1 $1;
|
||||
* #
|
||||
* double lowercase letters<br>
|
||||
* ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p>The segment of the input string to be copied is delimited by
|
||||
* "<code>(</code>" and "<code>)</code>". Up to
|
||||
* nine segments may be defined. Segments may not overlap. In the
|
||||
* output string, "<code>$1</code>" through "<code>$9</code>"
|
||||
* represent the input string segments, in left-to-right order of
|
||||
* definition.</p>
|
||||
*
|
||||
*
|
||||
* <p><b>Anchors</b></p>
|
||||
*
|
||||
*
|
||||
* <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
|
||||
* special characters '<code>^</code>' and '<code>$</code>'. For example:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>^ a > 'BEG_A'; # match 'a' at start of text<br>
|
||||
* a > 'A'; # match other instances
|
||||
@ -163,24 +163,24 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* z > 'Z'; # match other instances
|
||||
* of 'z'</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
|
||||
* This is done by including a virtual anchor character '<code>$</code>' at the end of the
|
||||
* set pattern. Although this is usually the match chafacter for the end anchor, the set will
|
||||
* match either the beginning or the end of the text, depending on its placement. For
|
||||
* example:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>$x = [a-z$]; # match 'a' through 'z' OR anchor<br>
|
||||
* $x 1 > 2; # match '1' after a-z or at the start<br>
|
||||
* 3 $x > 4; # match '3' before a-z or at the end</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p><b>Example</b> </p>
|
||||
*
|
||||
*
|
||||
* <p>The following example rules illustrate many of the features of
|
||||
* the rule language. </p>
|
||||
*
|
||||
*
|
||||
* <table border="0" cellpadding="4">
|
||||
* <tr>
|
||||
* <td valign="top">Rule 1.</td>
|
||||
@ -195,10 +195,10 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* <td valign="top" nowrap><code>yz>q</code></td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
*
|
||||
* <p>Applying these rules to the string "<code>adefabcdefz</code>"
|
||||
* yields the following results: </p>
|
||||
*
|
||||
*
|
||||
* <table border="0" cellpadding="4">
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>|adefabcdefz</code></td>
|
||||
@ -251,23 +251,23 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* transliteration is complete.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
*
|
||||
* <p>The order of rules is significant. If multiple rules may match
|
||||
* at some point, the first matching rule is applied. </p>
|
||||
*
|
||||
*
|
||||
* <p>Forward and reverse rules may have an empty output string.
|
||||
* Otherwise, an empty left or right hand side of any statement is a
|
||||
* syntax error. </p>
|
||||
*
|
||||
*
|
||||
* <p>Single quotes are used to quote any character other than a
|
||||
* digit or letter. To specify a single quote itself, inside or
|
||||
* outside of quotes, use two single quotes in a row. For example,
|
||||
* the rule "<code>'>'>o''clock</code>" changes the
|
||||
* string "<code>></code>" to the string "<code>o'clock</code>".
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <p><b>Notes</b> </p>
|
||||
*
|
||||
*
|
||||
* <p>While a RuleBasedTransliterator is being built, it checks that
|
||||
* the rules are added in proper order. For example, if the rule
|
||||
* "a>x" is followed by the rule "ab>y",
|
||||
@ -275,11 +275,11 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* the second rule can never be triggered, since the first rule
|
||||
* always matches anything it matches. In other words, the first
|
||||
* rule <em>masks</em> the second rule. </p>
|
||||
*
|
||||
*
|
||||
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
||||
*
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.46 $ $Date: 2001/09/26 18:00:06 $
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.47 $ $Date: 2001/10/03 00:14:22 $
|
||||
*/
|
||||
public class RuleBasedTransliterator extends Transliterator {
|
||||
|
||||
@ -433,7 +433,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
||||
* stored in the rule text to represent the set of characters.
|
||||
* variables[i] represents character (variablesBase + i).
|
||||
*/
|
||||
UnicodeSet[] variables;
|
||||
UnicodeMatcher[] variables;
|
||||
|
||||
/**
|
||||
* The character that represents variables[0]. Characters
|
||||
@ -498,6 +498,9 @@ public class RuleBasedTransliterator extends Transliterator {
|
||||
|
||||
/**
|
||||
* $Log: RuleBasedTransliterator.java,v $
|
||||
* Revision 1.47 2001/10/03 00:14:22 alan
|
||||
* jitterbug 73: finish quantifier and supplemental char support
|
||||
*
|
||||
* Revision 1.46 2001/09/26 18:00:06 alan
|
||||
* jitterbug 67: sync parser with icu4c, allow unlimited, nested segments
|
||||
*
|
||||
|
@ -4,9 +4,9 @@
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
||||
* $Date: 2001/09/26 18:00:06 $
|
||||
* $Revision: 1.28 $
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
||||
* $Date: 2001/10/03 00:14:23 $
|
||||
* $Revision: 1.29 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -28,7 +28,7 @@ import com.ibm.util.Utility;
|
||||
* may contain variables. Variables represent a set of Unicode
|
||||
* characters, such as the letters <i>a</i> through <i>z</i>.
|
||||
* Variables are detected by looking up each character in a supplied
|
||||
* variable list to see if it has been so defined.
|
||||
* variable list to see if it has been so defined.
|
||||
*
|
||||
* <p>A rule may contain segments in its input string and segment references in
|
||||
* its output string. A segment is a substring of the input pattern, indicated
|
||||
@ -44,7 +44,7 @@ import com.ibm.util.Utility;
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.28 $ $Date: 2001/09/26 18:00:06 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.29 $ $Date: 2001/10/03 00:14:23 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
|
||||
@ -310,7 +310,7 @@ class TransliterationRule {
|
||||
* r1: aakkkpppp
|
||||
* r2: aaakkkkkpppp
|
||||
* ^
|
||||
*
|
||||
*
|
||||
* The strings must be aligned at the first character of the
|
||||
* key. The length of r1 to the left of the alignment point
|
||||
* must be <= the length of r2 to the left; ditto for the
|
||||
@ -346,10 +346,10 @@ class TransliterationRule {
|
||||
int left2 = r2.anteContextLength;
|
||||
int right = pattern.length() - left;
|
||||
int right2 = r2.pattern.length() - left2;
|
||||
|
||||
|
||||
// TODO Clean this up -- some logic might be combinable with the
|
||||
// next statement.
|
||||
|
||||
|
||||
// Test for anchor masking
|
||||
if (left == left2 && right == right2 &&
|
||||
keyLength <= r2.keyLength &&
|
||||
@ -371,7 +371,7 @@ class TransliterationRule {
|
||||
pos - UTF16.getCharCount(UTF16.charAt(str, pos-1)) :
|
||||
pos - 1;
|
||||
}
|
||||
|
||||
|
||||
static final int posAfter(Replaceable str, int pos) {
|
||||
return (pos >= 0 && pos < str.length()) ?
|
||||
pos + UTF16.getCharCount(UTF16.charAt(str, pos)) :
|
||||
@ -387,10 +387,10 @@ class TransliterationRule {
|
||||
* context and key characters match, but the text is not long
|
||||
* enough to match all of them. A full match means all context
|
||||
* and key characters match.
|
||||
*
|
||||
*
|
||||
* If a full match is obtained, perform a replacement, update pos,
|
||||
* and return U_MATCH. Otherwise both text and pos are unchanged.
|
||||
*
|
||||
*
|
||||
* @param text the text
|
||||
* @param pos the position indices
|
||||
* @param incremental if TRUE, test for partial matches that may
|
||||
@ -559,13 +559,13 @@ class TransliterationRule {
|
||||
if (segments == null) {
|
||||
text.replace(pos.start, keyLimit, output);
|
||||
lenDelta = output.length() - (keyLimit - pos.start);
|
||||
if (cursorPos >= 0 && cursorPos < keyLength) {
|
||||
// Within the key, the cursor refers to 16-bit code units
|
||||
if (cursorPos >= 0 && cursorPos <= output.length()) {
|
||||
// Within the output string, the cursor refers to 16-bit code units
|
||||
newStart = pos.start + cursorPos;
|
||||
} else {
|
||||
newStart = pos.start;
|
||||
int n = cursorPos;
|
||||
// Outside the key, cursorPos counts code points
|
||||
// Outside the output string, cursorPos counts code points
|
||||
while (n > 0) {
|
||||
newStart += UTF16.getCharCount(UTF16.charAt(text, newStart));
|
||||
--n;
|
||||
@ -638,7 +638,7 @@ class TransliterationRule {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
oText += lenDelta;
|
||||
pos.limit += lenDelta;
|
||||
pos.contextLimit += lenDelta;
|
||||
@ -665,11 +665,11 @@ class TransliterationRule {
|
||||
* cleared out by, at the end, calling this method with a literal
|
||||
* character.
|
||||
*/
|
||||
protected void appendToRule(StringBuffer rule,
|
||||
int c,
|
||||
boolean isLiteral,
|
||||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
static void appendToRule(StringBuffer rule,
|
||||
int c,
|
||||
boolean isLiteral,
|
||||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
// If we are escaping unprintables, then escape them outside
|
||||
// quotes. <backslash>u and <backslash>U are not recognized within quotes. The same
|
||||
// logic applies to literals, but literals are never escaped.
|
||||
@ -745,11 +745,11 @@ class TransliterationRule {
|
||||
//System.out.println("rule=" + rule.toString() + " qb=" + quoteBuf.toString());
|
||||
}
|
||||
|
||||
protected final void appendToRule(StringBuffer rule,
|
||||
String text,
|
||||
boolean isLiteral,
|
||||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
static final void appendToRule(StringBuffer rule,
|
||||
String text,
|
||||
boolean isLiteral,
|
||||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
for (int i=0; i<text.length(); ++i) {
|
||||
appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
@ -764,7 +764,7 @@ class TransliterationRule {
|
||||
*/
|
||||
public String toRule(boolean escapeUnprintable) {
|
||||
int i;
|
||||
|
||||
|
||||
StringBuffer rule = new StringBuffer();
|
||||
|
||||
// iseg indexes into segments[] directly (not offset from FSPI)
|
||||
@ -863,7 +863,7 @@ class TransliterationRule {
|
||||
if (show) {
|
||||
rule.append((char)(48+d));
|
||||
}
|
||||
}
|
||||
}
|
||||
rule.append(' ');
|
||||
}
|
||||
}
|
||||
@ -905,6 +905,9 @@ class TransliterationRule {
|
||||
|
||||
/**
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.29 2001/10/03 00:14:23 alan
|
||||
* jitterbug 73: finish quantifier and supplemental char support
|
||||
*
|
||||
* Revision 1.28 2001/09/26 18:00:06 alan
|
||||
* jitterbug 67: sync parser with icu4c, allow unlimited, nested segments
|
||||
*
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Transliterator.java,v $
|
||||
* $Date: 2001/09/28 20:37:09 $
|
||||
* $Revision: 1.43 $
|
||||
* $Date: 2001/10/03 00:14:23 $
|
||||
* $Revision: 1.44 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -241,7 +241,7 @@ import com.ibm.util.CaseInsensitiveString;
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.43 $ $Date: 2001/09/28 20:37:09 $
|
||||
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.44 $ $Date: 2001/10/03 00:14:23 $
|
||||
*/
|
||||
public abstract class Transliterator {
|
||||
/**
|
||||
@ -553,6 +553,15 @@ public abstract class Transliterator {
|
||||
index.contextLimit += insertion.length();
|
||||
}
|
||||
|
||||
char last = (text.length() > 0) ?
|
||||
text.charAt(text.length() - 1) : 0;
|
||||
if (UTF16.isLeadSurrogate(last)) {
|
||||
// Oops, the caller passed us a single lead surrogate at the
|
||||
// end of the insertion. Don't transliterate until more text
|
||||
// comes in.
|
||||
return;
|
||||
}
|
||||
|
||||
filteredTransliterate(text, index, true);
|
||||
|
||||
// This doesn't work once we add quantifier support. Need to rewrite
|
||||
|
@ -142,6 +142,10 @@ class TransliteratorParser {
|
||||
private static final char CURSOR_OFFSET = '@';
|
||||
private static final char ANCHOR_START = '^';
|
||||
|
||||
private static final char KLEENE_STAR = '*';
|
||||
private static final char ONE_OR_MORE = '+';
|
||||
private static final char ZERO_OR_ONE = '?';
|
||||
|
||||
// By definition, the ANCHOR_END special character is a
|
||||
// trailing SymbolTable.SYMBOL_REF character.
|
||||
// private static final char ANCHOR_END = '$';
|
||||
@ -382,7 +386,7 @@ class TransliteratorParser {
|
||||
idBlock = idBlockResult.toString();
|
||||
|
||||
// Convert the set vector to an array
|
||||
data.variables = new UnicodeSet[variablesVector.size()];
|
||||
data.variables = new UnicodeMatcher[variablesVector.size()];
|
||||
variablesVector.copyInto(data.variables);
|
||||
variablesVector = null;
|
||||
|
||||
@ -658,7 +662,7 @@ class TransliteratorParser {
|
||||
int varStart = -1; // Most recent $variableReference
|
||||
int varLimit = -1;
|
||||
int[] iref = new int[1];
|
||||
|
||||
|
||||
main:
|
||||
while (pos < limit && !done) {
|
||||
char c = rule.charAt(pos++);
|
||||
@ -853,56 +857,71 @@ class TransliteratorParser {
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
// Quantifiers. We handle single characters, quoted strings,
|
||||
// variable references, and segments.
|
||||
// a+ matches aaa
|
||||
// 'foo'+ matches foofoofoo
|
||||
// $v+ matches xyxyxy if $v == xy
|
||||
// (seg)+ matches segsegseg
|
||||
{
|
||||
int qstart, qlimit;
|
||||
boolean[] isOpenParen = new boolean[1];
|
||||
boolean isSegment = false;
|
||||
if (segments != null &&
|
||||
segments.getLastParenOffset(isOpenParen) == buf.length()) {
|
||||
// The */+ immediately follows a segment
|
||||
if (isOpenParen[0]) {
|
||||
syntaxError("Misplaced quantifier", rule, start);
|
||||
}
|
||||
int[] startparam = new int[1];
|
||||
int[] limitparam = new int[1];
|
||||
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
|
||||
syntaxError("Mismatched segment delimiters", rule, start);
|
||||
}
|
||||
qstart = startparam[0];
|
||||
qlimit = limitparam[0];
|
||||
isSegment = true;
|
||||
} else {
|
||||
// The */+ follows an isolated character or quote
|
||||
// or variable reference
|
||||
if (buf.length() == quoteLimit) {
|
||||
// The */+ follows a 'quoted string'
|
||||
qstart = quoteStart;
|
||||
qlimit = quoteLimit;
|
||||
} else if (buf.length() == varLimit) {
|
||||
// The */+ follows a $variableReference
|
||||
qstart = varStart;
|
||||
qlimit = varLimit;
|
||||
} else {
|
||||
// The */+ follows a single character
|
||||
qstart = buf.length() - 1;
|
||||
qlimit = qstart + 1;
|
||||
}
|
||||
}
|
||||
UnicodeMatcher m =
|
||||
new StringMatcher(buf.toString(), qstart, qlimit,
|
||||
isSegment, parser.data);
|
||||
int min = 0;
|
||||
int max = Quantifier.MAX;
|
||||
switch (c) {
|
||||
case ONE_OR_MORE:
|
||||
min = 1;
|
||||
break;
|
||||
case ZERO_OR_ONE:
|
||||
min = 0;
|
||||
max = 1;
|
||||
break;
|
||||
// case KLEENE_STAR:
|
||||
// do nothing -- min, max already set
|
||||
}
|
||||
m = new Quantifier(m, min, max);
|
||||
buf.setLength(qstart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
// case SET_CLOSE:
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
@ -947,7 +966,7 @@ class TransliteratorParser {
|
||||
//----------------------------------------------------------------------
|
||||
// END RuleHalf
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
|
||||
/**
|
||||
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
||||
* at pos. Return the index after the last character parsed. Do not
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
||||
* $Date: 2001/09/28 05:47:30 $
|
||||
* $Revision: 1.47 $
|
||||
* $Date: 2001/10/03 00:18:23 $
|
||||
* $Revision: 1.48 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -77,14 +77,6 @@ public class TransliteratorTest extends TestFmwk {
|
||||
logln("Elapsed time: " + ms + " ms");
|
||||
}
|
||||
|
||||
public void TestDisplayName() {
|
||||
String ID;
|
||||
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
|
||||
ID = (String) e.nextElement();
|
||||
logln(ID + " -> " + Transliterator.getDisplayName(ID));
|
||||
}
|
||||
}
|
||||
|
||||
public void TestSimpleRules() {
|
||||
/* Example: rules 1. ab>x|y
|
||||
* 2. yc>z
|
||||
@ -131,29 +123,6 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test undefined variable.
|
||||
*/
|
||||
public void TestUndefinedVariable() {
|
||||
String rule = "$initial } a <> \u1161;";
|
||||
try {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>", rule);
|
||||
t = null;
|
||||
} catch (IllegalArgumentException e) {
|
||||
logln("OK: Got exception for " + rule + ", as expected: " +
|
||||
e.getMessage());
|
||||
return;
|
||||
}
|
||||
errln("Fail: bogus rule " + rule + " compiled without error");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test empty context.
|
||||
*/
|
||||
public void TestEmptyContext() {
|
||||
expect(" { a } > b;", "xay a ", "xby b ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inline set syntax and set variable syntax.
|
||||
*/
|
||||
@ -524,6 +493,9 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expect(hex3, "012", "012");
|
||||
}
|
||||
|
||||
public void TestJ329_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test segments and segment references.
|
||||
*/
|
||||
@ -681,6 +653,9 @@ public class TransliteratorTest extends TestFmwk {
|
||||
|
||||
}
|
||||
|
||||
public void TestCopyJ476_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inter-Indic transliterators. These are composed.
|
||||
*/
|
||||
@ -700,47 +675,6 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expect(dg, dev, guj);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test IDs of inverses of compound transliterators. (J20)
|
||||
*/
|
||||
public void TestCompoundInverseID() {
|
||||
String ID = "Latin-Jamo;NFC(NFD)";
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
Transliterator u = t.getInverse();
|
||||
String exp = "NFD(NFC);Jamo-Latin";
|
||||
String got = u.getID();
|
||||
if (!got.equals(exp)) {
|
||||
errln("FAIL: Inverse of " + ID + " is " + got +
|
||||
", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inverse of "Null" should be "Null". (J21)
|
||||
*/
|
||||
public void TestNullInverse() {
|
||||
Transliterator t = Transliterator.getInstance("Null");
|
||||
Transliterator u = t.getInverse();
|
||||
if (!u.getID().equals("Null")) {
|
||||
errln("FAIL: Inverse of Null should be Null");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check ID of inverse of alias. (J22)
|
||||
*/
|
||||
public void TestAliasInverseID() {
|
||||
String ID = "Latin-Hangul"; // This should be any alias ID with an inverse
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
Transliterator u = t.getInverse();
|
||||
String exp = "Hangul-Latin";
|
||||
String got = u.getID();
|
||||
if (!got.equals(exp)) {
|
||||
errln("FAIL: Inverse of " + ID + " is " + got +
|
||||
", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test filter syntax in IDs. (J23)
|
||||
*/
|
||||
@ -785,118 +719,6 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the "Remove" transliterator.
|
||||
*/
|
||||
public void TestRemove() {
|
||||
Transliterator t = Transliterator.getInstance("Remove[aeiou]");
|
||||
expect(t, "The quick brown fox.",
|
||||
"Th qck brwn fx.");
|
||||
}
|
||||
|
||||
public void TestToRules() {
|
||||
String RBT = "rbt";
|
||||
String SET = "set";
|
||||
String[] DATA = {
|
||||
RBT,
|
||||
"$a=\\u4E61; [$a] > A;",
|
||||
"[\\u4E61] > A;",
|
||||
|
||||
RBT,
|
||||
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
|
||||
"[[:Zs:][:Zl:]]{a} > A;",
|
||||
|
||||
SET,
|
||||
"[[:Zs:][:Zl:]]",
|
||||
"[[:Zs:][:Zl:]]",
|
||||
|
||||
SET,
|
||||
"[:Ps:]",
|
||||
"[:Ps:]",
|
||||
|
||||
SET,
|
||||
"[:L:]",
|
||||
"[:L:]",
|
||||
|
||||
SET,
|
||||
"[[:L:]-[A]]",
|
||||
"[[:L:]-[A]]",
|
||||
|
||||
SET,
|
||||
"[~[:Lu:][:Ll:]]",
|
||||
"[~[:Lu:][:Ll:]]",
|
||||
|
||||
SET,
|
||||
"[~[a-z]]",
|
||||
"[~[a-z]]",
|
||||
|
||||
RBT,
|
||||
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
|
||||
"[^[:Zs:]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
|
||||
"[[a-z]-[:Zs:]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
|
||||
"[[:Zs:]&[a-z]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
|
||||
"[x[:Zs:]]{a} > A;",
|
||||
};
|
||||
|
||||
for (int d=0; d < DATA.length; d+=3) {
|
||||
if (DATA[d] == RBT) {
|
||||
// Transliterator test
|
||||
Transliterator t = Transliterator.createFromRules("ID",
|
||||
DATA[d+1], Transliterator.FORWARD);
|
||||
if (t == null) {
|
||||
errln("FAIL: createFromRules failed");
|
||||
return;
|
||||
}
|
||||
String rules, escapedRules;
|
||||
rules = t.toRules(false);
|
||||
escapedRules = t.toRules(true);
|
||||
String expRules = Utility.unescape(DATA[d+2]);
|
||||
String expEscapedRules = DATA[d+2];
|
||||
if (rules.equals(expRules)) {
|
||||
logln("Ok: " + DATA[d+1] +
|
||||
" => " + Utility.escape(rules));
|
||||
} else {
|
||||
errln("FAIL: " + DATA[d+1] +
|
||||
" => " + Utility.escape(rules + ", exp " + expRules));
|
||||
}
|
||||
if (escapedRules.equals(expEscapedRules)) {
|
||||
logln("Ok: " + DATA[d+1] +
|
||||
" => " + escapedRules);
|
||||
} else {
|
||||
errln("FAIL: " + DATA[d+1] +
|
||||
" => " + escapedRules + ", exp " + expEscapedRules);
|
||||
}
|
||||
|
||||
} else {
|
||||
// UnicodeSet test
|
||||
String pat = DATA[d+1];
|
||||
String expToPat = DATA[d+2];
|
||||
UnicodeSet set = new UnicodeSet(pat);
|
||||
|
||||
// Adjust spacing etc. as necessary.
|
||||
String toPat;
|
||||
toPat = set.toPattern(true);
|
||||
if (expToPat.equals(toPat)) {
|
||||
logln("Ok: " + pat +
|
||||
" => " + toPat);
|
||||
} else {
|
||||
errln("FAIL: " + pat +
|
||||
" => " + Utility.escape(toPat) +
|
||||
", exp " + Utility.escape(pat));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the case mapping transliterators.
|
||||
*/
|
||||
@ -966,6 +788,9 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
public void TestCreateInstance_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the normalization transliterator.
|
||||
*/
|
||||
@ -1139,6 +964,249 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
public void TestCompoundFilter_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the "Remove" transliterator.
|
||||
*/
|
||||
public void TestRemove() {
|
||||
Transliterator t = Transliterator.getInstance("Remove[aeiou]");
|
||||
expect(t, "The quick brown fox.",
|
||||
"Th qck brwn fx.");
|
||||
}
|
||||
|
||||
public void TestToRules() {
|
||||
String RBT = "rbt";
|
||||
String SET = "set";
|
||||
String[] DATA = {
|
||||
RBT,
|
||||
"$a=\\u4E61; [$a] > A;",
|
||||
"[\\u4E61] > A;",
|
||||
|
||||
RBT,
|
||||
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
|
||||
"[[:Zs:][:Zl:]]{a} > A;",
|
||||
|
||||
SET,
|
||||
"[[:Zs:][:Zl:]]",
|
||||
"[[:Zs:][:Zl:]]",
|
||||
|
||||
SET,
|
||||
"[:Ps:]",
|
||||
"[:Ps:]",
|
||||
|
||||
SET,
|
||||
"[:L:]",
|
||||
"[:L:]",
|
||||
|
||||
SET,
|
||||
"[[:L:]-[A]]",
|
||||
"[[:L:]-[A]]",
|
||||
|
||||
SET,
|
||||
"[~[:Lu:][:Ll:]]",
|
||||
"[~[:Lu:][:Ll:]]",
|
||||
|
||||
SET,
|
||||
"[~[a-z]]",
|
||||
"[~[a-z]]",
|
||||
|
||||
RBT,
|
||||
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
|
||||
"[^[:Zs:]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
|
||||
"[[a-z]-[:Zs:]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
|
||||
"[[:Zs:]&[a-z]]{a} > A;",
|
||||
|
||||
RBT,
|
||||
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
|
||||
"[x[:Zs:]]{a} > A;",
|
||||
};
|
||||
|
||||
for (int d=0; d < DATA.length; d+=3) {
|
||||
if (DATA[d] == RBT) {
|
||||
// Transliterator test
|
||||
Transliterator t = Transliterator.createFromRules("ID",
|
||||
DATA[d+1], Transliterator.FORWARD);
|
||||
if (t == null) {
|
||||
errln("FAIL: createFromRules failed");
|
||||
return;
|
||||
}
|
||||
String rules, escapedRules;
|
||||
rules = t.toRules(false);
|
||||
escapedRules = t.toRules(true);
|
||||
String expRules = Utility.unescape(DATA[d+2]);
|
||||
String expEscapedRules = DATA[d+2];
|
||||
if (rules.equals(expRules)) {
|
||||
logln("Ok: " + DATA[d+1] +
|
||||
" => " + Utility.escape(rules));
|
||||
} else {
|
||||
errln("FAIL: " + DATA[d+1] +
|
||||
" => " + Utility.escape(rules + ", exp " + expRules));
|
||||
}
|
||||
if (escapedRules.equals(expEscapedRules)) {
|
||||
logln("Ok: " + DATA[d+1] +
|
||||
" => " + escapedRules);
|
||||
} else {
|
||||
errln("FAIL: " + DATA[d+1] +
|
||||
" => " + escapedRules + ", exp " + expEscapedRules);
|
||||
}
|
||||
|
||||
} else {
|
||||
// UnicodeSet test
|
||||
String pat = DATA[d+1];
|
||||
String expToPat = DATA[d+2];
|
||||
UnicodeSet set = new UnicodeSet(pat);
|
||||
|
||||
// Adjust spacing etc. as necessary.
|
||||
String toPat;
|
||||
toPat = set.toPattern(true);
|
||||
if (expToPat.equals(toPat)) {
|
||||
logln("Ok: " + pat +
|
||||
" => " + toPat);
|
||||
} else {
|
||||
errln("FAIL: " + pat +
|
||||
" => " + Utility.escape(toPat) +
|
||||
", exp " + Utility.escape(pat));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestContext() {
|
||||
Transliterator.Position pos = new Transliterator.Position(0, 2, 0, 1); // cs cl s l
|
||||
|
||||
expect("de > x; {d}e > y;",
|
||||
"de",
|
||||
"ye",
|
||||
pos);
|
||||
|
||||
expect("ab{c} > z;",
|
||||
"xadabdabcy",
|
||||
"xadabdabzy");
|
||||
}
|
||||
|
||||
static final String CharsToUnicodeString(String s) {
|
||||
return Utility.unescape(s);
|
||||
}
|
||||
|
||||
public void TestSupplemental() {
|
||||
|
||||
expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];" +
|
||||
"a > $a; $s > i;"),
|
||||
CharsToUnicodeString("ab\\U0001030Fx"),
|
||||
CharsToUnicodeString("\\U00010300bix"));
|
||||
|
||||
expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];" +
|
||||
"$b=[A-Z\\U00010400-\\U0001044D];" +
|
||||
"($a)($b) > $2 $1;"),
|
||||
CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
|
||||
CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
|
||||
|
||||
// k|ax\\U00010300xm
|
||||
|
||||
// k|a\\U00010400\\U00010300xm
|
||||
// ky|\\U00010400\\U00010300xm
|
||||
// ky\\U00010400|\\U00010300xm
|
||||
|
||||
// ky\\U00010400|\\U00010300\\U00010400m
|
||||
// ky\\U00010400y|\\U00010400m
|
||||
expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];" +
|
||||
"$a {x} > | @ \\U00010400;" +
|
||||
"{$a} [^\\u0000-\\uFFFF] > y;"),
|
||||
CharsToUnicodeString("kax\\U00010300xm"),
|
||||
CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
|
||||
}
|
||||
|
||||
public void TestQuantifier() {
|
||||
|
||||
// Make sure @ in a quantified anteContext works
|
||||
expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
|
||||
"AAAAAb",
|
||||
"aaa(aac)");
|
||||
|
||||
// Make sure @ in a quantified postContext works
|
||||
expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
|
||||
"baaaaa",
|
||||
"caa(aaa)");
|
||||
|
||||
// Make sure @ in a quantified postContext with seg ref works
|
||||
expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
|
||||
"baaaaa",
|
||||
"baa(aaa)");
|
||||
|
||||
// Make sure @ past ante context doesn't enter ante context
|
||||
Transliterator.Position pos = new Transliterator.Position(0, 5, 3, 5);
|
||||
expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
|
||||
"xxxab",
|
||||
"xxx(ac)",
|
||||
pos);
|
||||
|
||||
// Make sure @ past post context doesn't pass limit
|
||||
Transliterator.Position pos2 = new Transliterator.Position(0, 4, 0, 2);
|
||||
expect("{b} a+ > c @@ |; x > y; a > A;",
|
||||
"baxx",
|
||||
"caxx",
|
||||
pos2);
|
||||
|
||||
// Make sure @ past post context doesn't enter post context
|
||||
expect("{b} a+ > c @@ |; x > y; a > A;",
|
||||
"baxx",
|
||||
"cayy");
|
||||
|
||||
expect("(ab)? c > d;",
|
||||
"c abc ababc",
|
||||
"d d abd");
|
||||
|
||||
expect("(ab)+ {x} > '(' $1 ')';",
|
||||
"x abx ababxy",
|
||||
"x ab(ab) abab(abab)y");
|
||||
|
||||
expect("b+ > x;",
|
||||
"ac abc abbc abbbc",
|
||||
"ac axc axc axc");
|
||||
|
||||
expect("[abc]+ > x;",
|
||||
"qac abrc abbcs abtbbc",
|
||||
"qx xrx xs xtx");
|
||||
|
||||
expect("q{(ab)+} > x;",
|
||||
"qa qab qaba qababc qaba",
|
||||
"qa qx qxa qxc qxa");
|
||||
|
||||
expect("q(ab)* > x;",
|
||||
"qa qab qaba qababc",
|
||||
"xa x xa xc");
|
||||
|
||||
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
|
||||
// In perl, it only matches the first occurrence, so the output
|
||||
// is "()a (ab) (ab)a (ab)c".
|
||||
expect("q(ab)* > '(' $1 ')';",
|
||||
"qa qab qaba qababc",
|
||||
"()a (ab) (ab)a (abab)c");
|
||||
|
||||
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
|
||||
// quoted string
|
||||
expect("'ab'+ > x;",
|
||||
"bb ab ababb",
|
||||
"bb x xb");
|
||||
|
||||
// $foo+ and $foo* -- the quantifier should apply to the entire
|
||||
// variable reference
|
||||
expect("$var = ab; $var+ > x;",
|
||||
"bb ab ababb",
|
||||
"bb x xb");
|
||||
}
|
||||
|
||||
public void TestSTV_TODO() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inverse of Greek-Latin; Title()
|
||||
*/
|
||||
@ -1159,9 +1227,92 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// icu4j only
|
||||
//======================================================================
|
||||
|
||||
/**
|
||||
* Inverse of "Null" should be "Null". (J21)
|
||||
*/
|
||||
public void TestNullInverse() {
|
||||
Transliterator t = Transliterator.getInstance("Null");
|
||||
Transliterator u = t.getInverse();
|
||||
if (!u.getID().equals("Null")) {
|
||||
errln("FAIL: Inverse of Null should be Null");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check ID of inverse of alias. (J22)
|
||||
*/
|
||||
public void TestAliasInverseID() {
|
||||
String ID = "Latin-Hangul"; // This should be any alias ID with an inverse
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
Transliterator u = t.getInverse();
|
||||
String exp = "Hangul-Latin";
|
||||
String got = u.getID();
|
||||
if (!got.equals(exp)) {
|
||||
errln("FAIL: Inverse of " + ID + " is " + got +
|
||||
", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test IDs of inverses of compound transliterators. (J20)
|
||||
*/
|
||||
public void TestCompoundInverseID() {
|
||||
String ID = "Latin-Jamo;NFC(NFD)";
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
Transliterator u = t.getInverse();
|
||||
String exp = "NFD(NFC);Jamo-Latin";
|
||||
String got = u.getID();
|
||||
if (!got.equals(exp)) {
|
||||
errln("FAIL: Inverse of " + ID + " is " + got +
|
||||
", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test undefined variable.
|
||||
*/
|
||||
public void TestUndefinedVariable() {
|
||||
String rule = "$initial } a <> \u1161;";
|
||||
try {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>", rule);
|
||||
t = null;
|
||||
} catch (IllegalArgumentException e) {
|
||||
logln("OK: Got exception for " + rule + ", as expected: " +
|
||||
e.getMessage());
|
||||
return;
|
||||
}
|
||||
errln("Fail: bogus rule " + rule + " compiled without error");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test empty context.
|
||||
*/
|
||||
public void TestEmptyContext() {
|
||||
expect(" { a } > b;", "xay a ", "xby b ");
|
||||
}
|
||||
|
||||
public void TestDisplayName() {
|
||||
String ID;
|
||||
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
|
||||
ID = (String) e.nextElement();
|
||||
logln(ID + " -> " + Transliterator.getDisplayName(ID));
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
void expect(String rules,
|
||||
String source,
|
||||
String expectedResult,
|
||||
Transliterator.Position pos) {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>", rules);
|
||||
expect(t, source, expectedResult, pos);
|
||||
}
|
||||
|
||||
void expect(String rules, String source, String expectedResult) {
|
||||
expect(new RuleBasedTransliterator("<ID>", rules), source, expectedResult);
|
||||
@ -1176,33 +1327,53 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
|
||||
void expect(Transliterator t, String source, String expectedResult) {
|
||||
String result = t.transliterate(source);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
expect(t, source, expectedResult, (Transliterator.Position) null);
|
||||
}
|
||||
|
||||
void expect(Transliterator t, String source, String expectedResult,
|
||||
Transliterator.Position pos) {
|
||||
if (pos == null) {
|
||||
String result = t.transliterate(source);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
}
|
||||
|
||||
Transliterator.Position index = null;
|
||||
if (pos == null) {
|
||||
index = new Transliterator.Position();
|
||||
} else {
|
||||
index = new Transliterator.Position(pos.contextStart, pos.contextLimit,
|
||||
pos.start, pos.limit);
|
||||
}
|
||||
|
||||
ReplaceableString rsource = new ReplaceableString(source);
|
||||
t.transliterate(rsource);
|
||||
result = rsource.toString();
|
||||
if (pos == null) {
|
||||
t.transliterate(rsource);
|
||||
} else {
|
||||
// Do it all at once -- below we do it incrementally
|
||||
t.finishTransliteration(rsource, pos);
|
||||
}
|
||||
String result = rsource.toString();
|
||||
expectAux(t.getID() + ":Replaceable", source, result, expectedResult);
|
||||
|
||||
// Test keyboard (incremental) transliteration -- this result
|
||||
// must be the same after we finalize (see below).
|
||||
rsource.replace(0, rsource.length(), "");
|
||||
Transliterator.Position index = new Transliterator.Position();
|
||||
StringBuffer log = new StringBuffer();
|
||||
|
||||
for (int i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
rsource.replace(0, rsource.length(), "");
|
||||
if (pos != null) {
|
||||
rsource.replace(0, 0, source);
|
||||
formatInput(log, rsource, index);
|
||||
log.append(" -> ");
|
||||
t.transliterate(rsource, index);
|
||||
formatInput(log, rsource, index);
|
||||
} else {
|
||||
for (int i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
t.transliterate(rsource, index, source.charAt(i));
|
||||
formatInput(log, rsource, index);
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
t.transliterate(rsource, index,
|
||||
String.valueOf(source.charAt(i)));
|
||||
// Append the string buffer with a vertical bar '|' where
|
||||
// the committed index is.
|
||||
String s = rsource.toString();
|
||||
log.append(s.substring(0, index.start)).
|
||||
append('|').
|
||||
append(s.substring(index.start));
|
||||
}
|
||||
|
||||
// As a final step in keyboard transliteration, we must call
|
||||
@ -1217,6 +1388,41 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param appendTo result is appended to this param.
|
||||
* @param input the string being transliterated
|
||||
* @param pos the index struct
|
||||
*/
|
||||
StringBuffer formatInput(StringBuffer appendTo,
|
||||
final ReplaceableString input,
|
||||
final Transliterator.Position pos) {
|
||||
// Output a string of the form aaa{bbb|ccc|ddd}eee, where
|
||||
// the {} indicate the context start and limit, and the ||
|
||||
// indicate the start and limit.
|
||||
if (0 <= pos.contextStart &&
|
||||
pos.contextStart <= pos.start &&
|
||||
pos.start <= pos.limit &&
|
||||
pos.limit <= pos.contextLimit &&
|
||||
pos.contextLimit <= input.length()) {
|
||||
|
||||
String a, b, c, d, e;
|
||||
a = input.substring(0, pos.contextStart);
|
||||
b = input.substring(pos.contextStart, pos.start);
|
||||
c = input.substring(pos.start, pos.limit);
|
||||
d = input.substring(pos.limit, pos.contextLimit);
|
||||
e = input.substring(pos.contextLimit, input.length());
|
||||
appendTo.append(a).append('{').append(b).
|
||||
append('|').append(c).append('|').append(d).
|
||||
append('}').append(e);
|
||||
} else {
|
||||
appendTo.append("INVALID Transliterator.Position {cs=" +
|
||||
pos.contextStart + ", s=" + pos.start + ", l=" +
|
||||
pos.limit + ", cl=" + pos.contextLimit + "} on " +
|
||||
input);
|
||||
}
|
||||
return appendTo;
|
||||
}
|
||||
|
||||
void expectAux(String tag, String source,
|
||||
String result, String expectedResult) {
|
||||
expectAux(tag, source + " -> " + result,
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/ReplaceableString.java,v $
|
||||
* $Date: 2000/04/25 17:17:37 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/03 00:14:22 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -24,7 +24,7 @@ package com.ibm.text;
|
||||
*
|
||||
* @see Replaceable
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.3 $ $Date: 2000/04/25 17:17:37 $
|
||||
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.4 $ $Date: 2001/10/03 00:14:22 $
|
||||
*/
|
||||
public class ReplaceableString implements Replaceable {
|
||||
private StringBuffer buf;
|
||||
@ -67,6 +67,13 @@ public class ReplaceableString implements Replaceable {
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a substring of the given string.
|
||||
*/
|
||||
public String substring(int start, int limit) {
|
||||
return buf.substring(start, limit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters contained in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
|
@ -4,9 +4,9 @@
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
|
||||
* $Date: 2001/09/26 18:00:06 $
|
||||
* $Revision: 1.46 $
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
|
||||
* $Date: 2001/10/03 00:14:22 $
|
||||
* $Revision: 1.47 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -27,18 +27,18 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* Whitespace, as defined by <code>Character.isWhitespace()</code>,
|
||||
* is ignored. If the first non-blank character on a line is '#',
|
||||
* the entire line is ignored as a comment. </p>
|
||||
*
|
||||
*
|
||||
* <p>Each set of rules consists of two groups, one forward, and one
|
||||
* reverse. This is a convention that is not enforced; rules for one
|
||||
* direction may be omitted, with the result that translations in
|
||||
* that direction will not modify the source text. In addition,
|
||||
* bidirectional forward-reverse rules may be specified for
|
||||
* symmetrical transformations.</p>
|
||||
*
|
||||
*
|
||||
* <p><b>Rule syntax</b> </p>
|
||||
*
|
||||
*
|
||||
* <p>Rule statements take one of the following forms: </p>
|
||||
*
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>$alefmadda=\u0622;</code></dt>
|
||||
* <dd><strong>Variable definition.</strong> The name on the
|
||||
@ -66,7 +66,7 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* the string on the left when performing reverse
|
||||
* transliteration.</dd>
|
||||
* </dl>
|
||||
*
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>ai<>$alefmadda;</code></dt>
|
||||
* <dd><strong>Bidirectional translation rule.</strong> This
|
||||
@ -75,7 +75,7 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* transliteration, and vice versa when performing reverse
|
||||
* transliteration.</dd>
|
||||
* </dl>
|
||||
*
|
||||
*
|
||||
* <p>Translation rules consist of a <em>match pattern</em> and an <em>output
|
||||
* string</em>. The match pattern consists of literal characters,
|
||||
* optionally preceded by context, and optionally followed by
|
||||
@ -92,7 +92,7 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* (or "<code>123}456</code>") in which the literal
|
||||
* pattern "<code>123</code>" must be followed by "<code>456</code>".
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <p>The output string of a forward or reverse rule consists of
|
||||
* characters to replace the literal pattern characters. If the
|
||||
* output string contains the character '<code>|</code>', this is
|
||||
@ -102,59 +102,59 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* placed within the replacement text; however, it can actually be
|
||||
* placed into the precending or following context by using the
|
||||
* special character '<code>@</code>'. Examples:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>a {foo} z > | @ bar; # foo -> bar, move cursor
|
||||
* before a<br>
|
||||
* {foo} xyz > bar @@|; # foo -> bar, cursor between
|
||||
* y and z</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p><b>UnicodeSet</b></p>
|
||||
*
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may appear anywhere that
|
||||
* makes sense. They may appear in variable definitions.
|
||||
* Contrariwise, <code>UnicodeSet</code> patterns may themselves
|
||||
* contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>",
|
||||
* or "<code>$range=a-z;$ll=[$range]</code>".</p>
|
||||
*
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may also be embedded directly
|
||||
* into rule strings. Thus, the following two rules are equivalent:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>$vowel=[aeiou]; $vowel>'*'; # One way to do this<br>
|
||||
* [aeiou]>'*';
|
||||
* #
|
||||
* Another way</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p>See {@link UnicodeSet} for more documentation and examples.</p>
|
||||
*
|
||||
*
|
||||
* <p><b>Segments</b></p>
|
||||
*
|
||||
*
|
||||
* <p>Segments of the input string can be matched and copied to the
|
||||
* output string. This makes certain sets of rules simpler and more
|
||||
* general, and makes reordering possible. For example:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>([a-z]) > $1 $1;
|
||||
* #
|
||||
* double lowercase letters<br>
|
||||
* ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p>The segment of the input string to be copied is delimited by
|
||||
* "<code>(</code>" and "<code>)</code>". Up to
|
||||
* nine segments may be defined. Segments may not overlap. In the
|
||||
* output string, "<code>$1</code>" through "<code>$9</code>"
|
||||
* represent the input string segments, in left-to-right order of
|
||||
* definition.</p>
|
||||
*
|
||||
*
|
||||
* <p><b>Anchors</b></p>
|
||||
*
|
||||
*
|
||||
* <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
|
||||
* special characters '<code>^</code>' and '<code>$</code>'. For example:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>^ a > 'BEG_A'; # match 'a' at start of text<br>
|
||||
* a > 'A'; # match other instances
|
||||
@ -163,24 +163,24 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* z > 'Z'; # match other instances
|
||||
* of 'z'</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
|
||||
* This is done by including a virtual anchor character '<code>$</code>' at the end of the
|
||||
* set pattern. Although this is usually the match chafacter for the end anchor, the set will
|
||||
* match either the beginning or the end of the text, depending on its placement. For
|
||||
* example:</p>
|
||||
*
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>$x = [a-z$]; # match 'a' through 'z' OR anchor<br>
|
||||
* $x 1 > 2; # match '1' after a-z or at the start<br>
|
||||
* 3 $x > 4; # match '3' before a-z or at the end</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
*
|
||||
* <p><b>Example</b> </p>
|
||||
*
|
||||
*
|
||||
* <p>The following example rules illustrate many of the features of
|
||||
* the rule language. </p>
|
||||
*
|
||||
*
|
||||
* <table border="0" cellpadding="4">
|
||||
* <tr>
|
||||
* <td valign="top">Rule 1.</td>
|
||||
@ -195,10 +195,10 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* <td valign="top" nowrap><code>yz>q</code></td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
*
|
||||
* <p>Applying these rules to the string "<code>adefabcdefz</code>"
|
||||
* yields the following results: </p>
|
||||
*
|
||||
*
|
||||
* <table border="0" cellpadding="4">
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>|adefabcdefz</code></td>
|
||||
@ -251,23 +251,23 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* transliteration is complete.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
*
|
||||
* <p>The order of rules is significant. If multiple rules may match
|
||||
* at some point, the first matching rule is applied. </p>
|
||||
*
|
||||
*
|
||||
* <p>Forward and reverse rules may have an empty output string.
|
||||
* Otherwise, an empty left or right hand side of any statement is a
|
||||
* syntax error. </p>
|
||||
*
|
||||
*
|
||||
* <p>Single quotes are used to quote any character other than a
|
||||
* digit or letter. To specify a single quote itself, inside or
|
||||
* outside of quotes, use two single quotes in a row. For example,
|
||||
* the rule "<code>'>'>o''clock</code>" changes the
|
||||
* string "<code>></code>" to the string "<code>o'clock</code>".
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <p><b>Notes</b> </p>
|
||||
*
|
||||
*
|
||||
* <p>While a RuleBasedTransliterator is being built, it checks that
|
||||
* the rules are added in proper order. For example, if the rule
|
||||
* "a>x" is followed by the rule "ab>y",
|
||||
@ -275,11 +275,11 @@ import com.ibm.text.resources.ResourceReader;
|
||||
* the second rule can never be triggered, since the first rule
|
||||
* always matches anything it matches. In other words, the first
|
||||
* rule <em>masks</em> the second rule. </p>
|
||||
*
|
||||
*
|
||||
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
||||
*
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.46 $ $Date: 2001/09/26 18:00:06 $
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.47 $ $Date: 2001/10/03 00:14:22 $
|
||||
*/
|
||||
public class RuleBasedTransliterator extends Transliterator {
|
||||
|
||||
@ -433,7 +433,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
||||
* stored in the rule text to represent the set of characters.
|
||||
* variables[i] represents character (variablesBase + i).
|
||||
*/
|
||||
UnicodeSet[] variables;
|
||||
UnicodeMatcher[] variables;
|
||||
|
||||
/**
|
||||
* The character that represents variables[0]. Characters
|
||||
@ -498,6 +498,9 @@ public class RuleBasedTransliterator extends Transliterator {
|
||||
|
||||
/**
|
||||
* $Log: RuleBasedTransliterator.java,v $
|
||||
* Revision 1.47 2001/10/03 00:14:22 alan
|
||||
* jitterbug 73: finish quantifier and supplemental char support
|
||||
*
|
||||
* Revision 1.46 2001/09/26 18:00:06 alan
|
||||
* jitterbug 67: sync parser with icu4c, allow unlimited, nested segments
|
||||
*
|
||||
|
@ -4,9 +4,9 @@
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
||||
* $Date: 2001/09/26 18:00:06 $
|
||||
* $Revision: 1.28 $
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
||||
* $Date: 2001/10/03 00:14:23 $
|
||||
* $Revision: 1.29 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -28,7 +28,7 @@ import com.ibm.util.Utility;
|
||||
* may contain variables. Variables represent a set of Unicode
|
||||
* characters, such as the letters <i>a</i> through <i>z</i>.
|
||||
* Variables are detected by looking up each character in a supplied
|
||||
* variable list to see if it has been so defined.
|
||||
* variable list to see if it has been so defined.
|
||||
*
|
||||
* <p>A rule may contain segments in its input string and segment references in
|
||||
* its output string. A segment is a substring of the input pattern, indicated
|
||||
@ -44,7 +44,7 @@ import com.ibm.util.Utility;
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.28 $ $Date: 2001/09/26 18:00:06 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.29 $ $Date: 2001/10/03 00:14:23 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
|
||||
@ -310,7 +310,7 @@ class TransliterationRule {
|
||||
* r1: aakkkpppp
|
||||
* r2: aaakkkkkpppp
|
||||
* ^
|
||||
*
|
||||
*
|
||||
* The strings must be aligned at the first character of the
|
||||
* key. The length of r1 to the left of the alignment point
|
||||
* must be <= the length of r2 to the left; ditto for the
|
||||
@ -346,10 +346,10 @@ class TransliterationRule {
|
||||
int left2 = r2.anteContextLength;
|
||||
int right = pattern.length() - left;
|
||||
int right2 = r2.pattern.length() - left2;
|
||||
|
||||
|
||||
// TODO Clean this up -- some logic might be combinable with the
|
||||
// next statement.
|
||||
|
||||
|
||||
// Test for anchor masking
|
||||
if (left == left2 && right == right2 &&
|
||||
keyLength <= r2.keyLength &&
|
||||
@ -371,7 +371,7 @@ class TransliterationRule {
|
||||
pos - UTF16.getCharCount(UTF16.charAt(str, pos-1)) :
|
||||
pos - 1;
|
||||
}
|
||||
|
||||
|
||||
static final int posAfter(Replaceable str, int pos) {
|
||||
return (pos >= 0 && pos < str.length()) ?
|
||||
pos + UTF16.getCharCount(UTF16.charAt(str, pos)) :
|
||||
@ -387,10 +387,10 @@ class TransliterationRule {
|
||||
* context and key characters match, but the text is not long
|
||||
* enough to match all of them. A full match means all context
|
||||
* and key characters match.
|
||||
*
|
||||
*
|
||||
* If a full match is obtained, perform a replacement, update pos,
|
||||
* and return U_MATCH. Otherwise both text and pos are unchanged.
|
||||
*
|
||||
*
|
||||
* @param text the text
|
||||
* @param pos the position indices
|
||||
* @param incremental if TRUE, test for partial matches that may
|
||||
@ -559,13 +559,13 @@ class TransliterationRule {
|
||||
if (segments == null) {
|
||||
text.replace(pos.start, keyLimit, output);
|
||||
lenDelta = output.length() - (keyLimit - pos.start);
|
||||
if (cursorPos >= 0 && cursorPos < keyLength) {
|
||||
// Within the key, the cursor refers to 16-bit code units
|
||||
if (cursorPos >= 0 && cursorPos <= output.length()) {
|
||||
// Within the output string, the cursor refers to 16-bit code units
|
||||
newStart = pos.start + cursorPos;
|
||||
} else {
|
||||
newStart = pos.start;
|
||||
int n = cursorPos;
|
||||
// Outside the key, cursorPos counts code points
|
||||
// Outside the output string, cursorPos counts code points
|
||||
while (n > 0) {
|
||||
newStart += UTF16.getCharCount(UTF16.charAt(text, newStart));
|
||||
--n;
|
||||
@ -638,7 +638,7 @@ class TransliterationRule {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
oText += lenDelta;
|
||||
pos.limit += lenDelta;
|
||||
pos.contextLimit += lenDelta;
|
||||
@ -665,11 +665,11 @@ class TransliterationRule {
|
||||
* cleared out by, at the end, calling this method with a literal
|
||||
* character.
|
||||
*/
|
||||
protected void appendToRule(StringBuffer rule,
|
||||
int c,
|
||||
boolean isLiteral,
|
||||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
static void appendToRule(StringBuffer rule,
|
||||
int c,
|
||||
boolean isLiteral,
|
||||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
// If we are escaping unprintables, then escape them outside
|
||||
// quotes. <backslash>u and <backslash>U are not recognized within quotes. The same
|
||||
// logic applies to literals, but literals are never escaped.
|
||||
@ -745,11 +745,11 @@ class TransliterationRule {
|
||||
//System.out.println("rule=" + rule.toString() + " qb=" + quoteBuf.toString());
|
||||
}
|
||||
|
||||
protected final void appendToRule(StringBuffer rule,
|
||||
String text,
|
||||
boolean isLiteral,
|
||||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
static final void appendToRule(StringBuffer rule,
|
||||
String text,
|
||||
boolean isLiteral,
|
||||
boolean escapeUnprintable,
|
||||
StringBuffer quoteBuf) {
|
||||
for (int i=0; i<text.length(); ++i) {
|
||||
appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
@ -764,7 +764,7 @@ class TransliterationRule {
|
||||
*/
|
||||
public String toRule(boolean escapeUnprintable) {
|
||||
int i;
|
||||
|
||||
|
||||
StringBuffer rule = new StringBuffer();
|
||||
|
||||
// iseg indexes into segments[] directly (not offset from FSPI)
|
||||
@ -863,7 +863,7 @@ class TransliterationRule {
|
||||
if (show) {
|
||||
rule.append((char)(48+d));
|
||||
}
|
||||
}
|
||||
}
|
||||
rule.append(' ');
|
||||
}
|
||||
}
|
||||
@ -905,6 +905,9 @@ class TransliterationRule {
|
||||
|
||||
/**
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.29 2001/10/03 00:14:23 alan
|
||||
* jitterbug 73: finish quantifier and supplemental char support
|
||||
*
|
||||
* Revision 1.28 2001/09/26 18:00:06 alan
|
||||
* jitterbug 67: sync parser with icu4c, allow unlimited, nested segments
|
||||
*
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/Transliterator.java,v $
|
||||
* $Date: 2001/09/28 20:37:09 $
|
||||
* $Revision: 1.43 $
|
||||
* $Date: 2001/10/03 00:14:23 $
|
||||
* $Revision: 1.44 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -241,7 +241,7 @@ import com.ibm.util.CaseInsensitiveString;
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.43 $ $Date: 2001/09/28 20:37:09 $
|
||||
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.44 $ $Date: 2001/10/03 00:14:23 $
|
||||
*/
|
||||
public abstract class Transliterator {
|
||||
/**
|
||||
@ -553,6 +553,15 @@ public abstract class Transliterator {
|
||||
index.contextLimit += insertion.length();
|
||||
}
|
||||
|
||||
char last = (text.length() > 0) ?
|
||||
text.charAt(text.length() - 1) : 0;
|
||||
if (UTF16.isLeadSurrogate(last)) {
|
||||
// Oops, the caller passed us a single lead surrogate at the
|
||||
// end of the insertion. Don't transliterate until more text
|
||||
// comes in.
|
||||
return;
|
||||
}
|
||||
|
||||
filteredTransliterate(text, index, true);
|
||||
|
||||
// This doesn't work once we add quantifier support. Need to rewrite
|
||||
|
@ -142,6 +142,10 @@ class TransliteratorParser {
|
||||
private static final char CURSOR_OFFSET = '@';
|
||||
private static final char ANCHOR_START = '^';
|
||||
|
||||
private static final char KLEENE_STAR = '*';
|
||||
private static final char ONE_OR_MORE = '+';
|
||||
private static final char ZERO_OR_ONE = '?';
|
||||
|
||||
// By definition, the ANCHOR_END special character is a
|
||||
// trailing SymbolTable.SYMBOL_REF character.
|
||||
// private static final char ANCHOR_END = '$';
|
||||
@ -382,7 +386,7 @@ class TransliteratorParser {
|
||||
idBlock = idBlockResult.toString();
|
||||
|
||||
// Convert the set vector to an array
|
||||
data.variables = new UnicodeSet[variablesVector.size()];
|
||||
data.variables = new UnicodeMatcher[variablesVector.size()];
|
||||
variablesVector.copyInto(data.variables);
|
||||
variablesVector = null;
|
||||
|
||||
@ -658,7 +662,7 @@ class TransliteratorParser {
|
||||
int varStart = -1; // Most recent $variableReference
|
||||
int varLimit = -1;
|
||||
int[] iref = new int[1];
|
||||
|
||||
|
||||
main:
|
||||
while (pos < limit && !done) {
|
||||
char c = rule.charAt(pos++);
|
||||
@ -853,56 +857,71 @@ class TransliteratorParser {
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
// TODO Add quantifier parsing
|
||||
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
// Quantifiers. We handle single characters, quoted strings,
|
||||
// variable references, and segments.
|
||||
// a+ matches aaa
|
||||
// 'foo'+ matches foofoofoo
|
||||
// $v+ matches xyxyxy if $v == xy
|
||||
// (seg)+ matches segsegseg
|
||||
{
|
||||
int qstart, qlimit;
|
||||
boolean[] isOpenParen = new boolean[1];
|
||||
boolean isSegment = false;
|
||||
if (segments != null &&
|
||||
segments.getLastParenOffset(isOpenParen) == buf.length()) {
|
||||
// The */+ immediately follows a segment
|
||||
if (isOpenParen[0]) {
|
||||
syntaxError("Misplaced quantifier", rule, start);
|
||||
}
|
||||
int[] startparam = new int[1];
|
||||
int[] limitparam = new int[1];
|
||||
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
|
||||
syntaxError("Mismatched segment delimiters", rule, start);
|
||||
}
|
||||
qstart = startparam[0];
|
||||
qlimit = limitparam[0];
|
||||
isSegment = true;
|
||||
} else {
|
||||
// The */+ follows an isolated character or quote
|
||||
// or variable reference
|
||||
if (buf.length() == quoteLimit) {
|
||||
// The */+ follows a 'quoted string'
|
||||
qstart = quoteStart;
|
||||
qlimit = quoteLimit;
|
||||
} else if (buf.length() == varLimit) {
|
||||
// The */+ follows a $variableReference
|
||||
qstart = varStart;
|
||||
qlimit = varLimit;
|
||||
} else {
|
||||
// The */+ follows a single character
|
||||
qstart = buf.length() - 1;
|
||||
qlimit = qstart + 1;
|
||||
}
|
||||
}
|
||||
UnicodeMatcher m =
|
||||
new StringMatcher(buf.toString(), qstart, qlimit,
|
||||
isSegment, parser.data);
|
||||
int min = 0;
|
||||
int max = Quantifier.MAX;
|
||||
switch (c) {
|
||||
case ONE_OR_MORE:
|
||||
min = 1;
|
||||
break;
|
||||
case ZERO_OR_ONE:
|
||||
min = 0;
|
||||
max = 1;
|
||||
break;
|
||||
// case KLEENE_STAR:
|
||||
// do nothing -- min, max already set
|
||||
}
|
||||
m = new Quantifier(m, min, max);
|
||||
buf.setLength(qstart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
// case SET_CLOSE:
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
@ -947,7 +966,7 @@ class TransliteratorParser {
|
||||
//----------------------------------------------------------------------
|
||||
// END RuleHalf
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
|
||||
/**
|
||||
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
||||
* at pos. Return the index after the last character parsed. Do not
|
||||
|
Loading…
Reference in New Issue
Block a user