ICU-60 implement toPattern in UnicodeSet; update UnicodeFilter.contains to take an int; update UnicodeSet to support code points to U+10FFFF

X-SVN-Rev: 5904
This commit is contained in:
Alan Liu 2001-09-24 19:57:18 +00:00
parent 301464ad59
commit a01b74ee85
22 changed files with 980 additions and 424 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/CompoundTransliteratorTest.java,v $
* $Date: 2001/09/08 01:17:50 $
* $Revision: 1.2 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -111,7 +111,7 @@ public class CompoundTransliteratorTest extends TestFmwk {
public void TestGetTransliterator(){
logln("Testing the getTransliterator() API of CompoundTransliterator");
String ID="Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Unicode-Hex;Hex-Unicode";
String ID="Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Any-Hex;Hex-Any";
CompoundTransliterator ct1=null;
try{
ct1=new CompoundTransliterator(ID);
@ -141,9 +141,9 @@ public class CompoundTransliteratorTest extends TestFmwk {
logln("Testing the handleTransliterate() API of CompoundTransliterator");
CompoundTransliterator ct1=null;
try{
ct1=new CompoundTransliterator("Unicode-Hex;Hex-Unicode");
ct1=new CompoundTransliterator("Any-Hex;Hex-Any");
}catch(IllegalArgumentException iae){
errln("FAIL: construction using CompoundTransliterator(String ID) failed for " + "Unicode-Hex;Hex-Unicode");
errln("FAIL: construction using CompoundTransliterator(String ID) failed for " + "Any-Hex;Hex-Any");
throw iae;
}
@ -167,8 +167,8 @@ public class CompoundTransliteratorTest extends TestFmwk {
String Data[]={
//ID, input string, transliterated string
"Unicode-Hex;Hex-Unicode;Unicode-Hex", "hello", "\\u0068\\u0065\\u006C\\u006C\\u006F",
"Unicode-Hex;Hex-Unicode", "hello! How are you?", "hello! How are you?",
"Any-Hex;Hex-Any;Any-Hex", "hello", "\\u0068\\u0065\\u006C\\u006C\\u006F",
"Any-Hex;Hex-Any", "hello! How are you?", "hello! How are you?",
"Devanagari-Latin;Latin-Devanagari", "\u092D\u0948'\u0930'\u0935", "\u092D\u0948\u0930\u0935", // quotes lost
"Latin-Cyrillic;Cyrillic-Latin", "a'b'k'd'e'f'g'h'i'j'Shch'shch'zh'h", "abkdefghijShchshchzhh",
"Latin-Greek;Greek-Latin", "ABGabgAKLMN", "ABGabgAKLMN",

View File

@ -6,8 +6,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/Attic/HexToUnicodeTransliteratorTest.java,v $
* $Date: 2000/10/09 16:32:07 $
* $Revision: 1.2 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -33,7 +33,7 @@ public class HexToUnicodeTransliteratorTest extends TestFmwk {
* Used by TestConstruction() and TestTransliterate.
*/
UnicodeFilter HexFilter=new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
if(c == 0x0061 || c == 0x0063 )
return false;
else

View File

@ -4,9 +4,9 @@
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
* $Date: 2001/09/21 21:23:34 $
* $Revision: 1.45 $
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.46 $
*
*****************************************************************************************
*/
@ -48,14 +48,12 @@ public class TransliteratorTest extends TestFmwk {
throw ex;
}
// TODO remove check for class when we implement full
// toRules().
if (t != null && t instanceof RuleBasedTransliterator) {
if (t != null) {
// Now test toRules
String rules = null;
try {
rules = ((RuleBasedTransliterator)t).toRules(true);
rules = t.toRules(true);
Transliterator u = Transliterator.createFromRules("x",
rules, Transliterator.FORWARD);
} catch (IllegalArgumentException ex2) {
@ -74,7 +72,7 @@ public class TransliteratorTest extends TestFmwk {
} catch (IllegalArgumentException ex) {
logln("OK: Bogus ID handled properly");
}
ms = System.currentTimeMillis() - ms;
logln("Elapsed time: " + ms + " ms");
}
@ -223,7 +221,7 @@ public class TransliteratorTest extends TestFmwk {
* Basic test of keyboard.
*/
public void TestKeyboard() {
Transliterator t = new RuleBasedTransliterator("<ID>",
Transliterator t = new RuleBasedTransliterator("<ID>",
"psch>Y;"
+"ps>y;"
+"ch>x;"
@ -246,7 +244,7 @@ public class TransliteratorTest extends TestFmwk {
* Basic test of keyboard with cursor.
*/
public void TestKeyboard2() {
Transliterator t = new RuleBasedTransliterator("<ID>",
Transliterator t = new RuleBasedTransliterator("<ID>",
"ych>Y;"
+"ps>|y;"
+"ch>x;"
@ -381,7 +379,7 @@ public class TransliteratorTest extends TestFmwk {
public void TestFiltering() {
Transliterator hex = Transliterator.getInstance("Any-Hex");
hex.setFilter(new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
return c != 'c';
}
});
@ -398,7 +396,7 @@ public class TransliteratorTest extends TestFmwk {
/**
* Test anchors
*/
public void TestAnchors() {
public void TestAnchors() {
expect("^ab > 01 ;" +
" ab > |8 ;" +
" b > k ;" +
@ -406,7 +404,7 @@ public class TransliteratorTest extends TestFmwk {
" 8x > 77 ;",
"ababbabxabx",
"018k7745");
"018k7745");
expect("$s = [z$] ;" +
"$s{ab > 01 ;" +
" ab > |8 ;" +
@ -440,7 +438,7 @@ public class TransliteratorTest extends TestFmwk {
*/
public void TestJ277() {
Transliterator gl = Transliterator.getInstance("Greek-Latin");
char sigma = (char)0x3C3;
char upsilon = (char)0x3C5;
char nu = (char)0x3BD;
@ -517,7 +515,7 @@ public class TransliteratorTest extends TestFmwk {
// Try a custom Hex-Any
// \\uXXXX and &#xXXXX;
HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
expect(hex2, "\\u61\\u062\\u0063\\u00645\\u66x&#x30;&#x031;&#x0032;&#x00033;",
"abcd5fx012&#x00033;");
@ -740,7 +738,7 @@ public class TransliteratorTest extends TestFmwk {
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
}
/**
@ -752,18 +750,18 @@ public class TransliteratorTest extends TestFmwk {
"Hex[aeiou]-Any",
"quizzical",
"q\\u0075\\u0069zz\\u0069c\\u0061l",
"Any[aeiou]-Hex;Hex[^5]-Any",
"Any[^5]-Hex;Hex[aeiou]-Any",
"quizzical",
"q\\u0075izzical",
"Null[abc]",
"Null[abc]",
"xyz",
"xyz",
};
for (int i=0; i<DATA.length; i+=4) {
String ID = DATA[i];
Transliterator t = Transliterator.getInstance(ID);
@ -796,6 +794,109 @@ public class TransliteratorTest extends TestFmwk {
"Th qck brwn fx.");
}
public void TestToRules() {
String RBT = "rbt";
String SET = "set";
String[] DATA = {
RBT,
"$a=\\u4E61; [$a] > A;",
"[\\u4E61] > A;",
RBT,
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
"[[:Zs:][:Zl:]]{a} > A;",
SET,
"[[:Zs:][:Zl:]]",
"[[:Zs:][:Zl:]]",
SET,
"[:Ps:]",
"[:Ps:]",
SET,
"[:L:]",
"[:L:]",
SET,
"[[:L:]-[A]]",
"[[:L:]-[A]]",
SET,
"[~[:Lu:][:Ll:]]",
"[~[:Lu:][:Ll:]]",
SET,
"[~[a-z]]",
"[~[a-z]]",
RBT,
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
"[^[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
"[[a-z]-[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
"[[:Zs:]&[a-z]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
"[x[:Zs:]]{a} > A;",
};
for (int d=0; d < DATA.length; d+=3) {
if (DATA[d] == RBT) {
// Transliterator test
Transliterator t = Transliterator.createFromRules("ID",
DATA[d+1], Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules failed");
return;
}
String rules, escapedRules;
rules = t.toRules(false);
escapedRules = t.toRules(true);
String expRules = Utility.unescape(DATA[d+2]);
String expEscapedRules = DATA[d+2];
if (rules.equals(expRules)) {
logln("Ok: " + DATA[d+1] +
" => " + Utility.escape(rules));
} else {
errln("FAIL: " + DATA[d+1] +
" => " + Utility.escape(rules + ", exp " + expRules));
}
if (escapedRules.equals(expEscapedRules)) {
logln("Ok: " + DATA[d+1] +
" => " + escapedRules);
} else {
errln("FAIL: " + DATA[d+1] +
" => " + escapedRules + ", exp " + expEscapedRules);
}
} else {
// UnicodeSet test
String pat = DATA[d+1];
String expToPat = DATA[d+2];
UnicodeSet set = new UnicodeSet(pat);
// Adjust spacing etc. as necessary.
String toPat;
toPat = set.toPattern(true);
if (expToPat.equals(toPat)) {
logln("Ok: " + pat +
" => " + toPat);
} else {
errln("FAIL: " + pat +
" => " + Utility.escape(toPat) +
", exp " + Utility.escape(pat));
}
}
}
}
/**
* Test the case mapping transliterators.
*/
@ -806,7 +907,7 @@ public class TransliteratorTest extends TestFmwk {
Transliterator.getInstance("Any-Lower[^xyzXYZ]");
Transliterator toTitle =
Transliterator.getInstance("Any-Title[^xyzXYZ]");
expect(toUpper, "The quick brown fox jumped over the lazy dogs.",
"THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
expect(toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
@ -857,7 +958,7 @@ public class TransliteratorTest extends TestFmwk {
errln("FAIL: " + DATA[i+2] +
" create ID \"" + DATA[i] + "\" => \"" +
t.getID() + "\", exp \"" + DATA[i+1] + "\"");
}
}
} catch (IllegalArgumentException e) {
errln("FAIL: " + DATA[i+2] +
" create ID \"" + DATA[i] + "\"");
@ -875,52 +976,52 @@ public class TransliteratorTest extends TestFmwk {
// Input Decomposed Composed
{"cat", "cat", "cat" },
{"\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark" },
{"\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above
{"D\u0307", "D\u0307", "\u1e0a" }, // D dot_above
{"\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above
{"\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below
{"D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above
{"\u1e10\u0307\u0323", "D\u0327\u0323\u0307","\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above
{"D\u0307\u0328\u0323","D\u0328\u0323\u0307","\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below
{"\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave
{"\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave
{"\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron
{"\u212b", "A\u030a", "\u00c5" }, // angstrom_sign
{"\u00c5", "A\u030a", "\u00c5" }, // A-ring
{"\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0
{"\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0
{"Henry IV", "Henry IV", "Henry IV" },
{"Henry \u2163", "Henry \u2163", "Henry \u2163" },
{"\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
{"\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
{"\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten
{"\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten
{"\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten
{"A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
};
String[][] COMPAT = {
};
String[][] COMPAT = {
// Input Decomposed Composed
{"\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC" }, // Alef-Lamed vs. Alef, Lamed
{"\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0
{"\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i
{"Henry IV", "Henry IV", "Henry IV" },
{"Henry \u2163", "Henry IV", "Henry IV" },
{"\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
{"\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
{"\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten
};
@ -960,7 +1061,7 @@ public class TransliteratorTest extends TestFmwk {
expect(t, "\u010dx", "c\u030C");
}
}
/**
* Test compound RBT rules.
*/
@ -1116,7 +1217,7 @@ public class TransliteratorTest extends TestFmwk {
append('|').
append(s.substring(index.start));
}
// As a final step in keyboard transliteration, we must call
// transliterate to finish off any pending partial matches that
// were waiting for more input.
@ -1135,7 +1236,7 @@ public class TransliteratorTest extends TestFmwk {
result.equals(expectedResult),
expectedResult);
}
void expectAux(String tag, String summary, boolean pass,
String expectedResult) {
if (pass) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/Attic/UnicodeFilterLogicTest.java,v $
* $Date: 2000/10/04 23:12:33 $
* $Revision: 1.2 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -29,7 +29,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {
UnicodeFilter Filter1=new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
if(c == 0x0061 || c == 0x0041 || c == 0x0063 || c == 0x0043)
return false;
else
@ -37,7 +37,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {
}
};
UnicodeFilter Filter2=new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
if(c == 0x0079 || c == 0x0059 || c == 0x007a || c == 0x005a || c == 0x0061 || c == 0x0063)
return false;
else
@ -47,7 +47,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {
public void TestAllFilters() {
Transliterator t1 = Transliterator.getInstance("Unicode-Hex");
Transliterator t1 = Transliterator.getInstance("Any-Hex");
String source="abcdABCDyzYZ";
//sanity testing wihtout any filter

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java,v $
* $Date: 2001/09/08 01:17:50 $
* $Revision: 1.11 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.12 $
*
*****************************************************************************************
*/
@ -181,8 +181,8 @@ public class UnicodeSetTest extends TestFmwk {
set.clear();
set.applyPattern("[A-Y 1-8 b-d l-y]");
for (int i = 0; i<set.getRangeCount(); ++i) {
char a = set.getRangeStart(i);
char b = set.getRangeEnd(i);
int a = set.getRangeStart(i);
int b = set.getRangeEnd(i);
if (!set.contains(a, b)) {
errln("FAIL, should contain " + (char)a + '-' + (char)b +
" but doesn't: " + set);
@ -219,7 +219,7 @@ public class UnicodeSetTest extends TestFmwk {
if (c.equals(exp)) {
logln("c.complement(): " + c);
} else {
errln("FAIL: c.complement() = " + c + ", expect " + exp);
errln(Utility.escape("FAIL: c.complement() = " + c + ", expect " + exp));
}
c.complement();
exp.set((char)3, (char)15);
@ -252,13 +252,13 @@ public class UnicodeSetTest extends TestFmwk {
public void TestIndexOf() {
UnicodeSet set = new UnicodeSet("[a-cx-y3578]");
for (int i=0; i<set.size(); ++i) {
char c = set.charAt(i);
int c = set.charAt(i);
if (set.indexOf(c) != i) {
errln("FAIL: charAt(" + i + ") = " + c +
" => indexOf() => " + set.indexOf(c));
}
}
char c = set.charAt(set.size());
int c = set.charAt(set.size());
if (c != '\uFFFE') {
errln("FAIL: charAt(<out of range>) = " +
Utility.escape(String.valueOf(c)));

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/Attic/UnicodeToHexTransliteratorTest.java,v $
* $Date: 2000/10/16 16:58:29 $
* $Revision: 1.3 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.4 $
*
*****************************************************************************************
*/
@ -32,7 +32,7 @@ public class UnicodeToHexTransliteratorTest extends TestFmwk {
* Used by TestConstruction() and TestTransliterate.
*/
UnicodeFilter UniFilter=new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
if(c==0x0063 || c==0x0061 || c==0x0043 || c==0x0041)
return false;
else

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
* $Date: 2001/09/21 21:24:04 $
* $Revision: 1.44 $
* $Date: 2001/09/24 19:57:17 $
* $Revision: 1.45 $
*
*****************************************************************************************
*/
@ -279,7 +279,7 @@ import com.ibm.text.resources.ResourceReader;
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
*
* @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.44 $ $Date: 2001/09/21 21:24:04 $
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.45 $ $Date: 2001/09/24 19:57:17 $
*/
public class RuleBasedTransliterator extends Transliterator {
@ -542,7 +542,7 @@ public class RuleBasedTransliterator extends Transliterator {
/**
* Implement SymbolTable API.
*/
public UnicodeSet lookupSet(char ch) {
public UnicodeSet lookupSet(int ch) {
// Note that we cannot use data.lookupSet() because the
// set array has not been constructed yet.
int i = ch - data.setVariablesBase;
@ -1579,6 +1579,9 @@ public class RuleBasedTransliterator extends Transliterator {
/**
* $Log: RuleBasedTransliterator.java,v $
* Revision 1.45 2001/09/24 19:57:17 alan
* jitterbug 60: implement toPattern in UnicodeSet; update UnicodeFilter.contains to take an int; update UnicodeSet to support code points to U+10FFFF
*
* Revision 1.44 2001/09/21 21:24:04 alan
* jitterbug 64: allow ::ID blocks in rules
*

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SymbolTable.java,v $
* $Date: 2000/08/30 20:40:30 $
* $Revision: 1.6 $
* $Date: 2001/09/24 19:57:18 $
* $Revision: 1.7 $
*
*****************************************************************************************
*/
@ -41,8 +41,9 @@ public interface SymbolTable {
/**
* Lookup the UnicodeSet associated with the given character, and
* return it. Return <tt>null</tt> if not found.
* @param ch a 32-bit code point from 0 to 0x10FFFF.
*/
UnicodeSet lookupSet(char ch);
UnicodeSet lookupSet(int ch);
/**
* Parse a symbol reference name from the given string, starting

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeFilter.java,v $
* $Date: 2000/03/10 04:07:25 $
* $Revision: 1.4 $
* $Date: 2001/09/24 19:57:18 $
* $Revision: 1.5 $
*
*****************************************************************************************
*/
@ -30,5 +30,5 @@ public interface UnicodeFilter {
* filtered</b>, then <tt>contains()</tt> returns
* <b><tt>false</tt></b>.
*/
boolean contains(char c);
boolean contains(int c);
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UnicodeFilterLogic.java,v $
* $Date: 2000/03/10 04:07:25 $
* $Revision: 1.3 $
* $Date: 2001/09/24 19:57:18 $
* $Revision: 1.4 $
*
*****************************************************************************************
*/
@ -28,7 +28,7 @@ public final class UnicodeFilterLogic {
*/
public static UnicodeFilter not(final UnicodeFilter f) {
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
return !f.contains(c);
}
};
@ -51,7 +51,7 @@ public final class UnicodeFilterLogic {
return f;
}
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
return f.contains(c) && g.contains(c);
}
};
@ -66,7 +66,7 @@ public final class UnicodeFilterLogic {
*/
public static UnicodeFilter and(final UnicodeFilter[] f) {
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
for (int i=0; i<f.length; ++i) {
if (!f[i].contains(c)) {
return false;
@ -94,7 +94,7 @@ public final class UnicodeFilterLogic {
return f;
}
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
return f.contains(c) || g.contains(c);
}
};
@ -109,7 +109,7 @@ public final class UnicodeFilterLogic {
*/
public static UnicodeFilter or(final UnicodeFilter[] f) {
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
for (int i=0; i<f.length; ++i) {
if (f[i].contains(c)) {
return true;

View File

@ -5,21 +5,22 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
* $Date: 2001/09/20 21:20:00 $
* $Revision: 1.33 $
* $Date: 2001/09/24 19:57:18 $
* $Revision: 1.34 $
*
*****************************************************************************************
*/
package com.ibm.text;
import java.text.*;
import com.ibm.util.Utility;
/**
* A mutable set of Unicode characters. Objects of this class
* represent <em>character classes</em> used in regular expressions.
* Such classes specify a subset of the set of all Unicode characters,
* which in this implementation is the characters from U+0000 to
* U+FFFF, ignoring surrogates.
* U+10FFFF.
*
* <p><code>UnicodeSet</code> supports two APIs. The first is the
* <em>operand</em> API that allows the caller to modify the value of
@ -184,7 +185,7 @@ import java.text.*;
* through 'z' and all letters in between, in Unicode order
* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
* all characters but 'a' through 'z',
* that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF
* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
@ -254,10 +255,11 @@ import java.text.*;
* *Unsupported by Java (and hence unsupported by UnicodeSet).
*
* @author Alan Liu
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.33 $ $Date: 2001/09/20 21:20:00 $ */
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.34 $ $Date: 2001/09/24 19:57:18 $ */
public class UnicodeSet implements UnicodeFilter {
/* Implementation Notes.
* NOTE: This conversion has been completed as of 2.0.
*
* UnicodeSet currently represents only the characters U+0000 to
* U+FFFF. This allows the API to be written in terms of the Java
@ -285,24 +287,35 @@ public class UnicodeSet implements UnicodeFilter {
*/
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
private static final int HIGH = 0x10000; // HIGH > all valid values. 10000 for code units.
private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
// 110000 for codepoints
/**
* Minimum value that can be stored in a UnicodeSet.
*/
public static final char MIN_VALUE = (char) LOW;
public static final int MIN_VALUE = LOW;
/**
* Maximum value that can be stored in a UnicodeSet.
*/
public static final char MAX_VALUE = (char) (HIGH - 1);
public static final int MAX_VALUE = HIGH - 1;
private int len; // length used; list may be longer to minimize reallocs
private int[] list; // MUST be terminated with HIGH
private int[] rangeList; // internal buffer
private int[] buffer; // internal buffer
/**
* The pattern representation of this set. This may not be the
* most economical pattern. It is the pattern supplied to
* applyPattern(), with variables substituted and whitespace
* removed. For sets constructed without applyPattern(), or
* modified using the non-pattern API, this string will be null,
* indicating that toPattern() must generate a pattern
* representation from the inversion list.
*/
private String pat = null;
private static final int START_EXTRA = 16; // initial storage. Must be >= 0
private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
@ -349,7 +362,7 @@ public class UnicodeSet implements UnicodeFilter {
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
*/
public UnicodeSet(char start, char end) {
public UnicodeSet(int start, int end) {
this();
complement(start, end);
}
@ -418,7 +431,7 @@ public class UnicodeSet implements UnicodeFilter {
* @param start first character in the set, inclusive
* @rparam end last character in the set, inclusive
*/
public void set(char start, char end) {
public void set(int start, int end) {
clear();
complement(start, end);
}
@ -431,6 +444,7 @@ public class UnicodeSet implements UnicodeFilter {
public void set(UnicodeSet other) {
list = (int[]) other.list.clone();
len = other.len;
pat = other.pat;
}
/**
@ -475,24 +489,6 @@ public class UnicodeSet implements UnicodeFilter {
}
}
/**
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
*/
private static final void _toPat(StringBuffer buf, char c) {
// Okay to let ':' pass through
switch (c) {
case '[':
case ']':
case '-':
case '^':
case '&':
case '\\':
buf.append('\\');
}
buf.append(c);
}
/**
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
@ -575,6 +571,53 @@ public class UnicodeSet implements UnicodeFilter {
*/
public String toPattern(boolean escapeUnprintable) {
StringBuffer result = new StringBuffer();
return _toPattern(result, escapeUnprintable).toString();
}
/**
* Append a string representation of this set to result. This will be
* a cleaned version of the string passed to applyPattern(), if there
* is one. Otherwise it will be generated.
*/
private StringBuffer _toPattern(StringBuffer result,
boolean escapeUnprintable) {
if (pat != null) {
int i;
int backslashCount = 0;
for (i=0; i<pat.length(); ++i) {
char c = pat.charAt(i);
if (escapeUnprintable && _isUnprintable(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
// Before unescaping it, we delete the final
// backslash.
if ((backslashCount % 2) == 1) {
result.setLength(result.length() - 1);
}
_escapeUnprintable(result, c);
backslashCount = 0;
} else {
result.append(c);
if (c == '\\') {
++backslashCount;
} else {
backslashCount = 0;
}
}
}
return result;
}
return _generatePattern(result, escapeUnprintable);
}
/**
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
*/
public StringBuffer _generatePattern(StringBuffer result,
boolean escapeUnprintable) {
result.append('[');
// Check against the predefined categories. We implicitly build
@ -583,7 +626,7 @@ public class UnicodeSet implements UnicodeFilter {
if (this.equals(getCategorySet(cat))) {
result.append(':');
result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
return result.append(":]").toString();
return result.append(":]");
}
}
@ -623,7 +666,7 @@ public class UnicodeSet implements UnicodeFilter {
}
}
return result.append(']').toString();
return result.append(']');
}
/**
@ -659,7 +702,13 @@ public class UnicodeSet implements UnicodeFilter {
* @return <tt>true</tt> if this set contains the specified range
* of chars.
*/
public boolean contains(char start, char end) {
public boolean contains(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
int i = -1;
while (true) {
if (start < list[++i]) break;
@ -674,7 +723,10 @@ public class UnicodeSet implements UnicodeFilter {
* <code>charAt()</code>.
* @return an index from 0..size()-1, or -1
*/
public int indexOf(char c) {
public int indexOf(int c) {
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
int i = 0;
int n = 0;
for (;;) {
@ -697,7 +749,7 @@ public class UnicodeSet implements UnicodeFilter {
* <code>indexOf()</code>.
* @param index an index from 0..size()-1
*/
public char charAt(int index) {
public int charAt(int index) {
if (index >= 0) {
for (int i=0; i < len;) {
int start = list[i++];
@ -716,12 +768,14 @@ public class UnicodeSet implements UnicodeFilter {
*
* @return <tt>true</tt> if this set contains the specified char.
*/
public boolean contains(char c) {
//| Not needed unless HIGH > 0x10000
//| // catch degenerate cases
//| if (c == HIGH) { // catch final, so we don't do it in loop!
//| return (len & 1) == 0; // even length includes everything
//| }
public boolean contains(int c) {
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
// catch degenerate cases (not needed unless HIGH > 0x10000
if (c == HIGH) { // catch final, so we don't do it in loop!
return (len & 1) == 0; // even length includes everything
}
// Set i to the index of the start item greater than ch
// We know we will terminate without length test!
// LATER: for large sets, add binary search
@ -771,7 +825,13 @@ public class UnicodeSet implements UnicodeFilter {
* @param end last character, inclusive, of range to be added
* to this set.
*/
public void add(char start, char end) {
public void add(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
add(range(start, end), 2, 0);
}
@ -782,7 +842,7 @@ public class UnicodeSet implements UnicodeFilter {
* present. If this set already contains the specified character,
* the call leaves this set unchanged.
*/
public final void add(char c) {
public final void add(int c) {
add(c, c);
}
@ -796,7 +856,13 @@ public class UnicodeSet implements UnicodeFilter {
* @param end last character, inclusive, of range to be retained
* to this set.
*/
public void retain(char start, char end) {
public void retain(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
retain(range(start, end), 2, 0);
} else {
@ -807,7 +873,7 @@ public class UnicodeSet implements UnicodeFilter {
/**
* Retain the specified character from this set if it is present.
*/
public final void retain(char c) {
public final void retain(int c) {
retain(c, c);
}
@ -822,7 +888,13 @@ public class UnicodeSet implements UnicodeFilter {
* @param end last character, inclusive, of range to be removed
* from this set.
*/
public void remove(char start, char end) {
public void remove(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
retain(range(start, end), 2, 2);
}
@ -833,7 +905,7 @@ public class UnicodeSet implements UnicodeFilter {
* The set will not contain the specified character once the call
* returns.
*/
public final void remove(char c) {
public final void remove(int c) {
remove(c, c);
}
@ -848,7 +920,13 @@ public class UnicodeSet implements UnicodeFilter {
* @param end last character, inclusive, of range to be removed
* from this set.
*/
public void complement(char start, char end) {
public void complement(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
xor(range(start, end), 2, 0);
}
@ -859,7 +937,7 @@ public class UnicodeSet implements UnicodeFilter {
* will be removed if it is in this set, or will be added if it is
* not in this set.
*/
public final void complement(char c) {
public final void complement(int c) {
complement(c, c);
}
@ -878,6 +956,7 @@ public class UnicodeSet implements UnicodeFilter {
list[0] = LOW;
++len;
}
pat = null;
}
/**
@ -960,6 +1039,7 @@ public class UnicodeSet implements UnicodeFilter {
public void clear() {
list[0] = HIGH;
len = 1;
pat = null;
}
/**
@ -980,8 +1060,8 @@ public class UnicodeSet implements UnicodeFilter {
* @see #getRangeCount
* @see #getRangeEnd
*/
public char getRangeStart(int index) {
return (char) list[index*2];
public int getRangeStart(int index) {
return list[index*2];
}
/**
@ -992,8 +1072,8 @@ public class UnicodeSet implements UnicodeFilter {
* @see #getRangeStart
* @see #getRangeEnd
*/
public char getRangeEnd(int index) {
return (char) (list[index*2 + 1] - 1);
public int getRangeEnd(int index) {
return (list[index*2 + 1] - 1);
}
/**
@ -1052,7 +1132,7 @@ public class UnicodeSet implements UnicodeFilter {
* Return a programmer-readable string representation of this object.
*/
public String toString() {
return getClass().getName() + '{' + toPattern(false) + '}';
return getClass().getName() + '(' + toPattern(false) + ')';
}
//----------------------------------------------------------------
@ -1081,13 +1161,37 @@ public class UnicodeSet implements UnicodeFilter {
* of <code>pattern</code>
* @exception java.lang.IllegalArgumentException if the parse fails.
*/
void applyPattern(String pattern, ParsePosition pos,
SymbolTable symbols, boolean ignoreWhitespace) {
void applyPattern(String pattern,
ParsePosition pos,
SymbolTable symbols,
boolean ignoreWhitespace) {
// Need to build the pattern in a temporary string because
// _applyPattern calls add() etc., which set pat to empty.
StringBuffer rebuiltPat = new StringBuffer();
_applyPattern(pattern, pos, symbols, rebuiltPat, ignoreWhitespace);
pat = rebuiltPat.toString();
}
void _applyPattern(String pattern, ParsePosition pos,
SymbolTable symbols, StringBuffer rebuiltPat,
boolean ignoreWhitespace) {
// If the pattern contains any of the following, we save a
// rebuilt (variable-substituted) copy of the source pattern:
// - a category
// - an intersection or subtraction operator
// - an anchor (trailing '$', indicating RBT ether)
boolean rebuildPattern = false;
StringBuffer newPat = new StringBuffer("[");
int nestedPatStart = -1; // see below for usage
boolean nestedPatDone = false; // see below for usage
boolean invert = false;
clear();
int lastChar = -1; // This is either a char (0..FFFF) or -1
final int NONE = -1;
int lastChar = NONE; // This is either a char (0..10FFFF) or -1
char lastOp = 0;
/* This loop iterates over the characters in the pattern. We start at
@ -1109,7 +1213,7 @@ public class UnicodeSet implements UnicodeFilter {
// mode 2: '[' '^'? seen; parse pattern and close with ']'
// mode 3: '[:' seen; parse category and close with ':]'
int mode = 0;
int openPos = 0; // offset to opening '['
int colonPos = 0; // Expected pos of ':' in '[:'
int start = pos.getIndex();
int i = start;
int limit = pattern.length();
@ -1120,33 +1224,37 @@ public class UnicodeSet implements UnicodeFilter {
char[] varValueBuffer = null;
int ivarValueBuffer = 0;
int anchor = 0;
for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
int c;
while (i<limit) {
/* If the next element is a single character, c will be set to it,
* and nestedSet will be null. In this case isLiteral indicates
* whether the character should assume special meaning if it has
* one. If the next element is a nested set, either via a variable
* reference, or via an embedded "[..]" or "[:..:]" pattern, then
* nestedSet will be set to the i-list for the nested set, and
* nestedSet will be set to the pairs list for the nested set, and
* c's value should be ignored.
*/
UnicodeSet nestedSet = null;
boolean isLiteral = false;
char c;
if (varValueBuffer != null) {
if (ivarValueBuffer < varValueBuffer.length) {
c = varValueBuffer[ivarValueBuffer++];
c = UTF16.charAt(varValueBuffer, 0, varValueBuffer.length, ivarValueBuffer);
ivarValueBuffer += UTF16.getCharCount(c);
nestedSet = symbols.lookupSet(c); // may be NULL
nestedPatDone = false;
} else {
varValueBuffer = null;
c = pattern.charAt(i);
c = UTF16.charAt(pattern, i);
i += UTF16.getCharCount(c);
}
} else {
c = pattern.charAt(i);
c = UTF16.charAt(pattern, i);
i += UTF16.getCharCount(c);
}
// Ignore whitespace. This is not Unicode whitespace, but Java
// whitespace, a subset of Unicode whitespace.
if (ignoreWhitespace && Character.isWhitespace(c)) {
if (ignoreWhitespace && UCharacter.isWhitespace(c)) {
continue;
}
@ -1160,7 +1268,7 @@ public class UnicodeSet implements UnicodeFilter {
case 0:
if (c == '[') {
mode = 1; // Next look for '^'
openPos = i;
colonPos = i; // Expect ':' at next offset
continue;
} else {
throw new IllegalArgumentException("Missing opening '['");
@ -1170,14 +1278,17 @@ public class UnicodeSet implements UnicodeFilter {
switch (c) {
case '^':
invert = true;
newPat.append((char) c);
continue; // Back to top to fetch next character
case ':':
if (i == openPos+1) {
if (i-1 == colonPos) {
// '[:' cannot have whitespace in it
--i;
--i; // Backup to the '['
c = '[';
mode = 3;
// Fall through and parse category normally
// Fall through and parse category using the same
// code used to parse a nested category. The mode
// will indicate that this is actually top level.
}
break; // Fall through
case '-':
@ -1202,27 +1313,19 @@ public class UnicodeSet implements UnicodeFilter {
* interpret '\\uxxxx' Unicode escapes here (as literals).
*/
if (c == '\\') {
++i;
if (i < limit) {
c = pattern.charAt(i);
isLiteral = true;
if (c == 'u') {
if ((i+4) >= limit) {
throw new IllegalArgumentException("Invalid \\u escape");
}
c = '\u0000';
for (int j=(++i)+4; i<j; ++i) { // [sic]
int digit = Character.digit(pattern.charAt(i), 16);
if (digit<0) {
throw new IllegalArgumentException("Invalid \\u escape");
}
c = (char) ((c << 4) | digit);
}
--i; // Move i back to last parsed character
}
} else {
throw new IllegalArgumentException("Trailing '\\'");
int[] offset = new int[] { i };
int escaped = Utility.unescapeAt(pattern, offset);
if (escaped == -1) {
int sta = Math.max(i - 8, 0);
int lim = Math.min(i + 16, pattern.length());
throw new IllegalArgumentException("Invalid escape sequence " +
pattern.substring(sta, i-1) +
"|" +
pattern.substring(i-1, lim));
}
i = offset[0];
isLiteral = true;
c = escaped;
}
/* Parse variable references. These are treated as literals. If a
@ -1232,7 +1335,7 @@ public class UnicodeSet implements UnicodeFilter {
* Set variables are only looked up if varCharToSet is not null.
*/
else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
pos.setIndex(++i);
pos.setIndex(i);
String name = symbols.parseReference(pattern, pos, limit);
if (name != null) {
varValueBuffer = symbols.lookup(name);
@ -1246,7 +1349,6 @@ public class UnicodeSet implements UnicodeFilter {
// Got a null; this means we have an isolated $.
// Tentatively assume this is an anchor.
anchor = 1;
--i; // Back up so loop increment works properly
}
continue; // Back to the top to get varValueBuffer[0]
}
@ -1256,28 +1358,56 @@ public class UnicodeSet implements UnicodeFilter {
* recognize these here and set nestedSet accordingly.
*/
else if (!isLiteral && c == '[') {
// Record position before nested pattern
nestedPatStart = newPat.length();
// Handle "[:...:]", representing a character category
char d = charAfter(pattern, i);
if (d == ':') {
i += 2;
if (i < pattern.length() && pattern.charAt(i) == ':') {
++i;
int j = pattern.indexOf(":]", i);
if (j < 0) {
throw new IllegalArgumentException("Missing \":]\"");
}
String scratch = pattern.substring(i, j);
nestedSet = new UnicodeSet();
nestedSet.applyCategory(pattern.substring(i, j));
i = j+1; // Make i point to ']' in ":]"
nestedSet.applyCategory(scratch);
nestedPatDone = true; // We're going to do it just below
i = j+2; // Advance i past ":]"
// Use a rebuilt pattern. If we are top level,
// then there is already a SET_OPEN in newPat, and
// SET_CLOSE will be appended elsewhere.
if (mode != 3) {
newPat.append('[');
}
newPat.append(':').append(scratch).append(':');
if (mode != 3) {
newPat.append(']');
}
rebuildPattern = true;
if (mode == 3) {
// Entire pattern is a category; leave parse loop
// Entire pattern is a category; leave parse
// loop. This is one of 2 ways we leave this
// loop if the pattern is well-formed.
set(nestedSet);
mode = 4;
break;
}
} else {
// Recurse to get the i-list for this nested set.
pos.setIndex(i); // Add 2 to point AFTER op
// Recurse to get the pairs for this nested set.
// Backup i to '['.
pos.setIndex(--i);
switch (lastOp) {
case '-':
case '&':
newPat.append(lastOp);
break;
}
nestedSet = new UnicodeSet();
nestedSet.applyPattern(pattern, pos, symbols, ignoreWhitespace);
i = pos.getIndex() - 1; // - 1 to point at ']'
nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
nestedPatDone = true;
i = pos.getIndex();
}
}
}
@ -1291,12 +1421,23 @@ public class UnicodeSet implements UnicodeFilter {
* ']' have special meanings.
*/
if (nestedSet != null) {
if (lastChar >= 0) {
if (lastChar != NONE) {
if (lastOp != 0) {
throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
}
add((char) lastChar, (char) lastChar);
lastChar = -1;
if (nestedPatDone) {
// If there was a character before the nested set,
// then we need to insert it in newPat before the
// pattern for the nested set. This position was
// recorded in nestedPatStart.
StringBuffer s = new StringBuffer();
_appendToPat(s, lastChar, false);
newPat.insert(nestedPatStart, s.toString());
} else {
_appendToPat(newPat, lastChar, false);
}
lastChar = NONE;
}
switch (lastOp) {
case '-':
@ -1309,7 +1450,19 @@ public class UnicodeSet implements UnicodeFilter {
addAll(nestedSet);
break;
}
// Get the pattern for the nested set, if we haven't done so
// already.
if (!nestedPatDone) {
if (lastOp != 0) {
newPat.append(lastOp);
}
nestedSet._toPattern(newPat, false);
}
rebuildPattern = true;
lastOp = 0;
} else if (!isLiteral && c == ']') {
// Final closing delimiter. This is the only way we leave this
// loop if the pattern is well-formed.
@ -1318,11 +1471,14 @@ public class UnicodeSet implements UnicodeFilter {
}
if (anchor == 2) {
rebuildPattern = true;
newPat.append(SymbolTable.SYMBOL_REF);
add(TransliterationRule.ETHER);
}
mode = 4;
break;
} else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) {
lastOp = c;
lastOp = (char) c;
} else if (lastOp == '-') {
if (lastChar >= c) {
// Don't allow redundant (a-a) or empty (b-a) ranges;
@ -1330,36 +1486,45 @@ public class UnicodeSet implements UnicodeFilter {
throw new IllegalArgumentException("Invalid range " + lastChar +
'-' + c);
}
add((char) lastChar, c);
add(lastChar, c);
_appendToPat(newPat, lastChar, false);
newPat.append('-');
_appendToPat(newPat, c, false);
lastOp = 0;
lastChar = -1;
lastChar = NONE;
} else if (lastOp != 0) {
// We have <set>&<char> or <char>&<char>
throw new IllegalArgumentException("Unquoted " + lastOp);
} else {
if (lastChar >= 0) {
if (lastChar != NONE) {
// We have <char><char>
add((char) lastChar, (char) lastChar);
_appendToPat(newPat, lastChar, false);
}
lastChar = c;
}
}
if (mode == 0) {
throw new IllegalArgumentException("Missing '[' in \"" +
pattern.substring(start) + '"');
if (lastChar != NONE) {
add(lastChar, lastChar);
_appendToPat(newPat, lastChar, false);
}
// if (mode == 0) {
// throw new IllegalArgumentException("Missing '[' in \"" +
// pattern.substring(start) + '"');
// }
// Handle unprocessed stuff preceding the closing ']'
if (lastOp == '-') {
// Trailing '-' is treated as literal
add(lastOp, lastOp);
newPat.append('-');
} else if (lastOp == '&') {
throw new IllegalArgumentException("Unquoted trailing " + lastOp);
}
if (lastChar >= 0) {
add((char) lastChar, (char) lastChar);
}
newPat.append(']');
/**
* If we saw a '^' after the initial '[' of this pattern, then perform
@ -1369,17 +1534,30 @@ public class UnicodeSet implements UnicodeFilter {
complement();
}
/**
* i indexes the last character we parsed or is pattern.length(). In
* the latter case, we have run off the end without finding a closing
* ']'. Otherwise, we know i < pattern.length(), and we set the
* ParsePosition to the next character to be parsed.
*/
if (i == limit) {
throw new IllegalArgumentException("Missing ']' in \"" +
pattern.substring(start) + '"');
if (mode != 4) {
throw new IllegalArgumentException("Missing ']'");
}
// /**
// * i indexes the last character we parsed or is pattern.length(). In
// * the latter case, we have run off the end without finding a closing
// * ']'. Otherwise, we know i < pattern.length(), and we set the
// * ParsePosition to the next character to be parsed.
// */
// if (i == limit) {
// throw new IllegalArgumentException("Missing ']' in \"" +
// pattern.substring(start) + '"');
// }
pos.setIndex(i);
// Use the rebuilt pattern (newPat) only if necessary. Prefer the
// generated pattern.
if (rebuildPattern) {
rebuiltPat.append(newPat.toString());
} else {
_generatePattern(rebuiltPat, false);
}
pos.setIndex(i+1);
if (false) {
// Debug parser
@ -1494,14 +1672,6 @@ public class UnicodeSet implements UnicodeFilter {
// Implementation: Utility methods
//----------------------------------------------------------------
/**
* Returns the character after the given position, or '\uFFFE' if
* there is none.
*/
private static final char charAfter(String str, int i) {
return ((++i) < str.length()) ? str.charAt(i) : '\uFFFE';
}
private void ensureCapacity(int newLen) {
if (newLen <= list.length) return;
int[] temp = new int[newLen + GROW_EXTRA];
@ -1571,6 +1741,7 @@ public class UnicodeSet implements UnicodeFilter {
int[] temp = list;
list = buffer;
buffer = temp;
pat = null;
return this;
}
@ -1668,6 +1839,7 @@ public class UnicodeSet implements UnicodeFilter {
int[] temp = list;
list = buffer;
buffer = temp;
pat = null;
return this;
}
@ -1738,6 +1910,7 @@ public class UnicodeSet implements UnicodeFilter {
int[] temp = list;
list = buffer;
buffer = temp;
pat = null;
return this;
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/CompoundTransliteratorTest.java,v $
* $Date: 2001/09/08 01:17:50 $
* $Revision: 1.2 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -111,7 +111,7 @@ public class CompoundTransliteratorTest extends TestFmwk {
public void TestGetTransliterator(){
logln("Testing the getTransliterator() API of CompoundTransliterator");
String ID="Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Unicode-Hex;Hex-Unicode";
String ID="Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Any-Hex;Hex-Any";
CompoundTransliterator ct1=null;
try{
ct1=new CompoundTransliterator(ID);
@ -141,9 +141,9 @@ public class CompoundTransliteratorTest extends TestFmwk {
logln("Testing the handleTransliterate() API of CompoundTransliterator");
CompoundTransliterator ct1=null;
try{
ct1=new CompoundTransliterator("Unicode-Hex;Hex-Unicode");
ct1=new CompoundTransliterator("Any-Hex;Hex-Any");
}catch(IllegalArgumentException iae){
errln("FAIL: construction using CompoundTransliterator(String ID) failed for " + "Unicode-Hex;Hex-Unicode");
errln("FAIL: construction using CompoundTransliterator(String ID) failed for " + "Any-Hex;Hex-Any");
throw iae;
}
@ -167,8 +167,8 @@ public class CompoundTransliteratorTest extends TestFmwk {
String Data[]={
//ID, input string, transliterated string
"Unicode-Hex;Hex-Unicode;Unicode-Hex", "hello", "\\u0068\\u0065\\u006C\\u006C\\u006F",
"Unicode-Hex;Hex-Unicode", "hello! How are you?", "hello! How are you?",
"Any-Hex;Hex-Any;Any-Hex", "hello", "\\u0068\\u0065\\u006C\\u006C\\u006F",
"Any-Hex;Hex-Any", "hello! How are you?", "hello! How are you?",
"Devanagari-Latin;Latin-Devanagari", "\u092D\u0948'\u0930'\u0935", "\u092D\u0948\u0930\u0935", // quotes lost
"Latin-Cyrillic;Cyrillic-Latin", "a'b'k'd'e'f'g'h'i'j'Shch'shch'zh'h", "abkdefghijShchshchzhh",
"Latin-Greek;Greek-Latin", "ABGabgAKLMN", "ABGabgAKLMN",

View File

@ -6,8 +6,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/HexToUnicodeTransliteratorTest.java,v $
* $Date: 2000/10/09 16:32:07 $
* $Revision: 1.2 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -33,7 +33,7 @@ public class HexToUnicodeTransliteratorTest extends TestFmwk {
* Used by TestConstruction() and TestTransliterate.
*/
UnicodeFilter HexFilter=new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
if(c == 0x0061 || c == 0x0063 )
return false;
else

View File

@ -4,9 +4,9 @@
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
* $Date: 2001/09/21 21:23:34 $
* $Revision: 1.45 $
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.46 $
*
*****************************************************************************************
*/
@ -48,14 +48,12 @@ public class TransliteratorTest extends TestFmwk {
throw ex;
}
// TODO remove check for class when we implement full
// toRules().
if (t != null && t instanceof RuleBasedTransliterator) {
if (t != null) {
// Now test toRules
String rules = null;
try {
rules = ((RuleBasedTransliterator)t).toRules(true);
rules = t.toRules(true);
Transliterator u = Transliterator.createFromRules("x",
rules, Transliterator.FORWARD);
} catch (IllegalArgumentException ex2) {
@ -74,7 +72,7 @@ public class TransliteratorTest extends TestFmwk {
} catch (IllegalArgumentException ex) {
logln("OK: Bogus ID handled properly");
}
ms = System.currentTimeMillis() - ms;
logln("Elapsed time: " + ms + " ms");
}
@ -223,7 +221,7 @@ public class TransliteratorTest extends TestFmwk {
* Basic test of keyboard.
*/
public void TestKeyboard() {
Transliterator t = new RuleBasedTransliterator("<ID>",
Transliterator t = new RuleBasedTransliterator("<ID>",
"psch>Y;"
+"ps>y;"
+"ch>x;"
@ -246,7 +244,7 @@ public class TransliteratorTest extends TestFmwk {
* Basic test of keyboard with cursor.
*/
public void TestKeyboard2() {
Transliterator t = new RuleBasedTransliterator("<ID>",
Transliterator t = new RuleBasedTransliterator("<ID>",
"ych>Y;"
+"ps>|y;"
+"ch>x;"
@ -381,7 +379,7 @@ public class TransliteratorTest extends TestFmwk {
public void TestFiltering() {
Transliterator hex = Transliterator.getInstance("Any-Hex");
hex.setFilter(new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
return c != 'c';
}
});
@ -398,7 +396,7 @@ public class TransliteratorTest extends TestFmwk {
/**
* Test anchors
*/
public void TestAnchors() {
public void TestAnchors() {
expect("^ab > 01 ;" +
" ab > |8 ;" +
" b > k ;" +
@ -406,7 +404,7 @@ public class TransliteratorTest extends TestFmwk {
" 8x > 77 ;",
"ababbabxabx",
"018k7745");
"018k7745");
expect("$s = [z$] ;" +
"$s{ab > 01 ;" +
" ab > |8 ;" +
@ -440,7 +438,7 @@ public class TransliteratorTest extends TestFmwk {
*/
public void TestJ277() {
Transliterator gl = Transliterator.getInstance("Greek-Latin");
char sigma = (char)0x3C3;
char upsilon = (char)0x3C5;
char nu = (char)0x3BD;
@ -517,7 +515,7 @@ public class TransliteratorTest extends TestFmwk {
// Try a custom Hex-Any
// \\uXXXX and &#xXXXX;
HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
expect(hex2, "\\u61\\u062\\u0063\\u00645\\u66x&#x30;&#x031;&#x0032;&#x00033;",
"abcd5fx012&#x00033;");
@ -740,7 +738,7 @@ public class TransliteratorTest extends TestFmwk {
if (!got.equals(exp)) {
errln("FAIL: Inverse of " + ID + " is " + got +
", expected " + exp);
}
}
}
/**
@ -752,18 +750,18 @@ public class TransliteratorTest extends TestFmwk {
"Hex[aeiou]-Any",
"quizzical",
"q\\u0075\\u0069zz\\u0069c\\u0061l",
"Any[aeiou]-Hex;Hex[^5]-Any",
"Any[^5]-Hex;Hex[aeiou]-Any",
"quizzical",
"q\\u0075izzical",
"Null[abc]",
"Null[abc]",
"xyz",
"xyz",
};
for (int i=0; i<DATA.length; i+=4) {
String ID = DATA[i];
Transliterator t = Transliterator.getInstance(ID);
@ -796,6 +794,109 @@ public class TransliteratorTest extends TestFmwk {
"Th qck brwn fx.");
}
public void TestToRules() {
String RBT = "rbt";
String SET = "set";
String[] DATA = {
RBT,
"$a=\\u4E61; [$a] > A;",
"[\\u4E61] > A;",
RBT,
"$white=[[:Zs:][:Zl:]]; $white{a} > A;",
"[[:Zs:][:Zl:]]{a} > A;",
SET,
"[[:Zs:][:Zl:]]",
"[[:Zs:][:Zl:]]",
SET,
"[:Ps:]",
"[:Ps:]",
SET,
"[:L:]",
"[:L:]",
SET,
"[[:L:]-[A]]",
"[[:L:]-[A]]",
SET,
"[~[:Lu:][:Ll:]]",
"[~[:Lu:][:Ll:]]",
SET,
"[~[a-z]]",
"[~[a-z]]",
RBT,
"$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
"[^[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
"[[a-z]-[:Zs:]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
"[[:Zs:]&[a-z]]{a} > A;",
RBT,
"$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
"[x[:Zs:]]{a} > A;",
};
for (int d=0; d < DATA.length; d+=3) {
if (DATA[d] == RBT) {
// Transliterator test
Transliterator t = Transliterator.createFromRules("ID",
DATA[d+1], Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules failed");
return;
}
String rules, escapedRules;
rules = t.toRules(false);
escapedRules = t.toRules(true);
String expRules = Utility.unescape(DATA[d+2]);
String expEscapedRules = DATA[d+2];
if (rules.equals(expRules)) {
logln("Ok: " + DATA[d+1] +
" => " + Utility.escape(rules));
} else {
errln("FAIL: " + DATA[d+1] +
" => " + Utility.escape(rules + ", exp " + expRules));
}
if (escapedRules.equals(expEscapedRules)) {
logln("Ok: " + DATA[d+1] +
" => " + escapedRules);
} else {
errln("FAIL: " + DATA[d+1] +
" => " + escapedRules + ", exp " + expEscapedRules);
}
} else {
// UnicodeSet test
String pat = DATA[d+1];
String expToPat = DATA[d+2];
UnicodeSet set = new UnicodeSet(pat);
// Adjust spacing etc. as necessary.
String toPat;
toPat = set.toPattern(true);
if (expToPat.equals(toPat)) {
logln("Ok: " + pat +
" => " + toPat);
} else {
errln("FAIL: " + pat +
" => " + Utility.escape(toPat) +
", exp " + Utility.escape(pat));
}
}
}
}
/**
* Test the case mapping transliterators.
*/
@ -806,7 +907,7 @@ public class TransliteratorTest extends TestFmwk {
Transliterator.getInstance("Any-Lower[^xyzXYZ]");
Transliterator toTitle =
Transliterator.getInstance("Any-Title[^xyzXYZ]");
expect(toUpper, "The quick brown fox jumped over the lazy dogs.",
"THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
expect(toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
@ -857,7 +958,7 @@ public class TransliteratorTest extends TestFmwk {
errln("FAIL: " + DATA[i+2] +
" create ID \"" + DATA[i] + "\" => \"" +
t.getID() + "\", exp \"" + DATA[i+1] + "\"");
}
}
} catch (IllegalArgumentException e) {
errln("FAIL: " + DATA[i+2] +
" create ID \"" + DATA[i] + "\"");
@ -875,52 +976,52 @@ public class TransliteratorTest extends TestFmwk {
// Input Decomposed Composed
{"cat", "cat", "cat" },
{"\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark" },
{"\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above
{"D\u0307", "D\u0307", "\u1e0a" }, // D dot_above
{"\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above
{"\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below
{"D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above
{"\u1e10\u0307\u0323", "D\u0327\u0323\u0307","\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above
{"D\u0307\u0328\u0323","D\u0328\u0323\u0307","\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below
{"\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave
{"\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave
{"\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron
{"\u212b", "A\u030a", "\u00c5" }, // angstrom_sign
{"\u00c5", "A\u030a", "\u00c5" }, // A-ring
{"\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0
{"\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0
{"Henry IV", "Henry IV", "Henry IV" },
{"Henry \u2163", "Henry \u2163", "Henry \u2163" },
{"\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
{"\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
{"\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten
{"\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten
{"\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten
{"A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
};
String[][] COMPAT = {
};
String[][] COMPAT = {
// Input Decomposed Composed
{"\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC" }, // Alef-Lamed vs. Alef, Lamed
{"\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0
{"\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i
{"Henry IV", "Henry IV", "Henry IV" },
{"Henry \u2163", "Henry IV", "Henry IV" },
{"\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
{"\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
{"\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten
};
@ -960,7 +1061,7 @@ public class TransliteratorTest extends TestFmwk {
expect(t, "\u010dx", "c\u030C");
}
}
/**
* Test compound RBT rules.
*/
@ -1116,7 +1217,7 @@ public class TransliteratorTest extends TestFmwk {
append('|').
append(s.substring(index.start));
}
// As a final step in keyboard transliteration, we must call
// transliterate to finish off any pending partial matches that
// were waiting for more input.
@ -1135,7 +1236,7 @@ public class TransliteratorTest extends TestFmwk {
result.equals(expectedResult),
expectedResult);
}
void expectAux(String tag, String summary, boolean pass,
String expectedResult) {
if (pass) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeFilterLogicTest.java,v $
* $Date: 2000/10/04 23:12:33 $
* $Revision: 1.2 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -29,7 +29,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {
UnicodeFilter Filter1=new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
if(c == 0x0061 || c == 0x0041 || c == 0x0063 || c == 0x0043)
return false;
else
@ -37,7 +37,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {
}
};
UnicodeFilter Filter2=new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
if(c == 0x0079 || c == 0x0059 || c == 0x007a || c == 0x005a || c == 0x0061 || c == 0x0063)
return false;
else
@ -47,7 +47,7 @@ public class UnicodeFilterLogicTest extends TestFmwk {
public void TestAllFilters() {
Transliterator t1 = Transliterator.getInstance("Unicode-Hex");
Transliterator t1 = Transliterator.getInstance("Any-Hex");
String source="abcdABCDyzYZ";
//sanity testing wihtout any filter

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeSetTest.java,v $
* $Date: 2001/09/08 01:17:50 $
* $Revision: 1.11 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.12 $
*
*****************************************************************************************
*/
@ -181,8 +181,8 @@ public class UnicodeSetTest extends TestFmwk {
set.clear();
set.applyPattern("[A-Y 1-8 b-d l-y]");
for (int i = 0; i<set.getRangeCount(); ++i) {
char a = set.getRangeStart(i);
char b = set.getRangeEnd(i);
int a = set.getRangeStart(i);
int b = set.getRangeEnd(i);
if (!set.contains(a, b)) {
errln("FAIL, should contain " + (char)a + '-' + (char)b +
" but doesn't: " + set);
@ -219,7 +219,7 @@ public class UnicodeSetTest extends TestFmwk {
if (c.equals(exp)) {
logln("c.complement(): " + c);
} else {
errln("FAIL: c.complement() = " + c + ", expect " + exp);
errln(Utility.escape("FAIL: c.complement() = " + c + ", expect " + exp));
}
c.complement();
exp.set((char)3, (char)15);
@ -252,13 +252,13 @@ public class UnicodeSetTest extends TestFmwk {
public void TestIndexOf() {
UnicodeSet set = new UnicodeSet("[a-cx-y3578]");
for (int i=0; i<set.size(); ++i) {
char c = set.charAt(i);
int c = set.charAt(i);
if (set.indexOf(c) != i) {
errln("FAIL: charAt(" + i + ") = " + c +
" => indexOf() => " + set.indexOf(c));
}
}
char c = set.charAt(set.size());
int c = set.charAt(set.size());
if (c != '\uFFFE') {
errln("FAIL: charAt(<out of range>) = " +
Utility.escape(String.valueOf(c)));

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeToHexTransliteratorTest.java,v $
* $Date: 2000/10/16 16:58:29 $
* $Revision: 1.3 $
* $Date: 2001/09/24 19:56:41 $
* $Revision: 1.4 $
*
*****************************************************************************************
*/
@ -32,7 +32,7 @@ public class UnicodeToHexTransliteratorTest extends TestFmwk {
* Used by TestConstruction() and TestTransliterate.
*/
UnicodeFilter UniFilter=new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
if(c==0x0063 || c==0x0061 || c==0x0043 || c==0x0041)
return false;
else

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
* $Date: 2001/09/21 21:24:04 $
* $Revision: 1.44 $
* $Date: 2001/09/24 19:57:17 $
* $Revision: 1.45 $
*
*****************************************************************************************
*/
@ -279,7 +279,7 @@ import com.ibm.text.resources.ResourceReader;
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
*
* @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.44 $ $Date: 2001/09/21 21:24:04 $
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.45 $ $Date: 2001/09/24 19:57:17 $
*/
public class RuleBasedTransliterator extends Transliterator {
@ -542,7 +542,7 @@ public class RuleBasedTransliterator extends Transliterator {
/**
* Implement SymbolTable API.
*/
public UnicodeSet lookupSet(char ch) {
public UnicodeSet lookupSet(int ch) {
// Note that we cannot use data.lookupSet() because the
// set array has not been constructed yet.
int i = ch - data.setVariablesBase;
@ -1579,6 +1579,9 @@ public class RuleBasedTransliterator extends Transliterator {
/**
* $Log: RuleBasedTransliterator.java,v $
* Revision 1.45 2001/09/24 19:57:17 alan
* jitterbug 60: implement toPattern in UnicodeSet; update UnicodeFilter.contains to take an int; update UnicodeSet to support code points to U+10FFFF
*
* Revision 1.44 2001/09/21 21:24:04 alan
* jitterbug 64: allow ::ID blocks in rules
*

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/SymbolTable.java,v $
* $Date: 2000/08/30 20:40:30 $
* $Revision: 1.6 $
* $Date: 2001/09/24 19:57:18 $
* $Revision: 1.7 $
*
*****************************************************************************************
*/
@ -41,8 +41,9 @@ public interface SymbolTable {
/**
* Lookup the UnicodeSet associated with the given character, and
* return it. Return <tt>null</tt> if not found.
* @param ch a 32-bit code point from 0 to 0x10FFFF.
*/
UnicodeSet lookupSet(char ch);
UnicodeSet lookupSet(int ch);
/**
* Parse a symbol reference name from the given string, starting

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeFilter.java,v $
* $Date: 2000/03/10 04:07:25 $
* $Revision: 1.4 $
* $Date: 2001/09/24 19:57:18 $
* $Revision: 1.5 $
*
*****************************************************************************************
*/
@ -30,5 +30,5 @@ public interface UnicodeFilter {
* filtered</b>, then <tt>contains()</tt> returns
* <b><tt>false</tt></b>.
*/
boolean contains(char c);
boolean contains(int c);
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeFilterLogic.java,v $
* $Date: 2000/03/10 04:07:25 $
* $Revision: 1.3 $
* $Date: 2001/09/24 19:57:18 $
* $Revision: 1.4 $
*
*****************************************************************************************
*/
@ -28,7 +28,7 @@ public final class UnicodeFilterLogic {
*/
public static UnicodeFilter not(final UnicodeFilter f) {
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
return !f.contains(c);
}
};
@ -51,7 +51,7 @@ public final class UnicodeFilterLogic {
return f;
}
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
return f.contains(c) && g.contains(c);
}
};
@ -66,7 +66,7 @@ public final class UnicodeFilterLogic {
*/
public static UnicodeFilter and(final UnicodeFilter[] f) {
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
for (int i=0; i<f.length; ++i) {
if (!f[i].contains(c)) {
return false;
@ -94,7 +94,7 @@ public final class UnicodeFilterLogic {
return f;
}
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
return f.contains(c) || g.contains(c);
}
};
@ -109,7 +109,7 @@ public final class UnicodeFilterLogic {
*/
public static UnicodeFilter or(final UnicodeFilter[] f) {
return new UnicodeFilter() {
public boolean contains(char c) {
public boolean contains(int c) {
for (int i=0; i<f.length; ++i) {
if (f[i].contains(c)) {
return true;

View File

@ -5,21 +5,22 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
* $Date: 2001/09/20 21:20:00 $
* $Revision: 1.33 $
* $Date: 2001/09/24 19:57:18 $
* $Revision: 1.34 $
*
*****************************************************************************************
*/
package com.ibm.text;
import java.text.*;
import com.ibm.util.Utility;
/**
* A mutable set of Unicode characters. Objects of this class
* represent <em>character classes</em> used in regular expressions.
* Such classes specify a subset of the set of all Unicode characters,
* which in this implementation is the characters from U+0000 to
* U+FFFF, ignoring surrogates.
* U+10FFFF.
*
* <p><code>UnicodeSet</code> supports two APIs. The first is the
* <em>operand</em> API that allows the caller to modify the value of
@ -184,7 +185,7 @@ import java.text.*;
* through 'z' and all letters in between, in Unicode order
* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
* all characters but 'a' through 'z',
* that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF
* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
@ -254,10 +255,11 @@ import java.text.*;
* *Unsupported by Java (and hence unsupported by UnicodeSet).
*
* @author Alan Liu
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.33 $ $Date: 2001/09/20 21:20:00 $ */
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.34 $ $Date: 2001/09/24 19:57:18 $ */
public class UnicodeSet implements UnicodeFilter {
/* Implementation Notes.
* NOTE: This conversion has been completed as of 2.0.
*
* UnicodeSet currently represents only the characters U+0000 to
* U+FFFF. This allows the API to be written in terms of the Java
@ -285,24 +287,35 @@ public class UnicodeSet implements UnicodeFilter {
*/
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
private static final int HIGH = 0x10000; // HIGH > all valid values. 10000 for code units.
private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
// 110000 for codepoints
/**
* Minimum value that can be stored in a UnicodeSet.
*/
public static final char MIN_VALUE = (char) LOW;
public static final int MIN_VALUE = LOW;
/**
* Maximum value that can be stored in a UnicodeSet.
*/
public static final char MAX_VALUE = (char) (HIGH - 1);
public static final int MAX_VALUE = HIGH - 1;
private int len; // length used; list may be longer to minimize reallocs
private int[] list; // MUST be terminated with HIGH
private int[] rangeList; // internal buffer
private int[] buffer; // internal buffer
/**
* The pattern representation of this set. This may not be the
* most economical pattern. It is the pattern supplied to
* applyPattern(), with variables substituted and whitespace
* removed. For sets constructed without applyPattern(), or
* modified using the non-pattern API, this string will be null,
* indicating that toPattern() must generate a pattern
* representation from the inversion list.
*/
private String pat = null;
private static final int START_EXTRA = 16; // initial storage. Must be >= 0
private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
@ -349,7 +362,7 @@ public class UnicodeSet implements UnicodeFilter {
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
*/
public UnicodeSet(char start, char end) {
public UnicodeSet(int start, int end) {
this();
complement(start, end);
}
@ -418,7 +431,7 @@ public class UnicodeSet implements UnicodeFilter {
* @param start first character in the set, inclusive
* @rparam end last character in the set, inclusive
*/
public void set(char start, char end) {
public void set(int start, int end) {
clear();
complement(start, end);
}
@ -431,6 +444,7 @@ public class UnicodeSet implements UnicodeFilter {
public void set(UnicodeSet other) {
list = (int[]) other.list.clone();
len = other.len;
pat = other.pat;
}
/**
@ -475,24 +489,6 @@ public class UnicodeSet implements UnicodeFilter {
}
}
/**
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
*/
private static final void _toPat(StringBuffer buf, char c) {
// Okay to let ':' pass through
switch (c) {
case '[':
case ']':
case '-':
case '^':
case '&':
case '\\':
buf.append('\\');
}
buf.append(c);
}
/**
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
@ -575,6 +571,53 @@ public class UnicodeSet implements UnicodeFilter {
*/
public String toPattern(boolean escapeUnprintable) {
StringBuffer result = new StringBuffer();
return _toPattern(result, escapeUnprintable).toString();
}
/**
* Append a string representation of this set to result. This will be
* a cleaned version of the string passed to applyPattern(), if there
* is one. Otherwise it will be generated.
*/
private StringBuffer _toPattern(StringBuffer result,
boolean escapeUnprintable) {
if (pat != null) {
int i;
int backslashCount = 0;
for (i=0; i<pat.length(); ++i) {
char c = pat.charAt(i);
if (escapeUnprintable && _isUnprintable(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
// Before unescaping it, we delete the final
// backslash.
if ((backslashCount % 2) == 1) {
result.setLength(result.length() - 1);
}
_escapeUnprintable(result, c);
backslashCount = 0;
} else {
result.append(c);
if (c == '\\') {
++backslashCount;
} else {
backslashCount = 0;
}
}
}
return result;
}
return _generatePattern(result, escapeUnprintable);
}
/**
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
*/
public StringBuffer _generatePattern(StringBuffer result,
boolean escapeUnprintable) {
result.append('[');
// Check against the predefined categories. We implicitly build
@ -583,7 +626,7 @@ public class UnicodeSet implements UnicodeFilter {
if (this.equals(getCategorySet(cat))) {
result.append(':');
result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
return result.append(":]").toString();
return result.append(":]");
}
}
@ -623,7 +666,7 @@ public class UnicodeSet implements UnicodeFilter {
}
}
return result.append(']').toString();
return result.append(']');
}
/**
@ -659,7 +702,13 @@ public class UnicodeSet implements UnicodeFilter {
* @return <tt>true</tt> if this set contains the specified range
* of chars.
*/
public boolean contains(char start, char end) {
public boolean contains(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
int i = -1;
while (true) {
if (start < list[++i]) break;
@ -674,7 +723,10 @@ public class UnicodeSet implements UnicodeFilter {
* <code>charAt()</code>.
* @return an index from 0..size()-1, or -1
*/
public int indexOf(char c) {
public int indexOf(int c) {
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
int i = 0;
int n = 0;
for (;;) {
@ -697,7 +749,7 @@ public class UnicodeSet implements UnicodeFilter {
* <code>indexOf()</code>.
* @param index an index from 0..size()-1
*/
public char charAt(int index) {
public int charAt(int index) {
if (index >= 0) {
for (int i=0; i < len;) {
int start = list[i++];
@ -716,12 +768,14 @@ public class UnicodeSet implements UnicodeFilter {
*
* @return <tt>true</tt> if this set contains the specified char.
*/
public boolean contains(char c) {
//| Not needed unless HIGH > 0x10000
//| // catch degenerate cases
//| if (c == HIGH) { // catch final, so we don't do it in loop!
//| return (len & 1) == 0; // even length includes everything
//| }
public boolean contains(int c) {
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
// catch degenerate cases (not needed unless HIGH > 0x10000
if (c == HIGH) { // catch final, so we don't do it in loop!
return (len & 1) == 0; // even length includes everything
}
// Set i to the index of the start item greater than ch
// We know we will terminate without length test!
// LATER: for large sets, add binary search
@ -771,7 +825,13 @@ public class UnicodeSet implements UnicodeFilter {
* @param end last character, inclusive, of range to be added
* to this set.
*/
public void add(char start, char end) {
public void add(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
add(range(start, end), 2, 0);
}
@ -782,7 +842,7 @@ public class UnicodeSet implements UnicodeFilter {
* present. If this set already contains the specified character,
* the call leaves this set unchanged.
*/
public final void add(char c) {
public final void add(int c) {
add(c, c);
}
@ -796,7 +856,13 @@ public class UnicodeSet implements UnicodeFilter {
* @param end last character, inclusive, of range to be retained
* to this set.
*/
public void retain(char start, char end) {
public void retain(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
retain(range(start, end), 2, 0);
} else {
@ -807,7 +873,7 @@ public class UnicodeSet implements UnicodeFilter {
/**
* Retain the specified character from this set if it is present.
*/
public final void retain(char c) {
public final void retain(int c) {
retain(c, c);
}
@ -822,7 +888,13 @@ public class UnicodeSet implements UnicodeFilter {
* @param end last character, inclusive, of range to be removed
* from this set.
*/
public void remove(char start, char end) {
public void remove(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
retain(range(start, end), 2, 2);
}
@ -833,7 +905,7 @@ public class UnicodeSet implements UnicodeFilter {
* The set will not contain the specified character once the call
* returns.
*/
public final void remove(char c) {
public final void remove(int c) {
remove(c, c);
}
@ -848,7 +920,13 @@ public class UnicodeSet implements UnicodeFilter {
* @param end last character, inclusive, of range to be removed
* from this set.
*/
public void complement(char start, char end) {
public void complement(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
xor(range(start, end), 2, 0);
}
@ -859,7 +937,7 @@ public class UnicodeSet implements UnicodeFilter {
* will be removed if it is in this set, or will be added if it is
* not in this set.
*/
public final void complement(char c) {
public final void complement(int c) {
complement(c, c);
}
@ -878,6 +956,7 @@ public class UnicodeSet implements UnicodeFilter {
list[0] = LOW;
++len;
}
pat = null;
}
/**
@ -960,6 +1039,7 @@ public class UnicodeSet implements UnicodeFilter {
public void clear() {
list[0] = HIGH;
len = 1;
pat = null;
}
/**
@ -980,8 +1060,8 @@ public class UnicodeSet implements UnicodeFilter {
* @see #getRangeCount
* @see #getRangeEnd
*/
public char getRangeStart(int index) {
return (char) list[index*2];
public int getRangeStart(int index) {
return list[index*2];
}
/**
@ -992,8 +1072,8 @@ public class UnicodeSet implements UnicodeFilter {
* @see #getRangeStart
* @see #getRangeEnd
*/
public char getRangeEnd(int index) {
return (char) (list[index*2 + 1] - 1);
public int getRangeEnd(int index) {
return (list[index*2 + 1] - 1);
}
/**
@ -1052,7 +1132,7 @@ public class UnicodeSet implements UnicodeFilter {
* Return a programmer-readable string representation of this object.
*/
public String toString() {
return getClass().getName() + '{' + toPattern(false) + '}';
return getClass().getName() + '(' + toPattern(false) + ')';
}
//----------------------------------------------------------------
@ -1081,13 +1161,37 @@ public class UnicodeSet implements UnicodeFilter {
* of <code>pattern</code>
* @exception java.lang.IllegalArgumentException if the parse fails.
*/
void applyPattern(String pattern, ParsePosition pos,
SymbolTable symbols, boolean ignoreWhitespace) {
void applyPattern(String pattern,
ParsePosition pos,
SymbolTable symbols,
boolean ignoreWhitespace) {
// Need to build the pattern in a temporary string because
// _applyPattern calls add() etc., which set pat to empty.
StringBuffer rebuiltPat = new StringBuffer();
_applyPattern(pattern, pos, symbols, rebuiltPat, ignoreWhitespace);
pat = rebuiltPat.toString();
}
void _applyPattern(String pattern, ParsePosition pos,
SymbolTable symbols, StringBuffer rebuiltPat,
boolean ignoreWhitespace) {
// If the pattern contains any of the following, we save a
// rebuilt (variable-substituted) copy of the source pattern:
// - a category
// - an intersection or subtraction operator
// - an anchor (trailing '$', indicating RBT ether)
boolean rebuildPattern = false;
StringBuffer newPat = new StringBuffer("[");
int nestedPatStart = -1; // see below for usage
boolean nestedPatDone = false; // see below for usage
boolean invert = false;
clear();
int lastChar = -1; // This is either a char (0..FFFF) or -1
final int NONE = -1;
int lastChar = NONE; // This is either a char (0..10FFFF) or -1
char lastOp = 0;
/* This loop iterates over the characters in the pattern. We start at
@ -1109,7 +1213,7 @@ public class UnicodeSet implements UnicodeFilter {
// mode 2: '[' '^'? seen; parse pattern and close with ']'
// mode 3: '[:' seen; parse category and close with ':]'
int mode = 0;
int openPos = 0; // offset to opening '['
int colonPos = 0; // Expected pos of ':' in '[:'
int start = pos.getIndex();
int i = start;
int limit = pattern.length();
@ -1120,33 +1224,37 @@ public class UnicodeSet implements UnicodeFilter {
char[] varValueBuffer = null;
int ivarValueBuffer = 0;
int anchor = 0;
for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
int c;
while (i<limit) {
/* If the next element is a single character, c will be set to it,
* and nestedSet will be null. In this case isLiteral indicates
* whether the character should assume special meaning if it has
* one. If the next element is a nested set, either via a variable
* reference, or via an embedded "[..]" or "[:..:]" pattern, then
* nestedSet will be set to the i-list for the nested set, and
* nestedSet will be set to the pairs list for the nested set, and
* c's value should be ignored.
*/
UnicodeSet nestedSet = null;
boolean isLiteral = false;
char c;
if (varValueBuffer != null) {
if (ivarValueBuffer < varValueBuffer.length) {
c = varValueBuffer[ivarValueBuffer++];
c = UTF16.charAt(varValueBuffer, 0, varValueBuffer.length, ivarValueBuffer);
ivarValueBuffer += UTF16.getCharCount(c);
nestedSet = symbols.lookupSet(c); // may be NULL
nestedPatDone = false;
} else {
varValueBuffer = null;
c = pattern.charAt(i);
c = UTF16.charAt(pattern, i);
i += UTF16.getCharCount(c);
}
} else {
c = pattern.charAt(i);
c = UTF16.charAt(pattern, i);
i += UTF16.getCharCount(c);
}
// Ignore whitespace. This is not Unicode whitespace, but Java
// whitespace, a subset of Unicode whitespace.
if (ignoreWhitespace && Character.isWhitespace(c)) {
if (ignoreWhitespace && UCharacter.isWhitespace(c)) {
continue;
}
@ -1160,7 +1268,7 @@ public class UnicodeSet implements UnicodeFilter {
case 0:
if (c == '[') {
mode = 1; // Next look for '^'
openPos = i;
colonPos = i; // Expect ':' at next offset
continue;
} else {
throw new IllegalArgumentException("Missing opening '['");
@ -1170,14 +1278,17 @@ public class UnicodeSet implements UnicodeFilter {
switch (c) {
case '^':
invert = true;
newPat.append((char) c);
continue; // Back to top to fetch next character
case ':':
if (i == openPos+1) {
if (i-1 == colonPos) {
// '[:' cannot have whitespace in it
--i;
--i; // Backup to the '['
c = '[';
mode = 3;
// Fall through and parse category normally
// Fall through and parse category using the same
// code used to parse a nested category. The mode
// will indicate that this is actually top level.
}
break; // Fall through
case '-':
@ -1202,27 +1313,19 @@ public class UnicodeSet implements UnicodeFilter {
* interpret '\\uxxxx' Unicode escapes here (as literals).
*/
if (c == '\\') {
++i;
if (i < limit) {
c = pattern.charAt(i);
isLiteral = true;
if (c == 'u') {
if ((i+4) >= limit) {
throw new IllegalArgumentException("Invalid \\u escape");
}
c = '\u0000';
for (int j=(++i)+4; i<j; ++i) { // [sic]
int digit = Character.digit(pattern.charAt(i), 16);
if (digit<0) {
throw new IllegalArgumentException("Invalid \\u escape");
}
c = (char) ((c << 4) | digit);
}
--i; // Move i back to last parsed character
}
} else {
throw new IllegalArgumentException("Trailing '\\'");
int[] offset = new int[] { i };
int escaped = Utility.unescapeAt(pattern, offset);
if (escaped == -1) {
int sta = Math.max(i - 8, 0);
int lim = Math.min(i + 16, pattern.length());
throw new IllegalArgumentException("Invalid escape sequence " +
pattern.substring(sta, i-1) +
"|" +
pattern.substring(i-1, lim));
}
i = offset[0];
isLiteral = true;
c = escaped;
}
/* Parse variable references. These are treated as literals. If a
@ -1232,7 +1335,7 @@ public class UnicodeSet implements UnicodeFilter {
* Set variables are only looked up if varCharToSet is not null.
*/
else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
pos.setIndex(++i);
pos.setIndex(i);
String name = symbols.parseReference(pattern, pos, limit);
if (name != null) {
varValueBuffer = symbols.lookup(name);
@ -1246,7 +1349,6 @@ public class UnicodeSet implements UnicodeFilter {
// Got a null; this means we have an isolated $.
// Tentatively assume this is an anchor.
anchor = 1;
--i; // Back up so loop increment works properly
}
continue; // Back to the top to get varValueBuffer[0]
}
@ -1256,28 +1358,56 @@ public class UnicodeSet implements UnicodeFilter {
* recognize these here and set nestedSet accordingly.
*/
else if (!isLiteral && c == '[') {
// Record position before nested pattern
nestedPatStart = newPat.length();
// Handle "[:...:]", representing a character category
char d = charAfter(pattern, i);
if (d == ':') {
i += 2;
if (i < pattern.length() && pattern.charAt(i) == ':') {
++i;
int j = pattern.indexOf(":]", i);
if (j < 0) {
throw new IllegalArgumentException("Missing \":]\"");
}
String scratch = pattern.substring(i, j);
nestedSet = new UnicodeSet();
nestedSet.applyCategory(pattern.substring(i, j));
i = j+1; // Make i point to ']' in ":]"
nestedSet.applyCategory(scratch);
nestedPatDone = true; // We're going to do it just below
i = j+2; // Advance i past ":]"
// Use a rebuilt pattern. If we are top level,
// then there is already a SET_OPEN in newPat, and
// SET_CLOSE will be appended elsewhere.
if (mode != 3) {
newPat.append('[');
}
newPat.append(':').append(scratch).append(':');
if (mode != 3) {
newPat.append(']');
}
rebuildPattern = true;
if (mode == 3) {
// Entire pattern is a category; leave parse loop
// Entire pattern is a category; leave parse
// loop. This is one of 2 ways we leave this
// loop if the pattern is well-formed.
set(nestedSet);
mode = 4;
break;
}
} else {
// Recurse to get the i-list for this nested set.
pos.setIndex(i); // Add 2 to point AFTER op
// Recurse to get the pairs for this nested set.
// Backup i to '['.
pos.setIndex(--i);
switch (lastOp) {
case '-':
case '&':
newPat.append(lastOp);
break;
}
nestedSet = new UnicodeSet();
nestedSet.applyPattern(pattern, pos, symbols, ignoreWhitespace);
i = pos.getIndex() - 1; // - 1 to point at ']'
nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
nestedPatDone = true;
i = pos.getIndex();
}
}
}
@ -1291,12 +1421,23 @@ public class UnicodeSet implements UnicodeFilter {
* ']' have special meanings.
*/
if (nestedSet != null) {
if (lastChar >= 0) {
if (lastChar != NONE) {
if (lastOp != 0) {
throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
}
add((char) lastChar, (char) lastChar);
lastChar = -1;
if (nestedPatDone) {
// If there was a character before the nested set,
// then we need to insert it in newPat before the
// pattern for the nested set. This position was
// recorded in nestedPatStart.
StringBuffer s = new StringBuffer();
_appendToPat(s, lastChar, false);
newPat.insert(nestedPatStart, s.toString());
} else {
_appendToPat(newPat, lastChar, false);
}
lastChar = NONE;
}
switch (lastOp) {
case '-':
@ -1309,7 +1450,19 @@ public class UnicodeSet implements UnicodeFilter {
addAll(nestedSet);
break;
}
// Get the pattern for the nested set, if we haven't done so
// already.
if (!nestedPatDone) {
if (lastOp != 0) {
newPat.append(lastOp);
}
nestedSet._toPattern(newPat, false);
}
rebuildPattern = true;
lastOp = 0;
} else if (!isLiteral && c == ']') {
// Final closing delimiter. This is the only way we leave this
// loop if the pattern is well-formed.
@ -1318,11 +1471,14 @@ public class UnicodeSet implements UnicodeFilter {
}
if (anchor == 2) {
rebuildPattern = true;
newPat.append(SymbolTable.SYMBOL_REF);
add(TransliterationRule.ETHER);
}
mode = 4;
break;
} else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) {
lastOp = c;
lastOp = (char) c;
} else if (lastOp == '-') {
if (lastChar >= c) {
// Don't allow redundant (a-a) or empty (b-a) ranges;
@ -1330,36 +1486,45 @@ public class UnicodeSet implements UnicodeFilter {
throw new IllegalArgumentException("Invalid range " + lastChar +
'-' + c);
}
add((char) lastChar, c);
add(lastChar, c);
_appendToPat(newPat, lastChar, false);
newPat.append('-');
_appendToPat(newPat, c, false);
lastOp = 0;
lastChar = -1;
lastChar = NONE;
} else if (lastOp != 0) {
// We have <set>&<char> or <char>&<char>
throw new IllegalArgumentException("Unquoted " + lastOp);
} else {
if (lastChar >= 0) {
if (lastChar != NONE) {
// We have <char><char>
add((char) lastChar, (char) lastChar);
_appendToPat(newPat, lastChar, false);
}
lastChar = c;
}
}
if (mode == 0) {
throw new IllegalArgumentException("Missing '[' in \"" +
pattern.substring(start) + '"');
if (lastChar != NONE) {
add(lastChar, lastChar);
_appendToPat(newPat, lastChar, false);
}
// if (mode == 0) {
// throw new IllegalArgumentException("Missing '[' in \"" +
// pattern.substring(start) + '"');
// }
// Handle unprocessed stuff preceding the closing ']'
if (lastOp == '-') {
// Trailing '-' is treated as literal
add(lastOp, lastOp);
newPat.append('-');
} else if (lastOp == '&') {
throw new IllegalArgumentException("Unquoted trailing " + lastOp);
}
if (lastChar >= 0) {
add((char) lastChar, (char) lastChar);
}
newPat.append(']');
/**
* If we saw a '^' after the initial '[' of this pattern, then perform
@ -1369,17 +1534,30 @@ public class UnicodeSet implements UnicodeFilter {
complement();
}
/**
* i indexes the last character we parsed or is pattern.length(). In
* the latter case, we have run off the end without finding a closing
* ']'. Otherwise, we know i < pattern.length(), and we set the
* ParsePosition to the next character to be parsed.
*/
if (i == limit) {
throw new IllegalArgumentException("Missing ']' in \"" +
pattern.substring(start) + '"');
if (mode != 4) {
throw new IllegalArgumentException("Missing ']'");
}
// /**
// * i indexes the last character we parsed or is pattern.length(). In
// * the latter case, we have run off the end without finding a closing
// * ']'. Otherwise, we know i < pattern.length(), and we set the
// * ParsePosition to the next character to be parsed.
// */
// if (i == limit) {
// throw new IllegalArgumentException("Missing ']' in \"" +
// pattern.substring(start) + '"');
// }
pos.setIndex(i);
// Use the rebuilt pattern (newPat) only if necessary. Prefer the
// generated pattern.
if (rebuildPattern) {
rebuiltPat.append(newPat.toString());
} else {
_generatePattern(rebuiltPat, false);
}
pos.setIndex(i+1);
if (false) {
// Debug parser
@ -1494,14 +1672,6 @@ public class UnicodeSet implements UnicodeFilter {
// Implementation: Utility methods
//----------------------------------------------------------------
/**
* Returns the character after the given position, or '\uFFFE' if
* there is none.
*/
private static final char charAfter(String str, int i) {
return ((++i) < str.length()) ? str.charAt(i) : '\uFFFE';
}
private void ensureCapacity(int newLen) {
if (newLen <= list.length) return;
int[] temp = new int[newLen + GROW_EXTRA];
@ -1571,6 +1741,7 @@ public class UnicodeSet implements UnicodeFilter {
int[] temp = list;
list = buffer;
buffer = temp;
pat = null;
return this;
}
@ -1668,6 +1839,7 @@ public class UnicodeSet implements UnicodeFilter {
int[] temp = list;
list = buffer;
buffer = temp;
pat = null;
return this;
}
@ -1738,6 +1910,7 @@ public class UnicodeSet implements UnicodeFilter {
int[] temp = list;
list = buffer;
buffer = temp;
pat = null;
return this;
}