ICU-1560 added to test program, fixed Katakana

X-SVN-Rev: 7201
This commit is contained in:
Mark Davis 2001-11-30 01:59:45 +00:00
parent 78a66d2c3b
commit 51b90b9996
6 changed files with 78 additions and 18 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/translit/UnicodeSetClosure.java,v $
* $Date: 2001/11/30 00:27:06 $
* $Revision: 1.2 $
* $Date: 2001/11/30 01:59:45 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -65,8 +65,8 @@ public class UnicodeSetClosure {
+ s.toPattern(true)
+ (forward ? "" : " )")
+ " ;");
out.println("Unicode: " + s.toPattern(false));
out.println();
out.println("Unicode: " + s.toPattern(false));
}
public static void test() throws Exception {

View File

@ -3,14 +3,14 @@
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Latin_Katakana.txt,v $
# $Date: 2001/11/30 01:04:41 $
# $Revision: 1.15 $
# $Date: 2001/11/30 01:59:44 $
# $Revision: 1.16 $
#--------------------------------------------------------------------
# note: a global filter is more efficient, but MUST include all source chars
#:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
:: [',.A-Za-z~\u00A8\u00AA\u00AF\u00B4\u00B8\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0132-\u0137\u0139-\u0140\u0143-\u0149\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01C4-\u01C5\u01C7-\u01C8\u01CA-\u01CB\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01F2\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u02B0\u02B2-\u02B3\u02B7-\u02B8\u02BE\u02D8-\u02DD\u02E1-\u02E3\u0300-\u034E\u0360-\u0362\u0384-\u0386\u0388-\u038A\u038C\u038E-\u0390\u03AA-\u03B0\u03CA-\u03CE\u03D3-\u03D4\u0400-\u0401\u0403\u0407\u040C-\u040E\u0419\u0439\u0450-\u0451\u0453\u0457\u045C-\u045E\u0476-\u0477\u0483-\u0486\u04C1-\u04C2\u04D0-\u04D3\u04D6-\u04D7\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F5\u04F8-\u04F9\u0591-\u05A1\u05A3-\u05B9\u05BB-\u05BD\u05BF\u05C1-\u05C2\u05C4\u0622-\u0626\u064B-\u0655\u0670\u06C0\u06C2\u06D3\u06D6-\u06DC\u06DF-\u06E4\u06E7-\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u0901-\u0902\u0929\u0931\u0934\u093C\u0941-\u0948\u094D\u0951-\u0954\u0958-\u095F\u0962-\u0963\u0981\u09BC\u09C1-\u09C4\u09CD\u09DC-\u09DD\u09DF\u09E2-\u09E3\u0A02\u0A33\u0A36\u0A3C\u0A41-\u0A42\u0A47-\u0A48\u0A4B-\u0A4D\u0A59-\u0A5B\u0A5E\u0A70-\u0A71\u0A81-\u0A82\u0ABC\u0AC1-\u0AC5\u0AC7-\u0AC8\u0ACD\u0B01\u0B3C\u0B3F\u0B41-\u0B43\u0B48\u0B4D\u0B56\u0B5C-\u0B5D\u0B82\u0BC0\u0BCD\u0C3E-\u0C40\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56\u0CBF-\u0CC0\u0CC6-\u0CC8\u0CCA-\u0CCD\u0D41-\u0D43\u0D4D\u0DCA\u0DD2-\u0DD4\u0DD6\u0DDA\u0DDD\u0E31\u0E33-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB3-\u0EB9\u0EBB-\u0EBC\u0EC8-\u0ECD\u0F18-\u0F19\u0F35\u0F37\u0F39\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69\u0F71-\u0F7E\u0F80-\u0F84\u0F86-\u0F87\u0F90-\u0F97\u0F99-\u0FBC\u0FC6\u1026\u102D-\u1030\u1032\u1036-\u1037\u1039\u1058-\u1059\u17B7-\u17BD\u17C6\u17C9-\u17D3\u18A9\u1E00-\u1E9A\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB2\u1FB4\u1FB6-\u1FBB\u1FBD\u1FBF-\u1FC2\u1FC4\u1FC6-\u1FCB\u1FCD-\u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEE\u1FF2\u1FF4\u1FF6-\u1FFB\u1FFD-\u1FFE\u2017\u2024-\u2026\u203E\u207F\u20A8\u20D0-\u20DC\u20E1\u2100-\u2103\u2105-\u2106\u2109-\u210E\u2110-\u2113\u2115-\u2116\u2119-\u211D\u2120-\u2122\u2124\u2128\u212A-\u212D\u212F-\u2131\u2133-\u2134\u2139\u2160-\u216F\u219A-\u219B\u21AE\u21CD-\u21CF\u2204\u2209\u220C\u2224\u2226\u2241\u2244\u2247\u2249\u2260\u2262\u226D-\u2271\u2274-\u2275\u2278-\u2279\u2280-\u2281\u2284-\u2285\u2288-\u2289\u22AC-\u22AF\u22E0-\u22E3\u22EA-\u22ED\u2491-\u24CF\u302A-\u302F\u3371-\u3376\u3380-\u33DD\uFB00-\uFB06\uFB1D-\uFB1F\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40-\uFB41\uFB43-\uFB44\uFB46-\uFB4E\uFBA4-\uFBA5\uFBB0-\uFBB1\uFBEA-\uFBFB\uFC00-\uFC04\uFC5B-\uFC69\uFC90\uFC97-\uFC9B\uFCD9\uFCDF-\uFCE0\uFCF2-\uFCF4\uFD3C-\uFD3D\uFE20-\uFE23\uFE30\uFE49-\uFE4C\uFE50\uFE52\uFE70-\uFE72\uFE74\uFE76-\uFE7F\uFE81-\uFE8C\uFEF5-\uFEFA\uFF07\uFF0C\uFF0E\uFF21-\uFF3A\uFF41-\uFF5A\uFF5E\uFFE3\U0001D167-\U0001D169\U0001D17B-\U0001D182\U0001D185-\U0001D18B\U0001D1AA-\U0001D1AD] ;
:: [',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B] ;
:: NFD (NFC); # use NFKD to get the fullwidth latin characters
:: Lower (); # whenever transliterating from cased to uncased script, include this
@ -491,6 +491,6 @@ x > | ks ;
# note: a global filter is more efficient, but MUST include all source chars!!
#:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
:: ( [~\u3001-\u3002\u30A1-\u30FA\u30FC-\u30FE\u32D0-\u32FE\u3300-\u3357\uFE51\uFF5E\uFF61\uFF64\uFF66-\uFF9D] ) ;
:: ( [~\u3001-\u3002\u30A1-\u30FA\u30FC-\u30FE] ) ;
# eof

View File

@ -10,15 +10,21 @@
package com.ibm.text;
import java.util.*;
import com.ibm.tools.translit.UnicodeSetClosure;
import java.io.*;
/**
* This is a small class that resides in the com.ibm.text package in
* order to access some package-private API. It is used for
* development purposes and should be ignored by end clients.
* To run, use:
* java -classpath classes com.ibm.text.TransliteratorUtility Latin-Katakana NFD lower
* Output is produced in the command console, and a file with more detail is also written.
* To see if it works, use:
* java -classpath classescom.ibm.test.translit.TransliteratorTest -v -nothrow TestIncrementalProgress
*/
public class TransliteratorUtility {
public static void main(String[] args) {
public static void main(String[] args) throws IOException {
if (args.length == 0) {
// Compute and display the source sets for all system
// transliterators.
@ -53,14 +59,38 @@ public class TransliteratorUtility {
}
}
static void showSourceSet(String ID, Normalizer.Mode m, boolean lowerFirst) {
static void showSourceSet(String ID, Normalizer.Mode m, boolean lowerFirst) throws IOException {
File f = new File("UnicodeSetClosure.txt");
String filename = f.getCanonicalFile().toString();
out = new PrintWriter(
new OutputStreamWriter(
new FileOutputStream(filename), "UTF-8"));
System.out.println();
System.out.println("Writing " + filename);
Transliterator t = Transliterator.getInstance(ID);
showSourceSetAux(t, m, lowerFirst, true);
showSourceSetAux(t.getInverse(), m, lowerFirst, false);
out.close();
}
static PrintWriter out;
static void showSourceSetAux(Transliterator t, Normalizer.Mode m, boolean lowerFirst, boolean forward) throws IOException {
UnicodeSet sourceSet = t.getSourceSet();
if (m != Normalizer.NO_OP || lowerFirst) {
UnicodeSetClosure.close(sourceSet, m, lowerFirst);
}
System.out.println(t.getID() + ": " +
sourceSet.toPattern(true));
out.print('\uFEFF'); // BOM
out.println("# MINIMAL FILTER GENERATED FOR: " + t.getID() + (forward ? "" : " BACKWARD"));
out.println(":: "
+ (forward ? "" : "( ")
+ sourceSet.toPattern(true)
+ (forward ? "" : " )")
+ " ;");
out.println("# Unicode: " + sourceSet.toPattern(false));
out.println();
}
static void usage() {

View File

@ -10,15 +10,21 @@
package com.ibm.text;
import java.util.*;
import com.ibm.tools.translit.UnicodeSetClosure;
import java.io.*;
/**
* This is a small class that resides in the com.ibm.text package in
* order to access some package-private API. It is used for
* development purposes and should be ignored by end clients.
* To run, use:
* java -classpath classes com.ibm.text.TransliteratorUtility Latin-Katakana NFD lower
* Output is produced in the command console, and a file with more detail is also written.
* To see if it works, use:
* java -classpath classescom.ibm.test.translit.TransliteratorTest -v -nothrow TestIncrementalProgress
*/
public class TransliteratorUtility {
public static void main(String[] args) {
public static void main(String[] args) throws IOException {
if (args.length == 0) {
// Compute and display the source sets for all system
// transliterators.
@ -53,14 +59,38 @@ public class TransliteratorUtility {
}
}
static void showSourceSet(String ID, Normalizer.Mode m, boolean lowerFirst) {
static void showSourceSet(String ID, Normalizer.Mode m, boolean lowerFirst) throws IOException {
File f = new File("UnicodeSetClosure.txt");
String filename = f.getCanonicalFile().toString();
out = new PrintWriter(
new OutputStreamWriter(
new FileOutputStream(filename), "UTF-8"));
System.out.println();
System.out.println("Writing " + filename);
Transliterator t = Transliterator.getInstance(ID);
showSourceSetAux(t, m, lowerFirst, true);
showSourceSetAux(t.getInverse(), m, lowerFirst, false);
out.close();
}
static PrintWriter out;
static void showSourceSetAux(Transliterator t, Normalizer.Mode m, boolean lowerFirst, boolean forward) throws IOException {
UnicodeSet sourceSet = t.getSourceSet();
if (m != Normalizer.NO_OP || lowerFirst) {
UnicodeSetClosure.close(sourceSet, m, lowerFirst);
}
System.out.println(t.getID() + ": " +
sourceSet.toPattern(true));
out.print('\uFEFF'); // BOM
out.println("# MINIMAL FILTER GENERATED FOR: " + t.getID() + (forward ? "" : " BACKWARD"));
out.println(":: "
+ (forward ? "" : "( ")
+ sourceSet.toPattern(true)
+ (forward ? "" : " )")
+ " ;");
out.println("# Unicode: " + sourceSet.toPattern(false));
out.println();
}
static void usage() {

View File

@ -3,14 +3,14 @@
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/resources/Attic/Transliterator_Latin_Katakana.txt,v $
# $Date: 2001/11/30 01:04:41 $
# $Revision: 1.15 $
# $Date: 2001/11/30 01:59:44 $
# $Revision: 1.16 $
#--------------------------------------------------------------------
# note: a global filter is more efficient, but MUST include all source chars
#:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
:: [',.A-Za-z~\u00A8\u00AA\u00AF\u00B4\u00B8\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0132-\u0137\u0139-\u0140\u0143-\u0149\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01C4-\u01C5\u01C7-\u01C8\u01CA-\u01CB\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01F2\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u02B0\u02B2-\u02B3\u02B7-\u02B8\u02BE\u02D8-\u02DD\u02E1-\u02E3\u0300-\u034E\u0360-\u0362\u0384-\u0386\u0388-\u038A\u038C\u038E-\u0390\u03AA-\u03B0\u03CA-\u03CE\u03D3-\u03D4\u0400-\u0401\u0403\u0407\u040C-\u040E\u0419\u0439\u0450-\u0451\u0453\u0457\u045C-\u045E\u0476-\u0477\u0483-\u0486\u04C1-\u04C2\u04D0-\u04D3\u04D6-\u04D7\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F5\u04F8-\u04F9\u0591-\u05A1\u05A3-\u05B9\u05BB-\u05BD\u05BF\u05C1-\u05C2\u05C4\u0622-\u0626\u064B-\u0655\u0670\u06C0\u06C2\u06D3\u06D6-\u06DC\u06DF-\u06E4\u06E7-\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u0901-\u0902\u0929\u0931\u0934\u093C\u0941-\u0948\u094D\u0951-\u0954\u0958-\u095F\u0962-\u0963\u0981\u09BC\u09C1-\u09C4\u09CD\u09DC-\u09DD\u09DF\u09E2-\u09E3\u0A02\u0A33\u0A36\u0A3C\u0A41-\u0A42\u0A47-\u0A48\u0A4B-\u0A4D\u0A59-\u0A5B\u0A5E\u0A70-\u0A71\u0A81-\u0A82\u0ABC\u0AC1-\u0AC5\u0AC7-\u0AC8\u0ACD\u0B01\u0B3C\u0B3F\u0B41-\u0B43\u0B48\u0B4D\u0B56\u0B5C-\u0B5D\u0B82\u0BC0\u0BCD\u0C3E-\u0C40\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56\u0CBF-\u0CC0\u0CC6-\u0CC8\u0CCA-\u0CCD\u0D41-\u0D43\u0D4D\u0DCA\u0DD2-\u0DD4\u0DD6\u0DDA\u0DDD\u0E31\u0E33-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB3-\u0EB9\u0EBB-\u0EBC\u0EC8-\u0ECD\u0F18-\u0F19\u0F35\u0F37\u0F39\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69\u0F71-\u0F7E\u0F80-\u0F84\u0F86-\u0F87\u0F90-\u0F97\u0F99-\u0FBC\u0FC6\u1026\u102D-\u1030\u1032\u1036-\u1037\u1039\u1058-\u1059\u17B7-\u17BD\u17C6\u17C9-\u17D3\u18A9\u1E00-\u1E9A\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB2\u1FB4\u1FB6-\u1FBB\u1FBD\u1FBF-\u1FC2\u1FC4\u1FC6-\u1FCB\u1FCD-\u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEE\u1FF2\u1FF4\u1FF6-\u1FFB\u1FFD-\u1FFE\u2017\u2024-\u2026\u203E\u207F\u20A8\u20D0-\u20DC\u20E1\u2100-\u2103\u2105-\u2106\u2109-\u210E\u2110-\u2113\u2115-\u2116\u2119-\u211D\u2120-\u2122\u2124\u2128\u212A-\u212D\u212F-\u2131\u2133-\u2134\u2139\u2160-\u216F\u219A-\u219B\u21AE\u21CD-\u21CF\u2204\u2209\u220C\u2224\u2226\u2241\u2244\u2247\u2249\u2260\u2262\u226D-\u2271\u2274-\u2275\u2278-\u2279\u2280-\u2281\u2284-\u2285\u2288-\u2289\u22AC-\u22AF\u22E0-\u22E3\u22EA-\u22ED\u2491-\u24CF\u302A-\u302F\u3371-\u3376\u3380-\u33DD\uFB00-\uFB06\uFB1D-\uFB1F\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40-\uFB41\uFB43-\uFB44\uFB46-\uFB4E\uFBA4-\uFBA5\uFBB0-\uFBB1\uFBEA-\uFBFB\uFC00-\uFC04\uFC5B-\uFC69\uFC90\uFC97-\uFC9B\uFCD9\uFCDF-\uFCE0\uFCF2-\uFCF4\uFD3C-\uFD3D\uFE20-\uFE23\uFE30\uFE49-\uFE4C\uFE50\uFE52\uFE70-\uFE72\uFE74\uFE76-\uFE7F\uFE81-\uFE8C\uFEF5-\uFEFA\uFF07\uFF0C\uFF0E\uFF21-\uFF3A\uFF41-\uFF5A\uFF5E\uFFE3\U0001D167-\U0001D169\U0001D17B-\U0001D182\U0001D185-\U0001D18B\U0001D1AA-\U0001D1AD] ;
:: [',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B] ;
:: NFD (NFC); # use NFKD to get the fullwidth latin characters
:: Lower (); # whenever transliterating from cased to uncased script, include this
@ -491,6 +491,6 @@ x > | ks ;
# note: a global filter is more efficient, but MUST include all source chars!!
#:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
:: ( [~\u3001-\u3002\u30A1-\u30FA\u30FC-\u30FE\u32D0-\u32FE\u3300-\u3357\uFE51\uFF5E\uFF61\uFF64\uFF66-\uFF9D] ) ;
:: ( [~\u3001-\u3002\u30A1-\u30FA\u30FC-\u30FE] ) ;
# eof

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/tools/translit/Attic/UnicodeSetClosure.java,v $
* $Date: 2001/11/30 00:27:06 $
* $Revision: 1.2 $
* $Date: 2001/11/30 01:59:45 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -65,8 +65,8 @@ public class UnicodeSetClosure {
+ s.toPattern(true)
+ (forward ? "" : " )")
+ " ;");
out.println("Unicode: " + s.toPattern(false));
out.println();
out.println("Unicode: " + s.toPattern(false));
}
public static void test() throws Exception {