ICU-1472 fixes for halfwidth (also iteration mark)
X-SVN-Rev: 6835
This commit is contained in:
parent
3d4e45e02d
commit
1abc1b2372
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/WriteCharts.java,v $
|
||||
* $Date: 2001/11/13 02:50:11 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2001/11/13 20:03:51 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -21,9 +21,11 @@ import java.io.*;
|
||||
|
||||
public class WriteCharts {
|
||||
public static void main(String[] args) throws IOException {
|
||||
if (false) testSet();
|
||||
if (false) {
|
||||
printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
|
||||
}
|
||||
String testSet = "";
|
||||
if (args.length == 0) args = all;
|
||||
if (args.length == 0) args = getAllScripts();
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
// Enumeration enum = Transliterator.getAvailableIDs();
|
||||
if (args[i].startsWith("[")) {
|
||||
@ -35,8 +37,9 @@ public class WriteCharts {
|
||||
}
|
||||
}
|
||||
|
||||
public static void testSet() {
|
||||
UnicodeSet s = new UnicodeSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
|
||||
public static void printSet(String source) {
|
||||
UnicodeSet s = new UnicodeSet(source);
|
||||
System.out.println("Printout for '" + source + "'");
|
||||
int count = s.getRangeCount();
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int start = s.getRangeStart(i);
|
||||
@ -45,14 +48,63 @@ public class WriteCharts {
|
||||
}
|
||||
}
|
||||
|
||||
static final String[] all = {
|
||||
"Cyrillic-Latin", "Greek-Latin",
|
||||
"el-Latin",
|
||||
"Devanagari-Tamil", "Devanagari-Latin",
|
||||
"Katakana-Latin", "Hiragana-Latin", "Hangul-Latin"
|
||||
public static String[] getAllScripts() {
|
||||
Set set = new TreeSet();
|
||||
int scripts[];
|
||||
Enumeration sources = Transliterator.getAvailableSources();
|
||||
while(sources.hasMoreElements()) {
|
||||
String source = (String) sources.nextElement();
|
||||
scripts = UScript.getCode(source);
|
||||
int sourceScript = scripts[0];
|
||||
if (sourceScript == UScript.INVALID_CODE) {
|
||||
System.out.println("[Skipping " + source + "]");
|
||||
continue;
|
||||
}
|
||||
System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));
|
||||
Enumeration targets = Transliterator.getAvailableTargets(source);
|
||||
while(targets.hasMoreElements()) {
|
||||
String target = (String) targets.nextElement();
|
||||
scripts = UScript.getCode(target);
|
||||
int targetScript = scripts[0];
|
||||
if (targetScript == UScript.INVALID_CODE
|
||||
|| targetScript < sourceScript) {
|
||||
// skip doing both directions
|
||||
System.out.println("[Skipping '" + source + "-" + target + "']");
|
||||
continue;
|
||||
}
|
||||
System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));
|
||||
Enumeration variants = Transliterator.getAvailableVariants(source, target);
|
||||
while(variants.hasMoreElements()) {
|
||||
String variant = (String) variants.nextElement();
|
||||
String id = source + "-" + target;
|
||||
if (variant.length() != 0) {
|
||||
id += "/" + variant;
|
||||
if (true) {
|
||||
System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
System.out.println("\t\t\t\tAdding: '" + id + "'");
|
||||
set.add(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
String[] results = new String[set.size()];
|
||||
set.toArray(results);
|
||||
return results;
|
||||
};
|
||||
|
||||
public static String showScripts(int[] scripts) {
|
||||
StringBuffer results = new StringBuffer();
|
||||
for (int i = 0; i < scripts.length; ++i) {
|
||||
if (i != 0) results.append(", ");
|
||||
results.append(UScript.getName(scripts[i]));
|
||||
}
|
||||
return results.toString();
|
||||
}
|
||||
|
||||
public static void print(String testSet, String rawId) throws IOException {
|
||||
System.out.println("Processing " + rawId);
|
||||
Transliterator t = Transliterator.getInstance(rawId);
|
||||
String id = t.getID();
|
||||
|
||||
@ -73,11 +125,15 @@ public class WriteCharts {
|
||||
return;
|
||||
} else {
|
||||
testSet = "[:" + source + ":]";
|
||||
if (source.equalsIgnoreCase("katakana")) {
|
||||
testSet = "[" + testSet + "\u30FC]";
|
||||
printSet(testSet);
|
||||
}
|
||||
}
|
||||
}
|
||||
UnicodeSet sourceSet = new UnicodeSet(testSet);
|
||||
|
||||
// check that the source is a script
|
||||
// check that the target is a script
|
||||
int[] scripts = UScript.getCode(target);
|
||||
if (scripts.length != 1) {
|
||||
target = "[:Latin:]";
|
||||
@ -88,7 +144,7 @@ public class WriteCharts {
|
||||
|
||||
Transliterator inverse = t.getInverse();
|
||||
|
||||
Transliterator hex = Transliterator.getInstance("Any-Hex");
|
||||
//Transliterator hex = Transliterator.getInstance("Any-Hex");
|
||||
|
||||
|
||||
// iterate through script
|
||||
@ -133,12 +189,41 @@ public class WriteCharts {
|
||||
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.DECOMP_COMPAT, 0))
|
||||
+ "\u0000" + ss,
|
||||
"<tr><td>" + ss + "<br><tt>" + hex.transliterate(ss) + "</tt></td><td>"
|
||||
+ ts + "<br><tt>" + hex.transliterate(ts) + "</tt></td><td>"
|
||||
+ rt + "<br><tt>" + hex.transliterate(rt) + "</tt></td></tr>" );
|
||||
"<td class='s'>" + ss + "<br><tt>" + hex(ss)
|
||||
+ "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
|
||||
+ "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );
|
||||
|
||||
// Check Duals
|
||||
/*
|
||||
int maxDual = 200;
|
||||
dual:
|
||||
for (int i2 = 0; i2 < count; ++i2) {
|
||||
int end2 = sourceSet.getRangeEnd(i2);
|
||||
for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
|
||||
String ss2 = UTF16.valueOf(j2);
|
||||
String ts2 = t.transliterate(ss2);
|
||||
String rt2 = inverse.transliterate(ts2);
|
||||
|
||||
String ss12 = ss + ss2;
|
||||
String ts12 = t.transliterate(ss + ss12);
|
||||
String rt12 = inverse.transliterate(ts12);
|
||||
if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;
|
||||
if (--maxDual < 0) break dual;
|
||||
|
||||
// transliteration of whole differs from that of parts
|
||||
group = 0x100;
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
|
||||
+ "\u0000" + ss12,
|
||||
"<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
|
||||
+ "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
|
||||
+ "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA
|
||||
|
||||
count = leftOverSet.getRangeCount();
|
||||
@ -161,13 +246,14 @@ public class WriteCharts {
|
||||
}
|
||||
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0)) + ts,
|
||||
"<tr><td>-</td><td>" + ts + "<br><tt>" + hex.transliterate(ts) + "</tt></td><td>"
|
||||
+ rt + "<br><tt>" + hex.transliterate(rt) + "</tt></td></tr>");
|
||||
"<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
|
||||
+ "</tt></td><td class='r'>"
|
||||
+ rt + "<br><tt>" + hex(rt) + "</tt></td>");
|
||||
}
|
||||
}
|
||||
|
||||
// make file name and open
|
||||
File f = new File("chart_" + id.replace('/', '_') + ".html");
|
||||
File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");
|
||||
String filename = f.getCanonicalFile().toString();
|
||||
PrintWriter out = new PrintWriter(
|
||||
new OutputStreamWriter(
|
||||
@ -183,35 +269,64 @@ public class WriteCharts {
|
||||
out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
|
||||
|
||||
out.println("<BODY>");
|
||||
String tableHeader = "<p><table border='1'><tr><th>Source</th><th>Target</th><th>Return</th></tr>";
|
||||
out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");
|
||||
out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");
|
||||
out.println("The samples are mechanically generated, and only include single characters");
|
||||
out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
|
||||
out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
|
||||
out.println("<a href='http://oss.software.ibm.com/cgi-bin/icu/tr'>Transliteration Demo</a></p><hr>");
|
||||
|
||||
// set up the headers
|
||||
int columnCount = 3;
|
||||
String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
|
||||
String headers = headerBase;
|
||||
for (int i = columnCount - 1; i > 0; --i) headers += headerBase;
|
||||
|
||||
String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";
|
||||
String tableFooter = "</table></p>";
|
||||
out.println("<h1>Round Trip</h1>");
|
||||
out.println("<h2>Round Trip</h2>");
|
||||
out.println(tableHeader);
|
||||
|
||||
Iterator it = map.keySet().iterator();
|
||||
char lastGroup = 0;
|
||||
count = 0;
|
||||
int column = 0;
|
||||
while (it.hasNext()) {
|
||||
String key = (String) it.next();
|
||||
char group = key.charAt(0);
|
||||
if (group != lastGroup || count++ > 50) {
|
||||
lastGroup = group;
|
||||
count = 0;
|
||||
if (column != 0) {
|
||||
out.println("</tr>");
|
||||
column = 0;
|
||||
}
|
||||
out.println(tableFooter);
|
||||
|
||||
String title = "";
|
||||
if ((group & 0x80) != 0) out.println("<hr><h1>Completeness</h1>");
|
||||
else out.println("<hr><h1>Round Trip</h1>");
|
||||
if ((group & 16) != 0) out.println("<h2>Errors: Contains Private Use Characters</h2>");
|
||||
if ((group & 8) != 0) out.println("<h2>Possible Errors: Return not in Source Set</h2>");
|
||||
if ((group & 4) != 0) out.println("<h2>Errors: Return not equal to Source</h2>");
|
||||
if ((group & 2) != 0) out.println("<h2>Errors: Return not in Source Set</h2>");
|
||||
if ((group & 1) != 0) out.println("<h2>Errors: Target not in Target Set</h2>");
|
||||
if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");
|
||||
else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");
|
||||
else out.println("<hr><h2>Round Trip</h2>");
|
||||
if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");
|
||||
if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");
|
||||
if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");
|
||||
if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");
|
||||
if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");
|
||||
|
||||
out.println(tableHeader);
|
||||
column = 0;
|
||||
}
|
||||
String value = (String) map.get(key);
|
||||
if (column++ == 0) out.print("<tr>");
|
||||
out.println(value);
|
||||
if (column == 3) {
|
||||
out.println("</tr>");
|
||||
column = 0;
|
||||
}
|
||||
}
|
||||
if (column != 0) {
|
||||
out.println("</tr>");
|
||||
column = 0;
|
||||
}
|
||||
out.println(tableFooter + "</BODY></HTML>");
|
||||
|
||||
@ -220,6 +335,17 @@ public class WriteCharts {
|
||||
}
|
||||
}
|
||||
|
||||
public static String hex(String s) {
|
||||
int cp;
|
||||
StringBuffer results = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
if (i != 0) results.append(' ');
|
||||
results.append(Integer.toHexString(cp));
|
||||
}
|
||||
return results.toString().toUpperCase();
|
||||
}
|
||||
|
||||
static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
|
||||
|
||||
/*
|
||||
|
@ -3,8 +3,8 @@
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Latin_Katakana.txt,v $
|
||||
# $Date: 2001/11/13 02:57:28 $
|
||||
# $Revision: 1.10 $
|
||||
# $Date: 2001/11/13 20:03:22 $
|
||||
# $Revision: 1.11 $
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
#:: [\u0020-\u00FF [:Latin:][:Mark:]] ; # note: a global filter is more efficient, but MUST include all source chars
|
||||
@ -62,6 +62,7 @@
|
||||
# Variables
|
||||
|
||||
$vowel = [aeiou] ;
|
||||
$consonant = [bcdfghjklmnpqrstvwxyz] ;
|
||||
$macron = \u0304 ;
|
||||
|
||||
# Variables used for doubled-consonants with tsu
|
||||
@ -387,6 +388,30 @@ $macron <> ー ;
|
||||
'~ye' > ェ ;
|
||||
'~yo' <> ョ ;
|
||||
|
||||
# iteration marks
|
||||
# TODO: make more accurate
|
||||
|
||||
j $1 < sh (y* $vowel) {ヽ$voice ;
|
||||
dj $1 < ch (y* $vowel) {ヽ$voice ;
|
||||
dz $1 < ts (y* $vowel) {ヽ$voice ;
|
||||
|
||||
g $1 < k (y* $vowel) {ヽ$voice ;
|
||||
z $1 < s (y* $vowel) {ヽ$voice ;
|
||||
d $1 < t (y* $vowel) {ヽ$voice ;
|
||||
h $1 < b (y* $vowel) {ヽ$voice ;
|
||||
v $1 < w (y* $vowel) {ヽ$voice ;
|
||||
|
||||
sh $1 < sh (y* $vowel) {ヽ$voice ;
|
||||
j $1 < j (y* $vowel) {ヽ$voice ;
|
||||
ch $1 < ch (y* $vowel) {ヽ$voice ;
|
||||
dj $1 < dj(y* $vowel) {ヽ$voice ;
|
||||
ts $1 < ts (y* $vowel) {ヽ$voice ;
|
||||
dz $1 < dz (y* $vowel) {ヽ$voice ;
|
||||
|
||||
$1 < ($consonant y* $vowel) {ヽ$voice? ;
|
||||
$1 < (.) {ヽ $voice? ; # otherwise repeat last character
|
||||
< ヽ $voice? ; # delete if no characters found
|
||||
|
||||
# h- rule: lengthens vowel if not followed by a vowel
|
||||
|
||||
[aeiou] } h > ー ;
|
||||
@ -435,6 +460,12 @@ f > フ;
|
||||
j > ジ;
|
||||
w > ウ;
|
||||
|
||||
ß > | ss ;
|
||||
æ > | e ;
|
||||
ð > | d ;
|
||||
ø > | u ;
|
||||
þ > | th ;
|
||||
|
||||
# simple substitutions using backup
|
||||
|
||||
c > | k ;
|
||||
@ -449,7 +480,7 @@ x > | ks ;
|
||||
|
||||
'~' > ; # delete stray tildes between letters
|
||||
[:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
|
||||
[[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
|
||||
[\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
|
||||
|
||||
:: NFC (NFKD) ; # use NFKD to get the halfwidth katakana characters
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/WriteCharts.java,v $
|
||||
* $Date: 2001/11/13 02:50:11 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2001/11/13 20:03:51 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -21,9 +21,11 @@ import java.io.*;
|
||||
|
||||
public class WriteCharts {
|
||||
public static void main(String[] args) throws IOException {
|
||||
if (false) testSet();
|
||||
if (false) {
|
||||
printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
|
||||
}
|
||||
String testSet = "";
|
||||
if (args.length == 0) args = all;
|
||||
if (args.length == 0) args = getAllScripts();
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
// Enumeration enum = Transliterator.getAvailableIDs();
|
||||
if (args[i].startsWith("[")) {
|
||||
@ -35,8 +37,9 @@ public class WriteCharts {
|
||||
}
|
||||
}
|
||||
|
||||
public static void testSet() {
|
||||
UnicodeSet s = new UnicodeSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
|
||||
public static void printSet(String source) {
|
||||
UnicodeSet s = new UnicodeSet(source);
|
||||
System.out.println("Printout for '" + source + "'");
|
||||
int count = s.getRangeCount();
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int start = s.getRangeStart(i);
|
||||
@ -45,14 +48,63 @@ public class WriteCharts {
|
||||
}
|
||||
}
|
||||
|
||||
static final String[] all = {
|
||||
"Cyrillic-Latin", "Greek-Latin",
|
||||
"el-Latin",
|
||||
"Devanagari-Tamil", "Devanagari-Latin",
|
||||
"Katakana-Latin", "Hiragana-Latin", "Hangul-Latin"
|
||||
public static String[] getAllScripts() {
|
||||
Set set = new TreeSet();
|
||||
int scripts[];
|
||||
Enumeration sources = Transliterator.getAvailableSources();
|
||||
while(sources.hasMoreElements()) {
|
||||
String source = (String) sources.nextElement();
|
||||
scripts = UScript.getCode(source);
|
||||
int sourceScript = scripts[0];
|
||||
if (sourceScript == UScript.INVALID_CODE) {
|
||||
System.out.println("[Skipping " + source + "]");
|
||||
continue;
|
||||
}
|
||||
System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));
|
||||
Enumeration targets = Transliterator.getAvailableTargets(source);
|
||||
while(targets.hasMoreElements()) {
|
||||
String target = (String) targets.nextElement();
|
||||
scripts = UScript.getCode(target);
|
||||
int targetScript = scripts[0];
|
||||
if (targetScript == UScript.INVALID_CODE
|
||||
|| targetScript < sourceScript) {
|
||||
// skip doing both directions
|
||||
System.out.println("[Skipping '" + source + "-" + target + "']");
|
||||
continue;
|
||||
}
|
||||
System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));
|
||||
Enumeration variants = Transliterator.getAvailableVariants(source, target);
|
||||
while(variants.hasMoreElements()) {
|
||||
String variant = (String) variants.nextElement();
|
||||
String id = source + "-" + target;
|
||||
if (variant.length() != 0) {
|
||||
id += "/" + variant;
|
||||
if (true) {
|
||||
System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
System.out.println("\t\t\t\tAdding: '" + id + "'");
|
||||
set.add(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
String[] results = new String[set.size()];
|
||||
set.toArray(results);
|
||||
return results;
|
||||
};
|
||||
|
||||
public static String showScripts(int[] scripts) {
|
||||
StringBuffer results = new StringBuffer();
|
||||
for (int i = 0; i < scripts.length; ++i) {
|
||||
if (i != 0) results.append(", ");
|
||||
results.append(UScript.getName(scripts[i]));
|
||||
}
|
||||
return results.toString();
|
||||
}
|
||||
|
||||
public static void print(String testSet, String rawId) throws IOException {
|
||||
System.out.println("Processing " + rawId);
|
||||
Transliterator t = Transliterator.getInstance(rawId);
|
||||
String id = t.getID();
|
||||
|
||||
@ -73,11 +125,15 @@ public class WriteCharts {
|
||||
return;
|
||||
} else {
|
||||
testSet = "[:" + source + ":]";
|
||||
if (source.equalsIgnoreCase("katakana")) {
|
||||
testSet = "[" + testSet + "\u30FC]";
|
||||
printSet(testSet);
|
||||
}
|
||||
}
|
||||
}
|
||||
UnicodeSet sourceSet = new UnicodeSet(testSet);
|
||||
|
||||
// check that the source is a script
|
||||
// check that the target is a script
|
||||
int[] scripts = UScript.getCode(target);
|
||||
if (scripts.length != 1) {
|
||||
target = "[:Latin:]";
|
||||
@ -88,7 +144,7 @@ public class WriteCharts {
|
||||
|
||||
Transliterator inverse = t.getInverse();
|
||||
|
||||
Transliterator hex = Transliterator.getInstance("Any-Hex");
|
||||
//Transliterator hex = Transliterator.getInstance("Any-Hex");
|
||||
|
||||
|
||||
// iterate through script
|
||||
@ -133,12 +189,41 @@ public class WriteCharts {
|
||||
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.DECOMP_COMPAT, 0))
|
||||
+ "\u0000" + ss,
|
||||
"<tr><td>" + ss + "<br><tt>" + hex.transliterate(ss) + "</tt></td><td>"
|
||||
+ ts + "<br><tt>" + hex.transliterate(ts) + "</tt></td><td>"
|
||||
+ rt + "<br><tt>" + hex.transliterate(rt) + "</tt></td></tr>" );
|
||||
"<td class='s'>" + ss + "<br><tt>" + hex(ss)
|
||||
+ "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
|
||||
+ "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );
|
||||
|
||||
// Check Duals
|
||||
/*
|
||||
int maxDual = 200;
|
||||
dual:
|
||||
for (int i2 = 0; i2 < count; ++i2) {
|
||||
int end2 = sourceSet.getRangeEnd(i2);
|
||||
for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
|
||||
String ss2 = UTF16.valueOf(j2);
|
||||
String ts2 = t.transliterate(ss2);
|
||||
String rt2 = inverse.transliterate(ts2);
|
||||
|
||||
String ss12 = ss + ss2;
|
||||
String ts12 = t.transliterate(ss + ss12);
|
||||
String rt12 = inverse.transliterate(ts12);
|
||||
if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;
|
||||
if (--maxDual < 0) break dual;
|
||||
|
||||
// transliteration of whole differs from that of parts
|
||||
group = 0x100;
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
|
||||
+ "\u0000" + ss12,
|
||||
"<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
|
||||
+ "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
|
||||
+ "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA
|
||||
|
||||
count = leftOverSet.getRangeCount();
|
||||
@ -161,13 +246,14 @@ public class WriteCharts {
|
||||
}
|
||||
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0)) + ts,
|
||||
"<tr><td>-</td><td>" + ts + "<br><tt>" + hex.transliterate(ts) + "</tt></td><td>"
|
||||
+ rt + "<br><tt>" + hex.transliterate(rt) + "</tt></td></tr>");
|
||||
"<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
|
||||
+ "</tt></td><td class='r'>"
|
||||
+ rt + "<br><tt>" + hex(rt) + "</tt></td>");
|
||||
}
|
||||
}
|
||||
|
||||
// make file name and open
|
||||
File f = new File("chart_" + id.replace('/', '_') + ".html");
|
||||
File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");
|
||||
String filename = f.getCanonicalFile().toString();
|
||||
PrintWriter out = new PrintWriter(
|
||||
new OutputStreamWriter(
|
||||
@ -183,35 +269,64 @@ public class WriteCharts {
|
||||
out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
|
||||
|
||||
out.println("<BODY>");
|
||||
String tableHeader = "<p><table border='1'><tr><th>Source</th><th>Target</th><th>Return</th></tr>";
|
||||
out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");
|
||||
out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");
|
||||
out.println("The samples are mechanically generated, and only include single characters");
|
||||
out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
|
||||
out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
|
||||
out.println("<a href='http://oss.software.ibm.com/cgi-bin/icu/tr'>Transliteration Demo</a></p><hr>");
|
||||
|
||||
// set up the headers
|
||||
int columnCount = 3;
|
||||
String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
|
||||
String headers = headerBase;
|
||||
for (int i = columnCount - 1; i > 0; --i) headers += headerBase;
|
||||
|
||||
String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";
|
||||
String tableFooter = "</table></p>";
|
||||
out.println("<h1>Round Trip</h1>");
|
||||
out.println("<h2>Round Trip</h2>");
|
||||
out.println(tableHeader);
|
||||
|
||||
Iterator it = map.keySet().iterator();
|
||||
char lastGroup = 0;
|
||||
count = 0;
|
||||
int column = 0;
|
||||
while (it.hasNext()) {
|
||||
String key = (String) it.next();
|
||||
char group = key.charAt(0);
|
||||
if (group != lastGroup || count++ > 50) {
|
||||
lastGroup = group;
|
||||
count = 0;
|
||||
if (column != 0) {
|
||||
out.println("</tr>");
|
||||
column = 0;
|
||||
}
|
||||
out.println(tableFooter);
|
||||
|
||||
String title = "";
|
||||
if ((group & 0x80) != 0) out.println("<hr><h1>Completeness</h1>");
|
||||
else out.println("<hr><h1>Round Trip</h1>");
|
||||
if ((group & 16) != 0) out.println("<h2>Errors: Contains Private Use Characters</h2>");
|
||||
if ((group & 8) != 0) out.println("<h2>Possible Errors: Return not in Source Set</h2>");
|
||||
if ((group & 4) != 0) out.println("<h2>Errors: Return not equal to Source</h2>");
|
||||
if ((group & 2) != 0) out.println("<h2>Errors: Return not in Source Set</h2>");
|
||||
if ((group & 1) != 0) out.println("<h2>Errors: Target not in Target Set</h2>");
|
||||
if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");
|
||||
else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");
|
||||
else out.println("<hr><h2>Round Trip</h2>");
|
||||
if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");
|
||||
if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");
|
||||
if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");
|
||||
if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");
|
||||
if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");
|
||||
|
||||
out.println(tableHeader);
|
||||
column = 0;
|
||||
}
|
||||
String value = (String) map.get(key);
|
||||
if (column++ == 0) out.print("<tr>");
|
||||
out.println(value);
|
||||
if (column == 3) {
|
||||
out.println("</tr>");
|
||||
column = 0;
|
||||
}
|
||||
}
|
||||
if (column != 0) {
|
||||
out.println("</tr>");
|
||||
column = 0;
|
||||
}
|
||||
out.println(tableFooter + "</BODY></HTML>");
|
||||
|
||||
@ -220,6 +335,17 @@ public class WriteCharts {
|
||||
}
|
||||
}
|
||||
|
||||
public static String hex(String s) {
|
||||
int cp;
|
||||
StringBuffer results = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
if (i != 0) results.append(' ');
|
||||
results.append(Integer.toHexString(cp));
|
||||
}
|
||||
return results.toString().toUpperCase();
|
||||
}
|
||||
|
||||
static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
|
||||
|
||||
/*
|
||||
|
@ -3,8 +3,8 @@
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/resources/Attic/Transliterator_Latin_Katakana.txt,v $
|
||||
# $Date: 2001/11/13 02:57:28 $
|
||||
# $Revision: 1.10 $
|
||||
# $Date: 2001/11/13 20:03:22 $
|
||||
# $Revision: 1.11 $
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
#:: [\u0020-\u00FF [:Latin:][:Mark:]] ; # note: a global filter is more efficient, but MUST include all source chars
|
||||
@ -62,6 +62,7 @@
|
||||
# Variables
|
||||
|
||||
$vowel = [aeiou] ;
|
||||
$consonant = [bcdfghjklmnpqrstvwxyz] ;
|
||||
$macron = \u0304 ;
|
||||
|
||||
# Variables used for doubled-consonants with tsu
|
||||
@ -387,6 +388,30 @@ $macron <> ー ;
|
||||
'~ye' > ェ ;
|
||||
'~yo' <> ョ ;
|
||||
|
||||
# iteration marks
|
||||
# TODO: make more accurate
|
||||
|
||||
j $1 < sh (y* $vowel) {ヽ$voice ;
|
||||
dj $1 < ch (y* $vowel) {ヽ$voice ;
|
||||
dz $1 < ts (y* $vowel) {ヽ$voice ;
|
||||
|
||||
g $1 < k (y* $vowel) {ヽ$voice ;
|
||||
z $1 < s (y* $vowel) {ヽ$voice ;
|
||||
d $1 < t (y* $vowel) {ヽ$voice ;
|
||||
h $1 < b (y* $vowel) {ヽ$voice ;
|
||||
v $1 < w (y* $vowel) {ヽ$voice ;
|
||||
|
||||
sh $1 < sh (y* $vowel) {ヽ$voice ;
|
||||
j $1 < j (y* $vowel) {ヽ$voice ;
|
||||
ch $1 < ch (y* $vowel) {ヽ$voice ;
|
||||
dj $1 < dj(y* $vowel) {ヽ$voice ;
|
||||
ts $1 < ts (y* $vowel) {ヽ$voice ;
|
||||
dz $1 < dz (y* $vowel) {ヽ$voice ;
|
||||
|
||||
$1 < ($consonant y* $vowel) {ヽ$voice? ;
|
||||
$1 < (.) {ヽ $voice? ; # otherwise repeat last character
|
||||
< ヽ $voice? ; # delete if no characters found
|
||||
|
||||
# h- rule: lengthens vowel if not followed by a vowel
|
||||
|
||||
[aeiou] } h > ー ;
|
||||
@ -435,6 +460,12 @@ f > フ;
|
||||
j > ジ;
|
||||
w > ウ;
|
||||
|
||||
ß > | ss ;
|
||||
æ > | e ;
|
||||
ð > | d ;
|
||||
ø > | u ;
|
||||
þ > | th ;
|
||||
|
||||
# simple substitutions using backup
|
||||
|
||||
c > | k ;
|
||||
@ -449,7 +480,7 @@ x > | ks ;
|
||||
|
||||
'~' > ; # delete stray tildes between letters
|
||||
[:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
|
||||
[[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
|
||||
[\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
|
||||
|
||||
:: NFC (NFKD) ; # use NFKD to get the halfwidth katakana characters
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user