ICU-0 U4.1
X-SVN-Rev: 17421
This commit is contained in:
parent
98a1c52e09
commit
31eafca234
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
|
||||
* $Date: 2004/04/17 18:21:39 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2005/03/30 17:19:32 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -17,6 +17,7 @@ import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
@ -30,6 +31,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
OldUnicodeMap sampleMap = null;
|
||||
OldUnicodeMap map = new OldUnicodeMap();
|
||||
UnicodeProperty prop;
|
||||
|
||||
// ====================== Main ===========================
|
||||
|
||||
@ -46,6 +48,34 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
this.ucd = ucd;
|
||||
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
|
||||
nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion());
|
||||
/*
|
||||
public void fillMap(String propName) {
|
||||
List list = y.getAvailableValues();
|
||||
for (Iterator it = list.iterator(); it.hasNext();) {
|
||||
String label = (String) it.next();
|
||||
map.add(label, y.getSet(label));
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
ToolUnicodePropertySource unicodePropertySource = ToolUnicodePropertySource.make("");
|
||||
|
||||
Set labels = new HashSet();
|
||||
|
||||
int addToMap(String label) {
|
||||
labels.add(label);
|
||||
UnicodeSet s = prop.getSet(label);
|
||||
if (s == null || s.size() == 0) throw new IllegalArgumentException("Bad value: " + prop.getName() + ", " + label);
|
||||
return map.add(label, s);
|
||||
}
|
||||
|
||||
int addToMapLast(String label) {
|
||||
int result = addToMap(label);
|
||||
Set values = new HashSet(prop.getAvailableValues());
|
||||
if (!values.equals(labels)) throw new IllegalArgumentException("Missing Property Values: " + prop.getName()
|
||||
+ ": " + values.removeAll(labels));
|
||||
return result;
|
||||
}
|
||||
|
||||
// COMMON STUFF for Hangul
|
||||
@ -280,24 +310,30 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
|
||||
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("TR29\\"
|
||||
UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("auxiliary\\", fileName + "BreakTest");
|
||||
PrintWriter out = fc.out;
|
||||
|
||||
/* PrintWriter out = Utility.openPrintWriter("auxiliary\\"
|
||||
+ fileName + "BreakTest-"
|
||||
+ ucd.getVersion()
|
||||
+ ".html", Utility.UTF8_WINDOWS);
|
||||
*/
|
||||
out.println("<!doctype HTML PUBLIC '-//W3C//DTD HTML 4.0 Transitional//EN' 'http://www.w3.org/TR/REC-html40/loose.dtd'>");
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
out.println("<title>" + fileName + " Break Chart</title>");
|
||||
out.println("<style>");
|
||||
out.println("<style type='text/css'>");
|
||||
out.println("td, th { vertical-align: top }");
|
||||
out.println("</style></head>");
|
||||
|
||||
|
||||
out.println("<body bgcolor='#FFFFFF'>");
|
||||
out.println("<h2>" + fileName + " Break Chart</h2>");
|
||||
out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "; <b>Date:</b> " + ucd.getDate() + "</p>");
|
||||
out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "</p>");
|
||||
out.println("<p><b>Date:</b> " + Default.getDate() + "</p>");
|
||||
generateTable(out);
|
||||
|
||||
|
||||
if (sampleMap != null) {
|
||||
if (false) {
|
||||
out.println("<h3>Character Type Breakdown</h3>");
|
||||
out.println("<table border='1' cellspacing='0' width='100%'>");
|
||||
for (int i = 0; i < sampleMap.size(); ++i) {
|
||||
@ -308,7 +344,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
out.println("</table>");
|
||||
}
|
||||
|
||||
out.close();
|
||||
fc.close();
|
||||
|
||||
generateTest(false);
|
||||
|
||||
@ -318,14 +354,18 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
String[] testCase = new String[50];
|
||||
// do main test
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest"
|
||||
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("auxiliary\\", fileName + "BreakTest"
|
||||
+ (shortVersion ? "_SHORT" : ""));
|
||||
PrintWriter out = fc.out;
|
||||
/* PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest"
|
||||
+ (shortVersion ? "_SHORT" : "")
|
||||
+ "-" + ucd.getVersion()
|
||||
+ ".txt", Utility.UTF8_WINDOWS);
|
||||
*/
|
||||
int counter = 0;
|
||||
|
||||
out.println("#");
|
||||
out.println("# Default " + fileName + " Break Test");
|
||||
out.println("# Generated: " + ucd.getDate() + ", MED");
|
||||
out.println("#");
|
||||
out.println("# Format:");
|
||||
out.println("# <string> (# <comment>)? ");
|
||||
@ -361,7 +401,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
printLine(out, extraSingleSamples[ii], true, false);
|
||||
}
|
||||
out.println("# Lines: " + counter);
|
||||
out.close();
|
||||
fc.close();
|
||||
}
|
||||
|
||||
public void sampleDescription(PrintWriter out) {}
|
||||
@ -461,7 +501,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
if (after == null) continue;
|
||||
|
||||
String h = getTypeID(after);
|
||||
types += "<th " + width + " title='" + getInfo(after) + "'><a class='lbclass' href='#" + h + "'>" + h + "</th>";
|
||||
types += "<th " + width + " class='lbclass' title='" + getInfo(after) + "'>" + h + "</th>";
|
||||
|
||||
|
||||
//codes += "<th " + width + " title='" + getInfo(after) + "'>" + Utility.hex(after) + "</th>";
|
||||
@ -480,8 +520,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
if (before == null) continue;
|
||||
|
||||
String h = getTypeID(before);
|
||||
String line = "<tr><th title='" + ucd.getCodeAndName(before) + "'><a class='lbclass' href='#" + h + "'>"
|
||||
+ h + "</th>";
|
||||
String line = "<tr><th class='lbclass' title='" + ucd.getCodeAndName(before) + "'>" + h + "</th>";
|
||||
|
||||
for (int type2 = 0; type2 < tableLimit; ++type2) {
|
||||
|
||||
@ -555,7 +594,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
String status;
|
||||
if (html) {
|
||||
status = hasBreak ? " style='border-right: 1px solid blue'" : "";
|
||||
string.append("<span title='" + getRule() + "'><span" + status + "> </span> <span>");
|
||||
string.append("<span title='" + getRule() + "'><span" + status + "> </span> </span>");
|
||||
} else {
|
||||
status = hasBreak ? BREAK : NOBREAK;
|
||||
string.append(status);
|
||||
@ -574,7 +613,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
+ "'>"
|
||||
+ Utility.quoteXML(Utility.getDisplay(cp), true)
|
||||
+ "</span>");
|
||||
string.append("<span title='" + getRule() + "'><span" + status + "> </span> <span>");
|
||||
string.append("<span title='" + getRule() + "'><span" + status + "> </span> </span>");
|
||||
} else {
|
||||
if (string.length() > 0) {
|
||||
string.append(' ');
|
||||
@ -743,28 +782,23 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
GenerateGraphemeBreakTest(UCD ucd) {
|
||||
super(ucd);
|
||||
fileName = "GraphemeCluster";
|
||||
fileName = "Grapheme";
|
||||
sampleMap = map;
|
||||
}
|
||||
|
||||
|
||||
Object foo = prop = unicodePropertySource.getProperty("Grapheme_Cluster_Break");
|
||||
|
||||
final int
|
||||
CR = map.add("CR", new UnicodeSet(0xD, 0xD)),
|
||||
LF = map.add("LF", new UnicodeSet(0xA, 0xA)),
|
||||
Control = map.add("Control",
|
||||
getSet(ucd, CATEGORY, Cc)
|
||||
.addAll(getSet(ucd, CATEGORY, Cf))
|
||||
.addAll(getSet(ucd, CATEGORY, Zp))
|
||||
.addAll(getSet(ucd, CATEGORY, Zl))
|
||||
.removeAll(map.getSetFromIndex(CR))
|
||||
.removeAll(map.getSetFromIndex(LF))),
|
||||
Extend = map.add("Extend", getSet(ucd, DERIVED, GraphemeExtend)),
|
||||
L = map.add("L", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.L)),
|
||||
V = map.add("V", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.V)),
|
||||
T = map.add("T", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.T)),
|
||||
LV = map.add("LV", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.LV)),
|
||||
LVT = map.add("LVT", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.LVT)),
|
||||
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
|
||||
CR = addToMap("CR"),
|
||||
LF = addToMap("LF"),
|
||||
Control = addToMap("Control"),
|
||||
Extend = addToMap("Extend"),
|
||||
L = addToMap("L"),
|
||||
V = addToMap("V"),
|
||||
T = addToMap("T"),
|
||||
LV = addToMap("LV"),
|
||||
LVT = addToMap("LVT"),
|
||||
Other = addToMapLast("Other");
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp) {
|
||||
@ -860,35 +894,23 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
}
|
||||
|
||||
Object foo = prop = unicodePropertySource.getProperty("Word_Break");
|
||||
|
||||
//static String LENGTH = "[\u30FC\uFF70]";
|
||||
//static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]";
|
||||
//static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
|
||||
//static String HIRAGANA_ITERATION = "[\u309D\u309E]";
|
||||
|
||||
final int
|
||||
Format = map.add("Format", getSet(ucd, CATEGORY, Cf).remove(0x00AD)),
|
||||
Katakana = map.add("Katakana", getSet(ucd, SCRIPT, KATAKANA_SCRIPT)
|
||||
.addAll(new UnicodeSet("[\u30FC\uFF70\uFF9E\uFF9F]"))
|
||||
//.addAll(new UnicodeSet(HALFWIDTH_KATAKANA))
|
||||
//.addAll(new UnicodeSet(KATAKANA_ITERATION))
|
||||
),
|
||||
ALetter = map.add("ALetter",
|
||||
getSet(ucd, DERIVED, PropAlphabetic)
|
||||
.add(0x05F3, 0x05F3)
|
||||
.removeAll(map.getSetFromIndex(Katakana))
|
||||
.removeAll(getSet(ucd, BINARY_PROPERTIES, Ideographic))
|
||||
.removeAll(getSet(ucd, SCRIPT, THAI_SCRIPT))
|
||||
.removeAll(getSet(ucd, SCRIPT, LAO_SCRIPT))
|
||||
.removeAll(getSet(ucd, SCRIPT, HIRAGANA_SCRIPT))
|
||||
),
|
||||
MidLetter = map.add("MidLetter",
|
||||
new UnicodeSet("[\\u0027\\u00AD\\u00B7\\u05f4\\u05F4\\u2019\\u2027]")),
|
||||
MidNumLet = map.add("MidNumLet",
|
||||
new UnicodeSet("[\\u002E\\u003A]")),
|
||||
MidNum = map.add("MidNum", getSet(ucd, LINE_BREAK, LB_IN)
|
||||
.removeAll(map.getSetFromIndex(MidNumLet))),
|
||||
Numeric = map.add("Numeric", getSet(ucd, LINE_BREAK, LB_NU)),
|
||||
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
|
||||
Format = addToMap("Format"),
|
||||
Katakana = addToMap("Katakana"),
|
||||
ALetter = addToMap("ALetter"),
|
||||
MidLetter = addToMap("MidLetter"),
|
||||
//MidNumLet = addToMap("MidNumLet"),
|
||||
MidNum = addToMap("MidNum"),
|
||||
Numeric = addToMap("Numeric"),
|
||||
ExtendNumLet = addToMap("ExtendNumLet"),
|
||||
Other = addToMapLast("Other");
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp) {
|
||||
@ -948,11 +970,11 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
// Don’t break letters across certain punctuation
|
||||
|
||||
setRule("6: ALetter × (MidLetter | MidNumLet) ALetter");
|
||||
if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false;
|
||||
setRule("6: ALetter × MidLetter ALetter");
|
||||
if (before == ALetter && after == MidLetter && after2 == ALetter) return false;
|
||||
|
||||
setRule("7: ALetter (MidLetter | MidNumLet) × ALetter");
|
||||
if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false;
|
||||
if (before2 == ALetter && before == MidLetter && after == ALetter) return false;
|
||||
|
||||
// Don’t break within sequences of digits, or digits adjacent to letters.
|
||||
|
||||
@ -968,15 +990,22 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
// Don’t break within sequences like: '-3.2'
|
||||
setRule("11: Numeric (MidNum | MidNumLet) × Numeric");
|
||||
if (before2 == Numeric && (before == MidNum || before == MidNumLet) && after == Numeric) return false;
|
||||
if (before2 == Numeric && before == MidNum && after == Numeric) return false;
|
||||
|
||||
setRule("12: Numeric × (MidNum | MidNumLet) Numeric");
|
||||
if (before == Numeric && (after == MidNum || after == MidNumLet) && after2 == Numeric) return false;
|
||||
if (before == Numeric && after == MidNum && after2 == Numeric) return false;
|
||||
|
||||
// Don't break between Katakana
|
||||
|
||||
setRule("13: Katakana × Katakana");
|
||||
if (before == Katakana && after == Katakana) return false;
|
||||
|
||||
// Do not break from extenders
|
||||
setRule("13a: (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet");
|
||||
if ((before == ALetter || before == Numeric || before == Katakana || before == ExtendNumLet) && after == ExtendNumLet) return false;
|
||||
|
||||
setRule("13b: ExtendNumLet × (ALetter | Numeric | Katakana)");
|
||||
if (before == ExtendNumLet && (after == ALetter || after == Numeric || after == Katakana)) return false;
|
||||
|
||||
// Otherwise break always.
|
||||
setRule("14: Any ÷ Any");
|
||||
@ -1344,7 +1373,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
setRule("9: OP SP* ×");
|
||||
if (lastNonSpace == LB_OP) return false;
|
||||
|
||||
// LB 10 Don’t break within ‘”[’, , even with intervening spaces.
|
||||
// LB 10 Don’t break within ‘<EFBFBD>?[’, , even with intervening spaces.
|
||||
// QU SP* × OP
|
||||
setRule("10: QU SP* × OP");
|
||||
if (lastNonSpace == LB_QU && after == LB_OP) return false;
|
||||
@ -1377,7 +1406,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
setRule("13: × GL ; GL ×");
|
||||
if (after == LB_GL || before == LB_GL) return false;
|
||||
|
||||
// LB 14 Don’t break before or after ‘”’
|
||||
// LB 14 Don’t break before or after ‘<EFBFBD>?’
|
||||
setRule("14: × QU ; QU ×");
|
||||
if (before == LB_QU || after == LB_QU) return false;
|
||||
|
||||
@ -1450,7 +1479,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
if (before == LB_HY) return true;
|
||||
if (after == LB_BB) return true;
|
||||
|
||||
// LB 19 Don’t break between alphabetics (“at”)
|
||||
// LB 19 Don’t break between alphabetics (“at<EFBFBD>?)
|
||||
// AL × AL
|
||||
|
||||
setRule("19: AL × AL");
|
||||
@ -1515,36 +1544,20 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
}
|
||||
|
||||
Object foo = prop = unicodePropertySource.getProperty("Sentence_Break");
|
||||
|
||||
final int
|
||||
Sep = map.add("Sep", new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]")),
|
||||
Format = map.add("Format", getSet(ucd, CATEGORY, Cf)),
|
||||
Sp = map.add("Sp", getSet(ucd, BINARY_PROPERTIES, White_space)
|
||||
.removeAll(map.getSetFromIndex(Sep))),
|
||||
Lower = map.add("Lower", getSet(ucd, DERIVED, PropLowercase)),
|
||||
Upper = map.add("Upper", getSet(ucd, CATEGORY, Lt)
|
||||
.addAll(getSet(ucd, DERIVED, PropUppercase))),
|
||||
OLetter = map.add("OLetter",
|
||||
getSet(ucd, DERIVED, PropAlphabetic)
|
||||
.add(0x05F3, 0x05F3)
|
||||
.removeAll(map.getSetFromIndex(Lower))
|
||||
.removeAll(map.getSetFromIndex(Upper))
|
||||
),
|
||||
Numeric = map.add("Numeric", getSet(ucd, LINE_BREAK, LB_NU)),
|
||||
ATerm = map.add("ATerm", new UnicodeSet(0x002E,0x002E)),
|
||||
Term = map.add("Term", new UnicodeSet(
|
||||
"[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367"
|
||||
+ "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049"
|
||||
+ "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]")),
|
||||
Close = map.add("Close",
|
||||
getSet(ucd, CATEGORY, Po)
|
||||
.addAll(getSet(ucd, CATEGORY, Pe))
|
||||
.addAll(getSet(ucd, LINE_BREAK, LB_QU))
|
||||
.removeAll(map.getSetFromIndex(ATerm))
|
||||
.removeAll(map.getSetFromIndex(Term))
|
||||
.remove(0x05F3)
|
||||
),
|
||||
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
|
||||
Sep = addToMap("Sep"),
|
||||
Format = addToMap("Format"),
|
||||
Sp = addToMap("Sp"),
|
||||
Lower = addToMap("Lower"),
|
||||
Upper = addToMap("Upper"),
|
||||
OLetter = addToMap("OLetter"),
|
||||
Numeric = addToMap("Numeric"),
|
||||
ATerm = addToMap("ATerm"),
|
||||
STerm = addToMap("STerm"),
|
||||
Close = addToMap("Close"),
|
||||
Other = addToMapLast("Other");
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp) {
|
||||
@ -1726,8 +1739,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
}
|
||||
if (t == ATerm) {
|
||||
lookAfter = ATerm;
|
||||
} else if (t == Term) {
|
||||
lookAfter = Term;
|
||||
} else if (t == STerm) {
|
||||
lookAfter = STerm;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -1776,7 +1789,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
|
||||
return false;
|
||||
}
|
||||
if (lookAfter == Term) break;
|
||||
if (lookAfter == STerm) break;
|
||||
}
|
||||
|
||||
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
|
||||
|
@ -34,6 +34,10 @@ import com.ibm.icu.text.SymbolTable;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeMatcher;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.UCD.GenerateBreakTest.GenerateGraphemeBreakTest;
|
||||
import com.ibm.text.UCD.GenerateBreakTest.GenerateLineBreakTest;
|
||||
import com.ibm.text.UCD.GenerateBreakTest.GenerateSentenceBreakTest;
|
||||
import com.ibm.text.UCD.GenerateBreakTest.GenerateWordBreakTest;
|
||||
import com.ibm.text.UCD.MakeUnicodeFiles.Format.PrintStyle;
|
||||
import com.ibm.text.utility.UnicodeDataFile;
|
||||
import com.ibm.text.utility.Utility;
|
||||
@ -511,6 +515,14 @@ public class MakeUnicodeFiles {
|
||||
GenerateCaseFolding.generateSpecialCasing(false);
|
||||
} else if (filename.equals("StandardizedVariants")) {
|
||||
GenerateStandardizedVariants.generate();
|
||||
} else if (filename.equals("GraphemeBreakTest")) {
|
||||
new GenerateGraphemeBreakTest(Default.ucd()).run();
|
||||
} else if (filename.equals("WordBreakTest")) {
|
||||
new GenerateWordBreakTest(Default.ucd()).run();
|
||||
} else if (filename.equals("LineBreakTest")) {
|
||||
new GenerateLineBreakTest(Default.ucd()).run();
|
||||
} else if (filename.equals("SentenceBreakTest")) {
|
||||
new GenerateSentenceBreakTest(Default.ucd()).run();
|
||||
} else {
|
||||
generatePropertyFile(filename);
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
Generate:
|
||||
DeltaVersion: 13
|
||||
Generate: DerivedBidiClass
|
||||
DeltaVersion: 14
|
||||
CopyrightYear: 2005
|
||||
|
||||
File: auxiliary/GraphemeBreakProperty
|
||||
@ -14,6 +14,18 @@ File: auxiliary/SentenceBreakProperty
|
||||
Property: Sentence_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
File: auxiliary/GraphemeBreakTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: auxiliary/WordBreakTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: auxiliary/LineBreakTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: auxiliary/SentenceBreakTest
|
||||
Property: SPECIAL
|
||||
|
||||
File: Blocks
|
||||
Property: Block
|
||||
# Note: When comparing block names, casing, whitespace, hyphens,
|
||||
@ -58,12 +70,14 @@ Value: 4.1
|
||||
File: extracted/DerivedBidiClass
|
||||
Property: Bidi_Class
|
||||
# Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
|
||||
# Unlike other properties, unassigned code points in blocks reserved for right-to-left scripts are given either types R or AL.
|
||||
# Unlike other properties, unassigned code points in blocks
|
||||
# reserved for right-to-left scripts are given either types R or AL.
|
||||
# The unassigned characters that default to R are:
|
||||
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF
|
||||
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF
|
||||
# \uFB1D-\uFB4F \U00010840-\U000109FF \U00010A60-\U00010FFF
|
||||
# The unassigned characters that default to AL are:
|
||||
# Arabic, Syriac, Thaana, Arabic_Presentation_Forms_A, Arabic_Presentation_Forms_B, Arabic_Supplement,
|
||||
# and the range \u0750-\u077F, minus the Noncharacter_Code_Points
|
||||
# Arabic, Syriac, Arabic_Supplement, Thaana, Arabic_Presentation_Forms_A,
|
||||
# Arabic_Presentation_Forms_B, minus the Noncharacter_Code_Points
|
||||
# For all other cases:
|
||||
Format: valueStyle=short skipUnassigned=Left_To_Right
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2005/03/26 05:40:05 $
|
||||
* $Revision: 1.19 $
|
||||
* $Date: 2005/03/30 17:19:32 $
|
||||
* $Revision: 1.20 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -148,54 +148,58 @@ public class TestData implements UCD_Types {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
Matcher m;
|
||||
|
||||
static class GenStringPrep {
|
||||
UnicodeSet[] coreChars = new UnicodeSet[100];
|
||||
UnicodeSet decomposable = new UnicodeSet();
|
||||
UnicodeSet pattern = new UnicodeSet();
|
||||
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
|
||||
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
|
||||
UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())
|
||||
.retainAll(ups.getSet("gc=Sk"))
|
||||
.addAll(new UnicodeSet("[\u0027 \u002D \u002E \u003A \u00B7 \u058A \u05F3" +
|
||||
" \u05F4 \u200C \u200D \u2010 \u2019 \u2027 \u30A0]"));
|
||||
|
||||
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
|
||||
|
||||
UnicodeSet not_xid_continue = ups.getSet("XID_Continue=true").complement().removeAll(wordChars);
|
||||
|
||||
//UnicodeSet[] decompChars = new UnicodeSet[100];
|
||||
UCD ucd = Default.ucd();
|
||||
|
||||
Collator uca = Collator.getInstance(ULocale.ENGLISH);
|
||||
Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
|
||||
{
|
||||
uca.setStrength(Collator.IDENTICAL);
|
||||
uca0.setStrength(Collator.IDENTICAL);
|
||||
}
|
||||
GenerateHanTransliterator.MultiComparator uca
|
||||
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
|
||||
uca0, new UTF16.StringComparator()});
|
||||
|
||||
UnicodeSet bidiR = new UnicodeSet(
|
||||
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
|
||||
|
||||
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
|
||||
UnicodeSet hasUpper = new UnicodeSet();
|
||||
|
||||
BagFormatter bf = new BagFormatter();
|
||||
UnicodeSet inIDN = new UnicodeSet();
|
||||
|
||||
void genStringPrep() throws IOException {
|
||||
//BagFormatter bf = new BagFormatter();
|
||||
//System.out.println(bf.showSetDifferences("ID_Continue", id_continue, "XID_Continue", xid_continue));
|
||||
StringBuffer inbuffer = new StringBuffer();
|
||||
StringBuffer intermediate, outbuffer;
|
||||
bf.setShowLiteral(BagFormatter.toHTMLControl);
|
||||
//bf.setValueSource(UnicodeLabel.NULL);
|
||||
if (false) {
|
||||
|
||||
System.out.println("word chars: " + bf.showSetNames(wordChars));
|
||||
System.out.println("pat: " + bf.showSetNames(patternProp));
|
||||
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
|
||||
}
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
int cat = Default.ucd().getCategory(cp);
|
||||
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
|
||||
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
|
||||
inbuffer.setLength(0);
|
||||
UTF16.append(inbuffer, cp);
|
||||
try {
|
||||
intermediate = IDNA.convertToASCII(inbuffer,
|
||||
IDNA.USE_STD3_RULES);
|
||||
if (intermediate.length() == 0)
|
||||
continue;
|
||||
outbuffer = IDNA.convertToUnicode(intermediate,
|
||||
IDNA.USE_STD3_RULES);
|
||||
} catch (StringPrepParseException e) {
|
||||
continue;
|
||||
} catch (Exception e) {
|
||||
System.out.println("Failure at: " + Utility.hex(cp));
|
||||
continue;
|
||||
}
|
||||
if (!TestData.equals(inbuffer, outbuffer))
|
||||
continue;
|
||||
int idnaType = getIDNAType(cp);
|
||||
idnaTypeSet[idnaType].add(cp);
|
||||
int script = ucd.getScript(cp);
|
||||
if (coreChars[script] == null)
|
||||
coreChars[script] = new UnicodeSet();
|
||||
@ -208,8 +212,12 @@ public class TestData implements UCD_Types {
|
||||
}
|
||||
|
||||
Utility.fixDot();
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(GEN_DIR,
|
||||
"idn-chars.html");
|
||||
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
|
||||
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
|
||||
textOut.println('\uFEFF');
|
||||
textOut.println("For documentation, see idn-chars.html");
|
||||
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut);
|
||||
/*
|
||||
out
|
||||
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
out.println("<title>IDN Characters</title><style>");
|
||||
@ -217,44 +225,87 @@ public class TestData implements UCD_Types {
|
||||
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
|
||||
out.println(".Atomic { background-color: #CCCCFF }");
|
||||
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
|
||||
out.println(".Non-ID { background-color: #FFCCCC }");
|
||||
out.println(".Non-XID { background-color: #FFCCCC }");
|
||||
out.println(".Decomposable { background-color: #FFFFCC }");
|
||||
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
|
||||
|
||||
out.println("th { text-align: left }");
|
||||
out.println("-->");
|
||||
out.println("</style></head><body><table>");
|
||||
*/
|
||||
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0' style='border-collapse: collapse'>");
|
||||
|
||||
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
|
||||
if (scriptCode == COMMON_SCRIPT
|
||||
|| scriptCode == INHERITED_SCRIPT)
|
||||
continue;
|
||||
showCodes(out, scriptCode);
|
||||
showCodes(htmlOut, textOut, scriptCode);
|
||||
}
|
||||
showCodes(out, COMMON_SCRIPT);
|
||||
showCodes(out, INHERITED_SCRIPT);
|
||||
out.println("</table></body></html>");
|
||||
out.close();
|
||||
showCodes(htmlOut, textOut, COMMON_SCRIPT);
|
||||
showCodes(htmlOut, textOut, INHERITED_SCRIPT);
|
||||
htmlOut.println("</table></body></html>");
|
||||
htmlOut.close();
|
||||
}
|
||||
|
||||
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
|
||||
{
|
||||
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
|
||||
}
|
||||
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private int getIDNAType(int cp) {
|
||||
inbuffer.setLength(0);
|
||||
UTF16.append(inbuffer, cp);
|
||||
try {
|
||||
intermediate = IDNA.convertToASCII(inbuffer,
|
||||
IDNA.DEFAULT); // USE_STD3_RULES
|
||||
if (intermediate.length() == 0)
|
||||
return DELETED;
|
||||
outbuffer = IDNA.convertToUnicode(intermediate,
|
||||
IDNA.USE_STD3_RULES);
|
||||
} catch (StringPrepParseException e) {
|
||||
return ILLEGAL;
|
||||
} catch (Exception e) {
|
||||
System.out.println("Failure at: " + Utility.hex(cp));
|
||||
return ILLEGAL;
|
||||
}
|
||||
if (!TestData.equals(inbuffer, outbuffer))
|
||||
return REMAPPED;
|
||||
return OK;
|
||||
}
|
||||
StringBuffer inbuffer = new StringBuffer();
|
||||
StringBuffer intermediate, outbuffer;
|
||||
|
||||
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
|
||||
|
||||
/**
|
||||
* @param out
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param scriptCode
|
||||
* @param ucd
|
||||
* @param coreChars
|
||||
* @param decompChars
|
||||
* @param scriptCode
|
||||
*/
|
||||
private void showCodes(PrintWriter out, int scriptCode) {
|
||||
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode) {
|
||||
if (coreChars[scriptCode] == null) return;
|
||||
System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
|
||||
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
|
||||
out.println();
|
||||
out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
|
||||
htmlOut.println();
|
||||
htmlOut.println("<tr><th class='script'>Script: " + script + "</th></tr>");
|
||||
textOut.println();
|
||||
textOut.println("#*** Script: " + script + " ***");
|
||||
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
|
||||
UnicodeSet decomp = new UnicodeSet(core).retainAll(decomposable);
|
||||
core.removeAll(decomp);
|
||||
UnicodeSet non_id = new UnicodeSet(core).removeAll(xid_continue);
|
||||
core.removeAll(non_id);
|
||||
|
||||
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
|
||||
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
|
||||
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
|
||||
|
||||
UnicodeSet decomp = extract(decomposable, core);
|
||||
UnicodeSet pattern = extract(patternProp, core);
|
||||
UnicodeSet non_id = extract(not_xid_continue, core);
|
||||
|
||||
UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
|
||||
core.removeAll(otherCore);
|
||||
if (core.size() == 0) {
|
||||
@ -262,58 +313,81 @@ public class TestData implements UCD_Types {
|
||||
core = otherCore;
|
||||
otherCore = temp;
|
||||
}
|
||||
printlnSet(out, "Atomic", core, scriptCode);
|
||||
if (otherCore.size() != 0) printlnSet(out, "Atomic-no-uppercase", otherCore, scriptCode);
|
||||
if (non_id.size() != 0) printlnSet(out, "Non-ID", non_id, scriptCode);
|
||||
if (decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
|
||||
|
||||
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
|
||||
if (otherCore.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", otherCore, scriptCode);
|
||||
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
|
||||
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
|
||||
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
|
||||
|
||||
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped", remapped, scriptCode);
|
||||
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
|
||||
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param out
|
||||
* @param unicodeset
|
||||
* @param uca
|
||||
* @param scriptCode
|
||||
*
|
||||
*/
|
||||
private void printlnSet(PrintWriter out, String title,
|
||||
UnicodeSet unicodeset, int scriptCode) {
|
||||
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
|
||||
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
|
||||
core.removeAll(decomp);
|
||||
return decomp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param script TODO
|
||||
* @param unicodeset
|
||||
* @param scriptCode
|
||||
* @param uca
|
||||
*/
|
||||
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
|
||||
String script, String title, UnicodeSet unicodeset, int scriptCode) {
|
||||
if (unicodeset == null)
|
||||
return;
|
||||
int size = unicodeset.size();
|
||||
String dir = unicodeset.containsSome(bidiR)
|
||||
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
|
||||
out.println("<tr><th class='" + title + "'>" + title + " ("
|
||||
htmlOut.println("<tr><th class='" + title + "'>" + title + " ("
|
||||
+ nf.format(size) + ")</th></tr>");
|
||||
out.print("<tr><td class='" + title + "'" + dir + ">");
|
||||
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
|
||||
textOut.println();
|
||||
textOut.println("# " + title);
|
||||
bf.setValueSource(script + " ; " + title);
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator();
|
||||
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
|
||||
usi.reset(unicodeset);
|
||||
while (usi.nextRange()) {
|
||||
if (usi.codepoint == usi.codepointEnd) {
|
||||
out.print(formatCode(UTF16
|
||||
htmlOut.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint)));
|
||||
} else {
|
||||
out.print(formatCode(UTF16
|
||||
htmlOut.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint))
|
||||
+ ".. "
|
||||
+ formatCode(UTF16
|
||||
.valueOf(usi.codepointEnd)));
|
||||
}
|
||||
}
|
||||
bf.showSetNames(textOut, unicodeset);
|
||||
} else {
|
||||
Set reordered = new TreeSet(uca);
|
||||
usi.reset(unicodeset);
|
||||
while (usi.next()) {
|
||||
boolean foo = reordered.add(usi.getString());
|
||||
String x = usi.getString();
|
||||
boolean foo = reordered.add(x);
|
||||
if (!foo)
|
||||
throw new IllegalArgumentException("Collision with "
|
||||
+ Default.ucd().getCodeAndName(usi.getString()));
|
||||
+ Default.ucd().getCodeAndName(x));
|
||||
}
|
||||
for (Iterator it = reordered.iterator(); it.hasNext();) {
|
||||
out.print(formatCode((String) it
|
||||
.next()));
|
||||
Object key = it.next();
|
||||
htmlOut.print(formatCode((String)key));
|
||||
}
|
||||
bf.showSetNames(textOut, reordered);
|
||||
}
|
||||
out.println("</td></tr>");
|
||||
htmlOut.println("</td></tr>");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -324,7 +398,7 @@ public class TestData implements UCD_Types {
|
||||
int cat = ucd.getCategory(UTF16.charAt(string,0));
|
||||
return "<span title='" + ucd.getCodeAndName(string) + "'>"
|
||||
+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
|
||||
+ BagFormatter.toHTML.transliterate(string)
|
||||
+ BagFormatter.toHTMLControl.transliterate(string)
|
||||
+ " </span>";
|
||||
}
|
||||
}
|
||||
|
@ -123,7 +123,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 15;
|
||||
}
|
||||
}.setValues(LONG_YES_NO, YES_NO)
|
||||
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases()
|
||||
.setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
@ -135,7 +135,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 15;
|
||||
}
|
||||
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE)
|
||||
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases()
|
||||
.setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
@ -147,7 +147,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 15;
|
||||
}
|
||||
}.setValues(LONG_YES_NO, YES_NO)
|
||||
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases()
|
||||
.setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version));
|
||||
|
||||
add(new UnicodeProperty.SimpleProperty() {
|
||||
@ -159,7 +159,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
public int getMaxWidth(boolean isShort) {
|
||||
return 15;
|
||||
}
|
||||
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE)
|
||||
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases()
|
||||
.setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version));
|
||||
|
||||
|
||||
@ -235,7 +235,12 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
unicodeMap.putAll(hangul.getSet("LVT"),"LVT");
|
||||
unicodeMap.setMissing("Other");
|
||||
}
|
||||
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version));
|
||||
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version)
|
||||
.addValueAliases(new String[][] {
|
||||
{"Control", "CN"},
|
||||
{"Extend", "EX"},
|
||||
{"Other", "XX"},
|
||||
}).swapFirst2ValueAliases());
|
||||
|
||||
add(new UnicodeProperty.UnicodeMapProperty() {
|
||||
{
|
||||
@ -268,7 +273,17 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
|
||||
unicodeMap.setMissing("Other");
|
||||
}
|
||||
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version));
|
||||
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version)
|
||||
.addValueAliases(new String[][] {
|
||||
{"Format", "FO"},
|
||||
{"Katakana", "KA"},
|
||||
{"ALetter", "LE"},
|
||||
{"MidLetter", "ML"},
|
||||
{"MidNum", "MN"},
|
||||
{"Numeric", "NU"},
|
||||
{"ExtendNumLet", "EX"},
|
||||
{"Other", "XX"},
|
||||
}).swapFirst2ValueAliases());
|
||||
|
||||
add(new UnicodeProperty.UnicodeMapProperty() {
|
||||
{
|
||||
@ -307,7 +322,20 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
||||
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
|
||||
unicodeMap.setMissing("Other");
|
||||
}
|
||||
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version));
|
||||
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version)
|
||||
.addValueAliases(new String[][] {
|
||||
{"Sep", "SE"},
|
||||
{"Format", "FO"},
|
||||
{"Sp", "SP"},
|
||||
{"Lower", "LO"},
|
||||
{"Upper", "UP"},
|
||||
{"OLetter", "LE"},
|
||||
{"Numeric", "NU"},
|
||||
{"ATerm", "AT"},
|
||||
{"STerm", "ST"},
|
||||
{"Close", "CL"},
|
||||
{"Other", "XX"},
|
||||
}).swapFirst2ValueAliases());
|
||||
}
|
||||
|
||||
static String[] YES_NO_MAYBE = {"N", "M", "Y"};
|
||||
|
@ -14,58 +14,83 @@
|
||||
.Non-XID { background-color: #FFCCCC }
|
||||
.Decomposable { background-color: #FFFFCC }
|
||||
.Pattern_Syntax { background-color: #FFCCFF }
|
||||
.IDN-Remapped { background-color: #FF6666 }
|
||||
.IDN-Deleted { background-color: #66FF66 }
|
||||
.IDN-Illegal { background-color: #6666FF }
|
||||
th { text-align: left }
|
||||
-->
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<body style="margin: 2em">
|
||||
|
||||
<h1>IDN Character Categorization</h1>
|
||||
<p>$Date: 2005/03/29 18:31:15 $, MED</p>
|
||||
<p><i>$Date: 2005/03/30 17:19:32 $, MED</i></p>
|
||||
<p>This page lists all of the valid output IDN characters broken down by category. By "output" IDN
|
||||
characters, we mean ones that can result from nameprep. Characters are grouped first by script, and
|
||||
then by subcategory. Within each subcategory characters are sorted according to the default
|
||||
<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tooltips provide the character code
|
||||
<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tool-tips provide the character code
|
||||
and name (in enabled browsers).</p>
|
||||
<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse" bordercolor="#111111" id="AutoNumber1">
|
||||
<tr>
|
||||
<th>Subcategory</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Atomic">Atomic</td>
|
||||
<td>Characters that don't fall into any of the following subcategories</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
|
||||
<td>For bicameral scripts, Atomic characters without an uppercase.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Pattern_Syntax">Pattern_Syntax</td>
|
||||
<td>Characters recommended as a basis for syntax, as in
|
||||
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a>.
|
||||
Excludes the word characters in <i>Section 4 Word Boundaries</i> of
|
||||
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
|
||||
Word_Break property and notes at the end of the section. </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Non-XID">Non-XID</td>
|
||||
<td>Characters recommended as a basis for identifiers, as in
|
||||
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a>
|
||||
(XID_Continue). Excludes the word characters in <i>Section 4 Word Boundaries</i> of
|
||||
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
|
||||
Word_Break property and notes at the end of the section.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Decomposable">Decomposable</td>
|
||||
<td>Characters with NFC decompositions.</td>
|
||||
</tr>
|
||||
</table>
|
||||
<table>
|
||||
</table>
|
||||
<h2>Categorization</h2>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
<blockquote>
|
||||
<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse">
|
||||
<caption><b><font size="4">Key</font></b></caption>
|
||||
<tr>
|
||||
<th>Subcategory</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Atomic">Atomic</td>
|
||||
<td>Characters that don't fall into any of the following subcategories</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
|
||||
<td>For bicameral scripts, Atomic characters without an uppercase.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Pattern_Syntax">Pattern_Syntax</td>
|
||||
<td>Characters recommended as a basis for use in pattern syntax.<p>Excludes the word
|
||||
characters in <i>Section 4 Word Boundaries</i> of
|
||||
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
|
||||
Word_Break property and notes at the end of the section.</p>
|
||||
<p>See <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and
|
||||
Pattern Syntax</a>. </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Non-XID">Non-XID</td>
|
||||
<td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and the
|
||||
word characters in <i>Section 4 Word Boundaries</i> of
|
||||
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
|
||||
Word_Break property and notes at the end of the section.<p>See
|
||||
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern
|
||||
Syntax</a> (XID_Continue).</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Decomposable">Decomposable</td>
|
||||
<td>Characters with NFC decompositions.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Remapped">IDN-Remapped</td>
|
||||
<td>Characters remapped by IDN.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Deleted">IDN-Deleted</td>
|
||||
<td>Characters deleted by IDN.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Illegal">IDN-Illegal </td>
|
||||
<td>Characters illegal in IDN (note: most of these are due to IDN's using an old version of Unicode).</td>
|
||||
</tr>
|
||||
</table>
|
||||
</blockquote>
|
||||
<p>The information in the categorization is also available in a plain-text file, at
|
||||
<a href="idn-chars.txt">idn-chars.txt</a>. It can be viewed as is, or loaded into a spreadsheet for
|
||||
sorting and filtering to view the data in different ways. The format is:</p>
|
||||
<blockquote>
|
||||
<p>code ; script ; subcategory # general-category (character) character-name</p>
|
||||
</blockquote>
|
||||
<p><i>Examples:</i></p>
|
||||
<pre>0061 ; LATIN ; Atomic # ; L& (a) LATIN SMALL LETTER A
|
||||
2015 ; COMMON ; Pattern_Syntax # Pd (―) HORIZONTAL BAR
|
||||
058A ; ARMENIAN ; Atomic-no-uppercase # ; Pd (֊) ARMENIAN HYPHEN
|
||||
20AC ; COMMON ; Non-XID # ; Sc (€) EURO SIGN</pre>
|
||||
<h2>Categorization</h2>
|
@ -17,43 +17,53 @@ public class UnicodeDataFile {
|
||||
private String mostRecent;
|
||||
private String filename;
|
||||
private UnicodeDataFile(){};
|
||||
private String fileType = ".txt";
|
||||
|
||||
public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException {
|
||||
UnicodeDataFile result = new UnicodeDataFile();
|
||||
result.newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
|
||||
result.out = Utility.openPrintWriter(result.newFile, Utility.UTF8_UNIX);
|
||||
String[] batName = {""};
|
||||
result.mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
|
||||
result.batName = batName[0];
|
||||
result.filename = filename;
|
||||
return new UnicodeDataFile(directory, filename, false);
|
||||
}
|
||||
|
||||
public static UnicodeDataFile openHTMLAndWriteHeader(String directory, String filename) throws IOException {
|
||||
return new UnicodeDataFile(directory, filename, true);
|
||||
}
|
||||
|
||||
private UnicodeDataFile (String directory, String filename, boolean isHTML) throws IOException {
|
||||
fileType = isHTML ? ".html" : ".txt";
|
||||
String newSuffix = UnicodeDataFile.getFileSuffix(true, fileType);
|
||||
newFile = directory + filename + newSuffix;
|
||||
out = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
|
||||
String[] batName2 = {""};
|
||||
mostRecent = UnicodeDataFile.generateBat(directory, filename, newSuffix, fileType, batName2);
|
||||
batName = batName2[0];
|
||||
filename = filename;
|
||||
|
||||
result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
|
||||
result.out.println(generateDateLine());
|
||||
result.out.println("#");
|
||||
result.out.println("# Unicode Character Database");
|
||||
result.out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
|
||||
result.out.println(
|
||||
"# For terms of use, see http://www.unicode.org/terms_of_use.html");
|
||||
result.out.println("# For documentation, see UCD.html");
|
||||
if (!isHTML) {
|
||||
out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
|
||||
out.println(generateDateLine());
|
||||
out.println("#");
|
||||
out.println("# Unicode Character Database");
|
||||
out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
|
||||
out.println(
|
||||
"# For terms of use, see http://www.unicode.org/terms_of_use.html");
|
||||
out.println("# For documentation, see UCD.html");
|
||||
}
|
||||
try {
|
||||
Utility.appendFile(filename + "Header.txt", Utility.LATIN1, result.out);
|
||||
Utility.appendFile(filename + "Header" + fileType, Utility.UTF8_UNIX, out);
|
||||
} catch (FileNotFoundException e) {
|
||||
/*
|
||||
result.out.println("# Unicode Character Database: Derived Property Data");
|
||||
result.out.println("# Generated algorithmically from the Unicode Character Database");
|
||||
result.out.println("# For documentation, see UCD.html");
|
||||
result.out.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
|
||||
result.out.println("# if they have default property values.");
|
||||
result.out.println("# ================================================");
|
||||
out.println("# Unicode Character Database: Derived Property Data");
|
||||
out.println("# Generated algorithmically from the Unicode Character Database");
|
||||
out.println("# For documentation, see UCD.html");
|
||||
out.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
|
||||
out.println("# if they have default property values.");
|
||||
out.println("# ================================================");
|
||||
*/
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
Utility.appendFile(filename + "Footer.txt", Utility.LATIN1, out);
|
||||
Utility.appendFile(filename + "Footer" + fileType, Utility.UTF8_UNIX, out);
|
||||
} catch (FileNotFoundException e) {}
|
||||
out.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName);
|
||||
@ -64,21 +74,20 @@ public class UnicodeDataFile {
|
||||
}
|
||||
|
||||
public static String getHTMLFileSuffix(boolean withDVersion) {
|
||||
return "-"
|
||||
+ Default.ucd().getVersion()
|
||||
+ ((withDVersion && MakeUnicodeFiles.dVersion >= 0)
|
||||
? ("d" + MakeUnicodeFiles.dVersion)
|
||||
: "")
|
||||
+ ".html";
|
||||
return getFileSuffix(withDVersion, ".html");
|
||||
}
|
||||
|
||||
public static String getFileSuffix(boolean withDVersion) {
|
||||
return getFileSuffix(withDVersion, ".txt");
|
||||
}
|
||||
|
||||
public static String getFileSuffix(boolean withDVersion, String suffix) {
|
||||
return "-"
|
||||
+ Default.ucd().getVersion()
|
||||
+ ((withDVersion && MakeUnicodeFiles.dVersion >= 0)
|
||||
? ("d" + MakeUnicodeFiles.dVersion)
|
||||
: "")
|
||||
+ ".txt";
|
||||
+ suffix;
|
||||
}
|
||||
|
||||
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
|
||||
@ -126,8 +135,8 @@ public class UnicodeDataFile {
|
||||
*/
|
||||
// static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
|
||||
|
||||
public static String generateBat(String directory, String fileRoot, String suffix, String[] outputBatName) throws IOException {
|
||||
String mostRecent = Utility.getMostRecentUnicodeDataFile(UnicodeDataFile.fixFile(fileRoot), Default.ucd().getVersion(), true, true);
|
||||
public static String generateBat(String directory, String fileRoot, String suffix, String fileType, String[] outputBatName) throws IOException {
|
||||
String mostRecent = Utility.getMostRecentUnicodeDataFile(UnicodeDataFile.fixFile(fileRoot), Default.ucd().getVersion(), true, true, fileType);
|
||||
if (mostRecent != null) {
|
||||
outputBatName[0] = UnicodeDataFile.generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
|
||||
mostRecent, directory + fileRoot + suffix);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2005/03/04 02:50:26 $
|
||||
* $Revision: 1.47 $
|
||||
* $Date: 2005/03/30 17:19:32 $
|
||||
* $Revision: 1.48 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -1021,7 +1021,12 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
}
|
||||
|
||||
public static String getMostRecentUnicodeDataFile(String filename, String version,
|
||||
boolean acceptLatest, boolean show) throws IOException {
|
||||
boolean acceptLatest, boolean show) throws IOException {
|
||||
return getMostRecentUnicodeDataFile(filename, version, acceptLatest, show, ".txt");
|
||||
}
|
||||
|
||||
public static String getMostRecentUnicodeDataFile(String filename, String version,
|
||||
boolean acceptLatest, boolean show, String fileType) throws IOException {
|
||||
// get all the files in the directory
|
||||
|
||||
int compValue = acceptLatest ? 0 : 1;
|
||||
@ -1030,7 +1035,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
|
||||
String directoryName = UCD_Types.UCD_DIR + File.separator + searchPath[i] + "-Update" + File.separator;
|
||||
if (show) System.out.println("Trying: '" + directoryName + "', '" + filename + "'");
|
||||
String result = searchDirectory(new File(directoryName), filename, show);
|
||||
String result = searchDirectory(new File(directoryName), filename, show, fileType);
|
||||
if (result != null) return result;
|
||||
|
||||
}
|
||||
@ -1048,16 +1053,20 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
}
|
||||
|
||||
public static String searchDirectory(File directory, String filename, boolean show) throws IOException {
|
||||
return searchDirectory(directory, filename, show, ".txt");
|
||||
}
|
||||
|
||||
public static String searchDirectory(File directory, String filename, boolean show, String fileType) throws IOException {
|
||||
Iterator it = getDirectoryContentsLastFirst(directory).iterator();
|
||||
while (it.hasNext()) {
|
||||
String fn = (String) it.next();
|
||||
File foo = new File(directory + File.separator + fn);
|
||||
// System.out.println("\tChecking: '" + foo.getCanonicalPath() + "'");
|
||||
if (foo.isDirectory()) {
|
||||
String attempt = searchDirectory(foo, filename, show);
|
||||
String attempt = searchDirectory(foo, filename, show, fileType);
|
||||
if (attempt != null) return attempt;
|
||||
}
|
||||
if (fn.endsWith(".txt") && fn.startsWith(filename)) {
|
||||
if (fn.endsWith(fileType) && fn.startsWith(filename)) {
|
||||
if (show) System.out.println("\tFound: '" + fn + "'");
|
||||
return foo.getCanonicalPath();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user