ICU-0 U4.1
X-SVN-Rev: 17421
This commit is contained in:
@ -5,8 +5,8 @@
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/,v $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.12 $
* $Date: 2005/03/30 17:19:32 $
* $Revision: 1.13 $
@ -17,6 +17,7 @@ import java.util.*;
@ -30,6 +31,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
OldUnicodeMap sampleMap = null;
OldUnicodeMap map = new OldUnicodeMap();
UnicodeProperty prop;
// ====================== Main ===========================
@ -46,6 +48,34 @@ abstract public class GenerateBreakTest implements UCD_Types {
this.ucd = ucd;
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion());
public void fillMap(String propName) {
List list = y.getAvailableValues();
for (Iterator it = list.iterator(); it.hasNext();) {
String label = (String);
map.add(label, y.getSet(label));
ToolUnicodePropertySource unicodePropertySource = ToolUnicodePropertySource.make("");
Set labels = new HashSet();
int addToMap(String label) {
UnicodeSet s = prop.getSet(label);
if (s == null || s.size() == 0) throw new IllegalArgumentException("Bad value: " + prop.getName() + ", " + label);
return map.add(label, s);
int addToMapLast(String label) {
int result = addToMap(label);
Set values = new HashSet(prop.getAvailableValues());
if (!values.equals(labels)) throw new IllegalArgumentException("Missing Property Values: " + prop.getName()
+ ": " + values.removeAll(labels));
return result;
// COMMON STUFF for Hangul
@ -280,24 +310,30 @@ abstract public class GenerateBreakTest implements UCD_Types {
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
PrintWriter out = Utility.openPrintWriter("TR29\\"
UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("auxiliary\\", fileName + "BreakTest");
PrintWriter out = fc.out;
/* PrintWriter out = Utility.openPrintWriter("auxiliary\\"
+ fileName + "BreakTest-"
+ ucd.getVersion()
+ ".html", Utility.UTF8_WINDOWS);
out.println("<!doctype HTML PUBLIC '-//W3C//DTD HTML 4.0 Transitional//EN' ''>");
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>" + fileName + " Break Chart</title>");
out.println("<style type='text/css'>");
out.println("td, th { vertical-align: top }");
out.println("<body bgcolor='#FFFFFF'>");
out.println("<h2>" + fileName + " Break Chart</h2>");
out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "; <b>Date:</b> " + ucd.getDate() + "</p>");
out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "</p>");
out.println("<p><b>Date:</b> " + Default.getDate() + "</p>");
if (sampleMap != null) {
if (false) {
out.println("<h3>Character Type Breakdown</h3>");
out.println("<table border='1' cellspacing='0' width='100%'>");
for (int i = 0; i < sampleMap.size(); ++i) {
@ -308,7 +344,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
@ -318,14 +354,18 @@ abstract public class GenerateBreakTest implements UCD_Types {
String[] testCase = new String[50];
// do main test
PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest"
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("auxiliary\\", fileName + "BreakTest"
+ (shortVersion ? "_SHORT" : ""));
PrintWriter out = fc.out;
/* PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest"
+ (shortVersion ? "_SHORT" : "")
+ "-" + ucd.getVersion()
+ ".txt", Utility.UTF8_WINDOWS);
int counter = 0;
out.println("# Default " + fileName + " Break Test");
out.println("# Generated: " + ucd.getDate() + ", MED");
out.println("# Format:");
out.println("# <string> (# <comment>)? ");
@ -361,7 +401,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
printLine(out, extraSingleSamples[ii], true, false);
out.println("# Lines: " + counter);
public void sampleDescription(PrintWriter out) {}
@ -461,7 +501,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (after == null) continue;
String h = getTypeID(after);
types += "<th " + width + " title='" + getInfo(after) + "'><a class='lbclass' href='#" + h + "'>" + h + "</th>";
types += "<th " + width + " class='lbclass' title='" + getInfo(after) + "'>" + h + "</th>";
//codes += "<th " + width + " title='" + getInfo(after) + "'>" + Utility.hex(after) + "</th>";
@ -480,8 +520,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (before == null) continue;
String h = getTypeID(before);
String line = "<tr><th title='" + ucd.getCodeAndName(before) + "'><a class='lbclass' href='#" + h + "'>"
+ h + "</th>";
String line = "<tr><th class='lbclass' title='" + ucd.getCodeAndName(before) + "'>" + h + "</th>";
for (int type2 = 0; type2 < tableLimit; ++type2) {
@ -555,7 +594,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
String status;
if (html) {
status = hasBreak ? " style='border-right: 1px solid blue'" : "";
string.append("<span title='" + getRule() + "'><span" + status + "> </span> <span>");
string.append("<span title='" + getRule() + "'><span" + status + "> </span> </span>");
} else {
status = hasBreak ? BREAK : NOBREAK;
@ -574,7 +613,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
+ "'>"
+ Utility.quoteXML(Utility.getDisplay(cp), true)
+ "</span>");
string.append("<span title='" + getRule() + "'><span" + status + "> </span> <span>");
string.append("<span title='" + getRule() + "'><span" + status + "> </span> </span>");
} else {
if (string.length() > 0) {
string.append(' ');
@ -743,28 +782,23 @@ abstract public class GenerateBreakTest implements UCD_Types {
GenerateGraphemeBreakTest(UCD ucd) {
fileName = "GraphemeCluster";
fileName = "Grapheme";
sampleMap = map;
Object foo = prop = unicodePropertySource.getProperty("Grapheme_Cluster_Break");
final int
CR = map.add("CR", new UnicodeSet(0xD, 0xD)),
LF = map.add("LF", new UnicodeSet(0xA, 0xA)),
Control = map.add("Control",
getSet(ucd, CATEGORY, Cc)
.addAll(getSet(ucd, CATEGORY, Cf))
.addAll(getSet(ucd, CATEGORY, Zp))
.addAll(getSet(ucd, CATEGORY, Zl))
Extend = map.add("Extend", getSet(ucd, DERIVED, GraphemeExtend)),
L = map.add("L", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.L)),
V = map.add("V", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.V)),
T = map.add("T", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.T)),
LV = map.add("LV", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.LV)),
LVT = map.add("LVT", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.LVT)),
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
CR = addToMap("CR"),
LF = addToMap("LF"),
Control = addToMap("Control"),
Extend = addToMap("Extend"),
L = addToMap("L"),
V = addToMap("V"),
T = addToMap("T"),
LV = addToMap("LV"),
LVT = addToMap("LVT"),
Other = addToMapLast("Other");
// stuff that subclasses need to override
public String getTypeID(int cp) {
@ -860,35 +894,23 @@ abstract public class GenerateBreakTest implements UCD_Types {
Object foo = prop = unicodePropertySource.getProperty("Word_Break");
//static String LENGTH = "[\u30FC\uFF70]";
//static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]";
//static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
//static String HIRAGANA_ITERATION = "[\u309D\u309E]";
final int
Format = map.add("Format", getSet(ucd, CATEGORY, Cf).remove(0x00AD)),
Katakana = map.add("Katakana", getSet(ucd, SCRIPT, KATAKANA_SCRIPT)
.addAll(new UnicodeSet("[\u30FC\uFF70\uFF9E\uFF9F]"))
//.addAll(new UnicodeSet(HALFWIDTH_KATAKANA))
//.addAll(new UnicodeSet(KATAKANA_ITERATION))
ALetter = map.add("ALetter",
getSet(ucd, DERIVED, PropAlphabetic)
.add(0x05F3, 0x05F3)
.removeAll(getSet(ucd, BINARY_PROPERTIES, Ideographic))
.removeAll(getSet(ucd, SCRIPT, THAI_SCRIPT))
.removeAll(getSet(ucd, SCRIPT, LAO_SCRIPT))
.removeAll(getSet(ucd, SCRIPT, HIRAGANA_SCRIPT))
MidLetter = map.add("MidLetter",
new UnicodeSet("[\\u0027\\u00AD\\u00B7\\u05f4\\u05F4\\u2019\\u2027]")),
MidNumLet = map.add("MidNumLet",
new UnicodeSet("[\\u002E\\u003A]")),
MidNum = map.add("MidNum", getSet(ucd, LINE_BREAK, LB_IN)
Numeric = map.add("Numeric", getSet(ucd, LINE_BREAK, LB_NU)),
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
Format = addToMap("Format"),
Katakana = addToMap("Katakana"),
ALetter = addToMap("ALetter"),
MidLetter = addToMap("MidLetter"),
//MidNumLet = addToMap("MidNumLet"),
MidNum = addToMap("MidNum"),
Numeric = addToMap("Numeric"),
ExtendNumLet = addToMap("ExtendNumLet"),
Other = addToMapLast("Other");
// stuff that subclasses need to override
public String getTypeID(int cp) {
@ -948,11 +970,11 @@ abstract public class GenerateBreakTest implements UCD_Types {
// Don’t break letters across certain punctuation
setRule("6: ALetter × (MidLetter | MidNumLet) ALetter");
if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false;
setRule("6: ALetter × MidLetter ALetter");
if (before == ALetter && after == MidLetter && after2 == ALetter) return false;
setRule("7: ALetter (MidLetter | MidNumLet) × ALetter");
if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false;
if (before2 == ALetter && before == MidLetter && after == ALetter) return false;
// Don’t break within sequences of digits, or digits adjacent to letters.
@ -968,15 +990,22 @@ abstract public class GenerateBreakTest implements UCD_Types {
// Don’t break within sequences like: '-3.2'
setRule("11: Numeric (MidNum | MidNumLet) × Numeric");
if (before2 == Numeric && (before == MidNum || before == MidNumLet) && after == Numeric) return false;
if (before2 == Numeric && before == MidNum && after == Numeric) return false;
setRule("12: Numeric × (MidNum | MidNumLet) Numeric");
if (before == Numeric && (after == MidNum || after == MidNumLet) && after2 == Numeric) return false;
if (before == Numeric && after == MidNum && after2 == Numeric) return false;
// Don't break between Katakana
setRule("13: Katakana × Katakana");
if (before == Katakana && after == Katakana) return false;
// Do not break from extenders
setRule("13a: (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet");
if ((before == ALetter || before == Numeric || before == Katakana || before == ExtendNumLet) && after == ExtendNumLet) return false;
setRule("13b: ExtendNumLet × (ALetter | Numeric | Katakana)");
if (before == ExtendNumLet && (after == ALetter || after == Numeric || after == Katakana)) return false;
// Otherwise break always.
setRule("14: Any ÷ Any");
@ -1344,7 +1373,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
setRule("9: OP SP* ×");
if (lastNonSpace == LB_OP) return false;
// LB 10 Don’t break within ‘”[’, , even with intervening spaces.
// LB 10 Don’t break within ‘<EFBFBD>?[’, , even with intervening spaces.
// QU SP* × OP
setRule("10: QU SP* × OP");
if (lastNonSpace == LB_QU && after == LB_OP) return false;
@ -1377,7 +1406,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
setRule("13: × GL ; GL ×");
if (after == LB_GL || before == LB_GL) return false;
// LB 14 Don’t break before or after ‘”’
// LB 14 Don’t break before or after ‘<EFBFBD>?’
setRule("14: × QU ; QU ×");
if (before == LB_QU || after == LB_QU) return false;
@ -1450,7 +1479,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (before == LB_HY) return true;
if (after == LB_BB) return true;
// LB 19 Don’t break between alphabetics (“at”)
// LB 19 Don’t break between alphabetics (“at<EFBFBD>?)
// AL × AL
setRule("19: AL × AL");
@ -1515,36 +1544,20 @@ abstract public class GenerateBreakTest implements UCD_Types {
Object foo = prop = unicodePropertySource.getProperty("Sentence_Break");
final int
Sep = map.add("Sep", new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]")),
Format = map.add("Format", getSet(ucd, CATEGORY, Cf)),
Sp = map.add("Sp", getSet(ucd, BINARY_PROPERTIES, White_space)
Lower = map.add("Lower", getSet(ucd, DERIVED, PropLowercase)),
Upper = map.add("Upper", getSet(ucd, CATEGORY, Lt)
.addAll(getSet(ucd, DERIVED, PropUppercase))),
OLetter = map.add("OLetter",
getSet(ucd, DERIVED, PropAlphabetic)
.add(0x05F3, 0x05F3)
Numeric = map.add("Numeric", getSet(ucd, LINE_BREAK, LB_NU)),
ATerm = map.add("ATerm", new UnicodeSet(0x002E,0x002E)),
Term = map.add("Term", new UnicodeSet(
+ "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049"
+ "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]")),
Close = map.add("Close",
getSet(ucd, CATEGORY, Po)
.addAll(getSet(ucd, CATEGORY, Pe))
.addAll(getSet(ucd, LINE_BREAK, LB_QU))
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
Sep = addToMap("Sep"),
Format = addToMap("Format"),
Sp = addToMap("Sp"),
Lower = addToMap("Lower"),
Upper = addToMap("Upper"),
OLetter = addToMap("OLetter"),
Numeric = addToMap("Numeric"),
ATerm = addToMap("ATerm"),
STerm = addToMap("STerm"),
Close = addToMap("Close"),
Other = addToMapLast("Other");
// stuff that subclasses need to override
public String getTypeID(int cp) {
@ -1726,8 +1739,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (t == ATerm) {
lookAfter = ATerm;
} else if (t == Term) {
lookAfter = Term;
} else if (t == STerm) {
lookAfter = STerm;
@ -1776,7 +1789,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
return false;
if (lookAfter == Term) break;
if (lookAfter == STerm) break;
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
@ -34,6 +34,10 @@ import;
@ -511,6 +515,14 @@ public class MakeUnicodeFiles {
} else if (filename.equals("StandardizedVariants")) {
} else if (filename.equals("GraphemeBreakTest")) {
new GenerateGraphemeBreakTest(Default.ucd()).run();
} else if (filename.equals("WordBreakTest")) {
new GenerateWordBreakTest(Default.ucd()).run();
} else if (filename.equals("LineBreakTest")) {
new GenerateLineBreakTest(Default.ucd()).run();
} else if (filename.equals("SentenceBreakTest")) {
new GenerateSentenceBreakTest(Default.ucd()).run();
} else {
@ -1,5 +1,5 @@
DeltaVersion: 13
Generate: DerivedBidiClass
DeltaVersion: 14
CopyrightYear: 2005
File: auxiliary/GraphemeBreakProperty
@ -14,6 +14,18 @@ File: auxiliary/SentenceBreakProperty
Property: Sentence_Break
Format: skipValue=Other
File: auxiliary/GraphemeBreakTest
Property: SPECIAL
File: auxiliary/WordBreakTest
Property: SPECIAL
File: auxiliary/LineBreakTest
Property: SPECIAL
File: auxiliary/SentenceBreakTest
Property: SPECIAL
File: Blocks
Property: Block
# Note: When comparing block names, casing, whitespace, hyphens,
@ -58,12 +70,14 @@ Value: 4.1
File: extracted/DerivedBidiClass
Property: Bidi_Class
# Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
# Unlike other properties, unassigned code points in blocks reserved for right-to-left scripts are given either types R or AL.
# Unlike other properties, unassigned code points in blocks
# reserved for right-to-left scripts are given either types R or AL.
# The unassigned characters that default to R are:
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF
# \uFB1D-\uFB4F \U00010840-\U000109FF \U00010A60-\U00010FFF
# The unassigned characters that default to AL are:
# Arabic, Syriac, Thaana, Arabic_Presentation_Forms_A, Arabic_Presentation_Forms_B, Arabic_Supplement,
# and the range \u0750-\u077F, minus the Noncharacter_Code_Points
# Arabic, Syriac, Arabic_Supplement, Thaana, Arabic_Presentation_Forms_A,
# Arabic_Presentation_Forms_B, minus the Noncharacter_Code_Points
# For all other cases:
Format: valueStyle=short skipUnassigned=Left_To_Right
@ -5,8 +5,8 @@
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/,v $
* $Date: 2005/03/26 05:40:05 $
* $Revision: 1.19 $
* $Date: 2005/03/30 17:19:32 $
* $Revision: 1.20 $
@ -148,54 +148,58 @@ public class TestData implements UCD_Types {
Matcher m;
static class GenStringPrep {
UnicodeSet[] coreChars = new UnicodeSet[100];
UnicodeSet decomposable = new UnicodeSet();
UnicodeSet pattern = new UnicodeSet();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())
.addAll(new UnicodeSet("[\u0027 \u002D \u002E \u003A \u00B7 \u058A \u05F3" +
" \u05F4 \u200C \u200D \u2010 \u2019 \u2027 \u30A0]"));
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
UnicodeSet not_xid_continue = ups.getSet("XID_Continue=true").complement().removeAll(wordChars);
//UnicodeSet[] decompChars = new UnicodeSet[100];
UCD ucd = Default.ucd();
Collator uca = Collator.getInstance(ULocale.ENGLISH);
Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
GenerateHanTransliterator.MultiComparator uca
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
uca0, new UTF16.StringComparator()});
UnicodeSet bidiR = new UnicodeSet(
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
UnicodeSet hasUpper = new UnicodeSet();
BagFormatter bf = new BagFormatter();
UnicodeSet inIDN = new UnicodeSet();
void genStringPrep() throws IOException {
//BagFormatter bf = new BagFormatter();
//System.out.println(bf.showSetDifferences("ID_Continue", id_continue, "XID_Continue", xid_continue));
StringBuffer inbuffer = new StringBuffer();
StringBuffer intermediate, outbuffer;
if (false) {
System.out.println("word chars: " + bf.showSetNames(wordChars));
System.out.println("pat: " + bf.showSetNames(patternProp));
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
if (intermediate.length() == 0)
outbuffer = IDNA.convertToUnicode(intermediate,
} catch (StringPrepParseException e) {
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
if (!TestData.equals(inbuffer, outbuffer))
int idnaType = getIDNAType(cp);
int script = ucd.getScript(cp);
if (coreChars[script] == null)
coreChars[script] = new UnicodeSet();
@ -208,8 +212,12 @@ public class TestData implements UCD_Types {
PrintWriter out = BagFormatter.openUTF8Writer(GEN_DIR,
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
textOut.println("For documentation, see idn-chars.html");
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut);
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>IDN Characters</title><style>");
@ -217,44 +225,87 @@ public class TestData implements UCD_Types {
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
out.println(".Atomic { background-color: #CCCCFF }");
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
out.println(".Non-ID { background-color: #FFCCCC }");
out.println(".Non-XID { background-color: #FFCCCC }");
out.println(".Decomposable { background-color: #FFFFCC }");
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
out.println("th { text-align: left }");
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0' style='border-collapse: collapse'>");
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
if (scriptCode == COMMON_SCRIPT
|| scriptCode == INHERITED_SCRIPT)
showCodes(out, scriptCode);
showCodes(htmlOut, textOut, scriptCode);
showCodes(out, COMMON_SCRIPT);
showCodes(out, INHERITED_SCRIPT);
showCodes(htmlOut, textOut, COMMON_SCRIPT);
showCodes(htmlOut, textOut, INHERITED_SCRIPT);
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
private int getIDNAType(int cp) {
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
if (intermediate.length() == 0)
return DELETED;
outbuffer = IDNA.convertToUnicode(intermediate,
} catch (StringPrepParseException e) {
return ILLEGAL;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
return ILLEGAL;
if (!TestData.equals(inbuffer, outbuffer))
return REMAPPED;
return OK;
StringBuffer inbuffer = new StringBuffer();
StringBuffer intermediate, outbuffer;
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
* @param out
* @param htmlOut
* @param textOut TODO
* @param scriptCode
* @param ucd
* @param coreChars
* @param decompChars
* @param scriptCode
private void showCodes(PrintWriter out, int scriptCode) {
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode) {
if (coreChars[scriptCode] == null) return;
System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
htmlOut.println("<tr><th class='script'>Script: " + script + "</th></tr>");
textOut.println("#*** Script: " + script + " ***");
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
UnicodeSet decomp = new UnicodeSet(core).retainAll(decomposable);
UnicodeSet non_id = new UnicodeSet(core).removeAll(xid_continue);
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
UnicodeSet decomp = extract(decomposable, core);
UnicodeSet pattern = extract(patternProp, core);
UnicodeSet non_id = extract(not_xid_continue, core);
UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
if (core.size() == 0) {
@ -262,58 +313,81 @@ public class TestData implements UCD_Types {
core = otherCore;
otherCore = temp;
printlnSet(out, "Atomic", core, scriptCode);
if (otherCore.size() != 0) printlnSet(out, "Atomic-no-uppercase", otherCore, scriptCode);
if (non_id.size() != 0) printlnSet(out, "Non-ID", non_id, scriptCode);
if (decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
if (otherCore.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", otherCore, scriptCode);
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped", remapped, scriptCode);
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
* @param out
* @param unicodeset
* @param uca
* @param scriptCode
private void printlnSet(PrintWriter out, String title,
UnicodeSet unicodeset, int scriptCode) {
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
return decomp;
* @param htmlOut
* @param textOut TODO
* @param script TODO
* @param unicodeset
* @param scriptCode
* @param uca
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
String script, String title, UnicodeSet unicodeset, int scriptCode) {
if (unicodeset == null)
int size = unicodeset.size();
String dir = unicodeset.containsSome(bidiR)
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
out.println("<tr><th class='" + title + "'>" + title + " ("
htmlOut.println("<tr><th class='" + title + "'>" + title + " ("
+ nf.format(size) + ")</th></tr>");
out.print("<tr><td class='" + title + "'" + dir + ">");
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
textOut.println("# " + title);
bf.setValueSource(script + " ; " + title);
UnicodeSetIterator usi = new UnicodeSetIterator();
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
while (usi.nextRange()) {
if (usi.codepoint == usi.codepointEnd) {
} else {
+ ".. "
+ formatCode(UTF16
bf.showSetNames(textOut, unicodeset);
} else {
Set reordered = new TreeSet(uca);
while ( {
boolean foo = reordered.add(usi.getString());
String x = usi.getString();
boolean foo = reordered.add(x);
if (!foo)
throw new IllegalArgumentException("Collision with "
+ Default.ucd().getCodeAndName(usi.getString()));
+ Default.ucd().getCodeAndName(x));
for (Iterator it = reordered.iterator(); it.hasNext();) {
out.print(formatCode((String) it
Object key =;
bf.showSetNames(textOut, reordered);
@ -324,7 +398,7 @@ public class TestData implements UCD_Types {
int cat = ucd.getCategory(UTF16.charAt(string,0));
return "<span title='" + ucd.getCodeAndName(string) + "'>"
+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
+ BagFormatter.toHTML.transliterate(string)
+ BagFormatter.toHTMLControl.transliterate(string)
+ " </span>";
@ -123,7 +123,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}.setValues(LONG_YES_NO, YES_NO)
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases()
.setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
@ -135,7 +135,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases()
.setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
@ -147,7 +147,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}.setValues(LONG_YES_NO, YES_NO)
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases()
.setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
@ -159,7 +159,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases()
.setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version));
@ -235,7 +235,12 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version));
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version)
.addValueAliases(new String[][] {
{"Control", "CN"},
{"Extend", "EX"},
{"Other", "XX"},
add(new UnicodeProperty.UnicodeMapProperty() {
@ -268,7 +273,17 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version));
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version)
.addValueAliases(new String[][] {
{"Format", "FO"},
{"Katakana", "KA"},
{"ALetter", "LE"},
{"MidLetter", "ML"},
{"MidNum", "MN"},
{"Numeric", "NU"},
{"ExtendNumLet", "EX"},
{"Other", "XX"},
add(new UnicodeProperty.UnicodeMapProperty() {
@ -307,7 +322,20 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version));
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version)
.addValueAliases(new String[][] {
{"Sep", "SE"},
{"Format", "FO"},
{"Sp", "SP"},
{"Lower", "LO"},
{"Upper", "UP"},
{"OLetter", "LE"},
{"Numeric", "NU"},
{"ATerm", "AT"},
{"STerm", "ST"},
{"Close", "CL"},
{"Other", "XX"},
static String[] YES_NO_MAYBE = {"N", "M", "Y"};
@ -14,58 +14,83 @@
.Non-XID { background-color: #FFCCCC }
.Decomposable { background-color: #FFFFCC }
.Pattern_Syntax { background-color: #FFCCFF }
.IDN-Remapped { background-color: #FF6666 }
.IDN-Deleted { background-color: #66FF66 }
.IDN-Illegal { background-color: #6666FF }
th { text-align: left }
<body style="margin: 2em">
<h1>IDN Character Categorization</h1>
<p>$Date: 2005/03/29 18:31:15 $, MED</p>
<p><i>$Date: 2005/03/30 17:19:32 $, MED</i></p>
<p>This page lists all of the valid output IDN characters broken down by category. By "output" IDN
characters, we mean ones that can result from nameprep. Characters are grouped first by script, and
then by subcategory. Within each subcategory characters are sorted according to the default
<a href="">UCA</a> order. Tooltips provide the character code
<a href="">UCA</a> order. Tool-tips provide the character code
and name (in enabled browsers).</p>
<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse" bordercolor="#111111" id="AutoNumber1">
<td class="Atomic">Atomic</td>
<td>Characters that don't fall into any of the following subcategories</td>
<td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
<td>For bicameral scripts, Atomic characters without an uppercase.</td>
<td class="Pattern_Syntax">Pattern_Syntax</td>
<td>Characters recommended as a basis for syntax, as in
<a href="">UAX #31: Identifier and Pattern Syntax</a>.
Excludes the word characters in <i>Section 4 Word Boundaries</i> of
<a href="">UAX# 29</a>, in the
Word_Break property and notes at the end of the section. </td>
<td class="Non-XID">Non-XID</td>
<td>Characters recommended as a basis for identifiers, as in
<a href="">UAX #31: Identifier and Pattern Syntax</a>
(XID_Continue). Excludes the word characters in <i>Section 4 Word Boundaries</i> of
<a href="">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.</td>
<td class="Decomposable">Decomposable</td>
<td>Characters with NFC decompositions.</td>
<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse">
<caption><b><font size="4">Key</font></b></caption>
<td class="Atomic">Atomic</td>
<td>Characters that don't fall into any of the following subcategories</td>
<td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
<td>For bicameral scripts, Atomic characters without an uppercase.</td>
<td class="Pattern_Syntax">Pattern_Syntax</td>
<td>Characters recommended as a basis for use in pattern syntax.<p>Excludes the word
characters in <i>Section 4 Word Boundaries</i> of
<a href="">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.</p>
<p>See <a href="">UAX #31: Identifier and
Pattern Syntax</a>. </td>
<td class="Non-XID">Non-XID</td>
<td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and the
word characters in <i>Section 4 Word Boundaries</i> of
<a href="">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.<p>See
<a href="">UAX #31: Identifier and Pattern
Syntax</a> (XID_Continue).</td>
<td class="Decomposable">Decomposable</td>
<td>Characters with NFC decompositions.</td>
<td class="IDN-Remapped">IDN-Remapped</td>
<td>Characters remapped by IDN.</td>
<td class="IDN-Deleted">IDN-Deleted</td>
<td>Characters deleted by IDN.</td>
<td class="IDN-Illegal">IDN-Illegal </td>
<td>Characters illegal in IDN (note: most of these are due to IDN's using an old version of Unicode).</td>
<p>The information in the categorization is also available in a plain-text file, at
<a href="idn-chars.txt">idn-chars.txt</a>. It can be viewed as is, or loaded into a spreadsheet for
sorting and filtering to view the data in different ways. The format is:</p>
<p>code ; script ; subcategory # general-category (character) character-name</p>
<pre>0061 ; LATIN ; Atomic # ; L& (a) LATIN SMALL LETTER A
2015 ; COMMON ; Pattern_Syntax # Pd (―) HORIZONTAL BAR
058A ; ARMENIAN ; Atomic-no-uppercase # ; Pd (֊) ARMENIAN HYPHEN
20AC ; COMMON ; Non-XID # ; Sc (€) EURO SIGN</pre>
@ -17,43 +17,53 @@ public class UnicodeDataFile {
private String mostRecent;
private String filename;
private UnicodeDataFile(){};
private String fileType = ".txt";
public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException {
UnicodeDataFile result = new UnicodeDataFile();
result.newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
result.out = Utility.openPrintWriter(result.newFile, Utility.UTF8_UNIX);
String[] batName = {""};
result.mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
result.batName = batName[0];
result.filename = filename;
return new UnicodeDataFile(directory, filename, false);
public static UnicodeDataFile openHTMLAndWriteHeader(String directory, String filename) throws IOException {
return new UnicodeDataFile(directory, filename, true);
private UnicodeDataFile (String directory, String filename, boolean isHTML) throws IOException {
fileType = isHTML ? ".html" : ".txt";
String newSuffix = UnicodeDataFile.getFileSuffix(true, fileType);
newFile = directory + filename + newSuffix;
out = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
String[] batName2 = {""};
mostRecent = UnicodeDataFile.generateBat(directory, filename, newSuffix, fileType, batName2);
batName = batName2[0];
filename = filename;
result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
result.out.println("# Unicode Character Database");
result.out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
"# For terms of use, see");
result.out.println("# For documentation, see UCD.html");
if (!isHTML) {
out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
out.println("# Unicode Character Database");
out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
"# For terms of use, see");
out.println("# For documentation, see UCD.html");
try {
Utility.appendFile(filename + "Header.txt", Utility.LATIN1, result.out);
Utility.appendFile(filename + "Header" + fileType, Utility.UTF8_UNIX, out);
} catch (FileNotFoundException e) {
result.out.println("# Unicode Character Database: Derived Property Data");
result.out.println("# Generated algorithmically from the Unicode Character Database");
result.out.println("# For documentation, see UCD.html");
result.out.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
result.out.println("# if they have default property values.");
result.out.println("# ================================================");
out.println("# Unicode Character Database: Derived Property Data");
out.println("# Generated algorithmically from the Unicode Character Database");
out.println("# For documentation, see UCD.html");
out.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
out.println("# if they have default property values.");
out.println("# ================================================");
return result;
public void close() throws IOException {
try {
Utility.appendFile(filename + "Footer.txt", Utility.LATIN1, out);
Utility.appendFile(filename + "Footer" + fileType, Utility.UTF8_UNIX, out);
} catch (FileNotFoundException e) {}
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName);
@ -64,21 +74,20 @@ public class UnicodeDataFile {
public static String getHTMLFileSuffix(boolean withDVersion) {
return "-"
+ Default.ucd().getVersion()
+ ((withDVersion && MakeUnicodeFiles.dVersion >= 0)
? ("d" + MakeUnicodeFiles.dVersion)
: "")
+ ".html";
return getFileSuffix(withDVersion, ".html");
public static String getFileSuffix(boolean withDVersion) {
return getFileSuffix(withDVersion, ".txt");
public static String getFileSuffix(boolean withDVersion, String suffix) {
return "-"
+ Default.ucd().getVersion()
+ ((withDVersion && MakeUnicodeFiles.dVersion >= 0)
? ("d" + MakeUnicodeFiles.dVersion)
: "")
+ ".txt";
+ suffix;
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
@ -126,8 +135,8 @@ public class UnicodeDataFile {
// static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
public static String generateBat(String directory, String fileRoot, String suffix, String[] outputBatName) throws IOException {
String mostRecent = Utility.getMostRecentUnicodeDataFile(UnicodeDataFile.fixFile(fileRoot), Default.ucd().getVersion(), true, true);
public static String generateBat(String directory, String fileRoot, String suffix, String fileType, String[] outputBatName) throws IOException {
String mostRecent = Utility.getMostRecentUnicodeDataFile(UnicodeDataFile.fixFile(fileRoot), Default.ucd().getVersion(), true, true, fileType);
if (mostRecent != null) {
outputBatName[0] = UnicodeDataFile.generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
mostRecent, directory + fileRoot + suffix);
@ -5,8 +5,8 @@
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/,v $
* $Date: 2005/03/04 02:50:26 $
* $Revision: 1.47 $
* $Date: 2005/03/30 17:19:32 $
* $Revision: 1.48 $
@ -1021,7 +1021,12 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
public static String getMostRecentUnicodeDataFile(String filename, String version,
boolean acceptLatest, boolean show) throws IOException {
boolean acceptLatest, boolean show) throws IOException {
return getMostRecentUnicodeDataFile(filename, version, acceptLatest, show, ".txt");
public static String getMostRecentUnicodeDataFile(String filename, String version,
boolean acceptLatest, boolean show, String fileType) throws IOException {
// get all the files in the directory
int compValue = acceptLatest ? 0 : 1;
@ -1030,7 +1035,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
String directoryName = UCD_Types.UCD_DIR + File.separator + searchPath[i] + "-Update" + File.separator;
if (show) System.out.println("Trying: '" + directoryName + "', '" + filename + "'");
String result = searchDirectory(new File(directoryName), filename, show);
String result = searchDirectory(new File(directoryName), filename, show, fileType);
if (result != null) return result;
@ -1048,16 +1053,20 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
public static String searchDirectory(File directory, String filename, boolean show) throws IOException {
return searchDirectory(directory, filename, show, ".txt");
public static String searchDirectory(File directory, String filename, boolean show, String fileType) throws IOException {
Iterator it = getDirectoryContentsLastFirst(directory).iterator();
while (it.hasNext()) {
String fn = (String);
File foo = new File(directory + File.separator + fn);
// System.out.println("\tChecking: '" + foo.getCanonicalPath() + "'");
if (foo.isDirectory()) {
String attempt = searchDirectory(foo, filename, show);
String attempt = searchDirectory(foo, filename, show, fileType);
if (attempt != null) return attempt;
if (fn.endsWith(".txt") && fn.startsWith(filename)) {
if (fn.endsWith(fileType) && fn.startsWith(filename)) {
if (show) System.out.println("\tFound: '" + fn + "'");
return foo.getCanonicalPath();
Reference in New Issue
Block a user