ICU-0 U4.1

X-SVN-Rev: 17421
This commit is contained in:
Mark Davis 2005-03-30 17:19:32 +00:00
parent 98a1c52e09
commit 31eafca234
8 changed files with 441 additions and 257 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.12 $
* $Date: 2005/03/30 17:19:32 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
@ -17,6 +17,7 @@ import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
@ -30,6 +31,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
OldUnicodeMap sampleMap = null;
OldUnicodeMap map = new OldUnicodeMap();
UnicodeProperty prop;
// ====================== Main ===========================
@ -46,6 +48,34 @@ abstract public class GenerateBreakTest implements UCD_Types {
this.ucd = ucd;
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion());
/*
public void fillMap(String propName) {
List list = y.getAvailableValues();
for (Iterator it = list.iterator(); it.hasNext();) {
String label = (String) it.next();
map.add(label, y.getSet(label));
}
}
*/
}
ToolUnicodePropertySource unicodePropertySource = ToolUnicodePropertySource.make("");
Set labels = new HashSet();
int addToMap(String label) {
labels.add(label);
UnicodeSet s = prop.getSet(label);
if (s == null || s.size() == 0) throw new IllegalArgumentException("Bad value: " + prop.getName() + ", " + label);
return map.add(label, s);
}
int addToMapLast(String label) {
int result = addToMap(label);
Set values = new HashSet(prop.getAvailableValues());
if (!values.equals(labels)) throw new IllegalArgumentException("Missing Property Values: " + prop.getName()
+ ": " + values.removeAll(labels));
return result;
}
// COMMON STUFF for Hangul
@ -280,24 +310,30 @@ abstract public class GenerateBreakTest implements UCD_Types {
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
PrintWriter out = Utility.openPrintWriter("TR29\\"
UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("auxiliary\\", fileName + "BreakTest");
PrintWriter out = fc.out;
/* PrintWriter out = Utility.openPrintWriter("auxiliary\\"
+ fileName + "BreakTest-"
+ ucd.getVersion()
+ ".html", Utility.UTF8_WINDOWS);
*/
out.println("<!doctype HTML PUBLIC '-//W3C//DTD HTML 4.0 Transitional//EN' 'http://www.w3.org/TR/REC-html40/loose.dtd'>");
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>" + fileName + " Break Chart</title>");
out.println("<style>");
out.println("<style type='text/css'>");
out.println("td, th { vertical-align: top }");
out.println("</style></head>");
out.println("<body bgcolor='#FFFFFF'>");
out.println("<h2>" + fileName + " Break Chart</h2>");
out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "; <b>Date:</b> " + ucd.getDate() + "</p>");
out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "</p>");
out.println("<p><b>Date:</b> " + Default.getDate() + "</p>");
generateTable(out);
if (sampleMap != null) {
if (false) {
out.println("<h3>Character Type Breakdown</h3>");
out.println("<table border='1' cellspacing='0' width='100%'>");
for (int i = 0; i < sampleMap.size(); ++i) {
@ -308,7 +344,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
out.println("</table>");
}
out.close();
fc.close();
generateTest(false);
@ -318,14 +354,18 @@ abstract public class GenerateBreakTest implements UCD_Types {
String[] testCase = new String[50];
// do main test
PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest"
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("auxiliary\\", fileName + "BreakTest"
+ (shortVersion ? "_SHORT" : ""));
PrintWriter out = fc.out;
/* PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest"
+ (shortVersion ? "_SHORT" : "")
+ "-" + ucd.getVersion()
+ ".txt", Utility.UTF8_WINDOWS);
*/
int counter = 0;
out.println("#");
out.println("# Default " + fileName + " Break Test");
out.println("# Generated: " + ucd.getDate() + ", MED");
out.println("#");
out.println("# Format:");
out.println("# <string> (# <comment>)? ");
@ -361,7 +401,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
printLine(out, extraSingleSamples[ii], true, false);
}
out.println("# Lines: " + counter);
out.close();
fc.close();
}
public void sampleDescription(PrintWriter out) {}
@ -461,7 +501,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (after == null) continue;
String h = getTypeID(after);
types += "<th " + width + " title='" + getInfo(after) + "'><a class='lbclass' href='#" + h + "'>" + h + "</th>";
types += "<th " + width + " class='lbclass' title='" + getInfo(after) + "'>" + h + "</th>";
//codes += "<th " + width + " title='" + getInfo(after) + "'>" + Utility.hex(after) + "</th>";
@ -480,8 +520,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (before == null) continue;
String h = getTypeID(before);
String line = "<tr><th title='" + ucd.getCodeAndName(before) + "'><a class='lbclass' href='#" + h + "'>"
+ h + "</th>";
String line = "<tr><th class='lbclass' title='" + ucd.getCodeAndName(before) + "'>" + h + "</th>";
for (int type2 = 0; type2 < tableLimit; ++type2) {
@ -555,7 +594,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
String status;
if (html) {
status = hasBreak ? " style='border-right: 1px solid blue'" : "";
string.append("<span title='" + getRule() + "'><span" + status + ">&nbsp;</span>&nbsp;<span>");
string.append("<span title='" + getRule() + "'><span" + status + ">&nbsp;</span>&nbsp;</span>");
} else {
status = hasBreak ? BREAK : NOBREAK;
string.append(status);
@ -574,7 +613,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
+ "'>"
+ Utility.quoteXML(Utility.getDisplay(cp), true)
+ "</span>");
string.append("<span title='" + getRule() + "'><span" + status + ">&nbsp;</span>&nbsp;<span>");
string.append("<span title='" + getRule() + "'><span" + status + ">&nbsp;</span>&nbsp;</span>");
} else {
if (string.length() > 0) {
string.append(' ');
@ -743,28 +782,23 @@ abstract public class GenerateBreakTest implements UCD_Types {
GenerateGraphemeBreakTest(UCD ucd) {
super(ucd);
fileName = "GraphemeCluster";
fileName = "Grapheme";
sampleMap = map;
}
Object foo = prop = unicodePropertySource.getProperty("Grapheme_Cluster_Break");
final int
CR = map.add("CR", new UnicodeSet(0xD, 0xD)),
LF = map.add("LF", new UnicodeSet(0xA, 0xA)),
Control = map.add("Control",
getSet(ucd, CATEGORY, Cc)
.addAll(getSet(ucd, CATEGORY, Cf))
.addAll(getSet(ucd, CATEGORY, Zp))
.addAll(getSet(ucd, CATEGORY, Zl))
.removeAll(map.getSetFromIndex(CR))
.removeAll(map.getSetFromIndex(LF))),
Extend = map.add("Extend", getSet(ucd, DERIVED, GraphemeExtend)),
L = map.add("L", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.L)),
V = map.add("V", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.V)),
T = map.add("T", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.T)),
LV = map.add("LV", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.LV)),
LVT = map.add("LVT", getSet(ucd, HANGUL_SYLLABLE_TYPE, UCD_Types.LVT)),
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
CR = addToMap("CR"),
LF = addToMap("LF"),
Control = addToMap("Control"),
Extend = addToMap("Extend"),
L = addToMap("L"),
V = addToMap("V"),
T = addToMap("T"),
LV = addToMap("LV"),
LVT = addToMap("LVT"),
Other = addToMapLast("Other");
// stuff that subclasses need to override
public String getTypeID(int cp) {
@ -860,35 +894,23 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
Object foo = prop = unicodePropertySource.getProperty("Word_Break");
//static String LENGTH = "[\u30FC\uFF70]";
//static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]";
//static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
//static String HIRAGANA_ITERATION = "[\u309D\u309E]";
final int
Format = map.add("Format", getSet(ucd, CATEGORY, Cf).remove(0x00AD)),
Katakana = map.add("Katakana", getSet(ucd, SCRIPT, KATAKANA_SCRIPT)
.addAll(new UnicodeSet("[\u30FC\uFF70\uFF9E\uFF9F]"))
//.addAll(new UnicodeSet(HALFWIDTH_KATAKANA))
//.addAll(new UnicodeSet(KATAKANA_ITERATION))
),
ALetter = map.add("ALetter",
getSet(ucd, DERIVED, PropAlphabetic)
.add(0x05F3, 0x05F3)
.removeAll(map.getSetFromIndex(Katakana))
.removeAll(getSet(ucd, BINARY_PROPERTIES, Ideographic))
.removeAll(getSet(ucd, SCRIPT, THAI_SCRIPT))
.removeAll(getSet(ucd, SCRIPT, LAO_SCRIPT))
.removeAll(getSet(ucd, SCRIPT, HIRAGANA_SCRIPT))
),
MidLetter = map.add("MidLetter",
new UnicodeSet("[\\u0027\\u00AD\\u00B7\\u05f4\\u05F4\\u2019\\u2027]")),
MidNumLet = map.add("MidNumLet",
new UnicodeSet("[\\u002E\\u003A]")),
MidNum = map.add("MidNum", getSet(ucd, LINE_BREAK, LB_IN)
.removeAll(map.getSetFromIndex(MidNumLet))),
Numeric = map.add("Numeric", getSet(ucd, LINE_BREAK, LB_NU)),
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
Format = addToMap("Format"),
Katakana = addToMap("Katakana"),
ALetter = addToMap("ALetter"),
MidLetter = addToMap("MidLetter"),
//MidNumLet = addToMap("MidNumLet"),
MidNum = addToMap("MidNum"),
Numeric = addToMap("Numeric"),
ExtendNumLet = addToMap("ExtendNumLet"),
Other = addToMapLast("Other");
// stuff that subclasses need to override
public String getTypeID(int cp) {
@ -948,11 +970,11 @@ abstract public class GenerateBreakTest implements UCD_Types {
// Dont break letters across certain punctuation
setRule("6: ALetter × (MidLetter | MidNumLet) ALetter");
if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false;
setRule("6: ALetter × MidLetter ALetter");
if (before == ALetter && after == MidLetter && after2 == ALetter) return false;
setRule("7: ALetter (MidLetter | MidNumLet) × ALetter");
if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false;
if (before2 == ALetter && before == MidLetter && after == ALetter) return false;
// Dont break within sequences of digits, or digits adjacent to letters.
@ -968,15 +990,22 @@ abstract public class GenerateBreakTest implements UCD_Types {
// Dont break within sequences like: '-3.2'
setRule("11: Numeric (MidNum | MidNumLet) × Numeric");
if (before2 == Numeric && (before == MidNum || before == MidNumLet) && after == Numeric) return false;
if (before2 == Numeric && before == MidNum && after == Numeric) return false;
setRule("12: Numeric × (MidNum | MidNumLet) Numeric");
if (before == Numeric && (after == MidNum || after == MidNumLet) && after2 == Numeric) return false;
if (before == Numeric && after == MidNum && after2 == Numeric) return false;
// Don't break between Katakana
setRule("13: Katakana × Katakana");
if (before == Katakana && after == Katakana) return false;
// Do not break from extenders
setRule("13a: (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet");
if ((before == ALetter || before == Numeric || before == Katakana || before == ExtendNumLet) && after == ExtendNumLet) return false;
setRule("13b: ExtendNumLet × (ALetter | Numeric | Katakana)");
if (before == ExtendNumLet && (after == ALetter || after == Numeric || after == Katakana)) return false;
// Otherwise break always.
setRule("14: Any ÷ Any");
@ -1344,7 +1373,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
setRule("9: OP SP* ×");
if (lastNonSpace == LB_OP) return false;
// LB 10 Dont break within [, , even with intervening spaces.
// LB 10 Dont break within <EFBFBD>?[, , even with intervening spaces.
// QU SP* × OP
setRule("10: QU SP* × OP");
if (lastNonSpace == LB_QU && after == LB_OP) return false;
@ -1377,7 +1406,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
setRule("13: × GL ; GL ×");
if (after == LB_GL || before == LB_GL) return false;
// LB 14 Dont break before or after
// LB 14 Dont break before or after <EFBFBD>?
setRule("14: × QU ; QU ×");
if (before == LB_QU || after == LB_QU) return false;
@ -1450,7 +1479,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (before == LB_HY) return true;
if (after == LB_BB) return true;
// LB 19 Dont break between alphabetics (at)
// LB 19 Dont break between alphabetics (at<EFBFBD>?)
// AL × AL
setRule("19: AL × AL");
@ -1515,36 +1544,20 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
Object foo = prop = unicodePropertySource.getProperty("Sentence_Break");
final int
Sep = map.add("Sep", new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]")),
Format = map.add("Format", getSet(ucd, CATEGORY, Cf)),
Sp = map.add("Sp", getSet(ucd, BINARY_PROPERTIES, White_space)
.removeAll(map.getSetFromIndex(Sep))),
Lower = map.add("Lower", getSet(ucd, DERIVED, PropLowercase)),
Upper = map.add("Upper", getSet(ucd, CATEGORY, Lt)
.addAll(getSet(ucd, DERIVED, PropUppercase))),
OLetter = map.add("OLetter",
getSet(ucd, DERIVED, PropAlphabetic)
.add(0x05F3, 0x05F3)
.removeAll(map.getSetFromIndex(Lower))
.removeAll(map.getSetFromIndex(Upper))
),
Numeric = map.add("Numeric", getSet(ucd, LINE_BREAK, LB_NU)),
ATerm = map.add("ATerm", new UnicodeSet(0x002E,0x002E)),
Term = map.add("Term", new UnicodeSet(
"[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367"
+ "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049"
+ "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]")),
Close = map.add("Close",
getSet(ucd, CATEGORY, Po)
.addAll(getSet(ucd, CATEGORY, Pe))
.addAll(getSet(ucd, LINE_BREAK, LB_QU))
.removeAll(map.getSetFromIndex(ATerm))
.removeAll(map.getSetFromIndex(Term))
.remove(0x05F3)
),
Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false);
Sep = addToMap("Sep"),
Format = addToMap("Format"),
Sp = addToMap("Sp"),
Lower = addToMap("Lower"),
Upper = addToMap("Upper"),
OLetter = addToMap("OLetter"),
Numeric = addToMap("Numeric"),
ATerm = addToMap("ATerm"),
STerm = addToMap("STerm"),
Close = addToMap("Close"),
Other = addToMapLast("Other");
// stuff that subclasses need to override
public String getTypeID(int cp) {
@ -1726,8 +1739,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
if (t == ATerm) {
lookAfter = ATerm;
} else if (t == Term) {
lookAfter = Term;
} else if (t == STerm) {
lookAfter = STerm;
}
break;
}
@ -1776,7 +1789,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
return false;
}
if (lookAfter == Term) break;
if (lookAfter == STerm) break;
}
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6

View File

@ -34,6 +34,10 @@ import com.ibm.icu.text.SymbolTable;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeMatcher;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.UCD.GenerateBreakTest.GenerateGraphemeBreakTest;
import com.ibm.text.UCD.GenerateBreakTest.GenerateLineBreakTest;
import com.ibm.text.UCD.GenerateBreakTest.GenerateSentenceBreakTest;
import com.ibm.text.UCD.GenerateBreakTest.GenerateWordBreakTest;
import com.ibm.text.UCD.MakeUnicodeFiles.Format.PrintStyle;
import com.ibm.text.utility.UnicodeDataFile;
import com.ibm.text.utility.Utility;
@ -511,6 +515,14 @@ public class MakeUnicodeFiles {
GenerateCaseFolding.generateSpecialCasing(false);
} else if (filename.equals("StandardizedVariants")) {
GenerateStandardizedVariants.generate();
} else if (filename.equals("GraphemeBreakTest")) {
new GenerateGraphemeBreakTest(Default.ucd()).run();
} else if (filename.equals("WordBreakTest")) {
new GenerateWordBreakTest(Default.ucd()).run();
} else if (filename.equals("LineBreakTest")) {
new GenerateLineBreakTest(Default.ucd()).run();
} else if (filename.equals("SentenceBreakTest")) {
new GenerateSentenceBreakTest(Default.ucd()).run();
} else {
generatePropertyFile(filename);
}

View File

@ -1,5 +1,5 @@
Generate:
DeltaVersion: 13
Generate: DerivedBidiClass
DeltaVersion: 14
CopyrightYear: 2005
File: auxiliary/GraphemeBreakProperty
@ -14,6 +14,18 @@ File: auxiliary/SentenceBreakProperty
Property: Sentence_Break
Format: skipValue=Other
File: auxiliary/GraphemeBreakTest
Property: SPECIAL
File: auxiliary/WordBreakTest
Property: SPECIAL
File: auxiliary/LineBreakTest
Property: SPECIAL
File: auxiliary/SentenceBreakTest
Property: SPECIAL
File: Blocks
Property: Block
# Note: When comparing block names, casing, whitespace, hyphens,
@ -58,12 +70,14 @@ Value: 4.1
File: extracted/DerivedBidiClass
Property: Bidi_Class
# Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
# Unlike other properties, unassigned code points in blocks reserved for right-to-left scripts are given either types R or AL.
# Unlike other properties, unassigned code points in blocks
# reserved for right-to-left scripts are given either types R or AL.
# The unassigned characters that default to R are:
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF
# \uFB1D-\uFB4F \U00010840-\U000109FF \U00010A60-\U00010FFF
# The unassigned characters that default to AL are:
# Arabic, Syriac, Thaana, Arabic_Presentation_Forms_A, Arabic_Presentation_Forms_B, Arabic_Supplement,
# and the range \u0750-\u077F, minus the Noncharacter_Code_Points
# Arabic, Syriac, Arabic_Supplement, Thaana, Arabic_Presentation_Forms_A,
# Arabic_Presentation_Forms_B, minus the Noncharacter_Code_Points
# For all other cases:
Format: valueStyle=short skipUnassigned=Left_To_Right

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2005/03/26 05:40:05 $
* $Revision: 1.19 $
* $Date: 2005/03/30 17:19:32 $
* $Revision: 1.20 $
*
*******************************************************************************
*/
@ -148,54 +148,58 @@ public class TestData implements UCD_Types {
log.close();
}
}
Matcher m;
static class GenStringPrep {
UnicodeSet[] coreChars = new UnicodeSet[100];
UnicodeSet decomposable = new UnicodeSet();
UnicodeSet pattern = new UnicodeSet();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())
.retainAll(ups.getSet("gc=Sk"))
.addAll(new UnicodeSet("[\u0027 \u002D \u002E \u003A \u00B7 \u058A \u05F3" +
" \u05F4 \u200C \u200D \u2010 \u2019 \u2027 \u30A0]"));
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
UnicodeSet not_xid_continue = ups.getSet("XID_Continue=true").complement().removeAll(wordChars);
//UnicodeSet[] decompChars = new UnicodeSet[100];
UCD ucd = Default.ucd();
Collator uca = Collator.getInstance(ULocale.ENGLISH);
Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
{
uca.setStrength(Collator.IDENTICAL);
uca0.setStrength(Collator.IDENTICAL);
}
GenerateHanTransliterator.MultiComparator uca
= new GenerateHanTransliterator.MultiComparator(new Comparator[] {
uca0, new UTF16.StringComparator()});
UnicodeSet bidiR = new UnicodeSet(
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
UnicodeSet hasUpper = new UnicodeSet();
BagFormatter bf = new BagFormatter();
UnicodeSet inIDN = new UnicodeSet();
void genStringPrep() throws IOException {
//BagFormatter bf = new BagFormatter();
//System.out.println(bf.showSetDifferences("ID_Continue", id_continue, "XID_Continue", xid_continue));
StringBuffer inbuffer = new StringBuffer();
StringBuffer intermediate, outbuffer;
bf.setShowLiteral(BagFormatter.toHTMLControl);
//bf.setValueSource(UnicodeLabel.NULL);
if (false) {
System.out.println("word chars: " + bf.showSetNames(wordChars));
System.out.println("pat: " + bf.showSetNames(patternProp));
System.out.println("xid: " + bf.showSetNames(not_xid_continue));
}
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
IDNA.USE_STD3_RULES);
if (intermediate.length() == 0)
continue;
outbuffer = IDNA.convertToUnicode(intermediate,
IDNA.USE_STD3_RULES);
} catch (StringPrepParseException e) {
continue;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
continue;
}
if (!TestData.equals(inbuffer, outbuffer))
continue;
int idnaType = getIDNAType(cp);
idnaTypeSet[idnaType].add(cp);
int script = ucd.getScript(cp);
if (coreChars[script] == null)
coreChars[script] = new UnicodeSet();
@ -208,8 +212,12 @@ public class TestData implements UCD_Types {
}
Utility.fixDot();
PrintWriter out = BagFormatter.openUTF8Writer(GEN_DIR,
"idn-chars.html");
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
textOut.println('\uFEFF');
textOut.println("For documentation, see idn-chars.html");
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut);
/*
out
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>IDN Characters</title><style>");
@ -217,44 +225,87 @@ public class TestData implements UCD_Types {
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
out.println(".Atomic { background-color: #CCCCFF }");
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
out.println(".Non-ID { background-color: #FFCCCC }");
out.println(".Non-XID { background-color: #FFCCCC }");
out.println(".Decomposable { background-color: #FFFFCC }");
out.println(".Pattern_Syntax { background-color: #FFCCFF }");
out.println("th { text-align: left }");
out.println("-->");
out.println("</style></head><body><table>");
*/
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0' style='border-collapse: collapse'>");
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
if (scriptCode == COMMON_SCRIPT
|| scriptCode == INHERITED_SCRIPT)
continue;
showCodes(out, scriptCode);
showCodes(htmlOut, textOut, scriptCode);
}
showCodes(out, COMMON_SCRIPT);
showCodes(out, INHERITED_SCRIPT);
out.println("</table></body></html>");
out.close();
showCodes(htmlOut, textOut, COMMON_SCRIPT);
showCodes(htmlOut, textOut, INHERITED_SCRIPT);
htmlOut.println("</table></body></html>");
htmlOut.close();
}
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
{
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
}
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
/**
*
*/
private int getIDNAType(int cp) {
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
IDNA.DEFAULT); // USE_STD3_RULES
if (intermediate.length() == 0)
return DELETED;
outbuffer = IDNA.convertToUnicode(intermediate,
IDNA.USE_STD3_RULES);
} catch (StringPrepParseException e) {
return ILLEGAL;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
return ILLEGAL;
}
if (!TestData.equals(inbuffer, outbuffer))
return REMAPPED;
return OK;
}
StringBuffer inbuffer = new StringBuffer();
StringBuffer intermediate, outbuffer;
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
/**
* @param out
* @param htmlOut
* @param textOut TODO
* @param scriptCode
* @param ucd
* @param coreChars
* @param decompChars
* @param scriptCode
*/
private void showCodes(PrintWriter out, int scriptCode) {
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode) {
if (coreChars[scriptCode] == null) return;
System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
out.println();
out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
htmlOut.println();
htmlOut.println("<tr><th class='script'>Script: " + script + "</th></tr>");
textOut.println();
textOut.println("#*** Script: " + script + " ***");
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
UnicodeSet decomp = new UnicodeSet(core).retainAll(decomposable);
core.removeAll(decomp);
UnicodeSet non_id = new UnicodeSet(core).removeAll(xid_continue);
core.removeAll(non_id);
UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
UnicodeSet decomp = extract(decomposable, core);
UnicodeSet pattern = extract(patternProp, core);
UnicodeSet non_id = extract(not_xid_continue, core);
UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
core.removeAll(otherCore);
if (core.size() == 0) {
@ -262,58 +313,81 @@ public class TestData implements UCD_Types {
core = otherCore;
otherCore = temp;
}
printlnSet(out, "Atomic", core, scriptCode);
if (otherCore.size() != 0) printlnSet(out, "Atomic-no-uppercase", otherCore, scriptCode);
if (non_id.size() != 0) printlnSet(out, "Non-ID", non_id, scriptCode);
if (decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
if (otherCore.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", otherCore, scriptCode);
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped", remapped, scriptCode);
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
}
/**
* @param out
* @param unicodeset
* @param uca
* @param scriptCode
*
*/
private void printlnSet(PrintWriter out, String title,
UnicodeSet unicodeset, int scriptCode) {
private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
core.removeAll(decomp);
return decomp;
}
/**
* @param htmlOut
* @param textOut TODO
* @param script TODO
* @param unicodeset
* @param scriptCode
* @param uca
*/
private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
String script, String title, UnicodeSet unicodeset, int scriptCode) {
if (unicodeset == null)
return;
int size = unicodeset.size();
String dir = unicodeset.containsSome(bidiR)
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
out.println("<tr><th class='" + title + "'>" + title + " ("
htmlOut.println("<tr><th class='" + title + "'>" + title + " ("
+ nf.format(size) + ")</th></tr>");
out.print("<tr><td class='" + title + "'" + dir + ">");
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
textOut.println();
textOut.println("# " + title);
bf.setValueSource(script + " ; " + title);
UnicodeSetIterator usi = new UnicodeSetIterator();
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
usi.reset(unicodeset);
while (usi.nextRange()) {
if (usi.codepoint == usi.codepointEnd) {
out.print(formatCode(UTF16
htmlOut.print(formatCode(UTF16
.valueOf(usi.codepoint)));
} else {
out.print(formatCode(UTF16
htmlOut.print(formatCode(UTF16
.valueOf(usi.codepoint))
+ ".. "
+ formatCode(UTF16
.valueOf(usi.codepointEnd)));
}
}
bf.showSetNames(textOut, unicodeset);
} else {
Set reordered = new TreeSet(uca);
usi.reset(unicodeset);
while (usi.next()) {
boolean foo = reordered.add(usi.getString());
String x = usi.getString();
boolean foo = reordered.add(x);
if (!foo)
throw new IllegalArgumentException("Collision with "
+ Default.ucd().getCodeAndName(usi.getString()));
+ Default.ucd().getCodeAndName(x));
}
for (Iterator it = reordered.iterator(); it.hasNext();) {
out.print(formatCode((String) it
.next()));
Object key = it.next();
htmlOut.print(formatCode((String)key));
}
bf.showSetNames(textOut, reordered);
}
out.println("</td></tr>");
htmlOut.println("</td></tr>");
}
/**
@ -324,7 +398,7 @@ public class TestData implements UCD_Types {
int cat = ucd.getCategory(UTF16.charAt(string,0));
return "<span title='" + ucd.getCodeAndName(string) + "'>"
+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
+ BagFormatter.toHTML.transliterate(string)
+ BagFormatter.toHTMLControl.transliterate(string)
+ " </span>";
}
}

View File

@ -123,7 +123,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setValues(LONG_YES_NO, YES_NO)
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases()
.setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
@ -135,7 +135,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE)
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases()
.setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
@ -147,7 +147,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setValues(LONG_YES_NO, YES_NO)
}.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases()
.setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version));
add(new UnicodeProperty.SimpleProperty() {
@ -159,7 +159,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public int getMaxWidth(boolean isShort) {
return 15;
}
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE)
}.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases()
.setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version));
@ -235,7 +235,12 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
unicodeMap.putAll(hangul.getSet("LVT"),"LVT");
unicodeMap.setMissing("Other");
}
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version));
}.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version)
.addValueAliases(new String[][] {
{"Control", "CN"},
{"Extend", "EX"},
{"Other", "XX"},
}).swapFirst2ValueAliases());
add(new UnicodeProperty.UnicodeMapProperty() {
{
@ -268,7 +273,17 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
unicodeMap.setMissing("Other");
}
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version));
}.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version)
.addValueAliases(new String[][] {
{"Format", "FO"},
{"Katakana", "KA"},
{"ALetter", "LE"},
{"MidLetter", "ML"},
{"MidNum", "MN"},
{"Numeric", "NU"},
{"ExtendNumLet", "EX"},
{"Other", "XX"},
}).swapFirst2ValueAliases());
add(new UnicodeProperty.UnicodeMapProperty() {
{
@ -307,7 +322,20 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
unicodeMap.setMissing("Other");
}
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version));
}.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version)
.addValueAliases(new String[][] {
{"Sep", "SE"},
{"Format", "FO"},
{"Sp", "SP"},
{"Lower", "LO"},
{"Upper", "UP"},
{"OLetter", "LE"},
{"Numeric", "NU"},
{"ATerm", "AT"},
{"STerm", "ST"},
{"Close", "CL"},
{"Other", "XX"},
}).swapFirst2ValueAliases());
}
static String[] YES_NO_MAYBE = {"N", "M", "Y"};

View File

@ -14,58 +14,83 @@
.Non-XID { background-color: #FFCCCC }
.Decomposable { background-color: #FFFFCC }
.Pattern_Syntax { background-color: #FFCCFF }
.IDN-Remapped { background-color: #FF6666 }
.IDN-Deleted { background-color: #66FF66 }
.IDN-Illegal { background-color: #6666FF }
th { text-align: left }
-->
</style>
</head>
<body>
<body style="margin: 2em">
<h1>IDN Character Categorization</h1>
<p>$Date: 2005/03/29 18:31:15 $, MED</p>
<p><i>$Date: 2005/03/30 17:19:32 $, MED</i></p>
<p>This page lists all of the valid output IDN characters broken down by category. By &quot;output&quot; IDN
characters, we mean ones that can result from nameprep. Characters are grouped first by script, and
then by subcategory. Within each subcategory characters are sorted according to the default
<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tooltips provide the character code
<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tool-tips provide the character code
and name (in enabled browsers).</p>
<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse" bordercolor="#111111" id="AutoNumber1">
<tr>
<th>Subcategory</th>
<th>Description</th>
</tr>
<tr>
<td class="Atomic">Atomic</td>
<td>Characters that don&#39;t fall into any of the following subcategories</td>
</tr>
<tr>
<td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
<td>For bicameral scripts, Atomic characters without an uppercase.</td>
</tr>
<tr>
<td class="Pattern_Syntax">Pattern_Syntax</td>
<td>Characters recommended as a basis for syntax, as in
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a>.
Excludes the word characters in <i>Section 4 Word Boundaries</i> of
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.&nbsp;&nbsp; </td>
</tr>
<tr>
<td class="Non-XID">Non-XID</td>
<td>Characters recommended as a basis for identifiers, as in
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a>
(XID_Continue). Excludes the word characters in <i>Section 4 Word Boundaries</i> of
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.</td>
</tr>
<tr>
<td class="Decomposable">Decomposable</td>
<td>Characters with NFC decompositions.</td>
</tr>
</table>
<table>
</table>
<h2>Categorization</h2>
</body>
</html>
<blockquote>
<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse">
<caption><b><font size="4">Key</font></b></caption>
<tr>
<th>Subcategory</th>
<th>Description</th>
</tr>
<tr>
<td class="Atomic">Atomic</td>
<td>Characters that don&#39;t fall into any of the following subcategories</td>
</tr>
<tr>
<td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
<td>For bicameral scripts, Atomic characters without an uppercase.</td>
</tr>
<tr>
<td class="Pattern_Syntax">Pattern_Syntax</td>
<td>Characters recommended as a basis for use in pattern syntax.<p>Excludes the word
characters in <i>Section 4 Word Boundaries</i> of
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.</p>
<p>See <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and
Pattern Syntax</a>. </td>
</tr>
<tr>
<td class="Non-XID">Non-XID</td>
<td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and the
word characters in <i>Section 4 Word Boundaries</i> of
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.<p>See
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern
Syntax</a> (XID_Continue).</td>
</tr>
<tr>
<td class="Decomposable">Decomposable</td>
<td>Characters with NFC decompositions.</td>
</tr>
<tr>
<td class="IDN-Remapped">IDN-Remapped</td>
<td>Characters remapped by IDN.</td>
</tr>
<tr>
<td class="IDN-Deleted">IDN-Deleted</td>
<td>Characters deleted by IDN.</td>
</tr>
<tr>
<td class="IDN-Illegal">IDN-Illegal </td>
<td>Characters illegal in IDN (note: most of these are due to IDN's using an old version of Unicode).</td>
</tr>
</table>
</blockquote>
<p>The information in the categorization is also available in a plain-text file, at
<a href="idn-chars.txt">idn-chars.txt</a>. It can be viewed as is, or loaded into a spreadsheet for
sorting and filtering to view the data in different ways. The format is:</p>
<blockquote>
<p>code ; script ; subcategory # general-category (character) character-name</p>
</blockquote>
<p><i>Examples:</i></p>
<pre>0061 ; LATIN ; Atomic # ; L&amp; (a) LATIN SMALL LETTER A
2015 ; COMMON ; Pattern_Syntax # Pd (―) HORIZONTAL BAR
058A ; ARMENIAN ; Atomic-no-uppercase # ; Pd (֊) ARMENIAN HYPHEN
20AC ; COMMON ; Non-XID # ; Sc (€) EURO SIGN</pre>
<h2>Categorization</h2>

View File

@ -17,43 +17,53 @@ public class UnicodeDataFile {
private String mostRecent;
private String filename;
private UnicodeDataFile(){};
private String fileType = ".txt";
public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException {
UnicodeDataFile result = new UnicodeDataFile();
result.newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
result.out = Utility.openPrintWriter(result.newFile, Utility.UTF8_UNIX);
String[] batName = {""};
result.mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
result.batName = batName[0];
result.filename = filename;
return new UnicodeDataFile(directory, filename, false);
}
public static UnicodeDataFile openHTMLAndWriteHeader(String directory, String filename) throws IOException {
return new UnicodeDataFile(directory, filename, true);
}
private UnicodeDataFile (String directory, String filename, boolean isHTML) throws IOException {
fileType = isHTML ? ".html" : ".txt";
String newSuffix = UnicodeDataFile.getFileSuffix(true, fileType);
newFile = directory + filename + newSuffix;
out = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
String[] batName2 = {""};
mostRecent = UnicodeDataFile.generateBat(directory, filename, newSuffix, fileType, batName2);
batName = batName2[0];
filename = filename;
result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
result.out.println(generateDateLine());
result.out.println("#");
result.out.println("# Unicode Character Database");
result.out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
result.out.println(
"# For terms of use, see http://www.unicode.org/terms_of_use.html");
result.out.println("# For documentation, see UCD.html");
if (!isHTML) {
out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
out.println(generateDateLine());
out.println("#");
out.println("# Unicode Character Database");
out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
out.println(
"# For terms of use, see http://www.unicode.org/terms_of_use.html");
out.println("# For documentation, see UCD.html");
}
try {
Utility.appendFile(filename + "Header.txt", Utility.LATIN1, result.out);
Utility.appendFile(filename + "Header" + fileType, Utility.UTF8_UNIX, out);
} catch (FileNotFoundException e) {
/*
result.out.println("# Unicode Character Database: Derived Property Data");
result.out.println("# Generated algorithmically from the Unicode Character Database");
result.out.println("# For documentation, see UCD.html");
result.out.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
result.out.println("# if they have default property values.");
result.out.println("# ================================================");
out.println("# Unicode Character Database: Derived Property Data");
out.println("# Generated algorithmically from the Unicode Character Database");
out.println("# For documentation, see UCD.html");
out.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
out.println("# if they have default property values.");
out.println("# ================================================");
*/
}
return result;
}
public void close() throws IOException {
try {
Utility.appendFile(filename + "Footer.txt", Utility.LATIN1, out);
Utility.appendFile(filename + "Footer" + fileType, Utility.UTF8_UNIX, out);
} catch (FileNotFoundException e) {}
out.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName);
@ -64,21 +74,20 @@ public class UnicodeDataFile {
}
public static String getHTMLFileSuffix(boolean withDVersion) {
return "-"
+ Default.ucd().getVersion()
+ ((withDVersion && MakeUnicodeFiles.dVersion >= 0)
? ("d" + MakeUnicodeFiles.dVersion)
: "")
+ ".html";
return getFileSuffix(withDVersion, ".html");
}
public static String getFileSuffix(boolean withDVersion) {
return getFileSuffix(withDVersion, ".txt");
}
public static String getFileSuffix(boolean withDVersion, String suffix) {
return "-"
+ Default.ucd().getVersion()
+ ((withDVersion && MakeUnicodeFiles.dVersion >= 0)
? ("d" + MakeUnicodeFiles.dVersion)
: "")
+ ".txt";
+ suffix;
}
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
@ -126,8 +135,8 @@ public class UnicodeDataFile {
*/
// static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
public static String generateBat(String directory, String fileRoot, String suffix, String[] outputBatName) throws IOException {
String mostRecent = Utility.getMostRecentUnicodeDataFile(UnicodeDataFile.fixFile(fileRoot), Default.ucd().getVersion(), true, true);
public static String generateBat(String directory, String fileRoot, String suffix, String fileType, String[] outputBatName) throws IOException {
String mostRecent = Utility.getMostRecentUnicodeDataFile(UnicodeDataFile.fixFile(fileRoot), Default.ucd().getVersion(), true, true, fileType);
if (mostRecent != null) {
outputBatName[0] = UnicodeDataFile.generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
mostRecent, directory + fileRoot + suffix);

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2005/03/04 02:50:26 $
* $Revision: 1.47 $
* $Date: 2005/03/30 17:19:32 $
* $Revision: 1.48 $
*
*******************************************************************************
*/
@ -1021,7 +1021,12 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
}
public static String getMostRecentUnicodeDataFile(String filename, String version,
boolean acceptLatest, boolean show) throws IOException {
boolean acceptLatest, boolean show) throws IOException {
return getMostRecentUnicodeDataFile(filename, version, acceptLatest, show, ".txt");
}
public static String getMostRecentUnicodeDataFile(String filename, String version,
boolean acceptLatest, boolean show, String fileType) throws IOException {
// get all the files in the directory
int compValue = acceptLatest ? 0 : 1;
@ -1030,7 +1035,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
String directoryName = UCD_Types.UCD_DIR + File.separator + searchPath[i] + "-Update" + File.separator;
if (show) System.out.println("Trying: '" + directoryName + "', '" + filename + "'");
String result = searchDirectory(new File(directoryName), filename, show);
String result = searchDirectory(new File(directoryName), filename, show, fileType);
if (result != null) return result;
}
@ -1048,16 +1053,20 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
}
public static String searchDirectory(File directory, String filename, boolean show) throws IOException {
return searchDirectory(directory, filename, show, ".txt");
}
public static String searchDirectory(File directory, String filename, boolean show, String fileType) throws IOException {
Iterator it = getDirectoryContentsLastFirst(directory).iterator();
while (it.hasNext()) {
String fn = (String) it.next();
File foo = new File(directory + File.separator + fn);
// System.out.println("\tChecking: '" + foo.getCanonicalPath() + "'");
if (foo.isDirectory()) {
String attempt = searchDirectory(foo, filename, show);
String attempt = searchDirectory(foo, filename, show, fileType);
if (attempt != null) return attempt;
}
if (fn.endsWith(".txt") && fn.startsWith(filename)) {
if (fn.endsWith(fileType) && fn.startsWith(filename)) {
if (show) System.out.println("\tFound: '" + fn + "'");
return foo.getCanonicalPath();
}