ICU-0 uca fixes
X-SVN-Rev: 17533
This commit is contained in:
parent
b83dda29e5
commit
c6350d9d97
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
|
||||
* $Date: 2005/04/06 08:48:16 $
|
||||
* $Revision: 1.24 $
|
||||
* $Date: 2005/05/02 15:39:54 $
|
||||
* $Revision: 1.25 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -930,7 +930,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
*/
|
||||
static final char EMPTY = '\uFFFF';
|
||||
char rearrangeBuffer = EMPTY;
|
||||
UnicodeSet rearrangeList = null;
|
||||
UnicodeSet rearrangeList = new UnicodeSet();
|
||||
int hangulBufferPosition = 0;
|
||||
StringBuffer hangulBuffer = new StringBuffer();
|
||||
|
||||
@ -1102,7 +1102,23 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
UnicodeSet variantSecondaries = new UnicodeSet(0x0153,0x0154);
|
||||
UnicodeSet digitSecondaries = new UnicodeSet(0x155,0x017F);
|
||||
UnicodeSet homelessSecondaries;
|
||||
|
||||
static final UnicodeSet moreSamples = new UnicodeSet();
|
||||
static {
|
||||
moreSamples.add("\u09C7\u09BE");
|
||||
moreSamples.add("\u09C7\u09D7");
|
||||
moreSamples.add("\u1025\u102E");
|
||||
moreSamples.add("\u0DD9\u0DCF");
|
||||
moreSamples.add("\u0DD9\u0DDF");
|
||||
moreSamples.add("\u1100\u1161");
|
||||
moreSamples.add("\u1100\u1175");
|
||||
moreSamples.add("\u1112\u1161");
|
||||
moreSamples.add("\u1112\u1175");
|
||||
moreSamples.add("\uAC00\u1161");
|
||||
moreSamples.add("\uAC00\u1175");
|
||||
moreSamples.add("\uD788\u1161");
|
||||
moreSamples.add("\uD788\u1175");
|
||||
}
|
||||
|
||||
// static UnicodeSet homelessSecondaries = new UnicodeSet(0x0176, 0x0198);
|
||||
// 0x0153..0x017F
|
||||
|
||||
@ -1121,6 +1137,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
int skip = 1;
|
||||
boolean doSamples = false;
|
||||
AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
|
||||
UnicodeSetIterator moreSampleIterator = new UnicodeSetIterator(moreSamples);
|
||||
|
||||
|
||||
/**
|
||||
* use FIXED_CE as the limit
|
||||
@ -1231,6 +1249,12 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
return result;
|
||||
}
|
||||
|
||||
if (moreSampleIterator.next()) {
|
||||
result = moreSampleIterator.getString();
|
||||
if (DEBUG) System.out.println("More Samples: " + ucd.getCodeAndName(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
// extra samples
|
||||
if (currentRange < SAMPLE_RANGES.length) {
|
||||
try {
|
||||
@ -1329,9 +1353,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
|
||||
// In UAX 3.1, the rearrange list is moved to UCD.
|
||||
|
||||
rearrangeList = UnifiedBinaryProperty.make(UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd)
|
||||
if (ucaData.lessThan410) {
|
||||
rearrangeList = UnifiedBinaryProperty.make(UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd)
|
||||
.getSet();
|
||||
|
||||
}
|
||||
|
||||
while (true) try {
|
||||
inputLine = in.readLine();
|
||||
@ -1465,7 +1490,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd);
|
||||
UnicodeSet desiredSet = ubp.getSet();
|
||||
|
||||
if (!rearrangeList.equals(desiredSet)) {
|
||||
if (ucaData.lessThan410 && !rearrangeList.equals(desiredSet)) {
|
||||
throw new IllegalArgumentException("Rearrangement should be " + desiredSet.toPattern(true)
|
||||
+ ", but is " + rearrangeList.toPattern(true));
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $
|
||||
* $Date: 2004/03/11 19:03:19 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2005/05/02 15:39:54 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -29,6 +29,7 @@ import com.ibm.icu.text.UnicodeSet;
|
||||
public class UCA_Data implements UCA_Types {
|
||||
static final boolean DEBUG = false;
|
||||
static final boolean DEBUG_SHOW_ADD = false;
|
||||
static final boolean lessThan410 = false;
|
||||
|
||||
private Normalizer toD;
|
||||
private UCD ucd;
|
||||
@ -197,7 +198,7 @@ public class UCA_Data implements UCA_Types {
|
||||
int increment = UTF16.getCharCount(cp2);
|
||||
|
||||
// CHECK if last char was completely ignorable
|
||||
if (isCompletelyIgnoreable(cp2)) {
|
||||
if (lessThan410 && isCompletelyIgnoreable(cp2)) {
|
||||
index += increment; // just skip char don't set probe, value
|
||||
continue;
|
||||
}
|
||||
@ -231,7 +232,7 @@ public class UCA_Data implements UCA_Types {
|
||||
lastCan = can; // remember for next time
|
||||
|
||||
// CHECK if last char was completely ignorable. If so, skip it.
|
||||
if (isCompletelyIgnoreable(cp2)) {
|
||||
if (lessThan410 && isCompletelyIgnoreable(cp2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2005/04/06 08:48:17 $
|
||||
* $Revision: 1.40 $
|
||||
* $Date: 2005/05/02 15:39:54 $
|
||||
* $Revision: 1.41 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -440,9 +440,9 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
||||
if (!shortPrint) {
|
||||
log.print(Utility.hex(source));
|
||||
log.print(
|
||||
";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key));
|
||||
";\t# (" + quoteOperand(clipped) + ") " + ucd.getName(clipped) + "\t" + UCA.toString(key));
|
||||
} else {
|
||||
log.print(Utility.hex(source) + ";\t" + Utility.hex(clipped));
|
||||
log.print(Utility.hex(source));
|
||||
}
|
||||
log.println();
|
||||
}
|
||||
@ -537,16 +537,16 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
||||
//Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
|
||||
//Normalizer nfc = new Normalizer(Normalizer.NFC, UNICODE_VERSION);
|
||||
switch (strength) {
|
||||
case 1: log.println("<h2>3. Primaries Incompatible with Decompositions</h2>"); break;
|
||||
case 2: log.println("<h2>4. Secondaries Incompatible with Decompositions</h2>"); break;
|
||||
case 3: log.println("<h2>5. Tertiaries Incompatible with Decompositions</h2>");
|
||||
log.println("<p>Note: Tertiary differences are not really errors; these are just warnings</p>");
|
||||
break;
|
||||
case 1: log.println("<h2>3. Primaries Incompatible with NFKD</h2>"); break;
|
||||
case 2: log.println("<h2>4. Secondaries Incompatible with NFKD</h2>"); break;
|
||||
case 3: log.println("<h2>5. Tertiaries Incompatible with NFKD</h2>");
|
||||
break;
|
||||
default: throw new IllegalArgumentException("bad strength: " + strength);
|
||||
}
|
||||
log.println("<p>Note: Differences are not really errors; but they should be checked over for inadvertant problems</p>");
|
||||
log.println("<p>Warning: only checking characters defined in base: " + ucd_uca_base.getVersion() + "</p>");
|
||||
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
|
||||
log.println("<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>");
|
||||
log.println("<tr><th>Code</td><th>Sort Key</th><th>NFKD Sort Key</th><th>Name</th></tr>");
|
||||
|
||||
int errorCount = 0;
|
||||
|
||||
@ -1991,7 +1991,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
||||
relation = getStrengthDifference(ces, len, ces2, len2);
|
||||
|
||||
reset = quoteOperand(UTF16.valueOf(resetCp));
|
||||
resetComment = ucd.getCodeAndName(resetCp);
|
||||
if (!shortPrint) resetComment = ucd.getCodeAndName(resetCp);
|
||||
// lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
|
||||
xmlReset = 2;
|
||||
}
|
||||
@ -2523,7 +2523,8 @@ F900..FAFF; CJK Compatibility Ideographs
|
||||
static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster
|
||||
|
||||
static UnicodeSet needsQuoting = null;
|
||||
|
||||
static UnicodeSet needsUnicodeForm = null;
|
||||
|
||||
static final String quoteOperand(String s) {
|
||||
if (needsQuoting == null) {
|
||||
/*
|
||||
@ -2533,8 +2534,13 @@ F900..FAFF; CJK Compatibility Ideographs
|
||||
|| (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c))
|
||||
*/
|
||||
needsQuoting = new UnicodeSet(
|
||||
"[[:whitespace:][:c:][:z:][[:ascii:]-[a-zA-Z0-9]]]");
|
||||
"[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); //
|
||||
//"[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:]
|
||||
//for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
// if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i);
|
||||
//}
|
||||
// needsQuoting.remove();
|
||||
needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]");
|
||||
}
|
||||
s = Default.nfc().normalize(s);
|
||||
quoteOperandBuffer.setLength(0);
|
||||
@ -2558,7 +2564,8 @@ F900..FAFF; CJK Compatibility Ideographs
|
||||
quoteOperandBuffer.append('\'');
|
||||
inQuote = true;
|
||||
}
|
||||
if (cp > 0xFFFF) {
|
||||
if (!needsUnicodeForm.contains(cp)) quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028
|
||||
else if (cp > 0xFFFF) {
|
||||
quoteOperandBuffer.append("\\U").append(Utility.hex(cp,8));
|
||||
} else if (cp <= 0x20 || cp > 0x7E) {
|
||||
quoteOperandBuffer.append("\\u").append(Utility.hex(cp));
|
||||
|
@ -515,6 +515,8 @@ public class MakeUnicodeFiles {
|
||||
GenerateCaseFolding.generateSpecialCasing(false);
|
||||
} else if (filename.equals("StandardizedVariants")) {
|
||||
GenerateStandardizedVariants.generate();
|
||||
} else if (filename.equals("NamedSequences")) {
|
||||
GenerateNamedSequences.generate();
|
||||
} else if (filename.equals("GraphemeBreakTest")) {
|
||||
new GenerateGraphemeBreakTest(Default.ucd()).run();
|
||||
} else if (filename.equals("WordBreakTest")) {
|
||||
|
@ -1,4 +1,4 @@
|
||||
Generate: DerivedBidiClass
|
||||
Generate: NamedSequences
|
||||
DeltaVersion: 14
|
||||
CopyrightYear: 2005
|
||||
|
||||
@ -361,6 +361,9 @@ Property: SPECIAL
|
||||
File: StandardizedVariants
|
||||
Property: SPECIAL
|
||||
|
||||
File: NamedSequences
|
||||
Property: SPECIAL
|
||||
|
||||
HackName: noBreak
|
||||
HackName: Arabic_Presentation_Forms-A
|
||||
HackName: Arabic_Presentation_Forms-B
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2005/04/06 08:48:17 $
|
||||
* $Revision: 1.21 $
|
||||
* $Date: 2005/05/02 15:39:53 $
|
||||
* $Revision: 1.22 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -19,6 +19,7 @@ import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.CollectionUtilities;
|
||||
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
|
||||
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
@ -152,6 +153,7 @@ public class TestData implements UCD_Types {
|
||||
Matcher m;
|
||||
|
||||
static class GenStringPrep {
|
||||
|
||||
UnicodeSet[] coreChars = new UnicodeSet[100];
|
||||
UnicodeSet decomposable = new UnicodeSet();
|
||||
UnicodeMap suspect = new UnicodeMap();
|
||||
@ -159,11 +161,15 @@ public class TestData implements UCD_Types {
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
|
||||
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
|
||||
UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher());
|
||||
UnicodeSet wordChars = new UnicodeSet();
|
||||
{
|
||||
wordChars.retainAll(ups.getSet("gc=Sk"));
|
||||
if (false) {
|
||||
wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
|
||||
wordChars.retainAll(ups.getSet("gc=Sk"));
|
||||
}
|
||||
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
|
||||
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0]"));
|
||||
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
|
||||
" \\u055A \\u02B9 \\u02BA]"));
|
||||
//wordChars.removeAll(xid_continue);
|
||||
}
|
||||
|
||||
@ -193,6 +199,7 @@ public class TestData implements UCD_Types {
|
||||
UnicodeSet inIDN = new UnicodeSet();
|
||||
|
||||
void genStringPrep() throws IOException {
|
||||
//showScriptToBlock();
|
||||
bf.setShowLiteral(BagFormatter.toHTMLControl);
|
||||
//bf.setValueSource(UnicodeLabel.NULL);
|
||||
if (false) {
|
||||
@ -221,10 +228,13 @@ public class TestData implements UCD_Types {
|
||||
|
||||
Utility.fixDot();
|
||||
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
|
||||
PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
|
||||
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
|
||||
textOut.println('\uFEFF');
|
||||
textOut.println("For documentation, see idn-chars.html");
|
||||
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut);
|
||||
|
||||
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
|
||||
new String[] {"%date%", Default.getDate()});
|
||||
/*
|
||||
out
|
||||
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
@ -241,27 +251,31 @@ public class TestData implements UCD_Types {
|
||||
out.println("-->");
|
||||
out.println("</style></head><body><table>");
|
||||
*/
|
||||
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0' style='border-collapse: collapse'>");
|
||||
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
|
||||
htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
|
||||
|
||||
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
|
||||
if (scriptCode == COMMON_SCRIPT
|
||||
|| scriptCode == INHERITED_SCRIPT)
|
||||
continue;
|
||||
showCodes(htmlOut, textOut, scriptCode);
|
||||
showCodes(htmlOut, textOut, scriptCode, htmlOut2);
|
||||
}
|
||||
showCodes(htmlOut, textOut, COMMON_SCRIPT);
|
||||
showCodes(htmlOut, textOut, INHERITED_SCRIPT);
|
||||
showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
|
||||
showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
|
||||
htmlOut.println("</table></body></html>");
|
||||
htmlOut.close();
|
||||
htmlOut2.println("</table></body></html>");
|
||||
htmlOut2.close();
|
||||
bf.setMergeRanges(false);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** WORD CHARACTERS ADDED ***");
|
||||
textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
|
||||
textOut.println();
|
||||
bf.setValueSource("word-chars");
|
||||
bf.showSetNames(textOut, wordChars);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** FOR REVIEW (collected from above) ***");
|
||||
textOut.println("# *** FOR REVIEW ***");
|
||||
bf.setLabelSource(UnicodeLabel.NULL);
|
||||
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
|
||||
textOut.println();
|
||||
@ -272,6 +286,93 @@ public class TestData implements UCD_Types {
|
||||
textOut.close();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private void showScriptToBlock() {
|
||||
UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
|
||||
UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
|
||||
UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
|
||||
public Object compose(Object a, Object b) {
|
||||
return a + "\t" + b;
|
||||
}
|
||||
};
|
||||
UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
|
||||
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
|
||||
System.out.println(it.next());
|
||||
}
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
|
||||
Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
|
||||
|
||||
static String[][] script_to_gif = {
|
||||
|
||||
{"Common","common.gif"}, //Miscellaneous_Symbols
|
||||
{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
|
||||
{"Arabic","arabic.gif"}, //Arabic
|
||||
{"Armenian","armenian.gif"}, //Armenian
|
||||
{"Bengali","bengali.gif"}, //Bengali
|
||||
{"Bopomofo","bopomofo.gif"}, //Bopomofo
|
||||
{"Braille","braillesymbols.gif"}, //Braille_Patterns
|
||||
{"Buginese","buginese.gif"}, //Buginese
|
||||
{"Buhid","buhid.gif"}, //Buhid
|
||||
{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
|
||||
{"Cherokee","cherokee.gif"}, //Cherokee
|
||||
{"Coptic","coptic.gif"}, //Coptic
|
||||
{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
|
||||
{"Cyrillic","cyrillic.gif"}, //Cyrillic
|
||||
{"Deseret","deseret.gif"}, //Deseret
|
||||
{"Devanagari","devanagari.gif"}, //Devanagari
|
||||
{"Ethiopic","ethiopic.gif"}, //Ethiopic
|
||||
{"Georgian","georgian.gif"}, //Georgian
|
||||
{"Glagolitic","glagolitic.gif"}, //Glagolitic
|
||||
{"Gothic","gothic.gif"}, //Gothic
|
||||
{"Greek","greek.gif"}, //Greek_and_Coptic
|
||||
{"Gujarati","gujarati.gif"}, //Gujarati
|
||||
{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
|
||||
{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
|
||||
{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
|
||||
{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
|
||||
{"Hanunoo","hanunoo.gif"}, //Hanunoo
|
||||
{"Hebrew","hebrew.gif"}, //Hebrew
|
||||
{"Hiragana","hiragana.gif"}, //Hiragana
|
||||
{"Kannada","kannada.gif"}, //Kannada
|
||||
{"Katakana","katakana.gif"}, //Katakana
|
||||
{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
|
||||
{"Khmer","khmer.gif"}, //Khmer
|
||||
{"Lao","lao.gif"}, //Lao
|
||||
{"Latin","latin.gif"}, //Basic_Latin
|
||||
{"Limbu","limbu.gif"}, //Limbu
|
||||
{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
|
||||
{"Malayalam","malayalam.gif"}, //Malayalam
|
||||
{"Mongolian","mongolian.gif"}, //Mongolian
|
||||
{"Myanmar","myanmar.gif"}, //Myanmar
|
||||
{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
|
||||
{"Ogham","ogham.gif"}, //Ogham
|
||||
{"Old_Italic","olditalic.gif"}, //Old_Italic
|
||||
{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
|
||||
{"Oriya","oriya.gif"}, //Oriya
|
||||
{"Osmanya","osmanya.gif"}, //Osmanya
|
||||
{"Runic","runic.gif"}, //Runic
|
||||
{"Shavian","shavian.gif"}, //Shavian
|
||||
{"Sinhala","sinhala.gif"}, //Sinhala
|
||||
{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
|
||||
{"Syriac","syriac.gif"}, //Syriac
|
||||
{"Tagalog","tagalog.gif"}, //Tagalog
|
||||
{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
|
||||
{"Tai_Le","taile.gif"}, //Tai_Le
|
||||
{"Tamil","tamil.gif"}, //Tamil
|
||||
{"Telugu","telugu.gif"}, //Telugu
|
||||
{"Thaana","thaana.gif"}, //Thaana
|
||||
{"Thai","thai.gif"}, //Thai
|
||||
{"Tibetan","tibetan.gif"}, //Tibetan
|
||||
{"Tifinagh","tifinagh.gif"}, //Tifinagh
|
||||
{"Ugaritic","ugaritic.gif"}, //Ugaritic
|
||||
{"Yi","yi.gif"}, //Yi_Syllables
|
||||
|
||||
};
|
||||
|
||||
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
|
||||
{
|
||||
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
|
||||
@ -309,16 +410,22 @@ public class TestData implements UCD_Types {
|
||||
* @param htmlOut
|
||||
* @param textOut TODO
|
||||
* @param scriptCode
|
||||
* @param htmlOut2 TODO
|
||||
* @param ucd
|
||||
* @param coreChars
|
||||
* @param decompChars
|
||||
*/
|
||||
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode) {
|
||||
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
|
||||
if (coreChars[scriptCode] == null) return;
|
||||
System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
|
||||
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
|
||||
script = Utility.getUnskeleton(script.toLowerCase(),true);
|
||||
System.out.println(script);
|
||||
|
||||
htmlOut.println();
|
||||
htmlOut.println("<tr><th class='script'>Script: " + script + "</th></tr>");
|
||||
String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
|
||||
+ "'> Script: " + script + "</th></tr>";
|
||||
htmlOut.println(scriptLine);
|
||||
htmlOut2.println(scriptLine);
|
||||
textOut.println();
|
||||
textOut.println("#*** Script: " + script + " ***");
|
||||
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
|
||||
@ -354,13 +461,13 @@ public class TestData implements UCD_Types {
|
||||
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
|
||||
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
|
||||
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
|
||||
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
|
||||
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode);
|
||||
|
||||
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
|
||||
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Decomposable", remappedIsNFKCDecomp, scriptCode);
|
||||
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode);
|
||||
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
|
||||
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
|
||||
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
|
||||
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -387,9 +494,11 @@ public class TestData implements UCD_Types {
|
||||
int size = unicodeset.size();
|
||||
String dir = unicodeset.containsSome(bidiR)
|
||||
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
|
||||
htmlOut.println("<tr><th class='" + title + "'>" + title + " ("
|
||||
htmlOut.println("<tr><th class='" + title + "'><a href='#" +
|
||||
title + "'>" + title + "</a> ("
|
||||
+ nf.format(size) + ")</th></tr>");
|
||||
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
|
||||
// <a href="#Atomic">categorization</a>
|
||||
textOut.println();
|
||||
textOut.println("# " + title);
|
||||
bf.setValueSource(script + " ; " + title);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2005/03/10 02:37:20 $
|
||||
* $Revision: 1.38 $
|
||||
* $Date: 2005/05/02 15:39:53 $
|
||||
* $Revision: 1.39 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -31,6 +31,7 @@ import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public final class UCD implements UCD_Types {
|
||||
@ -157,11 +158,11 @@ public final class UCD implements UCD_Types {
|
||||
* Get the character names for the code points in a string, separated by ", "
|
||||
*/
|
||||
public String getName(String s, byte style) {
|
||||
if (s.length() == 1) return getName(s.charAt(0), style);
|
||||
if (s.length() == 1) return getName(s.charAt(0), style); // optimize BMP
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getName(cp, style));
|
||||
}
|
||||
@ -1383,7 +1384,7 @@ to guarantee identifier closure.
|
||||
result.codePoint = codePoint;
|
||||
if (fixStrings) {
|
||||
if (result.name == null || isRemapped) result.name = constructedName;
|
||||
if (result.shortName == null) result.shortName = Utility.replace(constructedName, UCD_Names.NAME_ABBREVIATIONS);
|
||||
if (result.shortName == null) result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
|
||||
if (isRemapped) {
|
||||
result.decompositionMapping = result.bidiMirror
|
||||
= result.simpleLowercase = result.simpleUppercase = result.simpleTitlecase = result.simpleCaseFolding
|
||||
|
@ -6,93 +6,96 @@
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<title>IDN Characters</title>
|
||||
<style>
|
||||
<!--
|
||||
.script { font-size: 150%; background-color: #CCCCCC }
|
||||
.Atomic { background-color: #CCCCFF }
|
||||
.Atomic-no-uppercase { background-color: #CCFFCC }
|
||||
.Non-XID { background-color: #FFCCCC }
|
||||
.Decomposable { background-color: #FFFFCC }
|
||||
.Pattern_Syntax { background-color: #FFCCFF }
|
||||
.IDN-Remapped-Case-Atomic { background-color: #CCFFFF }
|
||||
.IDN-Remapped-Case-Decomposable { background-color: #66FFFF }
|
||||
.IDN-Remapped-Compat { background-color: #FF6666 }
|
||||
.IDN-Deleted { background-color: #66FF66 }
|
||||
.IDN-Illegal { background-color: #6666FF }
|
||||
th { text-align: left }
|
||||
-->
|
||||
</style>
|
||||
<link rel="stylesheet" type="text/css" href="idn-chars.css">
|
||||
</head>
|
||||
|
||||
<body style="margin: 2em">
|
||||
<body>
|
||||
|
||||
<h1>IDN Character Categorization</h1>
|
||||
<p><i>$Date: 2005/04/06 08:48:17 $, MED</i></p>
|
||||
<p>This page lists all of the valid output IDN characters broken down by category. By "output" IDN
|
||||
characters, we mean ones that can result from nameprep. Characters are grouped first by script, and
|
||||
then by subcategory. Within each subcategory characters are sorted according to the default
|
||||
<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tool-tips provide the character code
|
||||
and name (in enabled browsers).</p>
|
||||
<p><i>%date%, MED</i></p>
|
||||
<p>This page lists all Unicode characters relevant to IDN in a <a href="#Categorization">chart</a>,
|
||||
broken down by category. Characters are grouped first by script, and then by subcategory.</p>
|
||||
<p>The "output" IDN characters are ones that can result from nameprep, while the "input" characters
|
||||
are those that are allowed in input, but transformed (remapped or deleted). Tool-tips provide the
|
||||
character code and name (in enabled browsers). The following table described the subcategories.
|
||||
Within each subcategory characters are sorted according to the default
|
||||
<a href="http://www.unicode.org/reports/tr10/">UCA</a> order.</p>
|
||||
<blockquote>
|
||||
<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse">
|
||||
<table border="1" cellpadding="2" cellspacing="0">
|
||||
<caption><b><font size="4">Key</font></b></caption>
|
||||
<tr>
|
||||
<th>Type</th>
|
||||
<th>Subcategory</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Atomic">Atomic</td>
|
||||
<th rowspan="5">Output</th>
|
||||
<td class="Atomic"><a name="Atomic">Atomic</a></td>
|
||||
<td>Characters that don't fall into any of the following subcategories</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
|
||||
<td>For bicameral scripts, Atomic characters without an uppercase.</td>
|
||||
<td class="Atomic-no-uppercase"><a name="Atomic-no-uppercase">Atomic-no-uppercase</a></td>
|
||||
<td>For bicameral scripts, Atomic characters without an uppercase. These need to be examined
|
||||
to see which are used in modern languages.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Pattern_Syntax">Pattern_Syntax</td>
|
||||
<td>Characters recommended as a basis for use in pattern syntax.<p>Excludes the word
|
||||
characters in <i>Section 4 Word Boundaries</i> of
|
||||
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
|
||||
Word_Break property and notes at the end of the section.</p>
|
||||
<p>See <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and
|
||||
Pattern Syntax</a>. </td>
|
||||
<td class="Pattern_Syntax"><a name="Pattern_Syntax">Pattern_Syntax</a></td>
|
||||
<td>Characters recommended as a basis for use in pattern syntax. Excludes the
|
||||
<a href="#Word_Characters">additional word characters</a>.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Non-XID">Non-XID</td>
|
||||
<td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and the
|
||||
word characters in <i>Section 4 Word Boundaries</i> of
|
||||
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
|
||||
Word_Break property and notes at the end of the section.<p>See
|
||||
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern
|
||||
Syntax</a> (XID_Continue).</td>
|
||||
<td class="Non-XID"><a name="Non-XID">Non-XID</a></td>
|
||||
<td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and
|
||||
<a href="#Word_Characters">additional word characters</a>.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="Decomposable">Decomposable</td>
|
||||
<td>Characters with NFC decompositions.</td>
|
||||
<td class="NFD-Decomposable"><a name="NFD-Decomposable">NFD-Decomposable</a></td>
|
||||
<td>Characters with NFD (canonical) decompositions. These are broken out separately because
|
||||
certain spoofing techniques are applied to them <i>via their decompositions.</i></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Remapped-Case-Atomic">IDN-Remapped</td>
|
||||
<td>Characters remapped by IDN due to case folding</td>
|
||||
<th rowspan="4">Input</th>
|
||||
<td class="IDN-Remapped-Case-Atomic"><a name="IDN-Remapped-Case-Atomic">
|
||||
IDN-Remapped-Case-Atomic</a></td>
|
||||
<td>Atomic characters remapped by IDN due to case folding [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a>
|
||||
Section 3.2].</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Remapped-Case-Decomposable">IDN-Remapped</td>
|
||||
<td>Characters remapped by IDN due to case folding, that are decomposable.</td>
|
||||
</tr>
|
||||
IDN-Remapped-Case-Decomposable
|
||||
<tr>
|
||||
<td class="IDN-Remapped-Compat">IDN-Remapped</td>
|
||||
<td>Characters remapped by IDN due to compatibility mapping.</td>
|
||||
<td class="IDN-Remapped-Case-NFD-Decomposable"><a name="IDN-Remapped-Case-NFD-Decomposable">
|
||||
IDN-Remapped-Case-NFD-Decomposable</a></td>
|
||||
<td>Characters that are NFD (canonical) decomposable and that are remapped by IDN due to case
|
||||
folding [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a> Section 3.2].</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Deleted">IDN-Deleted</td>
|
||||
<td>Characters deleted by IDN.</td>
|
||||
<td class="IDN-Remapped-Compat"><a name="IDN-Remapped-Compat">IDN-Remapped</a></td>
|
||||
<td>Characters remapped by IDN due to compatibility (NFKD) mapping. [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a>
|
||||
Section 4]</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Illegal">IDN-Illegal </td>
|
||||
<td>Characters illegal in IDN (note: most of these are due to IDN's using an old version of Unicode).</td>
|
||||
<td class="IDN-Deleted"><a name="IDN-Deleted">IDN-Deleted</a></td>
|
||||
<td>Characters deleted by IDN, that is, mapped to nothing [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a>
|
||||
Section 3.1]</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Prohibited</th>
|
||||
<td class="IDN-Prohibited"><a name="IDN-Prohibited">IDN-Prohibited </a></td>
|
||||
<td>Characters prohibited in IDN [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a>
|
||||
Section 5] (Note: most of these are due to IDN's using an old version of Unicode. IDN does
|
||||
treat unassigned characters differently than explicitly prohibited characters, but for our
|
||||
purposes this distinction doesn't matter.)</td>
|
||||
</tr>
|
||||
</table>
|
||||
</blockquote>
|
||||
<h3>Additional <a name="Word_Characters">Word Characters</a></h3>
|
||||
<p>This is a draft list of characters based on <i>Section 4 Word Boundaries</i> of
|
||||
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
|
||||
Word_Break property and notes at the end of the section. While not currently a part of the
|
||||
recommended characters for programming identifiers (XID_Continue), these characters have been
|
||||
identified as being necessary for more "natural language" identifiers, since some words in some
|
||||
modern languages could not be constructed without them. See also
|
||||
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a>.
|
||||
These characters are listed in the plain text file, as described below.</p>
|
||||
<h2>Plain-Text Version</h2>
|
||||
<p>The information in the categorization is also available in a plain-text file, at
|
||||
<a href="idn-chars.txt">idn-chars.txt</a>. It can be viewed as is, or loaded into a spreadsheet for
|
||||
sorting and filtering to view the data in different ways. The format is:</p>
|
||||
@ -101,7 +104,22 @@ sorting and filtering to view the data in different ways. The format is:</p>
|
||||
</blockquote>
|
||||
<p><i>Examples:</i></p>
|
||||
<pre>0061 ; LATIN ; Atomic # ; L& (a) LATIN SMALL LETTER A
|
||||
<code>026B ; LATIN ; Atomic-no-uppercase # L& (?) LATIN SMALL LETTER L WITH MIDDLE TILDE</code>
|
||||
2015 ; COMMON ; Pattern_Syntax # Pd (―) HORIZONTAL BAR
|
||||
058A ; ARMENIAN ; Atomic-no-uppercase # ; Pd (֊) ARMENIAN HYPHEN
|
||||
20AC ; COMMON ; Non-XID # ; Sc (€) EURO SIGN</pre>
|
||||
<h2>Categorization</h2>
|
||||
<p>At the end of <a href="idn-chars.txt">idn-chars.txt</a> is a section called ADDITIONAL WORD
|
||||
CHARACTERS, defined as described above. Below that is a section of FOR REVIEW characters,
|
||||
sorted by Unicode general category (an additional category of XX is added for the odd characters
|
||||
whose names include: <span style="font-variant: small-caps">MUSICAL SYMBOL, DINGBAT, or RADICAL</span>.)
|
||||
We need review of that list to check for characters that are needed for words in modern languages,
|
||||
that is, that should be moved up into the ADDITIONAL WORD CHARACTERS list. Each character in the FOR
|
||||
REVIEW list is collected because it either: </p>
|
||||
<ol>
|
||||
<li>would not otherwise count as part of an XID, or</li>
|
||||
<li>is part of a bicameral script and doesn't have an uppercase (eg, the situation for U+026B
|
||||
above)</li>
|
||||
</ol>
|
||||
<p>In either case there is prima facie reason for some level of scrutiny, if the goal to be
|
||||
initially conservative in repertoire.</p>
|
||||
<h2><a name="Categorization">Categorization</a></h2>
|
Loading…
Reference in New Issue
Block a user