ICU-0 uca fixes

X-SVN-Rev: 17533
This commit is contained in:
Mark Davis 2005-05-02 15:39:54 +00:00
parent b83dda29e5
commit c6350d9d97
8 changed files with 274 additions and 108 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2005/04/06 08:48:16 $
* $Revision: 1.24 $
* $Date: 2005/05/02 15:39:54 $
* $Revision: 1.25 $
*
*******************************************************************************
*/
@ -930,7 +930,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
*/
static final char EMPTY = '\uFFFF';
char rearrangeBuffer = EMPTY;
UnicodeSet rearrangeList = null;
UnicodeSet rearrangeList = new UnicodeSet();
int hangulBufferPosition = 0;
StringBuffer hangulBuffer = new StringBuffer();
@ -1102,6 +1102,22 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
UnicodeSet variantSecondaries = new UnicodeSet(0x0153,0x0154);
UnicodeSet digitSecondaries = new UnicodeSet(0x155,0x017F);
UnicodeSet homelessSecondaries;
static final UnicodeSet moreSamples = new UnicodeSet();
static {
moreSamples.add("\u09C7\u09BE");
moreSamples.add("\u09C7\u09D7");
moreSamples.add("\u1025\u102E");
moreSamples.add("\u0DD9\u0DCF");
moreSamples.add("\u0DD9\u0DDF");
moreSamples.add("\u1100\u1161");
moreSamples.add("\u1100\u1175");
moreSamples.add("\u1112\u1161");
moreSamples.add("\u1112\u1175");
moreSamples.add("\uAC00\u1161");
moreSamples.add("\uAC00\u1175");
moreSamples.add("\uD788\u1161");
moreSamples.add("\uD788\u1175");
}
// static UnicodeSet homelessSecondaries = new UnicodeSet(0x0176, 0x0198);
// 0x0153..0x017F
@ -1121,6 +1137,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
int skip = 1;
boolean doSamples = false;
AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
UnicodeSetIterator moreSampleIterator = new UnicodeSetIterator(moreSamples);
/**
* use FIXED_CE as the limit
@ -1231,6 +1249,12 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
return result;
}
if (moreSampleIterator.next()) {
result = moreSampleIterator.getString();
if (DEBUG) System.out.println("More Samples: " + ucd.getCodeAndName(result));
return result;
}
// extra samples
if (currentRange < SAMPLE_RANGES.length) {
try {
@ -1329,9 +1353,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
// In UAX 3.1, the rearrange list is moved to UCD.
if (ucaData.lessThan410) {
rearrangeList = UnifiedBinaryProperty.make(UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd)
.getSet();
}
while (true) try {
inputLine = in.readLine();
@ -1465,7 +1490,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd);
UnicodeSet desiredSet = ubp.getSet();
if (!rearrangeList.equals(desiredSet)) {
if (ucaData.lessThan410 && !rearrangeList.equals(desiredSet)) {
throw new IllegalArgumentException("Rearrangement should be " + desiredSet.toPattern(true)
+ ", but is " + rearrangeList.toPattern(true));
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $
* $Date: 2004/03/11 19:03:19 $
* $Revision: 1.2 $
* $Date: 2005/05/02 15:39:54 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -29,6 +29,7 @@ import com.ibm.icu.text.UnicodeSet;
public class UCA_Data implements UCA_Types {
static final boolean DEBUG = false;
static final boolean DEBUG_SHOW_ADD = false;
static final boolean lessThan410 = false;
private Normalizer toD;
private UCD ucd;
@ -197,7 +198,7 @@ public class UCA_Data implements UCA_Types {
int increment = UTF16.getCharCount(cp2);
// CHECK if last char was completely ignorable
if (isCompletelyIgnoreable(cp2)) {
if (lessThan410 && isCompletelyIgnoreable(cp2)) {
index += increment; // just skip char don't set probe, value
continue;
}
@ -231,7 +232,7 @@ public class UCA_Data implements UCA_Types {
lastCan = can; // remember for next time
// CHECK if last char was completely ignorable. If so, skip it.
if (isCompletelyIgnoreable(cp2)) {
if (lessThan410 && isCompletelyIgnoreable(cp2)) {
continue;
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2005/04/06 08:48:17 $
* $Revision: 1.40 $
* $Date: 2005/05/02 15:39:54 $
* $Revision: 1.41 $
*
*******************************************************************************
*/
@ -440,9 +440,9 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
if (!shortPrint) {
log.print(Utility.hex(source));
log.print(
";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key));
";\t# (" + quoteOperand(clipped) + ") " + ucd.getName(clipped) + "\t" + UCA.toString(key));
} else {
log.print(Utility.hex(source) + ";\t" + Utility.hex(clipped));
log.print(Utility.hex(source));
}
log.println();
}
@ -537,16 +537,16 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
//Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
//Normalizer nfc = new Normalizer(Normalizer.NFC, UNICODE_VERSION);
switch (strength) {
case 1: log.println("<h2>3. Primaries Incompatible with Decompositions</h2>"); break;
case 2: log.println("<h2>4. Secondaries Incompatible with Decompositions</h2>"); break;
case 3: log.println("<h2>5. Tertiaries Incompatible with Decompositions</h2>");
log.println("<p>Note: Tertiary differences are not really errors; these are just warnings</p>");
case 1: log.println("<h2>3. Primaries Incompatible with NFKD</h2>"); break;
case 2: log.println("<h2>4. Secondaries Incompatible with NFKD</h2>"); break;
case 3: log.println("<h2>5. Tertiaries Incompatible with NFKD</h2>");
break;
default: throw new IllegalArgumentException("bad strength: " + strength);
}
log.println("<p>Note: Differences are not really errors; but they should be checked over for inadvertant problems</p>");
log.println("<p>Warning: only checking characters defined in base: " + ucd_uca_base.getVersion() + "</p>");
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
log.println("<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>");
log.println("<tr><th>Code</td><th>Sort Key</th><th>NFKD Sort Key</th><th>Name</th></tr>");
int errorCount = 0;
@ -1991,7 +1991,7 @@ F900..FAFF; CJK Compatibility Ideographs
relation = getStrengthDifference(ces, len, ces2, len2);
reset = quoteOperand(UTF16.valueOf(resetCp));
resetComment = ucd.getCodeAndName(resetCp);
if (!shortPrint) resetComment = ucd.getCodeAndName(resetCp);
// lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
xmlReset = 2;
}
@ -2523,6 +2523,7 @@ F900..FAFF; CJK Compatibility Ideographs
static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster
static UnicodeSet needsQuoting = null;
static UnicodeSet needsUnicodeForm = null;
static final String quoteOperand(String s) {
if (needsQuoting == null) {
@ -2533,8 +2534,13 @@ F900..FAFF; CJK Compatibility Ideographs
|| (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c))
*/
needsQuoting = new UnicodeSet(
"[[:whitespace:][:c:][:z:][[:ascii:]-[a-zA-Z0-9]]]");
"[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); //
//"[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:]
//for (int i = 0; i <= 0x10FFFF; ++i) {
// if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i);
//}
// needsQuoting.remove();
needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]");
}
s = Default.nfc().normalize(s);
quoteOperandBuffer.setLength(0);
@ -2558,7 +2564,8 @@ F900..FAFF; CJK Compatibility Ideographs
quoteOperandBuffer.append('\'');
inQuote = true;
}
if (cp > 0xFFFF) {
if (!needsUnicodeForm.contains(cp)) quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028
else if (cp > 0xFFFF) {
quoteOperandBuffer.append("\\U").append(Utility.hex(cp,8));
} else if (cp <= 0x20 || cp > 0x7E) {
quoteOperandBuffer.append("\\u").append(Utility.hex(cp));

View File

@ -515,6 +515,8 @@ public class MakeUnicodeFiles {
GenerateCaseFolding.generateSpecialCasing(false);
} else if (filename.equals("StandardizedVariants")) {
GenerateStandardizedVariants.generate();
} else if (filename.equals("NamedSequences")) {
GenerateNamedSequences.generate();
} else if (filename.equals("GraphemeBreakTest")) {
new GenerateGraphemeBreakTest(Default.ucd()).run();
} else if (filename.equals("WordBreakTest")) {

View File

@ -1,4 +1,4 @@
Generate: DerivedBidiClass
Generate: NamedSequences
DeltaVersion: 14
CopyrightYear: 2005
@ -361,6 +361,9 @@ Property: SPECIAL
File: StandardizedVariants
Property: SPECIAL
File: NamedSequences
Property: SPECIAL
HackName: noBreak
HackName: Arabic_Presentation_Forms-A
HackName: Arabic_Presentation_Forms-B

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2005/04/06 08:48:17 $
* $Revision: 1.21 $
* $Date: 2005/05/02 15:39:53 $
* $Revision: 1.22 $
*
*******************************************************************************
*/
@ -19,6 +19,7 @@ import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.CollectionUtilities;
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
import com.ibm.icu.dev.test.util.UnicodeLabel;
import com.ibm.icu.dev.test.util.UnicodeMap;
@ -152,6 +153,7 @@ public class TestData implements UCD_Types {
Matcher m;
static class GenStringPrep {
UnicodeSet[] coreChars = new UnicodeSet[100];
UnicodeSet decomposable = new UnicodeSet();
UnicodeMap suspect = new UnicodeMap();
@ -159,11 +161,15 @@ public class TestData implements UCD_Types {
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher());
UnicodeSet wordChars = new UnicodeSet();
{
if (false) {
wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
wordChars.retainAll(ups.getSet("gc=Sk"));
}
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0]"));
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
" \\u055A \\u02B9 \\u02BA]"));
//wordChars.removeAll(xid_continue);
}
@ -193,6 +199,7 @@ public class TestData implements UCD_Types {
UnicodeSet inIDN = new UnicodeSet();
void genStringPrep() throws IOException {
//showScriptToBlock();
bf.setShowLiteral(BagFormatter.toHTMLControl);
//bf.setValueSource(UnicodeLabel.NULL);
if (false) {
@ -221,10 +228,13 @@ public class TestData implements UCD_Types {
Utility.fixDot();
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
textOut.println('\uFEFF');
textOut.println("For documentation, see idn-chars.html");
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut);
Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
new String[] {"%date%", Default.getDate()});
/*
out
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
@ -241,27 +251,31 @@ public class TestData implements UCD_Types {
out.println("-->");
out.println("</style></head><body><table>");
*/
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0' style='border-collapse: collapse'>");
htmlOut.println("<table border='1' cellpadding='2' cellspacing='0'>");
htmlOut2.println("<html><body><table border='1' cellpadding='2' cellspacing='0'>");
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
if (scriptCode == COMMON_SCRIPT
|| scriptCode == INHERITED_SCRIPT)
continue;
showCodes(htmlOut, textOut, scriptCode);
showCodes(htmlOut, textOut, scriptCode, htmlOut2);
}
showCodes(htmlOut, textOut, COMMON_SCRIPT);
showCodes(htmlOut, textOut, INHERITED_SCRIPT);
showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
htmlOut.println("</table></body></html>");
htmlOut.close();
htmlOut2.println("</table></body></html>");
htmlOut2.close();
bf.setMergeRanges(false);
textOut.println();
textOut.println("# *** WORD CHARACTERS ADDED ***");
textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
textOut.println();
bf.setValueSource("word-chars");
bf.showSetNames(textOut, wordChars);
textOut.println();
textOut.println("# *** FOR REVIEW (collected from above) ***");
textOut.println("# *** FOR REVIEW ***");
bf.setLabelSource(UnicodeLabel.NULL);
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
textOut.println();
@ -272,6 +286,93 @@ public class TestData implements UCD_Types {
textOut.close();
}
/**
*
*/
private void showScriptToBlock() {
UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
public Object compose(Object a, Object b) {
return a + "\t" + b;
}
};
UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
System.out.println(it.next());
}
throw new IllegalArgumentException();
}
Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
static String[][] script_to_gif = {
{"Common","common.gif"}, //Miscellaneous_Symbols
{"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
{"Arabic","arabic.gif"}, //Arabic
{"Armenian","armenian.gif"}, //Armenian
{"Bengali","bengali.gif"}, //Bengali
{"Bopomofo","bopomofo.gif"}, //Bopomofo
{"Braille","braillesymbols.gif"}, //Braille_Patterns
{"Buginese","buginese.gif"}, //Buginese
{"Buhid","buhid.gif"}, //Buhid
{"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
{"Cherokee","cherokee.gif"}, //Cherokee
{"Coptic","coptic.gif"}, //Coptic
{"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
{"Cyrillic","cyrillic.gif"}, //Cyrillic
{"Deseret","deseret.gif"}, //Deseret
{"Devanagari","devanagari.gif"}, //Devanagari
{"Ethiopic","ethiopic.gif"}, //Ethiopic
{"Georgian","georgian.gif"}, //Georgian
{"Glagolitic","glagolitic.gif"}, //Glagolitic
{"Gothic","gothic.gif"}, //Gothic
{"Greek","greek.gif"}, //Greek_and_Coptic
{"Gujarati","gujarati.gif"}, //Gujarati
{"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
{"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
{"Han","kangxiradicals.gif"}, //Kangxi_Radicals
{"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
{"Hanunoo","hanunoo.gif"}, //Hanunoo
{"Hebrew","hebrew.gif"}, //Hebrew
{"Hiragana","hiragana.gif"}, //Hiragana
{"Kannada","kannada.gif"}, //Kannada
{"Katakana","katakana.gif"}, //Katakana
{"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
{"Khmer","khmer.gif"}, //Khmer
{"Lao","lao.gif"}, //Lao
{"Latin","latin.gif"}, //Basic_Latin
{"Limbu","limbu.gif"}, //Limbu
{"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
{"Malayalam","malayalam.gif"}, //Malayalam
{"Mongolian","mongolian.gif"}, //Mongolian
{"Myanmar","myanmar.gif"}, //Myanmar
{"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
{"Ogham","ogham.gif"}, //Ogham
{"Old_Italic","olditalic.gif"}, //Old_Italic
{"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
{"Oriya","oriya.gif"}, //Oriya
{"Osmanya","osmanya.gif"}, //Osmanya
{"Runic","runic.gif"}, //Runic
{"Shavian","shavian.gif"}, //Shavian
{"Sinhala","sinhala.gif"}, //Sinhala
{"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
{"Syriac","syriac.gif"}, //Syriac
{"Tagalog","tagalog.gif"}, //Tagalog
{"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
{"Tai_Le","taile.gif"}, //Tai_Le
{"Tamil","tamil.gif"}, //Tamil
{"Telugu","telugu.gif"}, //Telugu
{"Thaana","thaana.gif"}, //Thaana
{"Thai","thai.gif"}, //Thai
{"Tibetan","tibetan.gif"}, //Tibetan
{"Tifinagh","tifinagh.gif"}, //Tifinagh
{"Ugaritic","ugaritic.gif"}, //Ugaritic
{"Yi","yi.gif"}, //Yi_Syllables
};
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
{
for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
@ -309,16 +410,22 @@ public class TestData implements UCD_Types {
* @param htmlOut
* @param textOut TODO
* @param scriptCode
* @param htmlOut2 TODO
* @param ucd
* @param coreChars
* @param decompChars
*/
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode) {
private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
if (coreChars[scriptCode] == null) return;
System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
script = Utility.getUnskeleton(script.toLowerCase(),true);
System.out.println(script);
htmlOut.println();
htmlOut.println("<tr><th class='script'>Script: " + script + "</th></tr>");
String scriptLine = "<tr><th class='script'><img src='images/" + ((String)scriptToGif.get(script)).toLowerCase()
+ "'> Script: " + script + "</th></tr>";
htmlOut.println(scriptLine);
htmlOut2.println(scriptLine);
textOut.println();
textOut.println("#*** Script: " + script + " ***");
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
@ -354,13 +461,13 @@ public class TestData implements UCD_Types {
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode);
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Decomposable", remappedIsNFKCDecomp, scriptCode);
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode);
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode);
}
/**
@ -387,9 +494,11 @@ public class TestData implements UCD_Types {
int size = unicodeset.size();
String dir = unicodeset.containsSome(bidiR)
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
htmlOut.println("<tr><th class='" + title + "'>" + title + " ("
htmlOut.println("<tr><th class='" + title + "'><a href='#" +
title + "'>" + title + "</a> ("
+ nf.format(size) + ")</th></tr>");
htmlOut.print("<tr><td class='" + title + "'" + dir + ">");
// <a href="#Atomic">categorization</a>
textOut.println();
textOut.println("# " + title);
bf.setValueSource(script + " ; " + title);

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2005/03/10 02:37:20 $
* $Revision: 1.38 $
* $Date: 2005/05/02 15:39:53 $
* $Revision: 1.39 $
*
*******************************************************************************
*/
@ -31,6 +31,7 @@ import com.ibm.text.utility.*;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public final class UCD implements UCD_Types {
@ -157,11 +158,11 @@ public final class UCD implements UCD_Types {
* Get the character names for the code points in a string, separated by ", "
*/
public String getName(String s, byte style) {
if (s.length() == 1) return getName(s.charAt(0), style);
if (s.length() == 1) return getName(s.charAt(0), style); // optimize BMP
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
if (i > 0) result.append(", ");
result.append(getName(cp, style));
}
@ -1383,7 +1384,7 @@ to guarantee identifier closure.
result.codePoint = codePoint;
if (fixStrings) {
if (result.name == null || isRemapped) result.name = constructedName;
if (result.shortName == null) result.shortName = Utility.replace(constructedName, UCD_Names.NAME_ABBREVIATIONS);
if (result.shortName == null) result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
if (isRemapped) {
result.decompositionMapping = result.bidiMirror
= result.simpleLowercase = result.simpleUppercase = result.simpleTitlecase = result.simpleCaseFolding

View File

@ -6,93 +6,96 @@
<meta name="ProgId" content="FrontPage.Editor.Document">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>IDN Characters</title>
<style>
<!--
.script { font-size: 150%; background-color: #CCCCCC }
.Atomic { background-color: #CCCCFF }
.Atomic-no-uppercase { background-color: #CCFFCC }
.Non-XID { background-color: #FFCCCC }
.Decomposable { background-color: #FFFFCC }
.Pattern_Syntax { background-color: #FFCCFF }
.IDN-Remapped-Case-Atomic { background-color: #CCFFFF }
.IDN-Remapped-Case-Decomposable { background-color: #66FFFF }
.IDN-Remapped-Compat { background-color: #FF6666 }
.IDN-Deleted { background-color: #66FF66 }
.IDN-Illegal { background-color: #6666FF }
th { text-align: left }
-->
</style>
<link rel="stylesheet" type="text/css" href="idn-chars.css">
</head>
<body style="margin: 2em">
<body>
<h1>IDN Character Categorization</h1>
<p><i>$Date: 2005/04/06 08:48:17 $, MED</i></p>
<p>This page lists all of the valid output IDN characters broken down by category. By &quot;output&quot; IDN
characters, we mean ones that can result from nameprep. Characters are grouped first by script, and
then by subcategory. Within each subcategory characters are sorted according to the default
<a href="http://www.unicode.org/reports/tr10/">UCA</a> order. Tool-tips provide the character code
and name (in enabled browsers).</p>
<p><i>%date%, MED</i></p>
<p>This page lists all Unicode characters relevant to IDN in a <a href="#Categorization">chart</a>,
broken down by category. Characters are grouped first by script, and then by subcategory.</p>
<p>The &quot;output&quot; IDN characters are ones that can result from nameprep, while the &quot;input&quot; characters
are those that are allowed in input, but transformed (remapped or deleted). Tool-tips provide the
character code and name (in enabled browsers). The following table described the subcategories.
Within each subcategory characters are sorted according to the default
<a href="http://www.unicode.org/reports/tr10/">UCA</a> order.</p>
<blockquote>
<table border="1" cellpadding="2" cellspacing="0" style="border-collapse: collapse">
<table border="1" cellpadding="2" cellspacing="0">
<caption><b><font size="4">Key</font></b></caption>
<tr>
<th>Type</th>
<th>Subcategory</th>
<th>Description</th>
</tr>
<tr>
<td class="Atomic">Atomic</td>
<th rowspan="5">Output</th>
<td class="Atomic"><a name="Atomic">Atomic</a></td>
<td>Characters that don&#39;t fall into any of the following subcategories</td>
</tr>
<tr>
<td class="Atomic-no-uppercase">Atomic-no-uppercase</td>
<td>For bicameral scripts, Atomic characters without an uppercase.</td>
<td class="Atomic-no-uppercase"><a name="Atomic-no-uppercase">Atomic-no-uppercase</a></td>
<td>For bicameral scripts, Atomic characters without an uppercase. These need to be examined
to see which are used in modern languages.</td>
</tr>
<tr>
<td class="Pattern_Syntax">Pattern_Syntax</td>
<td>Characters recommended as a basis for use in pattern syntax.<p>Excludes the word
characters in <i>Section 4 Word Boundaries</i> of
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.</p>
<p>See <a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and
Pattern Syntax</a>. </td>
<td class="Pattern_Syntax"><a name="Pattern_Syntax">Pattern_Syntax</a></td>
<td>Characters recommended as a basis for use in pattern syntax. Excludes the
<a href="#Word_Characters">additional word characters</a>.</td>
</tr>
<tr>
<td class="Non-XID">Non-XID</td>
<td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and the
word characters in <i>Section 4 Word Boundaries</i> of
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
Word_Break property and notes at the end of the section.<p>See
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern
Syntax</a> (XID_Continue).</td>
<td class="Non-XID"><a name="Non-XID">Non-XID</a></td>
<td>Characters not recommended as a basis for identifiers, excluding Pattern_Syntax and
<a href="#Word_Characters">additional word characters</a>.</td>
</tr>
<tr>
<td class="Decomposable">Decomposable</td>
<td>Characters with NFC decompositions.</td>
<td class="NFD-Decomposable"><a name="NFD-Decomposable">NFD-Decomposable</a></td>
<td>Characters with NFD (canonical) decompositions. These are broken out separately because
certain spoofing techniques are applied to them <i>via their decompositions.</i></td>
</tr>
<tr>
<td class="IDN-Remapped-Case-Atomic">IDN-Remapped</td>
<td>Characters remapped by IDN due to case folding</td>
<th rowspan="4">Input</th>
<td class="IDN-Remapped-Case-Atomic"><a name="IDN-Remapped-Case-Atomic">
IDN-Remapped-Case-Atomic</a></td>
<td>Atomic characters remapped by IDN due to case folding [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a>
Section 3.2].</td>
</tr>
<tr>
<td class="IDN-Remapped-Case-Decomposable">IDN-Remapped</td>
<td>Characters remapped by IDN due to case folding, that are decomposable.</td>
</tr>
IDN-Remapped-Case-Decomposable
<tr>
<td class="IDN-Remapped-Compat">IDN-Remapped</td>
<td>Characters remapped by IDN due to compatibility mapping.</td>
<td class="IDN-Remapped-Case-NFD-Decomposable"><a name="IDN-Remapped-Case-NFD-Decomposable">
IDN-Remapped-Case-NFD-Decomposable</a></td>
<td>Characters that are NFD (canonical) decomposable and that are remapped by IDN due to case
folding [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a> Section 3.2].</td>
</tr>
<tr>
<td class="IDN-Deleted">IDN-Deleted</td>
<td>Characters deleted by IDN.</td>
<td class="IDN-Remapped-Compat"><a name="IDN-Remapped-Compat">IDN-Remapped</a></td>
<td>Characters remapped by IDN due to compatibility (NFKD) mapping. [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a>
Section 4]</td>
</tr>
<tr>
<td class="IDN-Illegal">IDN-Illegal </td>
<td>Characters illegal in IDN (note: most of these are due to IDN's using an old version of Unicode).</td>
<td class="IDN-Deleted"><a name="IDN-Deleted">IDN-Deleted</a></td>
<td>Characters deleted by IDN, that is, mapped to nothing [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a>
Section 3.1]</td>
</tr>
<tr>
<th>Prohibited</th>
<td class="IDN-Prohibited"><a name="IDN-Prohibited">IDN-Prohibited </a></td>
<td>Characters prohibited in IDN [<a href="http://ietf.org/rfc/rfc3454.txt">StringPrep</a>
Section 5] (Note: most of these are due to IDN&#39;s using an old version of Unicode. IDN does
treat unassigned characters differently than explicitly prohibited characters, but for our
purposes this distinction doesn&#39;t matter.)</td>
</tr>
</table>
</blockquote>
<h3>Additional <a name="Word_Characters">Word Characters</a></h3>
<p>This is a draft list of characters based on <i>Section 4 Word Boundaries</i> of
<a href="http://www.unicode.org/reports/tr29/tr29-9.html#Word_Boundaries">UAX# 29</a>, in the
Word_Break property and notes at the end of the section. While not currently a part of the
recommended characters for programming identifiers (XID_Continue), these characters have been
identified as being necessary for more &quot;natural language&quot; identifiers, since some words in some
modern languages could not be constructed without them. See also
<a href="http://www.unicode.org/reports/tr31/tr31-5.html">UAX #31: Identifier and Pattern Syntax</a>.
These characters are listed in the plain text file, as described below.</p>
<h2>Plain-Text Version</h2>
<p>The information in the categorization is also available in a plain-text file, at
<a href="idn-chars.txt">idn-chars.txt</a>. It can be viewed as is, or loaded into a spreadsheet for
sorting and filtering to view the data in different ways. The format is:</p>
@ -101,7 +104,22 @@ sorting and filtering to view the data in different ways. The format is:</p>
</blockquote>
<p><i>Examples:</i></p>
<pre>0061 ; LATIN ; Atomic # ; L&amp; (a) LATIN SMALL LETTER A
<code>026B ; LATIN ; Atomic-no-uppercase # L&amp; (?) LATIN SMALL LETTER L WITH MIDDLE TILDE</code>
2015 ; COMMON ; Pattern_Syntax # Pd (―) HORIZONTAL BAR
058A ; ARMENIAN ; Atomic-no-uppercase # ; Pd (֊) ARMENIAN HYPHEN
20AC ; COMMON ; Non-XID # ; Sc (€) EURO SIGN</pre>
<h2>Categorization</h2>
<p>At the end of <a href="idn-chars.txt">idn-chars.txt</a> is a section called ADDITIONAL WORD
CHARACTERS, defined as described above. Below that is a section of FOR REVIEW characters,
sorted by Unicode general category (an additional category of XX is added for the odd characters
whose names include: <span style="font-variant: small-caps">MUSICAL SYMBOL, DINGBAT, or RADICAL</span>.)
We need review of that list to check for characters that are needed for words in modern languages,
that is, that should be moved up into the ADDITIONAL WORD CHARACTERS list. Each character in the FOR
REVIEW list is collected because it either: </p>
<ol>
<li>would not otherwise count as part of an XID, or</li>
<li>is part of a bicameral script and doesn&#39;t have an uppercase (eg, the situation for U+026B
above)</li>
</ol>
<p>In either case there is prima facie reason for some level of scrutiny, if the goal to be
initially conservative in repertoire.</p>
<h2><a name="Categorization">Categorization</a></h2>