ICU-0 updated unicode files
X-SVN-Rev: 17248
This commit is contained in:
parent
9c227a58f5
commit
ad417f752e
@ -1,16 +1,16 @@
|
||||
Generate: Derived.*
|
||||
Generate: .*
|
||||
DeltaVersion: 9
|
||||
CopyrightYear: 2005
|
||||
|
||||
File: uax29/GraphemeBreakProperty
|
||||
File: auxiliary/GraphemeBreakProperty
|
||||
Property: Grapheme_Cluster_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
File: uax29/WordBreakProperty
|
||||
File: auxiliary/WordBreakProperty
|
||||
Property: Word_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
File: uax29/SentenceBreakProperty
|
||||
File: auxiliary/SentenceBreakProperty
|
||||
Property: Sentence_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
|
@ -208,7 +208,7 @@ public final class NFSkippable extends UCDProperty {
|
||||
NFSkippable skipper = new NFSkippable(mode, Default.ucd());
|
||||
generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
|
||||
}
|
||||
|
||||
System.out.println("Done");
|
||||
out.close();
|
||||
}
|
||||
|
||||
|
@ -15,72 +15,23 @@
|
||||
#
|
||||
# <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
|
||||
#
|
||||
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more than
|
||||
# one character, they are separated by spaces. Other than as used to separate elements,
|
||||
# spaces are to be ignored.
|
||||
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more
|
||||
# than one character, they are separated by spaces. Other than as used to separate
|
||||
# elements, spaces are to be ignored.
|
||||
#
|
||||
# The <condition_list> is optional. Where present, it consists of one or more locales or contexts,
|
||||
# separated by spaces. In these conditions:
|
||||
# The <condition_list> is optional. Where present, it consists of one or more locale IDs
|
||||
# or contexts, separated by spaces. In these conditions:
|
||||
# - A condition list overrides the normal behavior if all of the listed conditions are true.
|
||||
# - The context is always the context of the characters in the original string,
|
||||
# NOT in the resulting string.
|
||||
# - Case distinctions in the condition list are not significant.
|
||||
# - Conditions preceded by "Not_" represent the negation of the condition.
|
||||
#
|
||||
# A locale is defined as:
|
||||
# <locale> := <ISO_639_code> ( "_" <ISO_3166_code> ( "_" <variant> )? )?
|
||||
# <ISO_3166_code> := 2-letter ISO country code,
|
||||
# <ISO_639_code> := 2-letter ISO language code
|
||||
# A locale ID is defined by taking any language tag as defined by
|
||||
# RFC 3066 (or its successor), and replacing '-' by '_'.
|
||||
#
|
||||
# A context for a character C is one of the following. This overrides Table
|
||||
# 3-13. Context Specification for Casing on p. 89 of The Unicode Standard,
|
||||
# Version 4.0.
|
||||
#
|
||||
# Definitions
|
||||
# - The property "cased" is defined in D47 on that same page (p. 89)
|
||||
# - A character C is defined to be "case-ignorable" if it meets either of the
|
||||
# following criteria:
|
||||
# A. The general category of C is Nonspacing Mark (Mn), or Enclosing Mark
|
||||
# (Me), or Format Control (Cf), or Letter Modifier (Lm), or
|
||||
# Symbol Modifier (Sk)
|
||||
# B. C is a MidLetter as defined in UAX #29
|
||||
# - A "case-ignorable sequence" is a sequence of zero or more case-ignorable
|
||||
# characters.
|
||||
#
|
||||
# A description of each context is followed by the equivalent regular
|
||||
# expression(s) describing the context before C and/or the context after C.
|
||||
# The regular expression uses the syntax of UTS #18, with one addition:
|
||||
# "!" means that the expression does not match. All regular expressions
|
||||
# below are case-sensitive.
|
||||
#
|
||||
# Context: Final_Sigma
|
||||
# Description: C is preceded by a sequence consisting of a cased letter and
|
||||
# a case-ignorable sequence, and C is not followed by a sequence consisting
|
||||
# of an ignorable sequence
|
||||
# and then a cased letter.
|
||||
# Before C: \p{cased} (\p{case-ignorable})*
|
||||
# After C: !( (\p{case-ignorable})* \p{cased} )
|
||||
#
|
||||
# Context: After_Soft_Dotted
|
||||
# Description: The last preceding character with combining class of zero before C was
|
||||
# Soft_Dotted, and there is no intervening combining character class 230 (ABOVE).
|
||||
# Before C: [\p{Soft_Dotted}] ([^{cc=230} {cc=0}])*
|
||||
#
|
||||
# Context: More_Above
|
||||
# Description: C is followed by one or more characters of combining class
|
||||
# 230 (ABOVE) in the combining character sequence.
|
||||
# After C: [^\p{cc=0}]* [\p{cc=230}]
|
||||
#
|
||||
# Context: Before_Dot
|
||||
# Description: C is followed by combining dot above (U+0307). Any sequence
|
||||
# of characters with a combining class that is neither 0 nor 230 may intervene
|
||||
# between the current character and the combining dot above.
|
||||
# After C: ([^\p{cc=230} \p{cc=0}])* [\u0307]
|
||||
#
|
||||
# Context: After_I
|
||||
# Description: The last preceding base character was an uppercase I, and
|
||||
# there is no intervening combining character class 230 (ABOVE).
|
||||
# Before C: [I] ([^\p{cc=230} \p{cc=0}])
|
||||
# A context for a character C is defined by Section 3.13 Default Case Operations,
|
||||
# on p. 89-90 of The Unicode Standard, Version 4.0, as amended by Unicode 4.0.1.
|
||||
#
|
||||
# Parsers of this file must be prepared to deal with future additions to this format:
|
||||
# * Additional contexts
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2004/12/11 06:03:08 $
|
||||
* $Revision: 1.16 $
|
||||
* $Date: 2005/02/24 02:59:34 $
|
||||
* $Revision: 1.17 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -20,9 +20,17 @@ import java.text.SimpleDateFormat;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
|
||||
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.impl.ICUData;
|
||||
import com.ibm.icu.impl.ICUResourceBundle;
|
||||
import com.ibm.icu.impl.UCharArrayIterator;
|
||||
import com.ibm.icu.text.NumberFormat;
|
||||
import com.ibm.icu.text.StringPrep;
|
||||
import com.ibm.icu.text.StringPrepParseException;
|
||||
import com.ibm.icu.util.Currency;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
|
||||
import java.util.regex.*;
|
||||
@ -35,6 +43,9 @@ public class TestData implements UCD_Types {
|
||||
static UnicodeProperty.Factory upf;
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
//checkChars(false);
|
||||
new GenStringPrep().genStringPrep();
|
||||
if (true) return;
|
||||
|
||||
System.out.println("main: " + Default.getDate());
|
||||
upf = ICUPropertyFactory.make();
|
||||
@ -138,6 +149,269 @@ public class TestData implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
static class GenStringPrep {
|
||||
UnicodeSet[] coreChars = new UnicodeSet[100];
|
||||
UnicodeSet[] decompChars = new UnicodeSet[100];
|
||||
UCD ucd = Default.ucd();
|
||||
|
||||
Collator uca = Collator.getInstance(ULocale.ENGLISH);
|
||||
{
|
||||
uca.setStrength(Collator.IDENTICAL);
|
||||
}
|
||||
|
||||
UnicodeSet bidiR = new UnicodeSet(
|
||||
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
|
||||
|
||||
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
|
||||
UnicodeSet hasUpper = new UnicodeSet();
|
||||
|
||||
|
||||
void genStringPrep() throws IOException {
|
||||
StringBuffer inbuffer = new StringBuffer();
|
||||
StringBuffer intermediate, outbuffer;
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
inbuffer.setLength(0);
|
||||
UTF16.append(inbuffer, cp);
|
||||
try {
|
||||
intermediate = IDNA.convertToASCII(inbuffer,
|
||||
IDNA.USE_STD3_RULES);
|
||||
if (intermediate.length() == 0)
|
||||
continue;
|
||||
outbuffer = IDNA.convertToUnicode(intermediate,
|
||||
IDNA.USE_STD3_RULES);
|
||||
} catch (StringPrepParseException e) {
|
||||
continue;
|
||||
} catch (Exception e) {
|
||||
System.out.println("Failure at: " + Utility.hex(cp));
|
||||
continue;
|
||||
}
|
||||
if (!TestData.equals(inbuffer, outbuffer))
|
||||
continue;
|
||||
int script = ucd.getScript(cp);
|
||||
if (!Default.nfd().isNormalized(cp)) {
|
||||
if (decompChars[script] == null)
|
||||
decompChars[script] = new UnicodeSet();
|
||||
decompChars[script].add(cp);
|
||||
} else {
|
||||
if (coreChars[script] == null)
|
||||
coreChars[script] = new UnicodeSet();
|
||||
coreChars[script].add(cp);
|
||||
}
|
||||
}
|
||||
// find characters with no uppercase
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) {
|
||||
String str = UTF16.valueOf(it.codepoint);
|
||||
if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(it.codepoint);
|
||||
}
|
||||
|
||||
Utility.fixDot();
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(GEN_DIR,
|
||||
"idn-chars.html");
|
||||
out
|
||||
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
out.println("<title>IDN Characters</title><style>");
|
||||
out.println("<!--");
|
||||
out
|
||||
.println(".script { font-size: 150%; background-color: #C0C0C0 }");
|
||||
out.println("th { text-align: left }");
|
||||
out.println("-->");
|
||||
out.println("</style></head><body><table>");
|
||||
|
||||
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
|
||||
if (scriptCode == COMMON_SCRIPT
|
||||
|| scriptCode == INHERITED_SCRIPT)
|
||||
continue;
|
||||
showCodes(out, scriptCode);
|
||||
}
|
||||
showCodes(out, COMMON_SCRIPT);
|
||||
showCodes(out, INHERITED_SCRIPT);
|
||||
out.println("</table></body></html>");
|
||||
out.close();
|
||||
}
|
||||
|
||||
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
|
||||
|
||||
/**
|
||||
* @param out
|
||||
* @param ucd
|
||||
* @param coreChars
|
||||
* @param decompChars
|
||||
* @param scriptCode
|
||||
*/
|
||||
private void showCodes(PrintWriter out, int scriptCode) {
|
||||
if (coreChars[scriptCode] == null
|
||||
&& decompChars[scriptCode] == null)
|
||||
return;
|
||||
System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
|
||||
String script = Default.ucd().getScriptID_fromIndex(
|
||||
(byte) scriptCode);
|
||||
out.println();
|
||||
out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
|
||||
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
|
||||
UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
|
||||
core.removeAll(otherCore);
|
||||
if (core.size() == 0) {
|
||||
UnicodeSet temp = core;
|
||||
core = otherCore;
|
||||
otherCore = temp;
|
||||
}
|
||||
printlnSet(out, "Atomic", core, scriptCode);
|
||||
if (otherCore.size() != 0) printlnSet(out, "Atomic [noUpper]", otherCore, scriptCode);
|
||||
UnicodeSet decomp = decompChars[scriptCode];
|
||||
if (decomp != null && decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param out
|
||||
* @param unicodeset
|
||||
* @param uca
|
||||
* @param scriptCode
|
||||
*/
|
||||
private void printlnSet(PrintWriter out, String title,
|
||||
UnicodeSet unicodeset, int scriptCode) {
|
||||
if (unicodeset == null)
|
||||
return;
|
||||
int size = unicodeset.size();
|
||||
String dir = unicodeset.containsSome(bidiR)
|
||||
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
|
||||
out.println("<tr><th class='" + title + "'>" + title + " ("
|
||||
+ nf.format(size) + ")</th></tr>");
|
||||
out.print("<tr><td" + dir + ">");
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator();
|
||||
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
|
||||
usi.reset(unicodeset);
|
||||
while (usi.nextRange()) {
|
||||
if (usi.codepoint == usi.codepointEnd) {
|
||||
out.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint)));
|
||||
} else {
|
||||
out.print(formatCode(UTF16
|
||||
.valueOf(usi.codepoint))
|
||||
+ ".. "
|
||||
+ formatCode(UTF16
|
||||
.valueOf(usi.codepointEnd)));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Set reordered = new TreeSet(uca);
|
||||
usi.reset(unicodeset);
|
||||
while (usi.next()) {
|
||||
boolean foo = reordered.add(usi.getString());
|
||||
if (!foo)
|
||||
throw new IllegalArgumentException("Collision with "
|
||||
+ Default.ucd().getCodeAndName(usi.getString()));
|
||||
}
|
||||
for (Iterator it = reordered.iterator(); it.hasNext();) {
|
||||
out.print(formatCode((String) it
|
||||
.next()));
|
||||
}
|
||||
}
|
||||
out.println("</td></tr>");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string
|
||||
* @return
|
||||
*/
|
||||
private String formatCode(String string) {
|
||||
int cat = ucd.getCategory(UTF16.charAt(string,0));
|
||||
return "<span title='" + ucd.getCodeAndName(string) + "'>"
|
||||
+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
|
||||
+ BagFormatter.toHTML.transliterate(string)
|
||||
+ " </span>";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param inbuffer
|
||||
* @param outbuffer
|
||||
* @return
|
||||
*/
|
||||
public static boolean equals(StringBuffer inbuffer, StringBuffer outbuffer) {
|
||||
if (inbuffer.length() != outbuffer.length()) return false;
|
||||
for (int i = inbuffer.length() - 1; i >= 0; --i) {
|
||||
if (inbuffer.charAt(i) != outbuffer.charAt(i)) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static void checkChars(boolean mergeRanges) {
|
||||
UCD ucd = Default.ucd();
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
UnicodeSet isUpper = ups.getSet("Uppercase=true");
|
||||
UnicodeSet isLower = ups.getSet("Lowercase=true");
|
||||
UnicodeSet isTitle = ups.getSet("gc=Lt");
|
||||
UnicodeSet otherAlphabetic = ups.getSet("Alphabetic=true").addAll(ups.getSet("gc=Sk"));
|
||||
// create the following
|
||||
UnicodeSet hasFold = new UnicodeSet();
|
||||
UnicodeSet hasUpper = new UnicodeSet();
|
||||
UnicodeSet hasLower = new UnicodeSet();
|
||||
UnicodeSet hasTitle = new UnicodeSet();
|
||||
UnicodeSet compat = new UnicodeSet();
|
||||
UnicodeSet bicameralsScripts = new UnicodeSet();
|
||||
|
||||
UCD u40 = UCD.make("4.0.0");
|
||||
BitSet scripts = new BitSet();
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
int gc = ucd.getCategory(i);
|
||||
if (gc == Cn || gc == PRIVATE_USE) continue;
|
||||
String str = UTF16.valueOf(i);
|
||||
if (!str.equals(ucd.getCase(str, FULL, FOLD))) hasFold.add(i);
|
||||
if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(i);
|
||||
if (!str.equals(ucd.getCase(str, FULL, LOWER))) {
|
||||
hasLower.add(i);
|
||||
scripts.set(ucd.getScript(i));
|
||||
}
|
||||
if (!str.equals(ucd.getCase(str, FULL, TITLE))) hasTitle.add(i);
|
||||
if (!str.equals(Default.nfkd().normalize(str))) compat.add(i);
|
||||
//System.out.println(ucd.getCodeAndName(i) + "\t" + (u40.isAllocated(i) ? "already in 4.0" : "new in 4.1"));
|
||||
}
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setMergeRanges(mergeRanges);
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
printItems(bf, compat, "isUpper or isTitle without hasLower",
|
||||
new UnicodeSet(isUpper).addAll(isTitle).removeAll(hasLower));
|
||||
printItems(bf, compat, "hasLower, but not isUpper or isTitle",
|
||||
new UnicodeSet(hasLower).removeAll(isTitle).removeAll(isUpper));
|
||||
printItems(bf, compat, "isLower without hasUpper",
|
||||
new UnicodeSet(isLower).addAll(isTitle).removeAll(hasUpper));
|
||||
printItems(bf, compat, "hasUpper, but not isLower or isTitle",
|
||||
new UnicodeSet(hasUpper).removeAll(isTitle).removeAll(isLower));
|
||||
|
||||
UnicodeSet scriptSet = new UnicodeSet();
|
||||
UnicodeProperty scriptProp = ups.getProperty("Script");
|
||||
for (int i = 0; i < scripts.size(); ++i) {
|
||||
if (!scripts.get(i)) continue;
|
||||
if (i == COMMON_SCRIPT) continue;
|
||||
String scriptName = ucd.getScriptID_fromIndex((byte)i);
|
||||
System.out.println(scriptName);
|
||||
scriptSet.addAll(scriptProp.getSet(scriptName));
|
||||
}
|
||||
UnicodeSet allCased = new UnicodeSet().addAll(isUpper).addAll(isLower).addAll(isTitle);
|
||||
printItems(bf, compat, "(Bicameral) isAlpha or Symbol Modifier, but not isCased",
|
||||
new UnicodeSet(scriptSet).retainAll(otherAlphabetic).removeAll(allCased));
|
||||
printItems(bf, compat, "(Bicameral) isCased, but not isAlpha or Symbol Modifier",
|
||||
new UnicodeSet(scriptSet).retainAll(allCased).removeAll(otherAlphabetic));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param bf
|
||||
* @param compat
|
||||
* @param temp
|
||||
*/
|
||||
private static void printItems(BagFormatter bf, UnicodeSet compat, String title, UnicodeSet temp) {
|
||||
System.out.println();
|
||||
System.out.println(title + " -- (non compat)");
|
||||
UnicodeSet temp2 = new UnicodeSet(temp).removeAll(compat);
|
||||
System.out.println(bf.showSetNames(temp2));
|
||||
System.out.println();
|
||||
temp2 = new UnicodeSet(temp).retainAll(compat);
|
||||
System.out.println(title + " -- (compat)");
|
||||
System.out.println(bf.showSetNames(temp2));
|
||||
}
|
||||
|
||||
static PrintWriter log;
|
||||
|
||||
public static void checkShaping() throws IOException {
|
||||
|
@ -75,6 +75,10 @@ public class TestUnicodeInvariants {
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
BagFormatter bf2 = new BagFormatter();
|
||||
bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
bf2.setMergeRanges(false);
|
||||
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
|
||||
ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
|
||||
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
|
||||
@ -106,7 +110,16 @@ public class TestUnicodeInvariants {
|
||||
continue;
|
||||
}
|
||||
|
||||
char relation = 0;
|
||||
// detect variables
|
||||
if (line.startsWith("Show")) {
|
||||
String part = line.substring(4).trim();
|
||||
pp.setIndex(0);
|
||||
UnicodeSet leftSet = new UnicodeSet(part, pp, st);
|
||||
bf2.showSetNames(out, leftSet);
|
||||
continue;
|
||||
}
|
||||
|
||||
char relation = 0;
|
||||
String rightSide = null;
|
||||
String leftSide = null;
|
||||
UnicodeSet leftSet = null;
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2004/02/12 08:23:16 $
|
||||
* $Revision: 1.25 $
|
||||
* $Date: 2005/02/24 02:59:34 $
|
||||
* $Revision: 1.26 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -272,7 +272,7 @@ public class VerifyUCD implements UCD_Types {
|
||||
}
|
||||
System.out.println(" </tr>");
|
||||
if (prop.doTotal(i, true)) printTotals("Subtotal", subtotalCount, true);
|
||||
if (prop.doTotal(i, false)) printTotals("Cumulative Total", totalCount, false);
|
||||
if (prop.doTotal(i, false)) printTotals("Cummulative Total", totalCount, false);
|
||||
}
|
||||
printTotals("Total", totalCount, false);
|
||||
System.out.println("</table>");
|
||||
|
Loading…
Reference in New Issue
Block a user