ICU-0 updated unicode files

X-SVN-Rev: 17248
This commit is contained in:
Mark Davis 2005-02-24 02:59:34 +00:00
parent 9c227a58f5
commit ad417f752e
6 changed files with 307 additions and 69 deletions

View File

@ -1,16 +1,16 @@
Generate: Derived.*
Generate: .*
DeltaVersion: 9
CopyrightYear: 2005
File: uax29/GraphemeBreakProperty
File: auxiliary/GraphemeBreakProperty
Property: Grapheme_Cluster_Break
Format: skipValue=Other
File: uax29/WordBreakProperty
File: auxiliary/WordBreakProperty
Property: Word_Break
Format: skipValue=Other
File: uax29/SentenceBreakProperty
File: auxiliary/SentenceBreakProperty
Property: Sentence_Break
Format: skipValue=Other

View File

@ -208,7 +208,7 @@ public final class NFSkippable extends UCDProperty {
NFSkippable skipper = new NFSkippable(mode, Default.ucd());
generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
}
System.out.println("Done");
out.close();
}

View File

@ -15,72 +15,23 @@
#
# <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
#
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more than
# one character, they are separated by spaces. Other than as used to separate elements,
# spaces are to be ignored.
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more
# than one character, they are separated by spaces. Other than as used to separate
# elements, spaces are to be ignored.
#
# The <condition_list> is optional. Where present, it consists of one or more locales or contexts,
# separated by spaces. In these conditions:
# The <condition_list> is optional. Where present, it consists of one or more locale IDs
# or contexts, separated by spaces. In these conditions:
# - A condition list overrides the normal behavior if all of the listed conditions are true.
# - The context is always the context of the characters in the original string,
# NOT in the resulting string.
# - Case distinctions in the condition list are not significant.
# - Conditions preceded by "Not_" represent the negation of the condition.
#
# A locale is defined as:
# <locale> := <ISO_639_code> ( "_" <ISO_3166_code> ( "_" <variant> )? )?
# <ISO_3166_code> := 2-letter ISO country code,
# <ISO_639_code> := 2-letter ISO language code
# A locale ID is defined by taking any language tag as defined by
# RFC 3066 (or its successor), and replacing '-' by '_'.
#
# A context for a character C is one of the following. This overrides Table
# 3-13. Context Specification for Casing on p. 89 of The Unicode Standard,
# Version 4.0.
#
# Definitions
# - The property "cased" is defined in D47 on that same page (p. 89)
# - A character C is defined to be "case-ignorable" if it meets either of the
# following criteria:
# A. The general category of C is Nonspacing Mark (Mn), or Enclosing Mark
# (Me), or Format Control (Cf), or Letter Modifier (Lm), or
# Symbol Modifier (Sk)
# B. C is a MidLetter as defined in UAX #29
# - A "case-ignorable sequence" is a sequence of zero or more case-ignorable
# characters.
#
# A description of each context is followed by the equivalent regular
# expression(s) describing the context before C and/or the context after C.
# The regular expression uses the syntax of UTS #18, with one addition:
# "!" means that the expression does not match. All regular expressions
# below are case-sensitive.
#
# Context: Final_Sigma
# Description: C is preceded by a sequence consisting of a cased letter and
# a case-ignorable sequence, and C is not followed by a sequence consisting
# of an ignorable sequence
# and then a cased letter.
# Before C: \p{cased} (\p{case-ignorable})*
# After C: !( (\p{case-ignorable})* \p{cased} )
#
# Context: After_Soft_Dotted
# Description: The last preceding character with combining class of zero before C was
# Soft_Dotted, and there is no intervening combining character class 230 (ABOVE).
# Before C: [\p{Soft_Dotted}] ([^{cc=230} {cc=0}])*
#
# Context: More_Above
# Description: C is followed by one or more characters of combining class
# 230 (ABOVE) in the combining character sequence.
# After C: [^\p{cc=0}]* [\p{cc=230}]
#
# Context: Before_Dot
# Description: C is followed by combining dot above (U+0307). Any sequence
# of characters with a combining class that is neither 0 nor 230 may intervene
# between the current character and the combining dot above.
# After C: ([^\p{cc=230} \p{cc=0}])* [\u0307]
#
# Context: After_I
# Description: The last preceding base character was an uppercase I, and
# there is no intervening combining character class 230 (ABOVE).
# Before C: [I] ([^\p{cc=230} \p{cc=0}])
# A context for a character C is defined by Section 3.13 Default Case Operations,
# on p. 89-90 of The Unicode Standard, Version 4.0, as amended by Unicode 4.0.1.
#
# Parsers of this file must be prepared to deal with future additions to this format:
# * Additional contexts

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2004/12/11 06:03:08 $
* $Revision: 1.16 $
* $Date: 2005/02/24 02:59:34 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -20,9 +20,17 @@ import java.text.SimpleDateFormat;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
import com.ibm.icu.dev.test.util.UnicodeLabel;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.UCharArrayIterator;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.StringPrep;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.util.Currency;
import com.ibm.icu.util.ULocale;
import java.math.BigDecimal;
import java.util.regex.*;
@ -35,6 +43,9 @@ public class TestData implements UCD_Types {
static UnicodeProperty.Factory upf;
public static void main (String[] args) throws IOException {
//checkChars(false);
new GenStringPrep().genStringPrep();
if (true) return;
System.out.println("main: " + Default.getDate());
upf = ICUPropertyFactory.make();
@ -138,6 +149,269 @@ public class TestData implements UCD_Types {
}
}
static class GenStringPrep {
UnicodeSet[] coreChars = new UnicodeSet[100];
UnicodeSet[] decompChars = new UnicodeSet[100];
UCD ucd = Default.ucd();
Collator uca = Collator.getInstance(ULocale.ENGLISH);
{
uca.setStrength(Collator.IDENTICAL);
}
UnicodeSet bidiR = new UnicodeSet(
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
UnicodeSet hasUpper = new UnicodeSet();
void genStringPrep() throws IOException {
StringBuffer inbuffer = new StringBuffer();
StringBuffer intermediate, outbuffer;
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
IDNA.USE_STD3_RULES);
if (intermediate.length() == 0)
continue;
outbuffer = IDNA.convertToUnicode(intermediate,
IDNA.USE_STD3_RULES);
} catch (StringPrepParseException e) {
continue;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
continue;
}
if (!TestData.equals(inbuffer, outbuffer))
continue;
int script = ucd.getScript(cp);
if (!Default.nfd().isNormalized(cp)) {
if (decompChars[script] == null)
decompChars[script] = new UnicodeSet();
decompChars[script].add(cp);
} else {
if (coreChars[script] == null)
coreChars[script] = new UnicodeSet();
coreChars[script].add(cp);
}
}
// find characters with no uppercase
for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) {
String str = UTF16.valueOf(it.codepoint);
if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(it.codepoint);
}
Utility.fixDot();
PrintWriter out = BagFormatter.openUTF8Writer(GEN_DIR,
"idn-chars.html");
out
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>IDN Characters</title><style>");
out.println("<!--");
out
.println(".script { font-size: 150%; background-color: #C0C0C0 }");
out.println("th { text-align: left }");
out.println("-->");
out.println("</style></head><body><table>");
for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
if (scriptCode == COMMON_SCRIPT
|| scriptCode == INHERITED_SCRIPT)
continue;
showCodes(out, scriptCode);
}
showCodes(out, COMMON_SCRIPT);
showCodes(out, INHERITED_SCRIPT);
out.println("</table></body></html>");
out.close();
}
UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
/**
* @param out
* @param ucd
* @param coreChars
* @param decompChars
* @param scriptCode
*/
private void showCodes(PrintWriter out, int scriptCode) {
if (coreChars[scriptCode] == null
&& decompChars[scriptCode] == null)
return;
System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
String script = Default.ucd().getScriptID_fromIndex(
(byte) scriptCode);
out.println();
out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
core.removeAll(otherCore);
if (core.size() == 0) {
UnicodeSet temp = core;
core = otherCore;
otherCore = temp;
}
printlnSet(out, "Atomic", core, scriptCode);
if (otherCore.size() != 0) printlnSet(out, "Atomic [noUpper]", otherCore, scriptCode);
UnicodeSet decomp = decompChars[scriptCode];
if (decomp != null && decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
}
/**
* @param out
* @param unicodeset
* @param uca
* @param scriptCode
*/
private void printlnSet(PrintWriter out, String title,
UnicodeSet unicodeset, int scriptCode) {
if (unicodeset == null)
return;
int size = unicodeset.size();
String dir = unicodeset.containsSome(bidiR)
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
out.println("<tr><th class='" + title + "'>" + title + " ("
+ nf.format(size) + ")</th></tr>");
out.print("<tr><td" + dir + ">");
UnicodeSetIterator usi = new UnicodeSetIterator();
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
usi.reset(unicodeset);
while (usi.nextRange()) {
if (usi.codepoint == usi.codepointEnd) {
out.print(formatCode(UTF16
.valueOf(usi.codepoint)));
} else {
out.print(formatCode(UTF16
.valueOf(usi.codepoint))
+ ".. "
+ formatCode(UTF16
.valueOf(usi.codepointEnd)));
}
}
} else {
Set reordered = new TreeSet(uca);
usi.reset(unicodeset);
while (usi.next()) {
boolean foo = reordered.add(usi.getString());
if (!foo)
throw new IllegalArgumentException("Collision with "
+ Default.ucd().getCodeAndName(usi.getString()));
}
for (Iterator it = reordered.iterator(); it.hasNext();) {
out.print(formatCode((String) it
.next()));
}
}
out.println("</td></tr>");
}
/**
* @param string
* @return
*/
private String formatCode(String string) {
int cat = ucd.getCategory(UTF16.charAt(string,0));
return "<span title='" + ucd.getCodeAndName(string) + "'>"
+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
+ BagFormatter.toHTML.transliterate(string)
+ " </span>";
}
}
/**
* @param inbuffer
* @param outbuffer
* @return
*/
public static boolean equals(StringBuffer inbuffer, StringBuffer outbuffer) {
if (inbuffer.length() != outbuffer.length()) return false;
for (int i = inbuffer.length() - 1; i >= 0; --i) {
if (inbuffer.charAt(i) != outbuffer.charAt(i)) return false;
}
return true;
}
private static void checkChars(boolean mergeRanges) {
UCD ucd = Default.ucd();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
UnicodeSet isUpper = ups.getSet("Uppercase=true");
UnicodeSet isLower = ups.getSet("Lowercase=true");
UnicodeSet isTitle = ups.getSet("gc=Lt");
UnicodeSet otherAlphabetic = ups.getSet("Alphabetic=true").addAll(ups.getSet("gc=Sk"));
// create the following
UnicodeSet hasFold = new UnicodeSet();
UnicodeSet hasUpper = new UnicodeSet();
UnicodeSet hasLower = new UnicodeSet();
UnicodeSet hasTitle = new UnicodeSet();
UnicodeSet compat = new UnicodeSet();
UnicodeSet bicameralsScripts = new UnicodeSet();
UCD u40 = UCD.make("4.0.0");
BitSet scripts = new BitSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
int gc = ucd.getCategory(i);
if (gc == Cn || gc == PRIVATE_USE) continue;
String str = UTF16.valueOf(i);
if (!str.equals(ucd.getCase(str, FULL, FOLD))) hasFold.add(i);
if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(i);
if (!str.equals(ucd.getCase(str, FULL, LOWER))) {
hasLower.add(i);
scripts.set(ucd.getScript(i));
}
if (!str.equals(ucd.getCase(str, FULL, TITLE))) hasTitle.add(i);
if (!str.equals(Default.nfkd().normalize(str))) compat.add(i);
//System.out.println(ucd.getCodeAndName(i) + "\t" + (u40.isAllocated(i) ? "already in 4.0" : "new in 4.1"));
}
BagFormatter bf = new BagFormatter();
bf.setMergeRanges(mergeRanges);
bf.setUnicodePropertyFactory(ups);
printItems(bf, compat, "isUpper or isTitle without hasLower",
new UnicodeSet(isUpper).addAll(isTitle).removeAll(hasLower));
printItems(bf, compat, "hasLower, but not isUpper or isTitle",
new UnicodeSet(hasLower).removeAll(isTitle).removeAll(isUpper));
printItems(bf, compat, "isLower without hasUpper",
new UnicodeSet(isLower).addAll(isTitle).removeAll(hasUpper));
printItems(bf, compat, "hasUpper, but not isLower or isTitle",
new UnicodeSet(hasUpper).removeAll(isTitle).removeAll(isLower));
UnicodeSet scriptSet = new UnicodeSet();
UnicodeProperty scriptProp = ups.getProperty("Script");
for (int i = 0; i < scripts.size(); ++i) {
if (!scripts.get(i)) continue;
if (i == COMMON_SCRIPT) continue;
String scriptName = ucd.getScriptID_fromIndex((byte)i);
System.out.println(scriptName);
scriptSet.addAll(scriptProp.getSet(scriptName));
}
UnicodeSet allCased = new UnicodeSet().addAll(isUpper).addAll(isLower).addAll(isTitle);
printItems(bf, compat, "(Bicameral) isAlpha or Symbol Modifier, but not isCased",
new UnicodeSet(scriptSet).retainAll(otherAlphabetic).removeAll(allCased));
printItems(bf, compat, "(Bicameral) isCased, but not isAlpha or Symbol Modifier",
new UnicodeSet(scriptSet).retainAll(allCased).removeAll(otherAlphabetic));
}
/**
* @param bf
* @param compat
* @param temp
*/
private static void printItems(BagFormatter bf, UnicodeSet compat, String title, UnicodeSet temp) {
System.out.println();
System.out.println(title + " -- (non compat)");
UnicodeSet temp2 = new UnicodeSet(temp).removeAll(compat);
System.out.println(bf.showSetNames(temp2));
System.out.println();
temp2 = new UnicodeSet(temp).retainAll(compat);
System.out.println(title + " -- (compat)");
System.out.println(bf.showSetNames(temp2));
}
static PrintWriter log;
public static void checkShaping() throws IOException {

View File

@ -75,6 +75,10 @@ public class TestUnicodeInvariants {
out.write('\uFEFF'); // BOM
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
BagFormatter bf2 = new BagFormatter();
bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
bf2.setMergeRanges(false);
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
@ -106,7 +110,16 @@ public class TestUnicodeInvariants {
continue;
}
char relation = 0;
// detect variables
if (line.startsWith("Show")) {
String part = line.substring(4).trim();
pp.setIndex(0);
UnicodeSet leftSet = new UnicodeSet(part, pp, st);
bf2.showSetNames(out, leftSet);
continue;
}
char relation = 0;
String rightSide = null;
String leftSide = null;
UnicodeSet leftSet = null;

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2004/02/12 08:23:16 $
* $Revision: 1.25 $
* $Date: 2005/02/24 02:59:34 $
* $Revision: 1.26 $
*
*******************************************************************************
*/
@ -272,7 +272,7 @@ public class VerifyUCD implements UCD_Types {
}
System.out.println(" </tr>");
if (prop.doTotal(i, true)) printTotals("Subtotal", subtotalCount, true);
if (prop.doTotal(i, false)) printTotals("Cumulative Total", totalCount, false);
if (prop.doTotal(i, false)) printTotals("Cummulative Total", totalCount, false);
}
printTotals("Total", totalCount, false);
System.out.println("</table>");