ICU-0 updated unicode files

X-SVN-Rev: 17248
2005-02-24 02:59:34 +00:00 · 2005-02-24 02:59:34 +00:00 · ad417f752e
commit ad417f752e
parent 9c227a58f5
6 changed files with 307 additions and 69 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
@ -1,16 +1,16 @@
-Generate: Derived.*
+Generate: .*
 DeltaVersion: 9
 CopyrightYear: 2005

-File: uax29/GraphemeBreakProperty
+File: auxiliary/GraphemeBreakProperty
 Property: Grapheme_Cluster_Break
 Format:	skipValue=Other

-File: uax29/WordBreakProperty
+File: auxiliary/WordBreakProperty
 Property: Word_Break
 Format:	skipValue=Other

-File: uax29/SentenceBreakProperty
+File: auxiliary/SentenceBreakProperty
 Property: Sentence_Break
 Format:	skipValue=Other

--- a/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java
+++ b/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java
@ -208,7 +208,7 @@ public final class NFSkippable extends UCDProperty {
            NFSkippable skipper = new NFSkippable(mode, Default.ucd());
            generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
        }
-        
+        System.out.println("Done");
        out.close();
    }
    
--- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
@ -15,72 +15,23 @@
 #
 # <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
 #
-# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more than
-# one character, they are separated by spaces. Other than as used to separate elements,
-# spaces are to be ignored.
+# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more
+# than one character, they are separated by spaces. Other than as used to separate 
+# elements, spaces are to be ignored.
 #
-# The <condition_list> is optional. Where present, it consists of one or more locales or contexts,
-# separated by spaces. In these conditions:
+# The <condition_list> is optional. Where present, it consists of one or more locale IDs
+# or contexts, separated by spaces. In these conditions:
 # - A condition list overrides the normal behavior if all of the listed conditions are true.
 # - The context is always the context of the characters in the original string,
 #   NOT in the resulting string.
 # - Case distinctions in the condition list are not significant.
 # - Conditions preceded by "Not_" represent the negation of the condition.
 #
-# A locale is defined as:
-# <locale> := <ISO_639_code> ( "_" <ISO_3166_code> ( "_" <variant> )? )?
-# <ISO_3166_code> := 2-letter ISO country code,
-# <ISO_639_code> :=  2-letter ISO language code
+# A locale ID is defined by taking any language tag as defined by
+# RFC 3066 (or its successor), and replacing '-' by '_'.
 #
-# A context for a character C is one of the following. This overrides Table
-# 3-13. Context Specification for Casing on p. 89 of The Unicode Standard,
-# Version 4.0.
-#
-# Definitions
-# - The property "cased" is defined in D47 on that same page (p. 89)
-# - A character C is defined to be "case-ignorable" if it meets either of the
-# following criteria:
-#  A. The general category of C is Nonspacing Mark (Mn), or Enclosing Mark
-#     (Me), or Format Control (Cf), or Letter Modifier (Lm), or 
-#     Symbol Modifier (Sk)
-#  B. C is a MidLetter as defined in UAX #29
-# - A "case-ignorable sequence" is a sequence of zero or more case-ignorable
-# characters.
-#
-# A description of each context is followed by the equivalent regular
-# expression(s) describing the context before C and/or the context after C.
-# The regular expression uses the syntax of UTS #18,  with one addition: 
-# "!" means that the expression does not match. All regular expressions
-# below are case-sensitive.
-#
-# Context: Final_Sigma
-# Description: C is preceded by a sequence consisting of a cased letter and
-# a case-ignorable sequence, and C is not followed by a sequence consisting 
-# of an ignorable sequence
-# and then a cased letter.
-# Before C: \p{cased} (\p{case-ignorable})*
-# After C: !( (\p{case-ignorable})* \p{cased} )
-#
-# Context: After_Soft_Dotted
-# Description: The last preceding character with combining class of zero before C was
-# Soft_Dotted,  and there is no intervening combining character class 230 (ABOVE).
-# Before C: [\p{Soft_Dotted}] ([^{cc=230} {cc=0}])*
-#
-# Context: More_Above
-# Description: C is followed by one or more characters of combining class
-# 230 (ABOVE) in the combining character sequence.
-# After C: [^\p{cc=0}]* [\p{cc=230}]
-#
-# Context: Before_Dot
-# Description: C is followed by combining dot above (U+0307). Any sequence
-# of characters with a combining class that is neither 0 nor 230 may intervene
-# between the current character and the combining dot above.
-# After C: ([^\p{cc=230} \p{cc=0}])* [\u0307]
-#
-# Context: After_I
-# Description: The last preceding base character was an uppercase I, and
-# there is no intervening combining character class 230 (ABOVE).
-# Before C: [I] ([^\p{cc=230} \p{cc=0}])
+# A context for a character C is defined by Section 3.13 Default Case Operations,
+# on p. 89-90 of The Unicode Standard, Version 4.0, as amended by Unicode 4.0.1.
 #
 # Parsers of this file must be prepared to deal with future additions to this format:
 #  * Additional contexts
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2004/12/11 06:03:08 $
-* $Revision: 1.16 $
+* $Date: 2005/02/24 02:59:34 $
+* $Revision: 1.17 $
 *
 *******************************************************************************
 */
@ -20,9 +20,17 @@ import java.text.SimpleDateFormat;

 import com.ibm.icu.dev.test.util.BagFormatter;
 import com.ibm.icu.dev.test.util.ICUPropertyFactory;
+import com.ibm.icu.dev.test.util.UnicodeLabel;
 import com.ibm.icu.dev.test.util.UnicodeProperty;
+import com.ibm.icu.impl.ICUData;
+import com.ibm.icu.impl.ICUResourceBundle;
+import com.ibm.icu.impl.UCharArrayIterator;
 import com.ibm.icu.text.NumberFormat;
+import com.ibm.icu.text.StringPrep;
+import com.ibm.icu.text.StringPrepParseException;
 import com.ibm.icu.util.Currency;
+import com.ibm.icu.util.ULocale;
+
 import java.math.BigDecimal;

 import java.util.regex.*;
@ -35,6 +43,9 @@ public class TestData implements UCD_Types {
    static UnicodeProperty.Factory upf;
    
 	public static void main (String[] args) throws IOException {
+		//checkChars(false);
+		new GenStringPrep().genStringPrep();
+		if (true) return;
        
        System.out.println("main: " + Default.getDate());
        upf = ICUPropertyFactory.make();
@ -138,6 +149,269 @@ public class TestData implements UCD_Types {
 		}
 	}
 	
+	static class GenStringPrep {
+		UnicodeSet[] coreChars = new UnicodeSet[100];
+		UnicodeSet[] decompChars = new UnicodeSet[100];
+		UCD ucd = Default.ucd();
+
+		Collator uca = Collator.getInstance(ULocale.ENGLISH);
+		{
+			uca.setStrength(Collator.IDENTICAL);
+		}
+
+		UnicodeSet bidiR = new UnicodeSet(
+				"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
+
+		UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
+		UnicodeSet hasUpper = new UnicodeSet();
+
+
+		void genStringPrep() throws IOException {
+			StringBuffer inbuffer = new StringBuffer();
+			StringBuffer intermediate, outbuffer;
+			for (int cp = 0; cp <= 0x10FFFF; ++cp) {
+				Utility.dot(cp);
+				inbuffer.setLength(0);
+				UTF16.append(inbuffer, cp);
+				try {
+					intermediate = IDNA.convertToASCII(inbuffer,
+							IDNA.USE_STD3_RULES);
+					if (intermediate.length() == 0)
+						continue;
+					outbuffer = IDNA.convertToUnicode(intermediate,
+							IDNA.USE_STD3_RULES);
+				} catch (StringPrepParseException e) {
+					continue;
+				} catch (Exception e) {
+					System.out.println("Failure at: " + Utility.hex(cp));
+					continue;
+				}
+				if (!TestData.equals(inbuffer, outbuffer))
+					continue;
+				int script = ucd.getScript(cp);
+				if (!Default.nfd().isNormalized(cp)) {
+					if (decompChars[script] == null)
+						decompChars[script] = new UnicodeSet();
+					decompChars[script].add(cp);
+				} else {
+					if (coreChars[script] == null)
+						coreChars[script] = new UnicodeSet();
+					coreChars[script].add(cp);
+				}
+			}
+			// find characters with no uppercase
+			for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) {
+				String str = UTF16.valueOf(it.codepoint);
+				if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(it.codepoint);
+			}
+			
+			Utility.fixDot();
+			PrintWriter out = BagFormatter.openUTF8Writer(GEN_DIR,
+					"idn-chars.html");
+			out
+					.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
+			out.println("<title>IDN Characters</title><style>");
+			out.println("<!--");
+			out
+					.println(".script       { font-size: 150%; background-color: #C0C0C0 }");
+			out.println("th           { text-align: left }");
+			out.println("-->");
+			out.println("</style></head><body><table>");
+
+			for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
+				if (scriptCode == COMMON_SCRIPT
+						|| scriptCode == INHERITED_SCRIPT)
+					continue;
+				showCodes(out, scriptCode);
+			}
+			showCodes(out, COMMON_SCRIPT);
+			showCodes(out, INHERITED_SCRIPT);
+			out.println("</table></body></html>");
+			out.close();
+		}
+		
+		UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
+
+		/**
+		 * @param out
+		 * @param ucd
+		 * @param coreChars
+		 * @param decompChars
+		 * @param scriptCode
+		 */
+		private void showCodes(PrintWriter out, int scriptCode) {
+			if (coreChars[scriptCode] == null
+					&& decompChars[scriptCode] == null)
+				return;
+			System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
+			String script = Default.ucd().getScriptID_fromIndex(
+					(byte) scriptCode);
+			out.println();
+			out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
+			UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
+			UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
+			core.removeAll(otherCore);
+			if (core.size() == 0) {
+				UnicodeSet temp = core;
+				core = otherCore;
+				otherCore = temp;
+			}
+			printlnSet(out, "Atomic", core, scriptCode);
+			if (otherCore.size() != 0) printlnSet(out, "Atomic [noUpper]", otherCore, scriptCode);							
+			UnicodeSet decomp = decompChars[scriptCode];
+			if (decomp != null && decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
+		}
+
+		/**
+		 * @param out
+		 * @param unicodeset
+		 * @param uca
+		 * @param scriptCode
+		 */
+		private  void printlnSet(PrintWriter out, String title,
+				UnicodeSet unicodeset, int scriptCode) {
+			if (unicodeset == null)
+				return;
+			int size = unicodeset.size();
+			String dir = unicodeset.containsSome(bidiR)
+					&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
+			out.println("<tr><th class='" + title + "'>" + title + " ("
+					+ nf.format(size) + ")</th></tr>");
+			out.print("<tr><td" + dir + ">");
+			UnicodeSetIterator usi = new UnicodeSetIterator();
+			if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
+				usi.reset(unicodeset);
+				while (usi.nextRange()) {
+					if (usi.codepoint == usi.codepointEnd) {
+						out.print(formatCode(UTF16
+								.valueOf(usi.codepoint)));
+					} else {
+						out.print(formatCode(UTF16
+								.valueOf(usi.codepoint))
+								+ ".. "
+								+ formatCode(UTF16
+										.valueOf(usi.codepointEnd)));
+					}
+				}
+			} else {
+				Set reordered = new TreeSet(uca);
+				usi.reset(unicodeset);
+				while (usi.next()) {
+					boolean foo = reordered.add(usi.getString());
+					if (!foo)
+						throw new IllegalArgumentException("Collision with "
+								+ Default.ucd().getCodeAndName(usi.getString()));
+				}
+				for (Iterator it = reordered.iterator(); it.hasNext();) {
+					out.print(formatCode((String) it
+							.next()));
+				}
+			}
+			out.println("</td></tr>");
+		}
+
+		/**
+		 * @param string
+		 * @return
+		 */
+		private String formatCode(String string) {
+			int cat = ucd.getCategory(UTF16.charAt(string,0));
+			return "<span title='" + ucd.getCodeAndName(string) + "'>"
+			+ (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
+			+ BagFormatter.toHTML.transliterate(string)
+			+ " </span>";
+		}
+	}
+	
+	/**
+	 * @param inbuffer
+	 * @param outbuffer
+	 * @return
+	 */
+	public static boolean equals(StringBuffer inbuffer, StringBuffer outbuffer) {
+		if (inbuffer.length() != outbuffer.length()) return false;
+		for (int i = inbuffer.length() - 1; i >= 0; --i) {
+			if (inbuffer.charAt(i) != outbuffer.charAt(i)) return false;
+		}
+		return true;
+	}
+
+	private static void checkChars(boolean mergeRanges) {
+		UCD ucd = Default.ucd();
+		ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+		UnicodeSet isUpper = ups.getSet("Uppercase=true");
+		UnicodeSet isLower = ups.getSet("Lowercase=true");
+		UnicodeSet isTitle = ups.getSet("gc=Lt");
+		UnicodeSet otherAlphabetic = ups.getSet("Alphabetic=true").addAll(ups.getSet("gc=Sk"));
+		// create the following
+		UnicodeSet hasFold = new UnicodeSet();
+		UnicodeSet hasUpper = new UnicodeSet();
+		UnicodeSet hasLower = new UnicodeSet();
+		UnicodeSet hasTitle = new UnicodeSet();
+		UnicodeSet compat = new UnicodeSet();
+		UnicodeSet bicameralsScripts = new UnicodeSet();
+
+		UCD u40 = UCD.make("4.0.0");
+		BitSet scripts = new BitSet();
+		for (int i = 0; i <= 0x10FFFF; ++i) {
+			int gc = ucd.getCategory(i);
+			if (gc == Cn || gc == PRIVATE_USE) continue;
+			String str = UTF16.valueOf(i);
+			if (!str.equals(ucd.getCase(str, FULL, FOLD))) hasFold.add(i);
+			if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(i);
+			if (!str.equals(ucd.getCase(str, FULL, LOWER))) {
+				hasLower.add(i);
+				scripts.set(ucd.getScript(i));
+			}
+			if (!str.equals(ucd.getCase(str, FULL, TITLE))) hasTitle.add(i);
+			if (!str.equals(Default.nfkd().normalize(str))) compat.add(i);
+			//System.out.println(ucd.getCodeAndName(i) + "\t" + (u40.isAllocated(i) ? "already in 4.0" : "new in 4.1"));
+		}
+		BagFormatter bf = new BagFormatter();
+		bf.setMergeRanges(mergeRanges);
+		bf.setUnicodePropertyFactory(ups);
+		printItems(bf, compat, "isUpper or isTitle without hasLower", 
+				new UnicodeSet(isUpper).addAll(isTitle).removeAll(hasLower));
+		printItems(bf, compat, "hasLower, but not isUpper or isTitle", 
+				new UnicodeSet(hasLower).removeAll(isTitle).removeAll(isUpper));
+		printItems(bf, compat, "isLower without hasUpper", 
+				new UnicodeSet(isLower).addAll(isTitle).removeAll(hasUpper));
+		printItems(bf, compat, "hasUpper, but not isLower or isTitle", 
+				new UnicodeSet(hasUpper).removeAll(isTitle).removeAll(isLower));
+
+		UnicodeSet scriptSet = new UnicodeSet();
+		UnicodeProperty scriptProp = ups.getProperty("Script");
+		for (int i = 0; i < scripts.size(); ++i) {
+			if (!scripts.get(i)) continue;
+			if (i == COMMON_SCRIPT) continue;
+			String scriptName = ucd.getScriptID_fromIndex((byte)i);
+			System.out.println(scriptName);
+			scriptSet.addAll(scriptProp.getSet(scriptName));
+		}
+		UnicodeSet allCased = new UnicodeSet().addAll(isUpper).addAll(isLower).addAll(isTitle);
+		printItems(bf, compat, "(Bicameral) isAlpha or Symbol Modifier, but not isCased", 
+				new UnicodeSet(scriptSet).retainAll(otherAlphabetic).removeAll(allCased));
+		printItems(bf, compat, "(Bicameral) isCased, but not isAlpha or Symbol Modifier", 
+				new UnicodeSet(scriptSet).retainAll(allCased).removeAll(otherAlphabetic));
+	}
+
+	
+	/**
+	 * @param bf
+	 * @param compat
+	 * @param temp
+	 */
+	private static void printItems(BagFormatter bf, UnicodeSet compat, String title, UnicodeSet temp) {
+		System.out.println();
+		System.out.println(title + " -- (non compat)");		
+		UnicodeSet temp2 = new UnicodeSet(temp).removeAll(compat);
+		System.out.println(bf.showSetNames(temp2));
+		System.out.println();
+		temp2 = new UnicodeSet(temp).retainAll(compat);
+		System.out.println(title + " -- (compat)");
+		System.out.println(bf.showSetNames(temp2));
+	}
+
 	static PrintWriter log;
 	
    public static void checkShaping() throws IOException {
--- a/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java
@ -75,6 +75,10 @@ public class TestUnicodeInvariants {
       out.write('\uFEFF'); // BOM
       BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
       BagFormatter bf = new BagFormatter();
+       bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
+       BagFormatter bf2 = new BagFormatter();
+       bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
+       bf2.setMergeRanges(false);
       ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
           ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
           ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
@ -106,7 +110,16 @@ public class TestUnicodeInvariants {
               continue;
           }

-           char relation = 0;
+           // detect variables
+           if (line.startsWith("Show")) {
+           		String part = line.substring(4).trim();
+           		pp.setIndex(0);
+           		UnicodeSet leftSet = new UnicodeSet(part, pp, st);
+           		bf2.showSetNames(out, leftSet);
+				continue;
+           }
+
+          char relation = 0;
           String rightSide = null;
           String leftSide = null;
           UnicodeSet leftSet = null;
--- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
-* $Date: 2004/02/12 08:23:16 $
-* $Revision: 1.25 $
+* $Date: 2005/02/24 02:59:34 $
+* $Revision: 1.26 $
 *
 *******************************************************************************
 */
@ -272,7 +272,7 @@ public class VerifyUCD implements UCD_Types {
 			}
 			System.out.println(" </tr>");
 			if (prop.doTotal(i, true)) printTotals("Subtotal", subtotalCount, true);
-			if (prop.doTotal(i, false)) printTotals("Cumulative Total", totalCount, false);
+			if (prop.doTotal(i, false)) printTotals("Cummulative Total", totalCount, false);
 		}
 		printTotals("Total", totalCount, false);
 		System.out.println("</table>");