ICU-5520 updated tools

X-SVN-Rev: 20684
2006-11-27 23:15:21 +00:00 · 2006-11-27 23:15:21 +00:00 · f559c01e8b
commit f559c01e8b
parent 89d5004e8b
7 changed files with 1275 additions and 695 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java
@ -540,7 +540,7 @@ public class MakeUnicodeFiles {
                  = ToolUnicodePropertySource.make(Default.ucdVersion());
        TreeSet sortedSet = new TreeSet(CASELESS_COMPARATOR);
        BagFormatter bf = new BagFormatter();
-        Tabber.MonoTabber mt = new Tabber.MonoTabber()
+        Tabber.MonoTabber mt = (Tabber.MonoTabber) new Tabber.MonoTabber()
        .add(10,Tabber.LEFT)
        .add(30,Tabber.LEFT);
        int count = 0;
@ -639,7 +639,7 @@ public class MakeUnicodeFiles {
                        // 123456789012345678901234567890123

        // sc ; Arab      ; Arabic
-        Tabber.MonoTabber mt2 = new Tabber.MonoTabber()
+        Tabber.MonoTabber mt2 = (Tabber.MonoTabber) new Tabber.MonoTabber()
        .add(3,Tabber.LEFT)
        .add(2,Tabber.LEFT) // ;
        .add(10,Tabber.LEFT)
@ -649,7 +649,7 @@ public class MakeUnicodeFiles {
        .add(33,Tabber.LEFT);
        
        // ccc; 216; ATAR ; Attached_Above_Right
-        Tabber.MonoTabber mt3 = new Tabber.MonoTabber()
+        Tabber.MonoTabber mt3 = (Tabber.MonoTabber) new Tabber.MonoTabber()
        .add(3,Tabber.LEFT)
        .add(2,Tabber.LEFT) // ;
        .add(3,Tabber.RIGHT)
--- a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java
@ -5,14 +5,35 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
-* $Date: 2006/09/24 23:32:45 $
-* $Revision: 1.13 $
+* $Date: 2006/11/27 23:15:21 $
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */

 package com.ibm.text.UCD;

+import org.unicode.cldr.util.Counter;
+
+import com.ibm.icu.dev.demo.translit.CaseIterator;
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.Tabber;
+import com.ibm.icu.dev.test.util.UnicodeMap;
+import com.ibm.icu.dev.test.util.UnicodeProperty.UnicodeMapProperty;
+import com.ibm.icu.impl.PrettyPrinter;
+import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.text.CanonicalIterator;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.NumberFormat;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.ULocale;
+
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
@ -33,36 +54,15 @@ import java.util.StringTokenizer;
 import java.util.TreeMap;
 import java.util.TreeSet;

-import org.unicode.cldr.util.Counter;
-
-import com.ibm.icu.dev.demo.translit.CaseIterator;
-import com.ibm.icu.dev.test.util.BagFormatter;
-import com.ibm.icu.dev.test.util.Tabber;
-import com.ibm.icu.dev.test.util.UnicodeMap;
-import com.ibm.icu.dev.test.util.UnicodeProperty.UnicodeMapProperty;
-import com.ibm.icu.impl.PrettyPrinter;
-import com.ibm.icu.impl.Utility;
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UProperty;
-import com.ibm.icu.text.CanonicalIterator;
-import com.ibm.icu.text.Collator;
-//import com.ibm.icu.text.Normalizer;
-
-import com.ibm.icu.text.NumberFormat;
-import com.ibm.icu.text.RuleBasedCollator;
-import com.ibm.icu.text.Transliterator;
-import com.ibm.icu.text.UTF16;
-import com.ibm.icu.text.UnicodeSet;
-import com.ibm.icu.text.UnicodeSetIterator;
-import com.ibm.icu.util.ULocale;
-
 public class QuickTest implements UCD_Types {
 	public static void main(String[] args) throws IOException {
 		try {
-            
-            getHangulDecomps();
+      String methodName = System.getProperty("method");
+      org.unicode.cldr.util.Utility.callMethod(methodName, QuickTest.class);
+

            if (true) return;
+            getHangulDecomps();

 			
      showLeadingTrailingNonStarters();
@ -203,8 +203,8 @@ public class QuickTest implements UCD_Types {
 		
 //		System.out.println(bf.showSetDifferences("NFC CWP", leadingC, "NFC Trailing", trailingC));
 	}
-	
-	private static void checkCaseChanges() {
+    
+  private static void checkCaseChanges() {
 		String first = "3.0.0";
 		String last = "4.1.0";
 		UCD ucd30 = UCD.make(first);
--- a/tools/unicodetools/com/ibm/text/UCD/ScriptTimeline.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ScriptTimeline.java
@ -0,0 +1,25 @@
+package com.ibm.text.UCD;
+
+import com.ibm.icu.dev.test.util.UnicodeProperty;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UnicodeSet;
+
+import java.util.List;
+
+public class ScriptTimeline {
+  public static void main(String[] args) {
+    String[] versions = { "2.0.0", "2.1.2", "3.0.0", "3.1.0", "3.2.0", "4.0.0", "4.1.0", "5.0.0" };
+    for (int s = 0; s < UScript.CODE_LIMIT; ++s) {
+      String scriptName = UScript.getName(s);
+      UnicodeSet chars = new UnicodeSet().applyPropertyAlias("script", scriptName);
+      if (chars.size() == 0) continue;
+      System.out.print(scriptName);
+      for (int v = 0; v < versions.length; ++v) {
+        UnicodeSet age = new UnicodeSet();
+        age.applyPropertyAlias("age", versions[v]);
+        System.out.print("\t" + new UnicodeSet(chars).retainAll(age).size());
+      }
+      System.out.println();
+    }
+  }
+}
--- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2006/04/05 22:12:44 $
-* $Revision: 1.41 $
+* $Date: 2006/11/27 23:15:21 $
+* $Revision: 1.42 $
 *
 *******************************************************************************
 */
@ -20,6 +20,8 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.BitSet;
 import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 import java.io.IOException;
 import java.io.DataInputStream;
@ -31,6 +33,7 @@ import com.ibm.text.utility.*;
 import com.ibm.icu.dev.test.util.BagFormatter;
 import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.dev.test.util.UnicodeProperty;
+import com.ibm.icu.text.Transliterator;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

@ -200,7 +203,13 @@ public final class UCD implements UCD_Types {
     * Get the name and number (U+xxxx NAME) for a code point
     */
    public String getCodeAndName(int codePoint, byte type) {
-        return getCode(codePoint) + " " + getName(codePoint, type);
+      return getCodeAndName(codePoint, type, null);
+    }
+    
+    public String getCodeAndName(int codePoint, byte type, Transliterator charTrans) {
+        return getCode(codePoint)
+        + (charTrans == null ? " " : " ( " + charTrans.transliterate(UTF16.valueOf(codePoint)) + " ) ") 
+        + getName(codePoint, type);
    }

    /**
@ -208,14 +217,18 @@ public final class UCD implements UCD_Types {
     * separated by ", "
     */
    public String getCodeAndName(String s, byte type) {
+      return getCodeAndName(s,type,null);
+    }
+    
+    public String getCodeAndName(String s, byte type, Transliterator charTrans) {
        if (s == null || s.length() == 0) return "NULL";
-        if (s.length() == 1) return getCodeAndName(s.charAt(0)); // fast path
+        if (s.length() == 1) return getCodeAndName(s.charAt(0), type, charTrans); // fast path
        StringBuffer result = new StringBuffer();
        int cp;
        for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
            cp = UTF32.char32At(s, i);
            if (i > 0) result.append(", ");
-            result.append(getCodeAndName(cp));
+            result.append(getCodeAndName(cp, type, charTrans));
        }
        return result.toString();
    }
@ -1666,24 +1679,34 @@ to guarantee identifier closure.
        return blockData.getSet(value, result);
    }
    
+    static final Matcher blockPattern = Pattern.compile("([0-9A-F]+)\\s*(?:[.][.]|[;])\\s*([0-9A-F]+)\\s*[;](.*)").matcher("");
    private void loadBlocks() {
        blockData = new UnicodeMap();
+        
        try {
            BufferedReader in = Utility.openUnicodeFile("Blocks", version, true, Utility.LATIN1);
            try {
-                while (true) {
+              for (int i = 1; ; ++i) {
                    // 0000..007F; Basic Latin
                    String line = Utility.readDataLine(in);
                    if (line == null) break;
                    if (line.length() == 0) continue;
-                    int pos1 = line.indexOf('.');
-                    int pos2 = line.indexOf(';', pos1);
+                    if (!blockPattern.reset(line).matches()) {
+                      throw new IllegalArgumentException("Bad line: " + line);
+                    }
+//                    int pos1 = line.indexOf(';');
+//                    int pos2 = line.indexOf(';', pos1+1);
                        
                    //lastBlock = new BlockData();
-                    int start = Integer.parseInt(line.substring(0, pos1), 16);
-                    int end = Integer.parseInt(line.substring(pos1+2, pos2), 16);
-                    String name = line.substring(pos2+1).trim().replace(' ', '_');
-                    blockData.putAll(start,end, name);
+                    try {
+                      int start = Integer.parseInt(blockPattern.group(1), 16);
+                      int end = Integer.parseInt(blockPattern.group(2), 16);
+                      String name = blockPattern.group(3).trim().replace(' ', '_');
+                      blockData.putAll(start,end, name);
+                    } catch (RuntimeException e) {
+                      System.err.println("Failed on line " + i + "\t" + line);
+                      throw e;
+                    }
                }
                blockData.setMissing("No_Block");
            } finally {
--- a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-old.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-old.txt
@ -0,0 +1,273 @@
+# Invariance tests
+# Each line indicates an invariant set relationship to be tested,
+# and is of the form:
+#
+# 	line := set relation set
+#
+#   relation := '='             // has identical contents to
+#            := ('>' | '⊃')    // is proper superset of
+#            := ('≥' | '⊇')    // is superset of 
+#            := ('<' | '⊂')    // is proper subset of
+#            := ('≤' | '⊆')    // is subset of
+#            := '!'             // has no intersection
+#            := '?'             // none of the above (they overlap, and neither contains the other)
+#
+# A set is a standard UnicodeSet, but where $pv can be used to express properties
+#
+# 	pv := '$' '×'? prop (('=' | ':') value)?
+#
+# The × indicates that the property is the previous released version.
+#  That is, if the version is 4.0.1, then the × version is 4.0.0
+# If the value is missing, it is defaulted to true
+# If the value is of the form «...», then the ... is interpreted as a regular expression
+# The property can be the short or long form as in the PropertyAliases.txt
+# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt
+#
+# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in
+#  Perl or other regular-expression languages. Examples:
+#	[$General_Category:Unassigned-[a-zA-Z]]
+# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html
+#
+# WARNING: do not use \p{...} or [:...:] syntax, since those will be
+# ICU's current version of properties, not the current snapshot's.
+# Use the $ notation for properties (listed above) instead.
+#
+# When this file is parsed, an error message may contain <@>
+#  to indicate the location of an error in the input line.
+# The Show command can be used to list any set on the console, for comparison.
+
+# General Constants
+Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
+Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
+Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark]
+
+##### EXAMPLES OF USAGE #####
+
+#Show [[^$gc:unassigned]-[^$×gc:unassigned]-[^$dt:none]]
+#$GC:Zs ! $GC:Zp
+#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
+$GC:Zs ? $Name:«.*SPACE.*»
+#$Script:Common ! [$Alphabetic - $Math]
+
+# $Pattern_Whitespace = [$Whitespace \u200E \u200F]
+# $Pattern_Syntax = [$gcAllSymbols $gcAllPunctuation [\u2190-\u2BFF\u2e00-\u2e7F]]
+# $Pattern_Syntax ! $Alphabetic
+# $Pattern_Syntax ! $ID_Continue
+
+# [$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126]
+
+# $script:greek = $×script:greek
+# $gc:lm = $script:inherited
+
+# Examples of parsing errors
+
+# $LBA:Neutral =  $GC:Zp # example of non-existant property
+# $LB:foo =  $GC:Zp # example of non-existant value
+# $GC:Zs @ $GC:Zp # example of unknown relation
+
+#### REAL INVARIANTS FOLLOW ####
+
+# For illustration, different alias styles are used
+
+$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
+$LB:OP = $GC:Ps
+$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
+$Dash ⊃ [$GC:Pd]
+$Script:Common ! [$GC:Mn $GC:Me \u200c \u200d]
+$Script:Inherited ⊆ [$GC:Mn $GC:Me \u200c \u200d]
+# [$Alphabetic] ! $Script:Common
+#  & [$Decomposition_Type:None $Decomposition_Type:Canonical]
+
+$Alphabetic ⊃ [$Uppercase $Lowercase]
+
+# Numbers: the following must be equal
+
+$General_Category:Decimal_Number = $Numeric_Type:Decimal
+
+# Decimals are 0..9
+
+Let $decimalValue = $Numeric_Value:«[0-9].0»
+$decimalValue ⊇ $General_Category:Decimal_Number
+
+# All and only those items with numeric types have numeric values
+
+Let $anyNumericValue = $Numeric_Value:«-?[0-9]+.[0-9]+»
+[$Numeric_Type:Decimal $Numeric_Type:Digit $Numeric_Type:Numeric] = $anyNumericValue
+
+# Canonical decompositions (minus exclusions) must be identical across releases
+[$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion]
+
+# Identifiers must be backwards compatible
+$ID_Start ⊇ $×ID_Start
+$ID_Continue ⊇ $×ID_Continue
+$XID_Start ⊇ $×XID_Start
+$XID_Continue ⊇ $×XID_Continue
+
+# Continue must contain start
+$ID_Continue ⊇ $ID_Start
+$XID_Continue ⊇ $XID_Start
+
+# Identifiers can't intersect pattern stuff
+$ID_Continue ! [$Pattern_Whitespace $Pattern_Syntax]
+$Pattern_Whitespace ! [$ID_Continue $Pattern_Syntax]
+$Pattern_Syntax ! [$ID_Continue $Pattern_Whitespace]
+
+$XID_Continue ! [$Pattern_Whitespace $Pattern_Syntax]
+$Pattern_Whitespace ! [$XID_Continue $Pattern_Syntax]
+$Pattern_Syntax ! [$XID_Continue $Pattern_Whitespace]
+
+# Test SA characters
+
+# They are limited to certain scripts:
+Let $SAScripts = [$script:thai $script:lao $script:myanmar $script:khmer $script:Tai_Le $script:New_Tai_Lue]
+$SAScripts ⊇ $LineBreak:SA
+
+# And in those scripts, they are all the alphabetic spacing characters, plus some odd Cf & Mn
+[$SAScripts & [$Alphabetic $gc:cf $gc:Mn \u19DE \u19DF]] = [$SAScripts & [$LineBreak:SA $LineBreak:CM]]
+
+#MY TEST
+#Show [$gc:Mn - $Alphabetic]
+#Show [$Alphabetic & $gc:Mn]
+
+# Try removing M* from alphabetic, and matching to SA
+#Show [$SAScripts & [$Alphabetic $gc:cf - $gcAllMarks]] = $LineBreak:SA
+
+# Try adding M* to alphabetic, and matching to SA
+#Show [$SAScripts & [$Alphabetic $gc:cf $gcAllMarks]] = $LineBreak:SA
+
+# testing
+# [$Pattern_Whitespace $Pattern_Syntax] ! [[^$WB:Format $WB:Other] \u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A]
+Let $otherword = [\u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A]
+Let $currentword = [[^$WB:Format $WB:Other $WB:MidNum] $Grapheme_Extend $alphabetic]
+Show [$currentword $otherword - $ID_Continue]
+Show [$currentword $otherword - [$alphabetic $anyNumericValue $gcAllMarks]]
+Show [$otherword - $currentword]
+Show [$name:«.*LETTER.*» - $alphabetic]
+
+# Pattern characters are invariant!
+# Add after 4.1.0
+$Pattern_Whitespace = $×Pattern_Whitespace
+$Pattern_Syntax = $×Pattern_Syntax
+
+#BIDI invariant constants
+Let $R_blocks = [$block:Kharoshthi $block:Hebrew $block:Cypriot_Syllabary  \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF]
+Let $AL_blocks = [[$block:Arabic_Supplement $block:Arabic $block:Syriac $block:Arabic $block:Thaana $block:Arabic_Presentation_Forms_A $block:Arabic_Presentation_Forms_B [\u0750-\u077F]] -$Noncharacter_Code_Point]
+
+#Unassigned characters in these blocks have R or AL respectively
+$Bidi_Class:R ⊇ [$R_blocks & $gc:Cn]
+$Bidi_Class:AL ⊇ [$AL_blocks & $gc:Cn]
+
+# There are no strong characters of the other directionalities (out of L, AL, R) in these blocks, 
+# and anything R or L is in the block (or RLM)
+$R_blocks ! [$Bidi_Class:L $Bidi_Class:AL]
+$AL_blocks ! [$Bidi_Class:L $Bidi_Class:R]
+[$R_blocks $AL_blocks \u200F] ⊇ [$Bidi_Class:AL $Bidi_Class:R]
+
+# Derivations must match
+
+$Math = [$GC:Sm $Other_Math]
+$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic]
+$Lowercase = [$GC:Ll $Other_Lowercase]
+$Uppercase = [$GC:Lu $Other_Uppercase]
+$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start]
+$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc $Other_ID_Continue] 
+$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]
+$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend]
+$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend]
+
+# "Minimal" Other_: NOT hard requirements; just if we want to be minimal
+# (Should add way to make these warnings, not errors)
+
+$Other_Math = [$Math - $GC:Sm]
+$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
+$Other_Lowercase = [$Lowercase - $GC:Ll]
+$Other_Uppercase = [$Uppercase - $GC:Lu]
+$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
+$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]]
+$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
+
+# ===========================
+
+# POSIX Compatibility Properties (UTS#18)
+# http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html
+
+# constants
+
+Let $SP = [\u0020]	# [\N{space}]
+Let $TAB = [\u0009] # [\N{CHARACTER TABULATION}]
+Let $LF = [\u000A]	# \N{linefeed}
+Let $VTAB = [\u000B]	# [\N{LINE TABULATION}]
+Let $FF = [\u000C]	# [\N{formfeed}]
+Let $CR = [\u000D]	# \N{carriage return}
+Let $NEL = [\u0085]	# \N{next line}
+Let $ZWNJ = [\u200C]	# [\N{ZERO WIDTH NON-JOINER}]
+Let $ZWJ = [\u200D]	# [\N{ZERO WIDTH JOINER}]
+
+Let $strange = [\u24B6-\u24E9]
+
+# Unassigned, Control, Format, Private_Use, Surrogate, 
+# Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter,
+# Nonspacing_Mark, Enclosing_Mark, Spacing_Mark,
+# Decimal_Number, Letter_Number, Other_Number, 
+# Space_Separator, Line_Separator, Paragraph_Separator, 
+# Dash_Punctuation, Open_Punctuation, Close_Punctuation, Connector_Punctuation, Other_Punctuation, Initial_Punctuation, Final_Punctuation
+# Math_Symbol, Currency_Symbol, Modifier_Symbol, Other_Symbol
+
+# UTS Rules
+
+Let $alpha = [$Alphabetic $strange] # $Uppercase $ZWNJ $ZWJ]
+Let $lower = $Lowercase 
+Let $upper = [$Uppercase]
+Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha]
+Let $digit = $gc:Decimal_Number 
+Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both!
+Let $alnum = [$alpha $digit]
+Let $space = $Whitespace 
+Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]]
+Let $cntrl = $gc:Control 
+Let $graph = [^$space $gc:Control $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ]
+Let $print = [$graph $blank - $cntrl]
+Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation] 
+
+# ===========================
+
+# POSIX locale definition file constraints
+
+$upper ! [$cntrl $digit $punct $space]
+$upper ≥ [A-Z]
+
+$lower ! [$cntrl $digit $punct $space]
+$lower ≥ [a-z]
+
+$alpha ! [$cntrl $digit $punct $space]
+$alpha ≥ [$lower $upper]
+
+$digit ≥ [0-9]
+
+$alnum = [$alpha $digit]
+
+$space ! [$upper $lower $alpha $digit $graph $xdigit]
+$space ≥ [$SP $FF $LF $CR] # $TAB $VTAB $NEL]
+$space ≥ $blank
+
+$cntrl ! [$upper $lower $alpha $digit $punct $graph $print $xdigit]
+
+$punct ! [$upper $lower $alpha $digit $cntrl $xdigit $SP]
+
+$graph ≥ [$upper $lower $alpha $digit $xdigit $punct]
+$graph ! [$SP $cntrl]
+
+$print ≥ [$upper $lower $alpha $digit $xdigit $punct $graph $SP]
+$print ! $cntrl
+
+$xdigit ≥ [$digit [a-f A-F]]
+
+$blank ≥ [$SP $TAB]
+
+# Extra POSIX 'POSIX locale' constraints
+
+$cntrl ≥  [\u0000-\u001F]
+
+$punct  ≥ [[\u0021-\u007E] - [0-9 A-Z a-z]]
+
+[$alpha $lower $upper $punct $digit $xdigit $alnum $space $blank $cntrl $graph $print $word] = [^$gc:unassigned $gc:surrogate]
--- a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-reallyold.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-reallyold.txt
@ -0,0 +1,188 @@
+# Invariance tests
+# dummy commit.
+# Each line indicates an invariant set relationship to be tested,
+# and is of the form:
+#
+# 	line := set relation set
+#
+#   relation := '='             // has identical contents to
+#            := ('>' | '⊃')    // is proper superset of
+#            := ('≥' | '⊇')    // is superset of 
+#            := ('<' | '⊂')    // is proper subset of
+#            := ('≤' | '⊆')    // is subset of
+#            := '!'             // has no intersection
+#            := '?'             // none of the above (they overlap, and neither contains the other)
+#
+# A set is a standard UnicodeSet, but where $pv can be used to express properties
+#
+# 	pv := '$' '×'? prop (('=' | ':') value)?
+#
+# The × indicates that the property is the previous released version.
+#  That is, if the version is 4.0.1, then the × version is 4.0.0
+# If the value is missing, it is defaulted to true
+# If the value is of the form «...», then the ... is interpreted as a regular expression
+# The property can be the short or long form as in the PropertyAliases.txt
+# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt
+#
+# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in
+#  Perl or other regular-expression languages. Examples:
+#	[$General_Category:Unassigned-[a-zA-Z]]
+# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html
+#
+# WARNING: do not use \p{...} or [:...:] syntax, since those will be
+# ICU's current version of properties, not the current snapshot's.
+# Use the $ notation for properties (listed above) instead.
+#
+# When this file is parsed, an error message may contain <@>
+#  to indicate the location of an error in the input line.
+
+# The following not very interesting, but show examples of use
+
+#$GC:Zs ! $GC:Zp
+#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
+$GC:Zs ? $Name:«.*SPACE.*»
+
+# [$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126]
+
+# Examples of parsing errors
+
+# $LBA:Neutral =  $GC:Zp # example of non-existant property
+# $LB:foo =  $GC:Zp # example of non-existant value
+# $GC:Zs @ $GC:Zp # example of unknown relation
+
+# The following should be real invariants
+# For illustration, different alias styles are used
+
+$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
+$LB:OP = $GC:Ps
+$General_Category:Decimal_Number = $Numeric_Type:Decimal
+$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
+$Dash ⊃ [$GC:Pd]
+$Script:Common ! [$GC:Mn $GC:Me]
+$Script:Common ! [$Alphabetic - $Math]
+$Alphabetic ⊃ [$Uppercase $Lowercase]
+
+# Comparisons across versions
+
+$ID_Start ⊇ $×ID_Start
+$ID_Continue ⊇ $×ID_Continue
+[$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion]
+
+#$age:4.0.1 = $age4.0.0
+
+# Derivations
+
+$Math = [$GC:Sm $Other_Math]
+$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic]
+$Lowercase = [$GC:Ll $Other_Lowercase]
+$Uppercase = [$GC:Lu $Other_Uppercase]
+$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start]
+$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc $Other_ID_Continue] 
+$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]
+$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend]
+$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend]
+
+# "Minimal" Other_: NOT hard requirements; just if we want to be minimal
+
+$Other_Math = [$Math - $GC:Sm]
+$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
+$Other_Lowercase = [$Lowercase - $GC:Ll]
+$Other_Uppercase = [$Uppercase - $GC:Lu]
+$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
+$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]]
+$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
+
+# Testing
+# $script:greek = $×script:greek
+# $gc:lm = $script:inherited
+
+# ===========================
+
+# Compatibility Properties (UTS#18)
+# http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html
+
+# constants
+
+Let $SP = [\u0020]	# [\N{space}]
+Let $TAB = [\u0009] # [\N{CHARACTER TABULATION}]
+Let $LF = [\u000A]	# \N{linefeed}
+Let $VTAB = [\u000B]	# [\N{LINE TABULATION}]
+Let $FF = [\u000C]	# [\N{formfeed}]
+Let $CR = [\u000D]	# \N{carriage return}
+Let $NEL = [\u0085]	# \N{next line}
+Let $ZWNJ = [\u200C]	# [\N{ZERO WIDTH NON-JOINER}]
+Let $ZWJ = [\u200D]	# [\N{ZERO WIDTH JOINER}]
+
+Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
+Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
+Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark]
+Let $strange = [\u24B6-\u24E9]
+
+# Unassigned, Control, Format, Private_Use, Surrogate, 
+# Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter,
+# Nonspacing_Mark, Enclosing_Mark, Spacing_Mark,
+# Decimal_Number, Letter_Number, Other_Number, 
+# Space_Separator, Line_Separator, Paragraph_Separator, 
+# Dash_Punctuation, Open_Punctuation, Close_Punctuation, Connector_Punctuation, Other_Punctuation, Initial_Punctuation, Final_Punctuation
+# Math_Symbol, Currency_Symbol, Modifier_Symbol, Other_Symbol
+
+# UTS Rules
+
+Let $alpha = [$Alphabetic $strange] # $Uppercase $ZWNJ $ZWJ]
+Let $lower = $Lowercase 
+Let $upper = [$Uppercase]
+Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha]
+Let $digit = $gc:Decimal_Number 
+Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both!
+Let $alnum = [$alpha $digit]
+Let $space = $Whitespace 
+Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]]
+Let $cntrl = $gc:Control 
+Let $graph = [^$space $gc:Control $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ]
+Let $print = [$graph $blank - $cntrl]
+Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation] 
+
+# ===========================
+
+# POSIX locale definition file constraints
+
+$upper ! [$cntrl $digit $punct $space]
+$upper ≥ [A-Z]
+
+$lower ! [$cntrl $digit $punct $space]
+$lower ≥ [a-z]
+
+$alpha ! [$cntrl $digit $punct $space]
+$alpha ≥ [$lower $upper]
+
+$digit ≥ [0-9]
+
+$alnum = [$alpha $digit]
+
+$space ! [$upper $lower $alpha $digit $graph $xdigit]
+$space ≥ [$SP $FF $LF $CR] # $TAB $VTAB $NEL]
+$space ≥ $blank
+
+$cntrl ! [$upper $lower $alpha $digit $punct $graph $print $xdigit]
+
+$punct ! [$upper $lower $alpha $digit $cntrl $xdigit $SP]
+
+$graph ≥ [$upper $lower $alpha $digit $xdigit $punct]
+$graph ! [$SP $cntrl]
+
+$print ≥ [$upper $lower $alpha $digit $xdigit $punct $graph $SP]
+$print ! $cntrl
+
+$xdigit ≥ [$digit [a-f A-F]]
+
+$blank ≥ [$SP $TAB]
+
+# Extra POSIX 'POSIX locale' constraints
+
+$cntrl ≥  [\u0000-\u001F]
+
+$punct  ≥ [[\u0021-\u007E] - [0-9 A-Z a-z]]
+
+[$alpha $lower $upper $punct $digit $xdigit $alnum $space $blank $cntrl $graph $print $word] = [^$gc:unassigned $gc:surrogate]
+
+