ICU-3984 initial commit of the collation reordering

X-SVN-Rev: 29015
2010-11-08 18:57:42 +00:00 · 2010-11-08 18:57:42 +00:00 · 1fac4c690b
commit 1fac4c690b
parent 5af2364f93
5 changed files with 2606 additions and 2476 deletions
--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
@ -3631,6 +3631,7 @@ final class CollationParsedRuleBuilder {
        collator.m_isHiragana4_ = option.m_isHiragana4_;
        collator.setStrength(option.m_strength_);
        collator.m_variableTopValue_ = option.m_variableTopValue_;
+        collator.m_scriptOrder_ = option.m_scriptOrder_;
        collator.latinOneFailed_ = false;
    }

--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java
@ -7,6 +7,7 @@
 package com.ibm.icu.text;

 import java.text.ParseException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
@ -16,6 +17,7 @@ import com.ibm.icu.util.UResourceBundle;
 import com.ibm.icu.util.ULocale;
 import com.ibm.icu.impl.UCharacterProperty;
 import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UScript;

 /**
 * Class for parsing collation rules, produces a list of tokens that will be
@ -89,6 +91,14 @@ final class CollationRuleParser
            m_decomposition_ = collator.getDecomposition();
            m_strength_ = collator.getStrength();
            m_isHiragana4_ = collator.m_isHiragana4_;
+
+            if(collator.m_scriptOrder_ != null){ 
+                m_scriptOrder_ = new int[collator.m_scriptOrder_.length]; 
+                for(int i = 0; i < m_scriptOrder_.length; i++){ 
+                    m_scriptOrder_[i] = collator.m_scriptOrder_[i]; 
+                } 
+            } 
+
        }

        // package private data members --------------------------------------
@ -119,6 +129,11 @@ final class CollationRuleParser
         * attribute for special Hiragana
         */
        boolean m_isHiragana4_;
+        
+        /** 
+         * the ordering of the scripts 
+         */ 
+        int[] m_scriptOrder_;   
    }

    /**
@ -291,6 +306,14 @@ final class CollationRuleParser
        collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
        collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
        collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
+        if(m_options_.m_scriptOrder_ != null) { 
+            collator.m_defaultScriptOrder_ = new int[m_options_.m_scriptOrder_.length]; 
+            for (int i = 0; i < m_options_.m_scriptOrder_.length; i++) { 
+                collator.m_defaultScriptOrder_[i] = m_options_.m_scriptOrder_[i]; 
+            } 
+        } else { 
+            collator.m_defaultScriptOrder_ = null; 
+        }  
    }

    // private inner classes -------------------------------------------------
@ -662,7 +685,7 @@ final class CollationRuleParser
        RULES_OPTIONS_[15] = new TokenOption("undefined",
                                  RuleBasedCollator.Attribute.LIMIT_,
                                  null, null);
-        RULES_OPTIONS_[16] = new TokenOption("scriptOrder",
+        RULES_OPTIONS_[16] = new TokenOption("reorder",
                                  RuleBasedCollator.Attribute.LIMIT_,
                                  null, null);
        RULES_OPTIONS_[17] = new TokenOption("charsetname",
@ -2028,7 +2051,6 @@ final class CollationRuleParser
      return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
    }

-
    /** in C, optionarg is passed by reference to function.
     *  We use a private int to simulate this.
     */
@ -2061,6 +2083,7 @@ final class CollationRuleParser
        }
        return i;
    }
+    
    /**
     * Reads and set collation options
     * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
@ -2152,6 +2175,11 @@ final class CollationRuleParser
            m_optionEnd_ = m_current_-1;
            return TOKEN_SUCCESS_MASK_;
        }
+        else if(i == 16) { 
+            m_current_ = m_optionarg_; // skip opening brace and name 
+            parseScriptReorder(); 
+            return TOKEN_SUCCESS_MASK_; 
+        } 
        else {
            throwParseException(m_rules_, optionarg);
        }
@ -2282,4 +2310,31 @@ final class CollationRuleParser
      }
      return rules;
    }
+    
+    private void parseScriptReorder() throws ParseException{ 
+        ArrayList<Integer> tempOrder = new ArrayList<Integer>(); 
+        int end = m_rules_.indexOf(']', m_current_); 
+        while(m_current_ < end){ 
+            // Ensure that the following token is 4 characters long 
+            if ((end != m_current_+4) && 
+                    (m_rules_.charAt(m_current_+4) != ' ')) { 
+                throw new ParseException(m_rules_, m_current_); 
+            } 
+            int[] script = UScript.getCode(m_rules_.substring(m_current_, m_current_+4)); 
+            if (script.length > 0) { 
+                tempOrder.add(script[0]); 
+            } else { 
+                throw new ParseException(m_rules_, m_current_); 
+            } 
+            m_current_+= 4; 
+            while (m_current_ < end && UCharacter.isWhitespace(m_rules_.charAt(m_current_))) 
+            {   // eat whitespace 
+                m_current_++; 
+            } 
+        } 
+        m_options_.m_scriptOrder_ = new int[tempOrder.size()]; 
+        for(int i = 0; i < tempOrder.size(); i++){ 
+            m_options_.m_scriptOrder_[i] = tempOrder.get(i); 
+        } 
+    } 
 }
--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java
@ -225,6 +225,18 @@ public abstract class Collator implements Comparator<Object>, Cloneable
     */
    public final static int CANONICAL_DECOMPOSITION = 17;

+    public final static class CollationReorderCodes {
+        private CollationReorderCodes() {}
+        
+        public final static int SPACE          = 0x1000;
+        public final static int FIRST          = SPACE;
+        public final static int PUNCTUATION    = 0x1001;
+        public final static int SYMBOL         = 0x1002;
+        public final static int CURRENCY       = 0x1003;
+        public final static int DIGIT          = 0x1004;
+        public final static int LIMIT          = 0x1005;
+        
+    }
    // public methods --------------------------------------------------------

    // public setters --------------------------------------------------------
@ -314,6 +326,17 @@ public abstract class Collator implements Comparator<Object>, Cloneable
        }
    }

+    /** 
+     * Set the order for scripts to be ordered in.  
+     * @param order the reordering of scripts 
+     * @see #getScriptOrder 
+     * @stable  
+     */ 
+    public void setScriptOrder(int... order) 
+    { 
+        throw new UnsupportedOperationException(); 
+    } 
+
    // public getters --------------------------------------------------------

    /**
@ -989,6 +1012,17 @@ public abstract class Collator implements Comparator<Object>, Cloneable
     */
    public abstract VersionInfo getUCAVersion();
    
+    /**  
+     * Method to retrieve the script reordering 
+     * @see #setScriptOrder 
+     * @return the ordering of the scripts if one has been set, null otherwise. 
+     * @stable  
+     */ 
+    public int[] getScriptOrder() 
+    { 
+        throw new UnsupportedOperationException(); 
+    }   
+
    // protected constructor -------------------------------------------------

    /**
--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollatorReader.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollatorReader.java
@ -1,9 +1,9 @@
 /**
-*******************************************************************************
-* Copyright (C) 1996-2010, International Business Machines Corporation and    *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*/
+ *******************************************************************************
+ * Copyright (C) 1996-2010, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
 package com.ibm.icu.text;

 import java.io.BufferedInputStream;
@ -18,29 +18,30 @@ import com.ibm.icu.impl.ICUResourceBundle;
 import com.ibm.icu.impl.IntTrie;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.CollationParsedRuleBuilder.InverseUCA;
+import com.ibm.icu.text.RuleBasedCollator.LeadByteConstants;
 import com.ibm.icu.text.RuleBasedCollator.UCAConstants;
 import com.ibm.icu.util.VersionInfo;

 /**
-* <p>Internal reader class for ICU data file uca.icu containing 
-* Unicode Collation Algorithm data.</p> 
-* <p>This class simply reads uca.icu, authenticates that it is a valid
-* ICU data file and split its contents up into blocks of data for use in
-* <a href=Collator.html>com.ibm.icu.text.Collator</a>.
-* </p> 
-* <p>uca.icu which is in big-endian format is jared together with this 
-* package.</p>
-* @author Syn Wee Quek
-* @since release 2.2, April 18 2002
-*/
+ * <p>Internal reader class for ICU data file uca.icu containing 
+ * Unicode Collation Algorithm data.</p> 
+ * <p>This class simply reads uca.icu, authenticates that it is a valid
+ * ICU data file and split its contents up into blocks of data for use in
+ * <a href=Collator.html>com.ibm.icu.text.Collator</a>.
+ * </p> 
+ * <p>uca.icu which is in big-endian format is jared together with this 
+ * package.</p>
+ * @author Syn Wee Quek
+ * @since release 2.2, April 18 2002
+ */

 final class CollatorReader
 {
-    static char[] read(RuleBasedCollator rbc, UCAConstants ucac) throws IOException {
+    static char[] read(RuleBasedCollator rbc, UCAConstants ucac, LeadByteConstants leadByteConstants) throws IOException {
        InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/coll/ucadata.icu");
        BufferedInputStream b = new BufferedInputStream(i, 90000);
        CollatorReader reader = new CollatorReader(b);
-        char[] result = reader.readImp(rbc, ucac);
+        char[] result = reader.readImp(rbc, ucac, leadByteConstants);
        b.close();
        return result;
    }
@ -62,14 +63,23 @@ final class CollatorReader
    }

    static void initRBC(RuleBasedCollator rbc, ByteBuffer data) throws IOException {
-        final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
+        // TODO - why? 4 extra bytes? padding in the swapper?
+        //final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
+        final int MIN_BINARY_DATA_SIZE_ = 272;
        int dataLength = data.remaining();
        // TODO: Change the rest of this class to use the ByteBuffer directly, rather than
        // a DataInputStream, except for passing an InputStream to ICUBinary.readHeader().
        // Consider changing ICUBinary to also work with a ByteBuffer.
        CollatorReader reader = new CollatorReader(makeByteBufferInputStream(data), false);
        if (dataLength > MIN_BINARY_DATA_SIZE_) {
-            reader.readImp(rbc, null);
+//            for (int i = 0; i < dataLength; i++) {
+//                byte b = data.get(i);
+//                System.out.print("0x" + (((int) 0xff & b) < 0x0f ? "0" : "") + Integer.toHexString(0xff & b) + " ");
+//                if (i % 16 == 0) {
+//                    System.out.println();
+//                }
+//            }
+            reader.readImp(rbc, null, null);
        } else {
            reader.readHeader(rbc);
            reader.readOptions(rbc);
@ -81,26 +91,26 @@ final class CollatorReader
    static InverseUCA getInverseUCA() throws IOException {
        InverseUCA result = null;
        InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/coll/invuca.icu");
-//        try    {
-//            String invdat = "/com/ibm/icu/impl/data/invuca.icu";
-//            InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
-            BufferedInputStream b = new BufferedInputStream(i, 110000);
-            result = CollatorReader.readInverseUCA(b);
-            b.close();
-            i.close();
-            return result;
-//        } catch (Exception e) {
-//            throw new RuntimeException(e.getMessage());
-//        }
+        //        try    {
+        //            String invdat = "/com/ibm/icu/impl/data/invuca.icu";
+        //            InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
+        BufferedInputStream b = new BufferedInputStream(i, 110000);
+        result = CollatorReader.readInverseUCA(b);
+        b.close();
+        i.close();
+        return result;
+        //        } catch (Exception e) {
+        //            throw new RuntimeException(e.getMessage());
+        //        }
    }

    // protected constructor ---------------------------------------------

    /**
-    * <p>Protected constructor.</p>
-    * @param inputStream ICU collator file input stream
-    * @exception IOException throw if data file fails authentication 
-    */
+     * <p>Protected constructor.</p>
+     * @param inputStream ICU collator file input stream
+     * @exception IOException throw if data file fails authentication 
+     */
    private CollatorReader(InputStream inputStream) throws IOException
    {
        this(inputStream, true);
@ -114,26 +124,26 @@ final class CollatorReader
            throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
        }
        m_dataInputStream_ = new DataInputStream(inputStream);
-        */
+         */
    }

    /**
-    * <p>Protected constructor.</p>
-    * @param inputStream ICU uprops.icu file input stream
-    * @param readICUHeader flag to indicate if the ICU header has to be read
-    * @exception IOException throw if data file fails authentication 
-    */
+     * <p>Protected constructor.</p>
+     * @param inputStream ICU uprops.icu file input stream
+     * @param readICUHeader flag to indicate if the ICU header has to be read
+     * @exception IOException throw if data file fails authentication 
+     */
    private CollatorReader(InputStream inputStream, boolean readICUHeader) 
-                                                            throws IOException
+    throws IOException
    {
        if (readICUHeader) {
            byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, 
-                                 UCA_AUTHENTICATE_);
+                    UCA_AUTHENTICATE_);
            // weiv: check that we have the correct Unicode version in 
            // binary files
            VersionInfo UCDVersion = UCharacter.getUnicodeVersion();
            if(UnicodeVersion[0] != UCDVersion.getMajor() 
-            || UnicodeVersion[1] != UCDVersion.getMinor()) {
+                    || UnicodeVersion[1] != UCDVersion.getMinor()) {
                throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
            }
        }
@ -143,11 +153,11 @@ final class CollatorReader
    // protected methods -------------------------------------------------

    /**
-    * Read and break up the header stream of data passed in as arguments into 
-    * meaningful Collator data.
-    * @param rbc RuleBasedCollator to populate with header information
-    * @exception IOException thrown when there's a data error.
-    */
+     * Read and break up the header stream of data passed in as arguments into 
+     * meaningful Collator data.
+     * @param rbc RuleBasedCollator to populate with header information
+     * @exception IOException thrown when there's a data error.
+     */
    private void readHeader(RuleBasedCollator rbc) throws IOException
    {
        m_size_ = m_dataInputStream_.readInt();
@ -158,11 +168,11 @@ final class CollatorReader
        int readcount = 8; // for size and headersize
        // structure which holds values for indirect positioning and implicit
        // ranges
-        int UCAConst = m_dataInputStream_.readInt();
+        m_UCAConstOffset_ = m_dataInputStream_.readInt();
        readcount += 4;
        // this one is needed only for UCA, to copy the appropriate
        // contractions
-        m_dataInputStream_.skip(4);
+        int contractionUCACombos = m_dataInputStream_.readInt();
        readcount += 4;
        // reserved for future use
        m_dataInputStream_.skipBytes(4);
@ -180,7 +190,7 @@ final class CollatorReader
        int contractionCE = m_dataInputStream_.readInt();
        readcount += 4;
        // needed for various closures int contractionSize
-        /*int contractionSize = */m_dataInputStream_.readInt();
+        int contractionSize = m_dataInputStream_.readInt();
        readcount += 4;
        // array of last collation element in expansion
        int expansionEndCE = m_dataInputStream_.readInt();
@ -190,7 +200,7 @@ final class CollatorReader
        int expansionEndCEMaxSize = m_dataInputStream_.readInt();
        readcount += 4;
        // size of endExpansionCE int expansionEndCESize
-        m_dataInputStream_.skipBytes(4);
+        /*int endExpansionCECount =*/ m_dataInputStream_.readInt();
        readcount += 4;
        // hash table of unsafe code points
        int unsafe = m_dataInputStream_.readInt();
@ -199,25 +209,35 @@ final class CollatorReader
        int contractionEnd = m_dataInputStream_.readInt();
        readcount += 4;
        // int CEcount = m_dataInputStream_.readInt();
-        m_dataInputStream_.skipBytes(4);
+        int contractionUCACombosSize = m_dataInputStream_.readInt();
        readcount += 4;
        // is jamoSpecial
        rbc.m_isJamoSpecial_ = m_dataInputStream_.readBoolean();
        readcount++;
-        // padding
-        m_dataInputStream_.skipBytes(3);
-        readcount += 3;
+        // isBigEndian and charSetFamily
+        m_dataInputStream_.skipBytes(2);
+        readcount += 2;
+        int contractionUCACombosWidth = m_dataInputStream_.readByte();
+        readcount += 1;
        rbc.m_version_ = readVersion(m_dataInputStream_);
        readcount += 4;
        rbc.m_UCA_version_ = readVersion(m_dataInputStream_);
        readcount += 4;
        rbc.m_UCD_version_ = readVersion(m_dataInputStream_);
        readcount += 4;
+        VersionInfo formatVersion = readVersion(m_dataInputStream_);
+        readcount += 4;
+        rbc.m_scriptToLeadBytes = m_dataInputStream_.readInt();
+        readcount += 4;
+        rbc.m_leadByteToScripts = m_dataInputStream_.readInt();
+        readcount += 4;
+
        // byte charsetName[] = new byte[32]; // for charset CEs
        m_dataInputStream_.skipBytes(32);
        readcount += 32;
-        m_dataInputStream_.skipBytes(56); // for future use
-        readcount += 56;
+
+        m_dataInputStream_.skipBytes(44); // for future use
+        readcount += 44;
        if (m_headerSize_ < readcount) {
            ///CLOVER:OFF
            throw new IOException("Internal Error: Header size error");
@ -237,11 +257,15 @@ final class CollatorReader
        m_expansionEndCESize_ = expansionEndCEMaxSize - expansionEndCE;
        m_expansionEndCEMaxSizeSize_ = unsafe - expansionEndCEMaxSize;
        m_unsafeSize_ = contractionEnd - unsafe;
-        m_UCAValuesSize_ = m_size_ - UCAConst; // UCA value, will be handled
-                                                // later
+        //m_UCAValuesSize_ = m_size_ - UCAConst; // UCA value, will be handled later
+        m_UCAcontractionSize_ = contractionUCACombosSize * contractionUCACombosWidth * 2;
+
        // treat it as normal collator first
        // for normal collator there is no UCA contraction
-        m_contractionEndSize_ = m_size_ - contractionEnd;
+        // contractions (UChar[contractionSize] + CE[contractionSize])
+        int old_contractionSize_ = m_size_ - contractionEnd;
+        //        m_contractionSize_ = contractionSize * 2 + contractionSize * 4;
+        m_contractionSize_ = contractionSize * 2 + contractionSize * 4;

        rbc.m_contractionOffset_ >>= 1; // casting to ints
        rbc.m_expansionOffset_ >>= 2; // casting to chars
@ -262,16 +286,19 @@ final class CollatorReader
        rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
        readcount += 4;
        rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt()
-                                      == RuleBasedCollator.AttributeValue.ON_);
+                == RuleBasedCollator.AttributeValue.ON_);
        readcount += 4;
        rbc.m_defaultIsAlternateHandlingShifted_ 
-                                   = (m_dataInputStream_.readInt() == 
-                                    RuleBasedCollator.AttributeValue.SHIFTED_);
+        = (m_dataInputStream_.readInt() == 
+            RuleBasedCollator.AttributeValue.SHIFTED_);
        readcount += 4;
        rbc.m_defaultCaseFirst_ = m_dataInputStream_.readInt();
        readcount += 4;
-        rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt() 
-                                     == RuleBasedCollator.AttributeValue.ON_);
+        //        rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt() 
+        //                == RuleBasedCollator.AttributeValue.ON_);
+        int defaultIsCaseLevel = m_dataInputStream_.readInt();
+        rbc.m_defaultIsCaseLevel_ = (defaultIsCaseLevel
+                == RuleBasedCollator.AttributeValue.ON_);
        readcount += 4;
        int value = m_dataInputStream_.readInt();
        readcount += 4;
@ -285,10 +312,10 @@ final class CollatorReader
        rbc.m_defaultStrength_ = m_dataInputStream_.readInt();
        readcount += 4;
        rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt() 
-                                     == RuleBasedCollator.AttributeValue.ON_);
+                == RuleBasedCollator.AttributeValue.ON_);
        readcount += 4;
        rbc.m_defaultIsNumericCollation_ = (m_dataInputStream_.readInt() 
-                                      == RuleBasedCollator.AttributeValue.ON_);
+                == RuleBasedCollator.AttributeValue.ON_);
        readcount += 4;
        m_dataInputStream_.skip(60); // reserved for future use
        readcount += 60;
@ -301,19 +328,23 @@ final class CollatorReader
    }

    /**
-    * Read and break up the stream of data passed in as arguments into 
-    * meaningful Collator data.
-    * @param rbc RuleBasedCollator to populate
-    * @param UCAConst object to fill up with UCA constants if we are reading 
-    *                 the UCA collator, if not use a null
-    * @return UCAContractions array filled up with the UCA contractions if we
-    *                        are reading the UCA collator
-    * @exception IOException thrown when there's a data error.
-    */
+     * Read and break up the stream of data passed in as arguments into 
+     * meaningful Collator data.
+     * @param rbc RuleBasedCollator to populate
+     * @param UCAConst object to fill up with UCA constants if we are reading 
+     *                 the UCA collator, if not use a null
+     * @param leadByteConstants 
+     * @return UCAContractions array filled up with the UCA contractions if we
+     *                        are reading the UCA collator
+     * @exception IOException thrown when there's a data error.
+     */
    private char[] readImp(RuleBasedCollator rbc, 
-                          RuleBasedCollator.UCAConstants UCAConst) 
-                                                            throws IOException
+            RuleBasedCollator.UCAConstants UCAConst,
+            RuleBasedCollator.LeadByteConstants leadByteConstants) 
+    throws IOException
    {
+        char ucaContractions[] = null;	// return result
+
        readHeader(rbc);
        // header size has been checked by readHeader
        int readcount = m_headerSize_; 
@ -328,24 +359,24 @@ final class CollatorReader
        readcount += (m_expansionSize_ << 2);
        if (m_contractionIndexSize_ > 0) { 
            m_contractionIndexSize_ >>= 1;
-            rbc.m_contractionIndex_ = new char[m_contractionIndexSize_];
-            for (int i = 0; i < m_contractionIndexSize_; i ++) {
-                rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar();
-            }
-            readcount += (m_contractionIndexSize_ << 1);
-            m_contractionCESize_ >>= 2;
-            rbc.m_contractionCE_ = new int[m_contractionCESize_];
-            for (int i = 0; i < m_contractionCESize_; i ++) {
-                rbc.m_contractionCE_[i] = m_dataInputStream_.readInt();
-            }
-            readcount += (m_contractionCESize_ << 2);
+        rbc.m_contractionIndex_ = new char[m_contractionIndexSize_];
+        for (int i = 0; i < m_contractionIndexSize_; i ++) {
+            rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar();
+        }
+        readcount += (m_contractionIndexSize_ << 1);
+        m_contractionCESize_ >>= 2;
+        rbc.m_contractionCE_ = new int[m_contractionCESize_];
+        for (int i = 0; i < m_contractionCESize_; i ++) {
+            rbc.m_contractionCE_[i] = m_dataInputStream_.readInt();
+        }
+        readcount += (m_contractionCESize_ << 2);
        }
        rbc.m_trie_ = new IntTrie(m_dataInputStream_, 
-                                 RuleBasedCollator.DataManipulate.getInstance());
+                RuleBasedCollator.DataManipulate.getInstance());
        if (!rbc.m_trie_.isLatin1Linear()) {
            throw new IOException("Data corrupted, " 
-                                  + "Collator Tries expected to have linear "
-                                  + "latin one data arrays");
+                    + "Collator Tries expected to have linear "
+                    + "latin one data arrays");
        }
        readcount += rbc.m_trie_.getSerializedDataSize();
        m_expansionEndCESize_ >>= 2;
@ -368,13 +399,16 @@ final class CollatorReader
            // we are reading the UCA
            // unfortunately the UCA offset in any collator data is not 0 and
            // only refers to the UCA data
-            m_contractionEndSize_ -= m_UCAValuesSize_;       
+            //m_contractionSize_ -= m_UCAValuesSize_;       
+            m_contractionSize_ = m_UCAConstOffset_ - readcount;       
+        } else {
+            m_contractionSize_ = m_size_ - readcount;
        }
-        rbc.m_contractionEnd_ = new byte[m_contractionEndSize_];
-        for (int i = 0; i < m_contractionEndSize_; i ++) {
+        rbc.m_contractionEnd_ = new byte[m_contractionSize_];
+        for (int i = 0; i < m_contractionSize_; i ++) {
            rbc.m_contractionEnd_[i] = m_dataInputStream_.readByte();
        }
-        readcount += m_contractionEndSize_;
+        readcount += m_contractionSize_;
        if (UCAConst != null) {
            UCAConst.FIRST_TERTIARY_IGNORABLE_[0] 
                                               = m_dataInputStream_.readInt();
@ -383,22 +417,22 @@ final class CollatorReader
                                               = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.LAST_TERTIARY_IGNORABLE_[0] 
-                                               = m_dataInputStream_.readInt();
+                                              = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.LAST_TERTIARY_IGNORABLE_[1] 
-                                               = m_dataInputStream_.readInt();
+                                              = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.FIRST_PRIMARY_IGNORABLE_[0] 
-                                               = m_dataInputStream_.readInt();
+                                              = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.FIRST_PRIMARY_IGNORABLE_[1] 
-                                               = m_dataInputStream_.readInt();
+                                              = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.FIRST_SECONDARY_IGNORABLE_[0] 
-                                               = m_dataInputStream_.readInt();
+                                                = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.FIRST_SECONDARY_IGNORABLE_[1] 
-                                               = m_dataInputStream_.readInt();
+                                                = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.LAST_SECONDARY_IGNORABLE_[0] 
                                               = m_dataInputStream_.readInt();
@ -407,10 +441,10 @@ final class CollatorReader
                                               = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.LAST_PRIMARY_IGNORABLE_[0] 
-                                               = m_dataInputStream_.readInt();
+                                             = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.LAST_PRIMARY_IGNORABLE_[1] 
-                                               = m_dataInputStream_.readInt();
+                                             = m_dataInputStream_.readInt();
            readUCAConstcount += 4;
            UCAConst.FIRST_VARIABLE_[0] = m_dataInputStream_.readInt();     
            readUCAConstcount += 4;
@ -462,25 +496,37 @@ final class CollatorReader
            readUCAConstcount += 4;
            UCAConst.PRIMARY_SPECIAL_MAX_ = m_dataInputStream_.readInt();   
            readUCAConstcount += 4;
-            int resultsize = (m_UCAValuesSize_ - readUCAConstcount) >> 1;
-            char result[] = new char[resultsize];
+
+            readcount += readUCAConstcount;
+
+            //int resultsize = m_UCAcontractionSize_ / 2;
+            int resultsize = (rbc.m_scriptToLeadBytes - readcount) / 2;
+            ucaContractions = new char[resultsize];
            for (int i = 0; i < resultsize; i ++) {
-                result[i] = m_dataInputStream_.readChar();
+                ucaContractions[i] = m_dataInputStream_.readChar();
            }
-            readcount += m_UCAValuesSize_;
-            if (readcount != m_size_) {
-                ///CLOVER:OFF
-                throw new IOException("Internal Error: Data file size error");
-                ///CLOVER:ON
-            }
-            return result;
+            readcount += m_UCAcontractionSize_;
+
+            //            if (readcount != m_size_) {
+            //                ///CLOVER:OFF
+            //                throw new IOException("Internal Error: Data file size error");
+            //                ///CLOVER:ON
+            //            }
        }
+
+        if (leadByteConstants != null)
+        {
+            readcount += m_dataInputStream_.skip(rbc.m_scriptToLeadBytes - readcount);
+            leadByteConstants.read(m_dataInputStream_);
+            readcount += leadByteConstants.getSerializedDataSize();
+        }
+
        if (readcount != m_size_) {
            ///CLOVER:OFF
            throw new IOException("Internal Error: Data file size error");
            ///CLOVER:ON
        }
-        return null;
+        return ucaContractions;
    }

    /**
@ -491,22 +537,22 @@ final class CollatorReader
     *            inverse uca
     */
    private static CollationParsedRuleBuilder.InverseUCA readInverseUCA(
-                                                      InputStream inputStream)
-                                                      throws IOException
+            InputStream inputStream)
+    throws IOException
    {
-         byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_, 
-                              INVERSE_UCA_AUTHENTICATE_);
+        byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_, 
+                INVERSE_UCA_AUTHENTICATE_);

        // weiv: check that we have the correct Unicode version in 
        // binary files
        VersionInfo UCDVersion = UCharacter.getUnicodeVersion();
        if(UnicodeVersion[0] != UCDVersion.getMajor() 
-        || UnicodeVersion[1] != UCDVersion.getMinor()) {
+                || UnicodeVersion[1] != UCDVersion.getMinor()) {
            throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
        }

        CollationParsedRuleBuilder.InverseUCA result = 
-                                  new CollationParsedRuleBuilder.InverseUCA();
+            new CollationParsedRuleBuilder.InverseUCA();
        DataInputStream input = new DataInputStream(inputStream);        
        input.readInt(); // bytesize
        int tablesize = input.readInt(); // in int size
@ -541,7 +587,7 @@ final class CollatorReader
     */

    protected static VersionInfo readVersion(DataInputStream input) 
-        throws IOException {
+    throws IOException {
        byte[] version = new byte[4];
        version[0] = input.readByte();
        version[1] = input.readByte();
@ -549,9 +595,9 @@ final class CollatorReader
        version[3] = input.readByte();

        VersionInfo result = 
-        VersionInfo.getInstance(
-            (int)version[0], (int)version[1], 
-            (int)version[2], (int)version[3]);
+            VersionInfo.getInstance(
+                    (int)version[0], (int)version[1], 
+                    (int)version[2], (int)version[3]);

        return result;
    }
@ -564,118 +610,122 @@ final class CollatorReader
     * Authenticate uca data format version
     */
    private static final ICUBinary.Authenticate UCA_AUTHENTICATE_ 
-                = new ICUBinary.Authenticate() {
-                        public boolean isDataVersionAcceptable(byte version[])
-                        {
-                            return version[0] == DATA_FORMAT_VERSION_[0] 
-                                   && version[1] >= DATA_FORMAT_VERSION_[1];
-                                   // Too harsh 
-                                   //&& version[1] == DATA_FORMAT_VERSION_[1]
-                                   //&& version[2] == DATA_FORMAT_VERSION_[2] 
-                                   //&& version[3] == DATA_FORMAT_VERSION_[3];
-                        }
-                };
+    = new ICUBinary.Authenticate() {
+        public boolean isDataVersionAcceptable(byte version[])
+        {
+            return version[0] == DATA_FORMAT_VERSION_[0] 
+                                                      && version[1] >= DATA_FORMAT_VERSION_[1];
+                                                      // Too harsh 
+                                                      //&& version[1] == DATA_FORMAT_VERSION_[1]
+                                                      //&& version[2] == DATA_FORMAT_VERSION_[2] 
+                                                      //&& version[3] == DATA_FORMAT_VERSION_[3];
+        }
+    };

    /**
     * Authenticate uca data format version
     */
    private static final ICUBinary.Authenticate INVERSE_UCA_AUTHENTICATE_ 
-                = new ICUBinary.Authenticate() {
-                        public boolean isDataVersionAcceptable(byte version[])
-                        {
-                            return version[0] 
-                                    == INVERSE_UCA_DATA_FORMAT_VERSION_[0] 
-                                && version[1] 
-                                    >= INVERSE_UCA_DATA_FORMAT_VERSION_[1];
-                        }
-                };
+    = new ICUBinary.Authenticate() {
+        public boolean isDataVersionAcceptable(byte version[])
+        {
+            return version[0] 
+                           == INVERSE_UCA_DATA_FORMAT_VERSION_[0] 
+                                                               && version[1] 
+                                                                          >= INVERSE_UCA_DATA_FORMAT_VERSION_[1];
+        }
+    };

    /**
-    * Data input stream for uca.icu 
-    */
+     * Data input stream for uca.icu 
+     */
    private DataInputStream m_dataInputStream_;

    /**
-    * File format version and id that this class understands.
-    * No guarantees are made if a older version is used
-    */
+     * File format version and id that this class understands.
+     * No guarantees are made if a older version is used
+     */
    private static final byte DATA_FORMAT_VERSION_[] = 
-                                   {(byte)0x2, (byte)0x2, (byte)0x0, (byte)0x0};
+    {(byte)0x3, (byte)0x0, (byte)0x0, (byte)0x0};
    private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x43,  
-                                                    (byte)0x6f, (byte)0x6c};
+        (byte)0x6f, (byte)0x6c};
    /**
-    * Inverse UCA file format version and id that this class understands.
-    * No guarantees are made if a older version is used
-    */
+     * Inverse UCA file format version and id that this class understands.
+     * No guarantees are made if a older version is used
+     */
    private static final byte INVERSE_UCA_DATA_FORMAT_VERSION_[] = 
-                                   {(byte)0x2, (byte)0x1, (byte)0x0, (byte)0x0};
+    {(byte)0x2, (byte)0x1, (byte)0x0, (byte)0x0};
    private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = {(byte)0x49, 
-                                                               (byte)0x6e,  
-                                                               (byte)0x76, 
-                                                               (byte)0x43};
+        (byte)0x6e,  
+        (byte)0x76, 
+        (byte)0x43};

    /**
-    * Wrong unicode version error string
-    */
-    private static final String WRONG_UNICODE_VERSION_ERROR_ =
-                                "Unicode version in binary image is not compatible with the current Unicode version";
+     * Wrong unicode version error string
+     */
+     private static final String WRONG_UNICODE_VERSION_ERROR_ =
+         "Unicode version in binary image is not compatible with the current Unicode version";

-    /**
-     * Size of expansion table in bytes
-     */
-    private int m_expansionSize_;
-    /**
-     * Size of contraction index table in bytes
-     */
-    private int m_contractionIndexSize_;
-    /**
-     * Size of contraction table in bytes
-     */
-    private int m_contractionCESize_;
-    /*
-     * Size of the Trie in bytes
-     */
-    //private int m_trieSize_;
-    /**
-     * Size of the table that contains information about collation elements
-     * that end with an expansion 
-     */
-    private int m_expansionEndCESize_;
-    /**
-     * Size of the table that contains information about the maximum size of 
-     * collation elements that end with a particular expansion CE corresponding
-     * to the ones in expansionEndCE
-     */
-    private int m_expansionEndCEMaxSizeSize_;
-    /**
-     * Size of the option table that contains information about the collation
-     * options
-     */
-    private int m_optionSize_;
-    /**
-     * Size of the whole data file minusing the ICU header
-     */
-    private int m_size_;
-    /**
-     * Size of the collation data header
-     */
-    private int m_headerSize_;
-    /**
-     * Size of the table that contains information about the "Unsafe" 
-     * codepoints
-     */
-    private int m_unsafeSize_;
-    /**
-     * Size of the table that contains information about codepoints that ends
-     * with a contraction
-     */
-    private int m_contractionEndSize_;
-    /**
-     * Size of the table that contains UCA contraction information
-     */
-    private int m_UCAValuesSize_;
+     /**
+      * Size of expansion table in bytes
+      */
+     private int m_expansionSize_;
+     /**
+      * Size of contraction index table in bytes
+      */
+     private int m_contractionIndexSize_;
+     /**
+      * Size of contraction table in bytes
+      */
+     private int m_contractionCESize_;
+     /*
+      * Size of the Trie in bytes
+      */
+     //private int m_trieSize_;
+     /**
+      * Size of the table that contains information about collation elements
+      * that end with an expansion 
+      */
+     private int m_expansionEndCESize_;
+     /**
+      * Size of the table that contains information about the maximum size of 
+      * collation elements that end with a particular expansion CE corresponding
+      * to the ones in expansionEndCE
+      */
+     private int m_expansionEndCEMaxSizeSize_;
+     /**
+      * Size of the option table that contains information about the collation
+      * options
+      */
+     private int m_optionSize_;
+     /**
+      * Size of the whole data file minusing the ICU header
+      */
+     private int m_size_;
+     /**
+      * Size of the collation data header
+      */
+     private int m_headerSize_;
+     /**
+      * Size of the table that contains information about the "Unsafe" 
+      * codepoints
+      */
+     private int m_unsafeSize_;
+     /**
+      * Size in bytes of the table that contains information about codepoints that ends
+      * with a contraction
+      */
+     private int m_contractionSize_;
+     /**
+      * Size of the table that contains UCA contraction information in bytes
+      */
+     private int m_UCAcontractionSize_;
+     /**
+      * Offset of the UCA Const
+      */
+     private int m_UCAConstOffset_;

-    // private methods ---------------------------------------------------
+     // private methods ---------------------------------------------------

 }

--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java