ICU-3984 initial commit of the collation reordering

X-SVN-Rev: 29015
This commit is contained in:
Stuart Gill 2010-11-08 18:57:42 +00:00
parent 5af2364f93
commit 1fac4c690b
5 changed files with 2606 additions and 2476 deletions

View File

@ -3631,6 +3631,7 @@ final class CollationParsedRuleBuilder {
collator.m_isHiragana4_ = option.m_isHiragana4_; collator.m_isHiragana4_ = option.m_isHiragana4_;
collator.setStrength(option.m_strength_); collator.setStrength(option.m_strength_);
collator.m_variableTopValue_ = option.m_variableTopValue_; collator.m_variableTopValue_ = option.m_variableTopValue_;
collator.m_scriptOrder_ = option.m_scriptOrder_;
collator.latinOneFailed_ = false; collator.latinOneFailed_ = false;
} }

View File

@ -7,6 +7,7 @@
package com.ibm.icu.text; package com.ibm.icu.text;
import java.text.ParseException; import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
@ -16,6 +17,7 @@ import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.ULocale; import com.ibm.icu.util.ULocale;
import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
/** /**
* Class for parsing collation rules, produces a list of tokens that will be * Class for parsing collation rules, produces a list of tokens that will be
@ -89,6 +91,14 @@ final class CollationRuleParser
m_decomposition_ = collator.getDecomposition(); m_decomposition_ = collator.getDecomposition();
m_strength_ = collator.getStrength(); m_strength_ = collator.getStrength();
m_isHiragana4_ = collator.m_isHiragana4_; m_isHiragana4_ = collator.m_isHiragana4_;
if(collator.m_scriptOrder_ != null){
m_scriptOrder_ = new int[collator.m_scriptOrder_.length];
for(int i = 0; i < m_scriptOrder_.length; i++){
m_scriptOrder_[i] = collator.m_scriptOrder_[i];
}
}
} }
// package private data members -------------------------------------- // package private data members --------------------------------------
@ -119,6 +129,11 @@ final class CollationRuleParser
* attribute for special Hiragana * attribute for special Hiragana
*/ */
boolean m_isHiragana4_; boolean m_isHiragana4_;
/**
* the ordering of the scripts
*/
int[] m_scriptOrder_;
} }
/** /**
@ -291,6 +306,14 @@ final class CollationRuleParser
collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_; collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_; collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_; collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
if(m_options_.m_scriptOrder_ != null) {
collator.m_defaultScriptOrder_ = new int[m_options_.m_scriptOrder_.length];
for (int i = 0; i < m_options_.m_scriptOrder_.length; i++) {
collator.m_defaultScriptOrder_[i] = m_options_.m_scriptOrder_[i];
}
} else {
collator.m_defaultScriptOrder_ = null;
}
} }
// private inner classes ------------------------------------------------- // private inner classes -------------------------------------------------
@ -662,7 +685,7 @@ final class CollationRuleParser
RULES_OPTIONS_[15] = new TokenOption("undefined", RULES_OPTIONS_[15] = new TokenOption("undefined",
RuleBasedCollator.Attribute.LIMIT_, RuleBasedCollator.Attribute.LIMIT_,
null, null); null, null);
RULES_OPTIONS_[16] = new TokenOption("scriptOrder", RULES_OPTIONS_[16] = new TokenOption("reorder",
RuleBasedCollator.Attribute.LIMIT_, RuleBasedCollator.Attribute.LIMIT_,
null, null); null, null);
RULES_OPTIONS_[17] = new TokenOption("charsetname", RULES_OPTIONS_[17] = new TokenOption("charsetname",
@ -2028,7 +2051,6 @@ final class CollationRuleParser
return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current); return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
} }
/** in C, optionarg is passed by reference to function. /** in C, optionarg is passed by reference to function.
* We use a private int to simulate this. * We use a private int to simulate this.
*/ */
@ -2061,6 +2083,7 @@ final class CollationRuleParser
} }
return i; return i;
} }
/** /**
* Reads and set collation options * Reads and set collation options
* @return TOKEN_SUCCESS if option is set correct, 0 otherwise * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
@ -2152,6 +2175,11 @@ final class CollationRuleParser
m_optionEnd_ = m_current_-1; m_optionEnd_ = m_current_-1;
return TOKEN_SUCCESS_MASK_; return TOKEN_SUCCESS_MASK_;
} }
else if(i == 16) {
m_current_ = m_optionarg_; // skip opening brace and name
parseScriptReorder();
return TOKEN_SUCCESS_MASK_;
}
else { else {
throwParseException(m_rules_, optionarg); throwParseException(m_rules_, optionarg);
} }
@ -2282,4 +2310,31 @@ final class CollationRuleParser
} }
return rules; return rules;
} }
private void parseScriptReorder() throws ParseException{
ArrayList<Integer> tempOrder = new ArrayList<Integer>();
int end = m_rules_.indexOf(']', m_current_);
while(m_current_ < end){
// Ensure that the following token is 4 characters long
if ((end != m_current_+4) &&
(m_rules_.charAt(m_current_+4) != ' ')) {
throw new ParseException(m_rules_, m_current_);
}
int[] script = UScript.getCode(m_rules_.substring(m_current_, m_current_+4));
if (script.length > 0) {
tempOrder.add(script[0]);
} else {
throw new ParseException(m_rules_, m_current_);
}
m_current_+= 4;
while (m_current_ < end && UCharacter.isWhitespace(m_rules_.charAt(m_current_)))
{ // eat whitespace
m_current_++;
}
}
m_options_.m_scriptOrder_ = new int[tempOrder.size()];
for(int i = 0; i < tempOrder.size(); i++){
m_options_.m_scriptOrder_[i] = tempOrder.get(i);
}
}
} }

View File

@ -225,6 +225,18 @@ public abstract class Collator implements Comparator<Object>, Cloneable
*/ */
public final static int CANONICAL_DECOMPOSITION = 17; public final static int CANONICAL_DECOMPOSITION = 17;
public final static class CollationReorderCodes {
private CollationReorderCodes() {}
public final static int SPACE = 0x1000;
public final static int FIRST = SPACE;
public final static int PUNCTUATION = 0x1001;
public final static int SYMBOL = 0x1002;
public final static int CURRENCY = 0x1003;
public final static int DIGIT = 0x1004;
public final static int LIMIT = 0x1005;
}
// public methods -------------------------------------------------------- // public methods --------------------------------------------------------
// public setters -------------------------------------------------------- // public setters --------------------------------------------------------
@ -314,6 +326,17 @@ public abstract class Collator implements Comparator<Object>, Cloneable
} }
} }
/**
* Set the order for scripts to be ordered in.
* @param order the reordering of scripts
* @see #getScriptOrder
* @stable
*/
public void setScriptOrder(int... order)
{
throw new UnsupportedOperationException();
}
// public getters -------------------------------------------------------- // public getters --------------------------------------------------------
/** /**
@ -988,6 +1011,17 @@ public abstract class Collator implements Comparator<Object>, Cloneable
* @stable ICU 2.8 * @stable ICU 2.8
*/ */
public abstract VersionInfo getUCAVersion(); public abstract VersionInfo getUCAVersion();
/**
* Method to retrieve the script reordering
* @see #setScriptOrder
* @return the ordering of the scripts if one has been set, null otherwise.
* @stable
*/
public int[] getScriptOrder()
{
throw new UnsupportedOperationException();
}
// protected constructor ------------------------------------------------- // protected constructor -------------------------------------------------

View File

@ -1,9 +1,9 @@
/** /**
******************************************************************************* *******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and * * Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. * * others. All Rights Reserved. *
******************************************************************************* *******************************************************************************
*/ */
package com.ibm.icu.text; package com.ibm.icu.text;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
@ -18,29 +18,30 @@ import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.IntTrie; import com.ibm.icu.impl.IntTrie;
import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.CollationParsedRuleBuilder.InverseUCA; import com.ibm.icu.text.CollationParsedRuleBuilder.InverseUCA;
import com.ibm.icu.text.RuleBasedCollator.LeadByteConstants;
import com.ibm.icu.text.RuleBasedCollator.UCAConstants; import com.ibm.icu.text.RuleBasedCollator.UCAConstants;
import com.ibm.icu.util.VersionInfo; import com.ibm.icu.util.VersionInfo;
/** /**
* <p>Internal reader class for ICU data file uca.icu containing * <p>Internal reader class for ICU data file uca.icu containing
* Unicode Collation Algorithm data.</p> * Unicode Collation Algorithm data.</p>
* <p>This class simply reads uca.icu, authenticates that it is a valid * <p>This class simply reads uca.icu, authenticates that it is a valid
* ICU data file and split its contents up into blocks of data for use in * ICU data file and split its contents up into blocks of data for use in
* <a href=Collator.html>com.ibm.icu.text.Collator</a>. * <a href=Collator.html>com.ibm.icu.text.Collator</a>.
* </p> * </p>
* <p>uca.icu which is in big-endian format is jared together with this * <p>uca.icu which is in big-endian format is jared together with this
* package.</p> * package.</p>
* @author Syn Wee Quek * @author Syn Wee Quek
* @since release 2.2, April 18 2002 * @since release 2.2, April 18 2002
*/ */
final class CollatorReader final class CollatorReader
{ {
static char[] read(RuleBasedCollator rbc, UCAConstants ucac) throws IOException { static char[] read(RuleBasedCollator rbc, UCAConstants ucac, LeadByteConstants leadByteConstants) throws IOException {
InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/coll/ucadata.icu"); InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/coll/ucadata.icu");
BufferedInputStream b = new BufferedInputStream(i, 90000); BufferedInputStream b = new BufferedInputStream(i, 90000);
CollatorReader reader = new CollatorReader(b); CollatorReader reader = new CollatorReader(b);
char[] result = reader.readImp(rbc, ucac); char[] result = reader.readImp(rbc, ucac, leadByteConstants);
b.close(); b.close();
return result; return result;
} }
@ -62,14 +63,23 @@ final class CollatorReader
} }
static void initRBC(RuleBasedCollator rbc, ByteBuffer data) throws IOException { static void initRBC(RuleBasedCollator rbc, ByteBuffer data) throws IOException {
final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2; // TODO - why? 4 extra bytes? padding in the swapper?
//final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
final int MIN_BINARY_DATA_SIZE_ = 272;
int dataLength = data.remaining(); int dataLength = data.remaining();
// TODO: Change the rest of this class to use the ByteBuffer directly, rather than // TODO: Change the rest of this class to use the ByteBuffer directly, rather than
// a DataInputStream, except for passing an InputStream to ICUBinary.readHeader(). // a DataInputStream, except for passing an InputStream to ICUBinary.readHeader().
// Consider changing ICUBinary to also work with a ByteBuffer. // Consider changing ICUBinary to also work with a ByteBuffer.
CollatorReader reader = new CollatorReader(makeByteBufferInputStream(data), false); CollatorReader reader = new CollatorReader(makeByteBufferInputStream(data), false);
if (dataLength > MIN_BINARY_DATA_SIZE_) { if (dataLength > MIN_BINARY_DATA_SIZE_) {
reader.readImp(rbc, null); // for (int i = 0; i < dataLength; i++) {
// byte b = data.get(i);
// System.out.print("0x" + (((int) 0xff & b) < 0x0f ? "0" : "") + Integer.toHexString(0xff & b) + " ");
// if (i % 16 == 0) {
// System.out.println();
// }
// }
reader.readImp(rbc, null, null);
} else { } else {
reader.readHeader(rbc); reader.readHeader(rbc);
reader.readOptions(rbc); reader.readOptions(rbc);
@ -77,30 +87,30 @@ final class CollatorReader
rbc.setWithUCATables(); rbc.setWithUCATables();
} }
} }
static InverseUCA getInverseUCA() throws IOException { static InverseUCA getInverseUCA() throws IOException {
InverseUCA result = null; InverseUCA result = null;
InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/coll/invuca.icu"); InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/coll/invuca.icu");
// try { // try {
// String invdat = "/com/ibm/icu/impl/data/invuca.icu"; // String invdat = "/com/ibm/icu/impl/data/invuca.icu";
// InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat); // InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
BufferedInputStream b = new BufferedInputStream(i, 110000); BufferedInputStream b = new BufferedInputStream(i, 110000);
result = CollatorReader.readInverseUCA(b); result = CollatorReader.readInverseUCA(b);
b.close(); b.close();
i.close(); i.close();
return result; return result;
// } catch (Exception e) { // } catch (Exception e) {
// throw new RuntimeException(e.getMessage()); // throw new RuntimeException(e.getMessage());
// } // }
} }
// protected constructor --------------------------------------------- // protected constructor ---------------------------------------------
/** /**
* <p>Protected constructor.</p> * <p>Protected constructor.</p>
* @param inputStream ICU collator file input stream * @param inputStream ICU collator file input stream
* @exception IOException throw if data file fails authentication * @exception IOException throw if data file fails authentication
*/ */
private CollatorReader(InputStream inputStream) throws IOException private CollatorReader(InputStream inputStream) throws IOException
{ {
this(inputStream, true); this(inputStream, true);
@ -114,40 +124,40 @@ final class CollatorReader
throw new IOException(WRONG_UNICODE_VERSION_ERROR_); throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
} }
m_dataInputStream_ = new DataInputStream(inputStream); m_dataInputStream_ = new DataInputStream(inputStream);
*/ */
} }
/** /**
* <p>Protected constructor.</p> * <p>Protected constructor.</p>
* @param inputStream ICU uprops.icu file input stream * @param inputStream ICU uprops.icu file input stream
* @param readICUHeader flag to indicate if the ICU header has to be read * @param readICUHeader flag to indicate if the ICU header has to be read
* @exception IOException throw if data file fails authentication * @exception IOException throw if data file fails authentication
*/ */
private CollatorReader(InputStream inputStream, boolean readICUHeader) private CollatorReader(InputStream inputStream, boolean readICUHeader)
throws IOException throws IOException
{ {
if (readICUHeader) { if (readICUHeader) {
byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_,
UCA_AUTHENTICATE_); UCA_AUTHENTICATE_);
// weiv: check that we have the correct Unicode version in // weiv: check that we have the correct Unicode version in
// binary files // binary files
VersionInfo UCDVersion = UCharacter.getUnicodeVersion(); VersionInfo UCDVersion = UCharacter.getUnicodeVersion();
if(UnicodeVersion[0] != UCDVersion.getMajor() if(UnicodeVersion[0] != UCDVersion.getMajor()
|| UnicodeVersion[1] != UCDVersion.getMinor()) { || UnicodeVersion[1] != UCDVersion.getMinor()) {
throw new IOException(WRONG_UNICODE_VERSION_ERROR_); throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
} }
} }
m_dataInputStream_ = new DataInputStream(inputStream); m_dataInputStream_ = new DataInputStream(inputStream);
} }
// protected methods ------------------------------------------------- // protected methods -------------------------------------------------
/** /**
* Read and break up the header stream of data passed in as arguments into * Read and break up the header stream of data passed in as arguments into
* meaningful Collator data. * meaningful Collator data.
* @param rbc RuleBasedCollator to populate with header information * @param rbc RuleBasedCollator to populate with header information
* @exception IOException thrown when there's a data error. * @exception IOException thrown when there's a data error.
*/ */
private void readHeader(RuleBasedCollator rbc) throws IOException private void readHeader(RuleBasedCollator rbc) throws IOException
{ {
m_size_ = m_dataInputStream_.readInt(); m_size_ = m_dataInputStream_.readInt();
@ -158,11 +168,11 @@ final class CollatorReader
int readcount = 8; // for size and headersize int readcount = 8; // for size and headersize
// structure which holds values for indirect positioning and implicit // structure which holds values for indirect positioning and implicit
// ranges // ranges
int UCAConst = m_dataInputStream_.readInt(); m_UCAConstOffset_ = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
// this one is needed only for UCA, to copy the appropriate // this one is needed only for UCA, to copy the appropriate
// contractions // contractions
m_dataInputStream_.skip(4); int contractionUCACombos = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
// reserved for future use // reserved for future use
m_dataInputStream_.skipBytes(4); m_dataInputStream_.skipBytes(4);
@ -180,7 +190,7 @@ final class CollatorReader
int contractionCE = m_dataInputStream_.readInt(); int contractionCE = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
// needed for various closures int contractionSize // needed for various closures int contractionSize
/*int contractionSize = */m_dataInputStream_.readInt(); int contractionSize = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
// array of last collation element in expansion // array of last collation element in expansion
int expansionEndCE = m_dataInputStream_.readInt(); int expansionEndCE = m_dataInputStream_.readInt();
@ -190,7 +200,7 @@ final class CollatorReader
int expansionEndCEMaxSize = m_dataInputStream_.readInt(); int expansionEndCEMaxSize = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
// size of endExpansionCE int expansionEndCESize // size of endExpansionCE int expansionEndCESize
m_dataInputStream_.skipBytes(4); /*int endExpansionCECount =*/ m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
// hash table of unsafe code points // hash table of unsafe code points
int unsafe = m_dataInputStream_.readInt(); int unsafe = m_dataInputStream_.readInt();
@ -199,25 +209,35 @@ final class CollatorReader
int contractionEnd = m_dataInputStream_.readInt(); int contractionEnd = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
// int CEcount = m_dataInputStream_.readInt(); // int CEcount = m_dataInputStream_.readInt();
m_dataInputStream_.skipBytes(4); int contractionUCACombosSize = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
// is jamoSpecial // is jamoSpecial
rbc.m_isJamoSpecial_ = m_dataInputStream_.readBoolean(); rbc.m_isJamoSpecial_ = m_dataInputStream_.readBoolean();
readcount++; readcount++;
// padding // isBigEndian and charSetFamily
m_dataInputStream_.skipBytes(3); m_dataInputStream_.skipBytes(2);
readcount += 3; readcount += 2;
int contractionUCACombosWidth = m_dataInputStream_.readByte();
readcount += 1;
rbc.m_version_ = readVersion(m_dataInputStream_); rbc.m_version_ = readVersion(m_dataInputStream_);
readcount += 4; readcount += 4;
rbc.m_UCA_version_ = readVersion(m_dataInputStream_); rbc.m_UCA_version_ = readVersion(m_dataInputStream_);
readcount += 4; readcount += 4;
rbc.m_UCD_version_ = readVersion(m_dataInputStream_); rbc.m_UCD_version_ = readVersion(m_dataInputStream_);
readcount += 4; readcount += 4;
VersionInfo formatVersion = readVersion(m_dataInputStream_);
readcount += 4;
rbc.m_scriptToLeadBytes = m_dataInputStream_.readInt();
readcount += 4;
rbc.m_leadByteToScripts = m_dataInputStream_.readInt();
readcount += 4;
// byte charsetName[] = new byte[32]; // for charset CEs // byte charsetName[] = new byte[32]; // for charset CEs
m_dataInputStream_.skipBytes(32); m_dataInputStream_.skipBytes(32);
readcount += 32; readcount += 32;
m_dataInputStream_.skipBytes(56); // for future use
readcount += 56; m_dataInputStream_.skipBytes(44); // for future use
readcount += 44;
if (m_headerSize_ < readcount) { if (m_headerSize_ < readcount) {
///CLOVER:OFF ///CLOVER:OFF
throw new IOException("Internal Error: Header size error"); throw new IOException("Internal Error: Header size error");
@ -237,16 +257,20 @@ final class CollatorReader
m_expansionEndCESize_ = expansionEndCEMaxSize - expansionEndCE; m_expansionEndCESize_ = expansionEndCEMaxSize - expansionEndCE;
m_expansionEndCEMaxSizeSize_ = unsafe - expansionEndCEMaxSize; m_expansionEndCEMaxSizeSize_ = unsafe - expansionEndCEMaxSize;
m_unsafeSize_ = contractionEnd - unsafe; m_unsafeSize_ = contractionEnd - unsafe;
m_UCAValuesSize_ = m_size_ - UCAConst; // UCA value, will be handled //m_UCAValuesSize_ = m_size_ - UCAConst; // UCA value, will be handled later
// later m_UCAcontractionSize_ = contractionUCACombosSize * contractionUCACombosWidth * 2;
// treat it as normal collator first // treat it as normal collator first
// for normal collator there is no UCA contraction // for normal collator there is no UCA contraction
m_contractionEndSize_ = m_size_ - contractionEnd; // contractions (UChar[contractionSize] + CE[contractionSize])
int old_contractionSize_ = m_size_ - contractionEnd;
// m_contractionSize_ = contractionSize * 2 + contractionSize * 4;
m_contractionSize_ = contractionSize * 2 + contractionSize * 4;
rbc.m_contractionOffset_ >>= 1; // casting to ints rbc.m_contractionOffset_ >>= 1; // casting to ints
rbc.m_expansionOffset_ >>= 2; // casting to chars rbc.m_expansionOffset_ >>= 2; // casting to chars
} }
/** /**
* Read and break up the collation options passed in the stream of data and * Read and break up the collation options passed in the stream of data and
* update the argument Collator with the results * update the argument Collator with the results
@ -262,16 +286,19 @@ final class CollatorReader
rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt(); rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt() rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_); == RuleBasedCollator.AttributeValue.ON_);
readcount += 4; readcount += 4;
rbc.m_defaultIsAlternateHandlingShifted_ rbc.m_defaultIsAlternateHandlingShifted_
= (m_dataInputStream_.readInt() == = (m_dataInputStream_.readInt() ==
RuleBasedCollator.AttributeValue.SHIFTED_); RuleBasedCollator.AttributeValue.SHIFTED_);
readcount += 4; readcount += 4;
rbc.m_defaultCaseFirst_ = m_dataInputStream_.readInt(); rbc.m_defaultCaseFirst_ = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt() // rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_); // == RuleBasedCollator.AttributeValue.ON_);
int defaultIsCaseLevel = m_dataInputStream_.readInt();
rbc.m_defaultIsCaseLevel_ = (defaultIsCaseLevel
== RuleBasedCollator.AttributeValue.ON_);
readcount += 4; readcount += 4;
int value = m_dataInputStream_.readInt(); int value = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
@ -285,10 +312,10 @@ final class CollatorReader
rbc.m_defaultStrength_ = m_dataInputStream_.readInt(); rbc.m_defaultStrength_ = m_dataInputStream_.readInt();
readcount += 4; readcount += 4;
rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt() rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_); == RuleBasedCollator.AttributeValue.ON_);
readcount += 4; readcount += 4;
rbc.m_defaultIsNumericCollation_ = (m_dataInputStream_.readInt() rbc.m_defaultIsNumericCollation_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_); == RuleBasedCollator.AttributeValue.ON_);
readcount += 4; readcount += 4;
m_dataInputStream_.skip(60); // reserved for future use m_dataInputStream_.skip(60); // reserved for future use
readcount += 60; readcount += 60;
@ -299,21 +326,25 @@ final class CollatorReader
///CLOVER:ON ///CLOVER:ON
} }
} }
/** /**
* Read and break up the stream of data passed in as arguments into * Read and break up the stream of data passed in as arguments into
* meaningful Collator data. * meaningful Collator data.
* @param rbc RuleBasedCollator to populate * @param rbc RuleBasedCollator to populate
* @param UCAConst object to fill up with UCA constants if we are reading * @param UCAConst object to fill up with UCA constants if we are reading
* the UCA collator, if not use a null * the UCA collator, if not use a null
* @return UCAContractions array filled up with the UCA contractions if we * @param leadByteConstants
* are reading the UCA collator * @return UCAContractions array filled up with the UCA contractions if we
* @exception IOException thrown when there's a data error. * are reading the UCA collator
*/ * @exception IOException thrown when there's a data error.
*/
private char[] readImp(RuleBasedCollator rbc, private char[] readImp(RuleBasedCollator rbc,
RuleBasedCollator.UCAConstants UCAConst) RuleBasedCollator.UCAConstants UCAConst,
throws IOException RuleBasedCollator.LeadByteConstants leadByteConstants)
throws IOException
{ {
char ucaContractions[] = null; // return result
readHeader(rbc); readHeader(rbc);
// header size has been checked by readHeader // header size has been checked by readHeader
int readcount = m_headerSize_; int readcount = m_headerSize_;
@ -328,24 +359,24 @@ final class CollatorReader
readcount += (m_expansionSize_ << 2); readcount += (m_expansionSize_ << 2);
if (m_contractionIndexSize_ > 0) { if (m_contractionIndexSize_ > 0) {
m_contractionIndexSize_ >>= 1; m_contractionIndexSize_ >>= 1;
rbc.m_contractionIndex_ = new char[m_contractionIndexSize_]; rbc.m_contractionIndex_ = new char[m_contractionIndexSize_];
for (int i = 0; i < m_contractionIndexSize_; i ++) { for (int i = 0; i < m_contractionIndexSize_; i ++) {
rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar(); rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar();
} }
readcount += (m_contractionIndexSize_ << 1); readcount += (m_contractionIndexSize_ << 1);
m_contractionCESize_ >>= 2; m_contractionCESize_ >>= 2;
rbc.m_contractionCE_ = new int[m_contractionCESize_]; rbc.m_contractionCE_ = new int[m_contractionCESize_];
for (int i = 0; i < m_contractionCESize_; i ++) { for (int i = 0; i < m_contractionCESize_; i ++) {
rbc.m_contractionCE_[i] = m_dataInputStream_.readInt(); rbc.m_contractionCE_[i] = m_dataInputStream_.readInt();
} }
readcount += (m_contractionCESize_ << 2); readcount += (m_contractionCESize_ << 2);
} }
rbc.m_trie_ = new IntTrie(m_dataInputStream_, rbc.m_trie_ = new IntTrie(m_dataInputStream_,
RuleBasedCollator.DataManipulate.getInstance()); RuleBasedCollator.DataManipulate.getInstance());
if (!rbc.m_trie_.isLatin1Linear()) { if (!rbc.m_trie_.isLatin1Linear()) {
throw new IOException("Data corrupted, " throw new IOException("Data corrupted, "
+ "Collator Tries expected to have linear " + "Collator Tries expected to have linear "
+ "latin one data arrays"); + "latin one data arrays");
} }
readcount += rbc.m_trie_.getSerializedDataSize(); readcount += rbc.m_trie_.getSerializedDataSize();
m_expansionEndCESize_ >>= 2; m_expansionEndCESize_ >>= 2;
@ -368,13 +399,16 @@ final class CollatorReader
// we are reading the UCA // we are reading the UCA
// unfortunately the UCA offset in any collator data is not 0 and // unfortunately the UCA offset in any collator data is not 0 and
// only refers to the UCA data // only refers to the UCA data
m_contractionEndSize_ -= m_UCAValuesSize_; //m_contractionSize_ -= m_UCAValuesSize_;
m_contractionSize_ = m_UCAConstOffset_ - readcount;
} else {
m_contractionSize_ = m_size_ - readcount;
} }
rbc.m_contractionEnd_ = new byte[m_contractionEndSize_]; rbc.m_contractionEnd_ = new byte[m_contractionSize_];
for (int i = 0; i < m_contractionEndSize_; i ++) { for (int i = 0; i < m_contractionSize_; i ++) {
rbc.m_contractionEnd_[i] = m_dataInputStream_.readByte(); rbc.m_contractionEnd_[i] = m_dataInputStream_.readByte();
} }
readcount += m_contractionEndSize_; readcount += m_contractionSize_;
if (UCAConst != null) { if (UCAConst != null) {
UCAConst.FIRST_TERTIARY_IGNORABLE_[0] UCAConst.FIRST_TERTIARY_IGNORABLE_[0]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
@ -383,22 +417,22 @@ final class CollatorReader
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.LAST_TERTIARY_IGNORABLE_[0] UCAConst.LAST_TERTIARY_IGNORABLE_[0]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.LAST_TERTIARY_IGNORABLE_[1] UCAConst.LAST_TERTIARY_IGNORABLE_[1]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.FIRST_PRIMARY_IGNORABLE_[0] UCAConst.FIRST_PRIMARY_IGNORABLE_[0]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.FIRST_PRIMARY_IGNORABLE_[1] UCAConst.FIRST_PRIMARY_IGNORABLE_[1]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.FIRST_SECONDARY_IGNORABLE_[0] UCAConst.FIRST_SECONDARY_IGNORABLE_[0]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.FIRST_SECONDARY_IGNORABLE_[1] UCAConst.FIRST_SECONDARY_IGNORABLE_[1]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.LAST_SECONDARY_IGNORABLE_[0] UCAConst.LAST_SECONDARY_IGNORABLE_[0]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
@ -407,10 +441,10 @@ final class CollatorReader
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.LAST_PRIMARY_IGNORABLE_[0] UCAConst.LAST_PRIMARY_IGNORABLE_[0]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.LAST_PRIMARY_IGNORABLE_[1] UCAConst.LAST_PRIMARY_IGNORABLE_[1]
= m_dataInputStream_.readInt(); = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.FIRST_VARIABLE_[0] = m_dataInputStream_.readInt(); UCAConst.FIRST_VARIABLE_[0] = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
@ -462,27 +496,39 @@ final class CollatorReader
readUCAConstcount += 4; readUCAConstcount += 4;
UCAConst.PRIMARY_SPECIAL_MAX_ = m_dataInputStream_.readInt(); UCAConst.PRIMARY_SPECIAL_MAX_ = m_dataInputStream_.readInt();
readUCAConstcount += 4; readUCAConstcount += 4;
int resultsize = (m_UCAValuesSize_ - readUCAConstcount) >> 1;
char result[] = new char[resultsize]; readcount += readUCAConstcount;
//int resultsize = m_UCAcontractionSize_ / 2;
int resultsize = (rbc.m_scriptToLeadBytes - readcount) / 2;
ucaContractions = new char[resultsize];
for (int i = 0; i < resultsize; i ++) { for (int i = 0; i < resultsize; i ++) {
result[i] = m_dataInputStream_.readChar(); ucaContractions[i] = m_dataInputStream_.readChar();
} }
readcount += m_UCAValuesSize_; readcount += m_UCAcontractionSize_;
if (readcount != m_size_) {
///CLOVER:OFF // if (readcount != m_size_) {
throw new IOException("Internal Error: Data file size error"); // ///CLOVER:OFF
///CLOVER:ON // throw new IOException("Internal Error: Data file size error");
} // ///CLOVER:ON
return result; // }
} }
if (leadByteConstants != null)
{
readcount += m_dataInputStream_.skip(rbc.m_scriptToLeadBytes - readcount);
leadByteConstants.read(m_dataInputStream_);
readcount += leadByteConstants.getSerializedDataSize();
}
if (readcount != m_size_) { if (readcount != m_size_) {
///CLOVER:OFF ///CLOVER:OFF
throw new IOException("Internal Error: Data file size error"); throw new IOException("Internal Error: Data file size error");
///CLOVER:ON ///CLOVER:ON
} }
return null; return ucaContractions;
} }
/** /**
* Reads in the inverse uca data * Reads in the inverse uca data
* @param input input stream with the inverse uca data * @param input input stream with the inverse uca data
@ -491,22 +537,22 @@ final class CollatorReader
* inverse uca * inverse uca
*/ */
private static CollationParsedRuleBuilder.InverseUCA readInverseUCA( private static CollationParsedRuleBuilder.InverseUCA readInverseUCA(
InputStream inputStream) InputStream inputStream)
throws IOException throws IOException
{ {
byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_, byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_,
INVERSE_UCA_AUTHENTICATE_); INVERSE_UCA_AUTHENTICATE_);
// weiv: check that we have the correct Unicode version in // weiv: check that we have the correct Unicode version in
// binary files // binary files
VersionInfo UCDVersion = UCharacter.getUnicodeVersion(); VersionInfo UCDVersion = UCharacter.getUnicodeVersion();
if(UnicodeVersion[0] != UCDVersion.getMajor() if(UnicodeVersion[0] != UCDVersion.getMajor()
|| UnicodeVersion[1] != UCDVersion.getMinor()) { || UnicodeVersion[1] != UCDVersion.getMinor()) {
throw new IOException(WRONG_UNICODE_VERSION_ERROR_); throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
} }
CollationParsedRuleBuilder.InverseUCA result = CollationParsedRuleBuilder.InverseUCA result =
new CollationParsedRuleBuilder.InverseUCA(); new CollationParsedRuleBuilder.InverseUCA();
DataInputStream input = new DataInputStream(inputStream); DataInputStream input = new DataInputStream(inputStream);
input.readInt(); // bytesize input.readInt(); // bytesize
int tablesize = input.readInt(); // in int size int tablesize = input.readInt(); // in int size
@ -515,11 +561,11 @@ final class CollatorReader
input.readInt(); // conts in bytes input.readInt(); // conts in bytes
result.m_UCA_version_ = readVersion(input); result.m_UCA_version_ = readVersion(input);
input.skipBytes(8); // skip padding input.skipBytes(8); // skip padding
int size = tablesize * 3; // one column for each strength int size = tablesize * 3; // one column for each strength
result.m_table_ = new int[size]; result.m_table_ = new int[size];
result.m_continuations_ = new char[contsize]; result.m_continuations_ = new char[contsize];
for (int i = 0; i < size; i ++) { for (int i = 0; i < size; i ++) {
result.m_table_[i] = input.readInt(); result.m_table_[i] = input.readInt();
} }
@ -529,7 +575,7 @@ final class CollatorReader
input.close(); input.close();
return result; return result;
} }
/** /**
* Reads four bytes from the input and returns a VersionInfo * Reads four bytes from the input and returns a VersionInfo
* object. Use it to read different collator versions. * object. Use it to read different collator versions.
@ -539,143 +585,147 @@ final class CollatorReader
* @throws IOException thrown when error occurs while reading * @throws IOException thrown when error occurs while reading
* version bytes * version bytes
*/ */
protected static VersionInfo readVersion(DataInputStream input) protected static VersionInfo readVersion(DataInputStream input)
throws IOException { throws IOException {
byte[] version = new byte[4]; byte[] version = new byte[4];
version[0] = input.readByte(); version[0] = input.readByte();
version[1] = input.readByte(); version[1] = input.readByte();
version[2] = input.readByte(); version[2] = input.readByte();
version[3] = input.readByte(); version[3] = input.readByte();
VersionInfo result = VersionInfo result =
VersionInfo.getInstance( VersionInfo.getInstance(
(int)version[0], (int)version[1], (int)version[0], (int)version[1],
(int)version[2], (int)version[3]); (int)version[2], (int)version[3]);
return result; return result;
} }
// private inner class ----------------------------------------------- // private inner class -----------------------------------------------
// private variables ------------------------------------------------- // private variables -------------------------------------------------
/** /**
* Authenticate uca data format version * Authenticate uca data format version
*/ */
private static final ICUBinary.Authenticate UCA_AUTHENTICATE_ private static final ICUBinary.Authenticate UCA_AUTHENTICATE_
= new ICUBinary.Authenticate() { = new ICUBinary.Authenticate() {
public boolean isDataVersionAcceptable(byte version[]) public boolean isDataVersionAcceptable(byte version[])
{ {
return version[0] == DATA_FORMAT_VERSION_[0] return version[0] == DATA_FORMAT_VERSION_[0]
&& version[1] >= DATA_FORMAT_VERSION_[1]; && version[1] >= DATA_FORMAT_VERSION_[1];
// Too harsh // Too harsh
//&& version[1] == DATA_FORMAT_VERSION_[1] //&& version[1] == DATA_FORMAT_VERSION_[1]
//&& version[2] == DATA_FORMAT_VERSION_[2] //&& version[2] == DATA_FORMAT_VERSION_[2]
//&& version[3] == DATA_FORMAT_VERSION_[3]; //&& version[3] == DATA_FORMAT_VERSION_[3];
} }
}; };
/** /**
* Authenticate uca data format version * Authenticate uca data format version
*/ */
private static final ICUBinary.Authenticate INVERSE_UCA_AUTHENTICATE_ private static final ICUBinary.Authenticate INVERSE_UCA_AUTHENTICATE_
= new ICUBinary.Authenticate() { = new ICUBinary.Authenticate() {
public boolean isDataVersionAcceptable(byte version[]) public boolean isDataVersionAcceptable(byte version[])
{ {
return version[0] return version[0]
== INVERSE_UCA_DATA_FORMAT_VERSION_[0] == INVERSE_UCA_DATA_FORMAT_VERSION_[0]
&& version[1] && version[1]
>= INVERSE_UCA_DATA_FORMAT_VERSION_[1]; >= INVERSE_UCA_DATA_FORMAT_VERSION_[1];
} }
}; };
/**
* Data input stream for uca.icu
*/
private DataInputStream m_dataInputStream_;
/**
* File format version and id that this class understands.
* No guarantees are made if a older version is used
*/
private static final byte DATA_FORMAT_VERSION_[] =
{(byte)0x2, (byte)0x2, (byte)0x0, (byte)0x0};
private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x43,
(byte)0x6f, (byte)0x6c};
/**
* Inverse UCA file format version and id that this class understands.
* No guarantees are made if a older version is used
*/
private static final byte INVERSE_UCA_DATA_FORMAT_VERSION_[] =
{(byte)0x2, (byte)0x1, (byte)0x0, (byte)0x0};
private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = {(byte)0x49,
(byte)0x6e,
(byte)0x76,
(byte)0x43};
/**
* Wrong unicode version error string
*/
private static final String WRONG_UNICODE_VERSION_ERROR_ =
"Unicode version in binary image is not compatible with the current Unicode version";
/** /**
* Size of expansion table in bytes * Data input stream for uca.icu
*/ */
private int m_expansionSize_; private DataInputStream m_dataInputStream_;
/** /**
* Size of contraction index table in bytes * File format version and id that this class understands.
* No guarantees are made if a older version is used
*/ */
private int m_contractionIndexSize_; private static final byte DATA_FORMAT_VERSION_[] =
{(byte)0x3, (byte)0x0, (byte)0x0, (byte)0x0};
private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x43,
(byte)0x6f, (byte)0x6c};
/** /**
* Size of contraction table in bytes * Inverse UCA file format version and id that this class understands.
* No guarantees are made if a older version is used
*/ */
private int m_contractionCESize_; private static final byte INVERSE_UCA_DATA_FORMAT_VERSION_[] =
/* {(byte)0x2, (byte)0x1, (byte)0x0, (byte)0x0};
* Size of the Trie in bytes private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = {(byte)0x49,
*/ (byte)0x6e,
//private int m_trieSize_; (byte)0x76,
(byte)0x43};
/** /**
* Size of the table that contains information about collation elements * Wrong unicode version error string
* that end with an expansion
*/ */
private int m_expansionEndCESize_; private static final String WRONG_UNICODE_VERSION_ERROR_ =
/** "Unicode version in binary image is not compatible with the current Unicode version";
* Size of the table that contains information about the maximum size of
* collation elements that end with a particular expansion CE corresponding /**
* to the ones in expansionEndCE * Size of expansion table in bytes
*/ */
private int m_expansionEndCEMaxSizeSize_; private int m_expansionSize_;
/** /**
* Size of the option table that contains information about the collation * Size of contraction index table in bytes
* options */
*/ private int m_contractionIndexSize_;
private int m_optionSize_; /**
/** * Size of contraction table in bytes
* Size of the whole data file minusing the ICU header */
*/ private int m_contractionCESize_;
private int m_size_; /*
/** * Size of the Trie in bytes
* Size of the collation data header */
*/ //private int m_trieSize_;
private int m_headerSize_; /**
/** * Size of the table that contains information about collation elements
* Size of the table that contains information about the "Unsafe" * that end with an expansion
* codepoints */
*/ private int m_expansionEndCESize_;
private int m_unsafeSize_; /**
/** * Size of the table that contains information about the maximum size of
* Size of the table that contains information about codepoints that ends * collation elements that end with a particular expansion CE corresponding
* with a contraction * to the ones in expansionEndCE
*/ */
private int m_contractionEndSize_; private int m_expansionEndCEMaxSizeSize_;
/** /**
* Size of the table that contains UCA contraction information * Size of the option table that contains information about the collation
*/ * options
private int m_UCAValuesSize_; */
private int m_optionSize_;
// private methods --------------------------------------------------- /**
* Size of the whole data file minusing the ICU header
*/
private int m_size_;
/**
* Size of the collation data header
*/
private int m_headerSize_;
/**
* Size of the table that contains information about the "Unsafe"
* codepoints
*/
private int m_unsafeSize_;
/**
* Size in bytes of the table that contains information about codepoints that ends
* with a contraction
*/
private int m_contractionSize_;
/**
* Size of the table that contains UCA contraction information in bytes
*/
private int m_UCAcontractionSize_;
/**
* Offset of the UCA Const
*/
private int m_UCAConstOffset_;
// private methods ---------------------------------------------------
} }