From 1fac4c690bf47e3ca739b0a10c431a1ec92f2679 Mon Sep 17 00:00:00 2001 From: Stuart Gill Date: Mon, 8 Nov 2010 18:57:42 +0000 Subject: [PATCH] ICU-3984 initial commit of the collation reordering X-SVN-Rev: 29015 --- .../icu/text/CollationParsedRuleBuilder.java | 1 + .../com/ibm/icu/text/CollationRuleParser.java | 59 +- .../src/com/ibm/icu/text/Collator.java | 34 + .../src/com/ibm/icu/text/CollatorReader.java | 546 +- .../com/ibm/icu/text/RuleBasedCollator.java | 4442 ++++++++--------- 5 files changed, 2606 insertions(+), 2476 deletions(-) diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java index 14d8c23a97..2ef16ba685 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java @@ -3631,6 +3631,7 @@ final class CollationParsedRuleBuilder { collator.m_isHiragana4_ = option.m_isHiragana4_; collator.setStrength(option.m_strength_); collator.m_variableTopValue_ = option.m_variableTopValue_; + collator.m_scriptOrder_ = option.m_scriptOrder_; collator.latinOneFailed_ = false; } diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java index 69e715404a..e8d8da220d 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java @@ -7,6 +7,7 @@ package com.ibm.icu.text; import java.text.ParseException; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -16,6 +17,7 @@ import com.ibm.icu.util.UResourceBundle; import com.ibm.icu.util.ULocale; import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UScript; /** * Class for parsing collation rules, produces a list of tokens that will be @@ -89,6 +91,14 @@ final class CollationRuleParser m_decomposition_ = collator.getDecomposition(); m_strength_ = collator.getStrength(); m_isHiragana4_ = collator.m_isHiragana4_; + + if(collator.m_scriptOrder_ != null){ + m_scriptOrder_ = new int[collator.m_scriptOrder_.length]; + for(int i = 0; i < m_scriptOrder_.length; i++){ + m_scriptOrder_[i] = collator.m_scriptOrder_[i]; + } + } + } // package private data members -------------------------------------- @@ -119,6 +129,11 @@ final class CollationRuleParser * attribute for special Hiragana */ boolean m_isHiragana4_; + + /** + * the ordering of the scripts + */ + int[] m_scriptOrder_; } /** @@ -291,6 +306,14 @@ final class CollationRuleParser collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_; collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_; collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_; + if(m_options_.m_scriptOrder_ != null) { + collator.m_defaultScriptOrder_ = new int[m_options_.m_scriptOrder_.length]; + for (int i = 0; i < m_options_.m_scriptOrder_.length; i++) { + collator.m_defaultScriptOrder_[i] = m_options_.m_scriptOrder_[i]; + } + } else { + collator.m_defaultScriptOrder_ = null; + } } // private inner classes ------------------------------------------------- @@ -662,7 +685,7 @@ final class CollationRuleParser RULES_OPTIONS_[15] = new TokenOption("undefined", RuleBasedCollator.Attribute.LIMIT_, null, null); - RULES_OPTIONS_[16] = new TokenOption("scriptOrder", + RULES_OPTIONS_[16] = new TokenOption("reorder", RuleBasedCollator.Attribute.LIMIT_, null, null); RULES_OPTIONS_[17] = new TokenOption("charsetname", @@ -2028,7 +2051,6 @@ final class CollationRuleParser return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current); } - /** in C, optionarg is passed by reference to function. * We use a private int to simulate this. */ @@ -2061,6 +2083,7 @@ final class CollationRuleParser } return i; } + /** * Reads and set collation options * @return TOKEN_SUCCESS if option is set correct, 0 otherwise @@ -2152,6 +2175,11 @@ final class CollationRuleParser m_optionEnd_ = m_current_-1; return TOKEN_SUCCESS_MASK_; } + else if(i == 16) { + m_current_ = m_optionarg_; // skip opening brace and name + parseScriptReorder(); + return TOKEN_SUCCESS_MASK_; + } else { throwParseException(m_rules_, optionarg); } @@ -2282,4 +2310,31 @@ final class CollationRuleParser } return rules; } + + private void parseScriptReorder() throws ParseException{ + ArrayList tempOrder = new ArrayList(); + int end = m_rules_.indexOf(']', m_current_); + while(m_current_ < end){ + // Ensure that the following token is 4 characters long + if ((end != m_current_+4) && + (m_rules_.charAt(m_current_+4) != ' ')) { + throw new ParseException(m_rules_, m_current_); + } + int[] script = UScript.getCode(m_rules_.substring(m_current_, m_current_+4)); + if (script.length > 0) { + tempOrder.add(script[0]); + } else { + throw new ParseException(m_rules_, m_current_); + } + m_current_+= 4; + while (m_current_ < end && UCharacter.isWhitespace(m_rules_.charAt(m_current_))) + { // eat whitespace + m_current_++; + } + } + m_options_.m_scriptOrder_ = new int[tempOrder.size()]; + for(int i = 0; i < tempOrder.size(); i++){ + m_options_.m_scriptOrder_[i] = tempOrder.get(i); + } + } } diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java index b0c1fcf8c5..19b7dcc243 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java @@ -225,6 +225,18 @@ public abstract class Collator implements Comparator, Cloneable */ public final static int CANONICAL_DECOMPOSITION = 17; + public final static class CollationReorderCodes { + private CollationReorderCodes() {} + + public final static int SPACE = 0x1000; + public final static int FIRST = SPACE; + public final static int PUNCTUATION = 0x1001; + public final static int SYMBOL = 0x1002; + public final static int CURRENCY = 0x1003; + public final static int DIGIT = 0x1004; + public final static int LIMIT = 0x1005; + + } // public methods -------------------------------------------------------- // public setters -------------------------------------------------------- @@ -314,6 +326,17 @@ public abstract class Collator implements Comparator, Cloneable } } + /** + * Set the order for scripts to be ordered in. + * @param order the reordering of scripts + * @see #getScriptOrder + * @stable + */ + public void setScriptOrder(int... order) + { + throw new UnsupportedOperationException(); + } + // public getters -------------------------------------------------------- /** @@ -988,6 +1011,17 @@ public abstract class Collator implements Comparator, Cloneable * @stable ICU 2.8 */ public abstract VersionInfo getUCAVersion(); + + /** + * Method to retrieve the script reordering + * @see #setScriptOrder + * @return the ordering of the scripts if one has been set, null otherwise. + * @stable + */ + public int[] getScriptOrder() + { + throw new UnsupportedOperationException(); + } // protected constructor ------------------------------------------------- diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollatorReader.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollatorReader.java index d3d02ed69c..8ddfaf6c8a 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollatorReader.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollatorReader.java @@ -1,9 +1,9 @@ /** -******************************************************************************* -* Copyright (C) 1996-2010, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ + ******************************************************************************* + * Copyright (C) 1996-2010, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ package com.ibm.icu.text; import java.io.BufferedInputStream; @@ -18,29 +18,30 @@ import com.ibm.icu.impl.ICUResourceBundle; import com.ibm.icu.impl.IntTrie; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.CollationParsedRuleBuilder.InverseUCA; +import com.ibm.icu.text.RuleBasedCollator.LeadByteConstants; import com.ibm.icu.text.RuleBasedCollator.UCAConstants; import com.ibm.icu.util.VersionInfo; /** -*

Internal reader class for ICU data file uca.icu containing -* Unicode Collation Algorithm data.

-*

This class simply reads uca.icu, authenticates that it is a valid -* ICU data file and split its contents up into blocks of data for use in -* com.ibm.icu.text.Collator. -*

-*

uca.icu which is in big-endian format is jared together with this -* package.

-* @author Syn Wee Quek -* @since release 2.2, April 18 2002 -*/ + *

Internal reader class for ICU data file uca.icu containing + * Unicode Collation Algorithm data.

+ *

This class simply reads uca.icu, authenticates that it is a valid + * ICU data file and split its contents up into blocks of data for use in + * com.ibm.icu.text.Collator. + *

+ *

uca.icu which is in big-endian format is jared together with this + * package.

+ * @author Syn Wee Quek + * @since release 2.2, April 18 2002 + */ final class CollatorReader -{ - static char[] read(RuleBasedCollator rbc, UCAConstants ucac) throws IOException { +{ + static char[] read(RuleBasedCollator rbc, UCAConstants ucac, LeadByteConstants leadByteConstants) throws IOException { InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/coll/ucadata.icu"); BufferedInputStream b = new BufferedInputStream(i, 90000); CollatorReader reader = new CollatorReader(b); - char[] result = reader.readImp(rbc, ucac); + char[] result = reader.readImp(rbc, ucac, leadByteConstants); b.close(); return result; } @@ -62,14 +63,23 @@ final class CollatorReader } static void initRBC(RuleBasedCollator rbc, ByteBuffer data) throws IOException { - final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2; + // TODO - why? 4 extra bytes? padding in the swapper? + //final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2; + final int MIN_BINARY_DATA_SIZE_ = 272; int dataLength = data.remaining(); // TODO: Change the rest of this class to use the ByteBuffer directly, rather than // a DataInputStream, except for passing an InputStream to ICUBinary.readHeader(). // Consider changing ICUBinary to also work with a ByteBuffer. CollatorReader reader = new CollatorReader(makeByteBufferInputStream(data), false); if (dataLength > MIN_BINARY_DATA_SIZE_) { - reader.readImp(rbc, null); +// for (int i = 0; i < dataLength; i++) { +// byte b = data.get(i); +// System.out.print("0x" + (((int) 0xff & b) < 0x0f ? "0" : "") + Integer.toHexString(0xff & b) + " "); +// if (i % 16 == 0) { +// System.out.println(); +// } +// } + reader.readImp(rbc, null, null); } else { reader.readHeader(rbc); reader.readOptions(rbc); @@ -77,30 +87,30 @@ final class CollatorReader rbc.setWithUCATables(); } } - + static InverseUCA getInverseUCA() throws IOException { InverseUCA result = null; InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/coll/invuca.icu"); -// try { -// String invdat = "/com/ibm/icu/impl/data/invuca.icu"; -// InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat); - BufferedInputStream b = new BufferedInputStream(i, 110000); - result = CollatorReader.readInverseUCA(b); - b.close(); - i.close(); - return result; -// } catch (Exception e) { -// throw new RuntimeException(e.getMessage()); -// } + // try { + // String invdat = "/com/ibm/icu/impl/data/invuca.icu"; + // InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat); + BufferedInputStream b = new BufferedInputStream(i, 110000); + result = CollatorReader.readInverseUCA(b); + b.close(); + i.close(); + return result; + // } catch (Exception e) { + // throw new RuntimeException(e.getMessage()); + // } } - + // protected constructor --------------------------------------------- - + /** - *

Protected constructor.

- * @param inputStream ICU collator file input stream - * @exception IOException throw if data file fails authentication - */ + *

Protected constructor.

+ * @param inputStream ICU collator file input stream + * @exception IOException throw if data file fails authentication + */ private CollatorReader(InputStream inputStream) throws IOException { this(inputStream, true); @@ -114,40 +124,40 @@ final class CollatorReader throw new IOException(WRONG_UNICODE_VERSION_ERROR_); } m_dataInputStream_ = new DataInputStream(inputStream); - */ + */ } - + /** - *

Protected constructor.

- * @param inputStream ICU uprops.icu file input stream - * @param readICUHeader flag to indicate if the ICU header has to be read - * @exception IOException throw if data file fails authentication - */ + *

Protected constructor.

+ * @param inputStream ICU uprops.icu file input stream + * @param readICUHeader flag to indicate if the ICU header has to be read + * @exception IOException throw if data file fails authentication + */ private CollatorReader(InputStream inputStream, boolean readICUHeader) - throws IOException + throws IOException { if (readICUHeader) { byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, - UCA_AUTHENTICATE_); + UCA_AUTHENTICATE_); // weiv: check that we have the correct Unicode version in // binary files VersionInfo UCDVersion = UCharacter.getUnicodeVersion(); if(UnicodeVersion[0] != UCDVersion.getMajor() - || UnicodeVersion[1] != UCDVersion.getMinor()) { + || UnicodeVersion[1] != UCDVersion.getMinor()) { throw new IOException(WRONG_UNICODE_VERSION_ERROR_); } } m_dataInputStream_ = new DataInputStream(inputStream); } - + // protected methods ------------------------------------------------- - + /** - * Read and break up the header stream of data passed in as arguments into - * meaningful Collator data. - * @param rbc RuleBasedCollator to populate with header information - * @exception IOException thrown when there's a data error. - */ + * Read and break up the header stream of data passed in as arguments into + * meaningful Collator data. + * @param rbc RuleBasedCollator to populate with header information + * @exception IOException thrown when there's a data error. + */ private void readHeader(RuleBasedCollator rbc) throws IOException { m_size_ = m_dataInputStream_.readInt(); @@ -158,11 +168,11 @@ final class CollatorReader int readcount = 8; // for size and headersize // structure which holds values for indirect positioning and implicit // ranges - int UCAConst = m_dataInputStream_.readInt(); + m_UCAConstOffset_ = m_dataInputStream_.readInt(); readcount += 4; // this one is needed only for UCA, to copy the appropriate // contractions - m_dataInputStream_.skip(4); + int contractionUCACombos = m_dataInputStream_.readInt(); readcount += 4; // reserved for future use m_dataInputStream_.skipBytes(4); @@ -180,7 +190,7 @@ final class CollatorReader int contractionCE = m_dataInputStream_.readInt(); readcount += 4; // needed for various closures int contractionSize - /*int contractionSize = */m_dataInputStream_.readInt(); + int contractionSize = m_dataInputStream_.readInt(); readcount += 4; // array of last collation element in expansion int expansionEndCE = m_dataInputStream_.readInt(); @@ -190,7 +200,7 @@ final class CollatorReader int expansionEndCEMaxSize = m_dataInputStream_.readInt(); readcount += 4; // size of endExpansionCE int expansionEndCESize - m_dataInputStream_.skipBytes(4); + /*int endExpansionCECount =*/ m_dataInputStream_.readInt(); readcount += 4; // hash table of unsafe code points int unsafe = m_dataInputStream_.readInt(); @@ -199,25 +209,35 @@ final class CollatorReader int contractionEnd = m_dataInputStream_.readInt(); readcount += 4; // int CEcount = m_dataInputStream_.readInt(); - m_dataInputStream_.skipBytes(4); + int contractionUCACombosSize = m_dataInputStream_.readInt(); readcount += 4; // is jamoSpecial rbc.m_isJamoSpecial_ = m_dataInputStream_.readBoolean(); readcount++; - // padding - m_dataInputStream_.skipBytes(3); - readcount += 3; + // isBigEndian and charSetFamily + m_dataInputStream_.skipBytes(2); + readcount += 2; + int contractionUCACombosWidth = m_dataInputStream_.readByte(); + readcount += 1; rbc.m_version_ = readVersion(m_dataInputStream_); readcount += 4; rbc.m_UCA_version_ = readVersion(m_dataInputStream_); readcount += 4; rbc.m_UCD_version_ = readVersion(m_dataInputStream_); readcount += 4; + VersionInfo formatVersion = readVersion(m_dataInputStream_); + readcount += 4; + rbc.m_scriptToLeadBytes = m_dataInputStream_.readInt(); + readcount += 4; + rbc.m_leadByteToScripts = m_dataInputStream_.readInt(); + readcount += 4; + // byte charsetName[] = new byte[32]; // for charset CEs m_dataInputStream_.skipBytes(32); readcount += 32; - m_dataInputStream_.skipBytes(56); // for future use - readcount += 56; + + m_dataInputStream_.skipBytes(44); // for future use + readcount += 44; if (m_headerSize_ < readcount) { ///CLOVER:OFF throw new IOException("Internal Error: Header size error"); @@ -237,16 +257,20 @@ final class CollatorReader m_expansionEndCESize_ = expansionEndCEMaxSize - expansionEndCE; m_expansionEndCEMaxSizeSize_ = unsafe - expansionEndCEMaxSize; m_unsafeSize_ = contractionEnd - unsafe; - m_UCAValuesSize_ = m_size_ - UCAConst; // UCA value, will be handled - // later + //m_UCAValuesSize_ = m_size_ - UCAConst; // UCA value, will be handled later + m_UCAcontractionSize_ = contractionUCACombosSize * contractionUCACombosWidth * 2; + // treat it as normal collator first // for normal collator there is no UCA contraction - m_contractionEndSize_ = m_size_ - contractionEnd; + // contractions (UChar[contractionSize] + CE[contractionSize]) + int old_contractionSize_ = m_size_ - contractionEnd; + // m_contractionSize_ = contractionSize * 2 + contractionSize * 4; + m_contractionSize_ = contractionSize * 2 + contractionSize * 4; rbc.m_contractionOffset_ >>= 1; // casting to ints rbc.m_expansionOffset_ >>= 2; // casting to chars } - + /** * Read and break up the collation options passed in the stream of data and * update the argument Collator with the results @@ -262,16 +286,19 @@ final class CollatorReader rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt(); readcount += 4; rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt() - == RuleBasedCollator.AttributeValue.ON_); + == RuleBasedCollator.AttributeValue.ON_); readcount += 4; rbc.m_defaultIsAlternateHandlingShifted_ - = (m_dataInputStream_.readInt() == - RuleBasedCollator.AttributeValue.SHIFTED_); + = (m_dataInputStream_.readInt() == + RuleBasedCollator.AttributeValue.SHIFTED_); readcount += 4; rbc.m_defaultCaseFirst_ = m_dataInputStream_.readInt(); readcount += 4; - rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt() - == RuleBasedCollator.AttributeValue.ON_); + // rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt() + // == RuleBasedCollator.AttributeValue.ON_); + int defaultIsCaseLevel = m_dataInputStream_.readInt(); + rbc.m_defaultIsCaseLevel_ = (defaultIsCaseLevel + == RuleBasedCollator.AttributeValue.ON_); readcount += 4; int value = m_dataInputStream_.readInt(); readcount += 4; @@ -285,10 +312,10 @@ final class CollatorReader rbc.m_defaultStrength_ = m_dataInputStream_.readInt(); readcount += 4; rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt() - == RuleBasedCollator.AttributeValue.ON_); + == RuleBasedCollator.AttributeValue.ON_); readcount += 4; rbc.m_defaultIsNumericCollation_ = (m_dataInputStream_.readInt() - == RuleBasedCollator.AttributeValue.ON_); + == RuleBasedCollator.AttributeValue.ON_); readcount += 4; m_dataInputStream_.skip(60); // reserved for future use readcount += 60; @@ -299,21 +326,25 @@ final class CollatorReader ///CLOVER:ON } } - + /** - * Read and break up the stream of data passed in as arguments into - * meaningful Collator data. - * @param rbc RuleBasedCollator to populate - * @param UCAConst object to fill up with UCA constants if we are reading - * the UCA collator, if not use a null - * @return UCAContractions array filled up with the UCA contractions if we - * are reading the UCA collator - * @exception IOException thrown when there's a data error. - */ + * Read and break up the stream of data passed in as arguments into + * meaningful Collator data. + * @param rbc RuleBasedCollator to populate + * @param UCAConst object to fill up with UCA constants if we are reading + * the UCA collator, if not use a null + * @param leadByteConstants + * @return UCAContractions array filled up with the UCA contractions if we + * are reading the UCA collator + * @exception IOException thrown when there's a data error. + */ private char[] readImp(RuleBasedCollator rbc, - RuleBasedCollator.UCAConstants UCAConst) - throws IOException + RuleBasedCollator.UCAConstants UCAConst, + RuleBasedCollator.LeadByteConstants leadByteConstants) + throws IOException { + char ucaContractions[] = null; // return result + readHeader(rbc); // header size has been checked by readHeader int readcount = m_headerSize_; @@ -328,24 +359,24 @@ final class CollatorReader readcount += (m_expansionSize_ << 2); if (m_contractionIndexSize_ > 0) { m_contractionIndexSize_ >>= 1; - rbc.m_contractionIndex_ = new char[m_contractionIndexSize_]; - for (int i = 0; i < m_contractionIndexSize_; i ++) { - rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar(); - } - readcount += (m_contractionIndexSize_ << 1); - m_contractionCESize_ >>= 2; - rbc.m_contractionCE_ = new int[m_contractionCESize_]; - for (int i = 0; i < m_contractionCESize_; i ++) { - rbc.m_contractionCE_[i] = m_dataInputStream_.readInt(); - } - readcount += (m_contractionCESize_ << 2); + rbc.m_contractionIndex_ = new char[m_contractionIndexSize_]; + for (int i = 0; i < m_contractionIndexSize_; i ++) { + rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar(); + } + readcount += (m_contractionIndexSize_ << 1); + m_contractionCESize_ >>= 2; + rbc.m_contractionCE_ = new int[m_contractionCESize_]; + for (int i = 0; i < m_contractionCESize_; i ++) { + rbc.m_contractionCE_[i] = m_dataInputStream_.readInt(); + } + readcount += (m_contractionCESize_ << 2); } rbc.m_trie_ = new IntTrie(m_dataInputStream_, - RuleBasedCollator.DataManipulate.getInstance()); + RuleBasedCollator.DataManipulate.getInstance()); if (!rbc.m_trie_.isLatin1Linear()) { throw new IOException("Data corrupted, " - + "Collator Tries expected to have linear " - + "latin one data arrays"); + + "Collator Tries expected to have linear " + + "latin one data arrays"); } readcount += rbc.m_trie_.getSerializedDataSize(); m_expansionEndCESize_ >>= 2; @@ -368,13 +399,16 @@ final class CollatorReader // we are reading the UCA // unfortunately the UCA offset in any collator data is not 0 and // only refers to the UCA data - m_contractionEndSize_ -= m_UCAValuesSize_; + //m_contractionSize_ -= m_UCAValuesSize_; + m_contractionSize_ = m_UCAConstOffset_ - readcount; + } else { + m_contractionSize_ = m_size_ - readcount; } - rbc.m_contractionEnd_ = new byte[m_contractionEndSize_]; - for (int i = 0; i < m_contractionEndSize_; i ++) { + rbc.m_contractionEnd_ = new byte[m_contractionSize_]; + for (int i = 0; i < m_contractionSize_; i ++) { rbc.m_contractionEnd_[i] = m_dataInputStream_.readByte(); } - readcount += m_contractionEndSize_; + readcount += m_contractionSize_; if (UCAConst != null) { UCAConst.FIRST_TERTIARY_IGNORABLE_[0] = m_dataInputStream_.readInt(); @@ -383,22 +417,22 @@ final class CollatorReader = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.LAST_TERTIARY_IGNORABLE_[0] - = m_dataInputStream_.readInt(); + = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.LAST_TERTIARY_IGNORABLE_[1] - = m_dataInputStream_.readInt(); + = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.FIRST_PRIMARY_IGNORABLE_[0] - = m_dataInputStream_.readInt(); + = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.FIRST_PRIMARY_IGNORABLE_[1] - = m_dataInputStream_.readInt(); + = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.FIRST_SECONDARY_IGNORABLE_[0] - = m_dataInputStream_.readInt(); + = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.FIRST_SECONDARY_IGNORABLE_[1] - = m_dataInputStream_.readInt(); + = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.LAST_SECONDARY_IGNORABLE_[0] = m_dataInputStream_.readInt(); @@ -407,10 +441,10 @@ final class CollatorReader = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.LAST_PRIMARY_IGNORABLE_[0] - = m_dataInputStream_.readInt(); + = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.LAST_PRIMARY_IGNORABLE_[1] - = m_dataInputStream_.readInt(); + = m_dataInputStream_.readInt(); readUCAConstcount += 4; UCAConst.FIRST_VARIABLE_[0] = m_dataInputStream_.readInt(); readUCAConstcount += 4; @@ -462,27 +496,39 @@ final class CollatorReader readUCAConstcount += 4; UCAConst.PRIMARY_SPECIAL_MAX_ = m_dataInputStream_.readInt(); readUCAConstcount += 4; - int resultsize = (m_UCAValuesSize_ - readUCAConstcount) >> 1; - char result[] = new char[resultsize]; + + readcount += readUCAConstcount; + + //int resultsize = m_UCAcontractionSize_ / 2; + int resultsize = (rbc.m_scriptToLeadBytes - readcount) / 2; + ucaContractions = new char[resultsize]; for (int i = 0; i < resultsize; i ++) { - result[i] = m_dataInputStream_.readChar(); + ucaContractions[i] = m_dataInputStream_.readChar(); } - readcount += m_UCAValuesSize_; - if (readcount != m_size_) { - ///CLOVER:OFF - throw new IOException("Internal Error: Data file size error"); - ///CLOVER:ON - } - return result; + readcount += m_UCAcontractionSize_; + + // if (readcount != m_size_) { + // ///CLOVER:OFF + // throw new IOException("Internal Error: Data file size error"); + // ///CLOVER:ON + // } } + + if (leadByteConstants != null) + { + readcount += m_dataInputStream_.skip(rbc.m_scriptToLeadBytes - readcount); + leadByteConstants.read(m_dataInputStream_); + readcount += leadByteConstants.getSerializedDataSize(); + } + if (readcount != m_size_) { ///CLOVER:OFF throw new IOException("Internal Error: Data file size error"); ///CLOVER:ON } - return null; + return ucaContractions; } - + /** * Reads in the inverse uca data * @param input input stream with the inverse uca data @@ -491,22 +537,22 @@ final class CollatorReader * inverse uca */ private static CollationParsedRuleBuilder.InverseUCA readInverseUCA( - InputStream inputStream) - throws IOException + InputStream inputStream) + throws IOException { - byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_, - INVERSE_UCA_AUTHENTICATE_); - + byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_, + INVERSE_UCA_AUTHENTICATE_); + // weiv: check that we have the correct Unicode version in // binary files VersionInfo UCDVersion = UCharacter.getUnicodeVersion(); if(UnicodeVersion[0] != UCDVersion.getMajor() - || UnicodeVersion[1] != UCDVersion.getMinor()) { + || UnicodeVersion[1] != UCDVersion.getMinor()) { throw new IOException(WRONG_UNICODE_VERSION_ERROR_); } - + CollationParsedRuleBuilder.InverseUCA result = - new CollationParsedRuleBuilder.InverseUCA(); + new CollationParsedRuleBuilder.InverseUCA(); DataInputStream input = new DataInputStream(inputStream); input.readInt(); // bytesize int tablesize = input.readInt(); // in int size @@ -515,11 +561,11 @@ final class CollatorReader input.readInt(); // conts in bytes result.m_UCA_version_ = readVersion(input); input.skipBytes(8); // skip padding - + int size = tablesize * 3; // one column for each strength result.m_table_ = new int[size]; result.m_continuations_ = new char[contsize]; - + for (int i = 0; i < size; i ++) { result.m_table_[i] = input.readInt(); } @@ -529,7 +575,7 @@ final class CollatorReader input.close(); return result; } - + /** * Reads four bytes from the input and returns a VersionInfo * object. Use it to read different collator versions. @@ -539,143 +585,147 @@ final class CollatorReader * @throws IOException thrown when error occurs while reading * version bytes */ - + protected static VersionInfo readVersion(DataInputStream input) - throws IOException { + throws IOException { byte[] version = new byte[4]; version[0] = input.readByte(); version[1] = input.readByte(); version[2] = input.readByte(); version[3] = input.readByte(); - + VersionInfo result = - VersionInfo.getInstance( - (int)version[0], (int)version[1], - (int)version[2], (int)version[3]); - + VersionInfo.getInstance( + (int)version[0], (int)version[1], + (int)version[2], (int)version[3]); + return result; } - + // private inner class ----------------------------------------------- - + // private variables ------------------------------------------------- - + /** * Authenticate uca data format version */ private static final ICUBinary.Authenticate UCA_AUTHENTICATE_ - = new ICUBinary.Authenticate() { - public boolean isDataVersionAcceptable(byte version[]) - { - return version[0] == DATA_FORMAT_VERSION_[0] - && version[1] >= DATA_FORMAT_VERSION_[1]; - // Too harsh - //&& version[1] == DATA_FORMAT_VERSION_[1] - //&& version[2] == DATA_FORMAT_VERSION_[2] - //&& version[3] == DATA_FORMAT_VERSION_[3]; - } - }; - + = new ICUBinary.Authenticate() { + public boolean isDataVersionAcceptable(byte version[]) + { + return version[0] == DATA_FORMAT_VERSION_[0] + && version[1] >= DATA_FORMAT_VERSION_[1]; + // Too harsh + //&& version[1] == DATA_FORMAT_VERSION_[1] + //&& version[2] == DATA_FORMAT_VERSION_[2] + //&& version[3] == DATA_FORMAT_VERSION_[3]; + } + }; + /** * Authenticate uca data format version */ private static final ICUBinary.Authenticate INVERSE_UCA_AUTHENTICATE_ - = new ICUBinary.Authenticate() { - public boolean isDataVersionAcceptable(byte version[]) - { - return version[0] - == INVERSE_UCA_DATA_FORMAT_VERSION_[0] - && version[1] - >= INVERSE_UCA_DATA_FORMAT_VERSION_[1]; - } - }; - - /** - * Data input stream for uca.icu - */ - private DataInputStream m_dataInputStream_; - - /** - * File format version and id that this class understands. - * No guarantees are made if a older version is used - */ - private static final byte DATA_FORMAT_VERSION_[] = - {(byte)0x2, (byte)0x2, (byte)0x0, (byte)0x0}; - private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x43, - (byte)0x6f, (byte)0x6c}; - /** - * Inverse UCA file format version and id that this class understands. - * No guarantees are made if a older version is used - */ - private static final byte INVERSE_UCA_DATA_FORMAT_VERSION_[] = - {(byte)0x2, (byte)0x1, (byte)0x0, (byte)0x0}; - private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = {(byte)0x49, - (byte)0x6e, - (byte)0x76, - (byte)0x43}; - - /** - * Wrong unicode version error string - */ - private static final String WRONG_UNICODE_VERSION_ERROR_ = - "Unicode version in binary image is not compatible with the current Unicode version"; + = new ICUBinary.Authenticate() { + public boolean isDataVersionAcceptable(byte version[]) + { + return version[0] + == INVERSE_UCA_DATA_FORMAT_VERSION_[0] + && version[1] + >= INVERSE_UCA_DATA_FORMAT_VERSION_[1]; + } + }; /** - * Size of expansion table in bytes + * Data input stream for uca.icu */ - private int m_expansionSize_; + private DataInputStream m_dataInputStream_; + /** - * Size of contraction index table in bytes + * File format version and id that this class understands. + * No guarantees are made if a older version is used */ - private int m_contractionIndexSize_; + private static final byte DATA_FORMAT_VERSION_[] = + {(byte)0x3, (byte)0x0, (byte)0x0, (byte)0x0}; + private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x43, + (byte)0x6f, (byte)0x6c}; /** - * Size of contraction table in bytes + * Inverse UCA file format version and id that this class understands. + * No guarantees are made if a older version is used */ - private int m_contractionCESize_; - /* - * Size of the Trie in bytes - */ - //private int m_trieSize_; + private static final byte INVERSE_UCA_DATA_FORMAT_VERSION_[] = + {(byte)0x2, (byte)0x1, (byte)0x0, (byte)0x0}; + private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = {(byte)0x49, + (byte)0x6e, + (byte)0x76, + (byte)0x43}; + /** - * Size of the table that contains information about collation elements - * that end with an expansion + * Wrong unicode version error string */ - private int m_expansionEndCESize_; - /** - * Size of the table that contains information about the maximum size of - * collation elements that end with a particular expansion CE corresponding - * to the ones in expansionEndCE - */ - private int m_expansionEndCEMaxSizeSize_; - /** - * Size of the option table that contains information about the collation - * options - */ - private int m_optionSize_; - /** - * Size of the whole data file minusing the ICU header - */ - private int m_size_; - /** - * Size of the collation data header - */ - private int m_headerSize_; - /** - * Size of the table that contains information about the "Unsafe" - * codepoints - */ - private int m_unsafeSize_; - /** - * Size of the table that contains information about codepoints that ends - * with a contraction - */ - private int m_contractionEndSize_; - /** - * Size of the table that contains UCA contraction information - */ - private int m_UCAValuesSize_; - - // private methods --------------------------------------------------- - + private static final String WRONG_UNICODE_VERSION_ERROR_ = + "Unicode version in binary image is not compatible with the current Unicode version"; + + /** + * Size of expansion table in bytes + */ + private int m_expansionSize_; + /** + * Size of contraction index table in bytes + */ + private int m_contractionIndexSize_; + /** + * Size of contraction table in bytes + */ + private int m_contractionCESize_; + /* + * Size of the Trie in bytes + */ + //private int m_trieSize_; + /** + * Size of the table that contains information about collation elements + * that end with an expansion + */ + private int m_expansionEndCESize_; + /** + * Size of the table that contains information about the maximum size of + * collation elements that end with a particular expansion CE corresponding + * to the ones in expansionEndCE + */ + private int m_expansionEndCEMaxSizeSize_; + /** + * Size of the option table that contains information about the collation + * options + */ + private int m_optionSize_; + /** + * Size of the whole data file minusing the ICU header + */ + private int m_size_; + /** + * Size of the collation data header + */ + private int m_headerSize_; + /** + * Size of the table that contains information about the "Unsafe" + * codepoints + */ + private int m_unsafeSize_; + /** + * Size in bytes of the table that contains information about codepoints that ends + * with a contraction + */ + private int m_contractionSize_; + /** + * Size of the table that contains UCA contraction information in bytes + */ + private int m_UCAcontractionSize_; + /** + * Offset of the UCA Const + */ + private int m_UCAConstOffset_; + + // private methods --------------------------------------------------- + } diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java index 593e130a53..78b5a80b42 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java @@ -1,17 +1,22 @@ /** -******************************************************************************* -* Copyright (C) 1996-2010, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ + ******************************************************************************* + * Copyright (C) 1996-2010, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ package com.ibm.icu.text; +import java.io.DataInputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.text.CharacterIterator; import java.text.ParseException; import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; import java.util.MissingResourceException; +import java.util.Set; import com.ibm.icu.impl.BOCU; import com.ibm.icu.impl.ICUDebug; @@ -23,194 +28,175 @@ import com.ibm.icu.impl.Trie; import com.ibm.icu.impl.TrieIterator; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UScript; import com.ibm.icu.util.RangeValueIterator; import com.ibm.icu.util.ULocale; import com.ibm.icu.util.UResourceBundle; import com.ibm.icu.util.VersionInfo; /** - *

RuleBasedCollator is a concrete subclass of Collator. It allows - * customization of the Collator via user-specified rule sets. - * RuleBasedCollator is designed to be fully compliant to the Unicode - * Collation Algorithm (UCA) and conforms to ISO 14651.

- * - *

Users are strongly encouraged to read - * the users guide for more information about the collation - * service before using this class.

- * - *

Create a RuleBasedCollator from a locale by calling the - * getInstance(Locale) factory method in the base class Collator. - * Collator.getInstance(Locale) creates a RuleBasedCollator object - * based on the collation rules defined by the argument locale. If a - * customized collation ordering ar attributes is required, use the - * RuleBasedCollator(String) constructor with the appropriate - * rules. The customized RuleBasedCollator will base its ordering on - * UCA, while re-adjusting the attributes and orders of the characters - * in the specified rule accordingly.

- * - *

RuleBasedCollator provides correct collation orders for most - * locales supported in ICU. If specific data for a locale is not - * available, the orders eventually falls back to the UCA collation - * order .

- * - *

For information about the collation rule syntax and details - * about customization, please refer to the - * - * Collation customization section of the user's guide.

- * - *

Note that there are some differences between - * the Collation rule syntax used in Java and ICU4J: - * - *

    - *
  • According to the JDK documentation: - * *

    - * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule - * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a - * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the - * range \U0EC0-\U0EC4 precedes a Lao consonant of the range - * \U0E81-\U0EAE then the - * vowel is placed after the consonant for collation purposes. + * RuleBasedCollator is a concrete subclass of Collator. It allows customization of the Collator via user-specified rule + * sets. RuleBasedCollator is designed to be fully compliant to the Unicode Collation Algorithm (UCA) and conforms to ISO 14651. + *

    + * + *

    + * Users are strongly encouraged to read the users + * guide for more information about the collation service before using this class. + *

    + * + *

    + * Create a RuleBasedCollator from a locale by calling the getInstance(Locale) factory method in the base class + * Collator. Collator.getInstance(Locale) creates a RuleBasedCollator object based on the collation rules defined by the + * argument locale. If a customized collation ordering ar attributes is required, use the RuleBasedCollator(String) + * constructor with the appropriate rules. The customized RuleBasedCollator will base its ordering on UCA, while + * re-adjusting the attributes and orders of the characters in the specified rule accordingly. + *

    + * + *

    + * RuleBasedCollator provides correct collation orders for most locales supported in ICU. If specific data for a locale + * is not available, the orders eventually falls back to the UCA + * collation order . + *

    + * + *

    + * For information about the collation rule syntax and details about customization, please refer to the Collation customization section of the + * user's guide. + *

    + * + *

    + * Note that there are some differences between the Collation rule syntax used in Java and ICU4J: + * + *

      + *
    • According to the JDK documentation: + *

      + * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule is in force when a Thai vowel of the range + * \U0E40-\U0E44 precedes a Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the range + * \U0EC0-\U0EC4 precedes a Lao consonant of the range \U0E81-\U0EAE then the vowel is placed after the + * consonant for collation purposes. *

      *

      - * If a rule is without the modifier '!', the Thai/Lao vowel-consonant - * swapping is not turned on. + * If a rule is without the modifier '!', the Thai/Lao vowel-consonant swapping is not turned on. *

      *
      *

      - * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao - * vowel-consonant swapping, since the UCA clearly states that it has to be - * supported to ensure a correct sorting order. If a '!' is encountered, it is - * ignored. + * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao vowel-consonant swapping, since the UCA clearly + * states that it has to be supported to ensure a correct sorting order. If a '!' is encountered, it is ignored. *

      - *
    • As mentioned in the documentation of the base class Collator, - * compatibility decomposition mode is not supported. + *
    • As mentioned in the documentation of the base class Collator, compatibility decomposition mode is not supported. *
    *

    * Examples *

    *

    - * Creating Customized RuleBasedCollators: - *

    + * Creating Customized RuleBasedCollators:
    + * *
    - * String simple = "& a < b < c < d";
    + * String simple = "& a < b < c < d";
      * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
    - *
    - * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
    - *                    + "< f , F < g , G < h , H < i , I < j , "
    - *                    + "J < k , K < l , L < m , M < n , N < "
    - *                    + "o , O < p , P < q , Q < r , R < s , S < "
    - *                    + "t , T < u , U < v , V < w , W < x , X "
    - *                    + "< y , Y < z , Z < \u00E5 = a\u030A "
    - *                    + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
    - *                    + ", \u00C6 < \u00F8 , \u00D8";
    + * 
    + * String norwegian = "& a , A < b , B < c , C < d , D < e , E " + "< f , F < g , G < h , H < i , I < j , "
    + *         + "J < k , K < l , L < m , M < n , N < " + "o , O < p , P < q , Q &lt r , R &lt s , S < "
    + *         + "t , T < u , U < v , V < w , W < x , X " + "< y , Y < z , Z < \u00E5 = a\u030A "
    + *         + ", \u00C5 = A\u030A ; aa , AA < \u00E6 " + ", \u00C6 < \u00F8 , \u00D8";
      * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
      * 
    + * *
    - * - * Concatenating rules to combine Collators: - *
    + * + * Concatenating rules to combine Collators:
    + * *
      * // Create an en_US Collator object
    - * RuleBasedCollator en_USCollator = (RuleBasedCollator)
    - *     Collator.getInstance(new Locale("en", "US", ""));
    + * RuleBasedCollator en_USCollator = (RuleBasedCollator) Collator.getInstance(new Locale("en", "US", ""));
      * // Create a da_DK Collator object
    - * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
    - *     Collator.getInstance(new Locale("da", "DK", ""));
    + * RuleBasedCollator da_DKCollator = (RuleBasedCollator) Collator.getInstance(new Locale("da", "DK", ""));
      * // Combine the two
      * // First, get the collation rules from en_USCollator
      * String en_USRules = en_USCollator.getRules();
      * // Second, get the collation rules from da_DKCollator
      * String da_DKRules = da_DKCollator.getRules();
    - * RuleBasedCollator newCollator =
    - *                             new RuleBasedCollator(en_USRules + da_DKRules);
    + * RuleBasedCollator newCollator = new RuleBasedCollator(en_USRules + da_DKRules);
      * // newCollator has the combined rules
      * 
    + * *
    - * - * Making changes to an existing RuleBasedCollator to create a new - * Collator object, by appending changes to the existing rule: - *
    + * + * Making changes to an existing RuleBasedCollator to create a new Collator object, by appending changes to + * the existing rule:
    + * *
      * // Create a new Collator object with additional rules
    - * String addRules = "& C < ch, cH, Ch, CH";
    - * RuleBasedCollator myCollator =
    - *     new RuleBasedCollator(en_USCollator.getRules() + addRules);
    + * String addRules = "& C < ch, cH, Ch, CH";
    + * RuleBasedCollator myCollator = new RuleBasedCollator(en_USCollator.getRules() + addRules);
      * // myCollator contains the new rules
      * 
    + * *
    - * - * How to change the order of non-spacing accents: - *
    + * + * How to change the order of non-spacing accents:
    + * *
      * // old rule with main accents
    - * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
    - *                 + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
    - *                 + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
    - *                 + "; \u030B ; \u030C ; \u030D ; \u030E "
    - *                 + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
    - *                 + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
    - *                 + "< b , B < c, C < e, E & C < d , D";
    + * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 " + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
    + *         + "; \u0306 ; \u0307 ; \u0309 ; \u030A " + "; \u030B ; \u030C ; \u030D ; \u030E "
    + *         + "; \u030F ; \u0310 ; \u0311 ; \u0312 " + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
    + *         + "< b , B < c, C < e, E & C < d , D";
      * // change the order of accent characters
    - * String addOn = "& \u0300 ; \u0308 ; \u0302";
    + * String addOn = "& \u0300 ; \u0308 ; \u0302";
      * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
      * 
    + * *
    - * - * Putting in a new primary ordering before the default setting, - * e.g. sort English characters before or after Japanese characters in the Japanese - * Collator: - *
    + * + * Putting in a new primary ordering before the default setting, e.g. sort English characters before or after Japanese + * characters in the Japanese Collator:
    + * *
      * // get en_US Collator rules
    - * RuleBasedCollator en_USCollator
    - *                        = (RuleBasedCollator)Collator.getInstance(Locale.US);
    + * RuleBasedCollator en_USCollator = (RuleBasedCollator) Collator.getInstance(Locale.US);
      * // add a few Japanese characters to sort before English characters
      * // suppose the last character before the first base letter 'a' in
    - * // the English collation rule is \u2212
    - * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "
    - *                   + "\u3044";
    - * RuleBasedCollator myJapaneseCollator
    - *              = new RuleBasedCollator(en_USCollator.getRules() + jaString);
    + * // the English collation rule is \u2212
    + * String jaString = "& \u2212 &lt \u3041, \u3042 &lt \u3043, " + "\u3044";
    + * RuleBasedCollator myJapaneseCollator = new RuleBasedCollator(en_USCollator.getRules() + jaString);
      * 
    + * *
    *

    *

    * This class is not subclassable *

    + * * @author Syn Wee Quek * @stable ICU 2.8 */ -public final class RuleBasedCollator extends Collator -{ +public final class RuleBasedCollator extends Collator { // public constructors --------------------------------------------------- /** *

    - * Constructor that takes the argument rules for - * customization. The collator will be based on UCA, - * with the attributes and re-ordering of the characters specified in the - * argument rules. + * Constructor that takes the argument rules for customization. The collator will be based on UCA, with the + * attributes and re-ordering of the characters specified in the argument rules. *

    - *

    See the user guide's section on - * + *

    + * See the user guide's section on * Collation Customization for details on the rule syntax. *

    - * @param rules the collation rules to build the collation table from. - * @exception ParseException and IOException thrown. ParseException thrown - * when argument rules have an invalid syntax. IOException - * thrown when an error occured while reading internal data. + * + * @param rules + * the collation rules to build the collation table from. + * @exception ParseException + * and IOException thrown. ParseException thrown when argument rules have an invalid syntax. + * IOException thrown when an error occured while reading internal data. * @stable ICU 2.8 */ - public RuleBasedCollator(String rules) throws Exception - { + public RuleBasedCollator(String rules) throws Exception { checkUCA(); if (rules == null) { - throw new IllegalArgumentException( - "Collation rules can not be null"); + throw new IllegalArgumentException("Collation rules can not be null"); } init(rules); } @@ -219,12 +205,12 @@ public final class RuleBasedCollator extends Collator /** * Clones the RuleBasedCollator + * * @return a new instance of this RuleBasedCollator object * @stable ICU 2.8 */ - public Object clone() throws CloneNotSupportedException - { - RuleBasedCollator result = (RuleBasedCollator)super.clone(); + public Object clone() throws CloneNotSupportedException { + RuleBasedCollator result = (RuleBasedCollator) super.clone(); if (latinOneCEs_ != null) { result.m_reallocLatinOneCEs_ = true; result.m_ContInfo_ = new ContractionInfo(); @@ -232,107 +218,97 @@ public final class RuleBasedCollator extends Collator // since all collation data in the RuleBasedCollator do not change // we can safely assign the result.fields to this collator - result.initUtility(false); // let the new clone have their own util - // iterators + result.initUtility(false); // let the new clone have their own util + // iterators return result; } /** * Return a CollationElementIterator for the given String. + * * @see CollationElementIterator * @stable ICU 2.8 */ - public CollationElementIterator getCollationElementIterator(String source) - { + public CollationElementIterator getCollationElementIterator(String source) { return new CollationElementIterator(source, this); } /** - * Return a CollationElementIterator for the given CharacterIterator. - * The source iterator's integrity will be preserved since a new copy - * will be created for use. + * Return a CollationElementIterator for the given CharacterIterator. The source iterator's integrity will be + * preserved since a new copy will be created for use. + * * @see CollationElementIterator * @stable ICU 2.8 */ - public CollationElementIterator getCollationElementIterator( - CharacterIterator source) - { - CharacterIterator newsource = (CharacterIterator)source.clone(); + public CollationElementIterator getCollationElementIterator(CharacterIterator source) { + CharacterIterator newsource = (CharacterIterator) source.clone(); return new CollationElementIterator(newsource, this); } - + /** - * Return a CollationElementIterator for the given UCharacterIterator. - * The source iterator's integrity will be preserved since a new copy - * will be created for use. + * Return a CollationElementIterator for the given UCharacterIterator. The source iterator's integrity will be + * preserved since a new copy will be created for use. + * * @see CollationElementIterator * @stable ICU 2.8 */ - public CollationElementIterator getCollationElementIterator( - UCharacterIterator source) - { + public CollationElementIterator getCollationElementIterator(UCharacterIterator source) { return new CollationElementIterator(source, this); } // public setters -------------------------------------------------------- /** - * Sets the Hiragana Quaternary mode to be on or off. - * When the Hiragana Quaternary mode is turned on, the collator - * positions Hiragana characters before all non-ignorable characters in - * QUATERNARY strength. This is to produce a correct JIS collation order, - * distinguishing between Katakana and Hiragana characters. - * @param flag true if Hiragana Quaternary mode is to be on, false - * otherwise + * Sets the Hiragana Quaternary mode to be on or off. When the Hiragana Quaternary mode is turned on, the collator + * positions Hiragana characters before all non-ignorable characters in QUATERNARY strength. This is to produce a + * correct JIS collation order, distinguishing between Katakana and Hiragana characters. + * + * @param flag + * true if Hiragana Quaternary mode is to be on, false otherwise * @see #setHiraganaQuaternaryDefault * @see #isHiraganaQuaternary * @stable ICU 2.8 */ - public void setHiraganaQuaternary(boolean flag) - { + public void setHiraganaQuaternary(boolean flag) { m_isHiragana4_ = flag; - updateInternalState(); + updateInternalState(); } /** - * Sets the Hiragana Quaternary mode to the initial mode set during - * construction of the RuleBasedCollator. - * See setHiraganaQuaternary(boolean) for more details. + * Sets the Hiragana Quaternary mode to the initial mode set during construction of the RuleBasedCollator. See + * setHiraganaQuaternary(boolean) for more details. + * * @see #setHiraganaQuaternary(boolean) * @see #isHiraganaQuaternary * @stable ICU 2.8 */ - public void setHiraganaQuaternaryDefault() - { + public void setHiraganaQuaternaryDefault() { m_isHiragana4_ = m_defaultIsHiragana4_; updateInternalState(); } /** - * Sets whether uppercase characters sort before lowercase - * characters or vice versa, in strength TERTIARY. The default - * mode is false, and so lowercase characters sort before uppercase - * characters. - * If true, sort upper case characters first. - * @param upperfirst true to sort uppercase characters before - * lowercase characters, false to sort lowercase - * characters before uppercase characters + * Sets whether uppercase characters sort before lowercase characters or vice versa, in strength TERTIARY. The + * default mode is false, and so lowercase characters sort before uppercase characters. If true, sort upper case + * characters first. + * + * @param upperfirst + * true to sort uppercase characters before lowercase characters, false to sort lowercase characters + * before uppercase characters * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setLowerCaseFirst * @see #setCaseFirstDefault * @stable ICU 2.8 */ - public void setUpperCaseFirst(boolean upperfirst) - { + public void setUpperCaseFirst(boolean upperfirst) { if (upperfirst) { - if(m_caseFirst_ != AttributeValue.UPPER_FIRST_) { + if (m_caseFirst_ != AttributeValue.UPPER_FIRST_) { latinOneRegenTable_ = true; } m_caseFirst_ = AttributeValue.UPPER_FIRST_; - } - else { - if(m_caseFirst_ != AttributeValue.OFF_) { + } else { + if (m_caseFirst_ != AttributeValue.OFF_) { latinOneRegenTable_ = true; } m_caseFirst_ = AttributeValue.OFF_; @@ -341,53 +317,46 @@ public final class RuleBasedCollator extends Collator } /** - * Sets the orders of lower cased characters to sort before upper cased - * characters, in strength TERTIARY. The default - * mode is false. - * If true is set, the RuleBasedCollator will sort lower cased characters - * before the upper cased ones. - * Otherwise, if false is set, the RuleBasedCollator will ignore case - * preferences. - * @param lowerfirst true for sorting lower cased characters before - * upper cased characters, false to ignore case - * preferences. + * Sets the orders of lower cased characters to sort before upper cased characters, in strength TERTIARY. The + * default mode is false. If true is set, the RuleBasedCollator will sort lower cased characters before the upper + * cased ones. Otherwise, if false is set, the RuleBasedCollator will ignore case preferences. + * + * @param lowerfirst + * true for sorting lower cased characters before upper cased characters, false to ignore case + * preferences. * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setUpperCaseFirst * @see #setCaseFirstDefault * @stable ICU 2.8 */ - public void setLowerCaseFirst(boolean lowerfirst) - { + public void setLowerCaseFirst(boolean lowerfirst) { if (lowerfirst) { - if(m_caseFirst_ != AttributeValue.LOWER_FIRST_) { - latinOneRegenTable_ = true; - } - m_caseFirst_ = AttributeValue.LOWER_FIRST_; - } - else { - if(m_caseFirst_ != AttributeValue.OFF_) { - latinOneRegenTable_ = true; - } - m_caseFirst_ = AttributeValue.OFF_; + if (m_caseFirst_ != AttributeValue.LOWER_FIRST_) { + latinOneRegenTable_ = true; } + m_caseFirst_ = AttributeValue.LOWER_FIRST_; + } else { + if (m_caseFirst_ != AttributeValue.OFF_) { + latinOneRegenTable_ = true; + } + m_caseFirst_ = AttributeValue.OFF_; + } updateInternalState(); } /** - * Sets the case first mode to the initial mode set during - * construction of the RuleBasedCollator. - * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more - * details. + * Sets the case first mode to the initial mode set during construction of the RuleBasedCollator. See + * setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more details. + * * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setLowerCaseFirst(boolean) * @see #setUpperCaseFirst(boolean) * @stable ICU 2.8 */ - public final void setCaseFirstDefault() - { - if(m_caseFirst_ != m_defaultCaseFirst_) { + public final void setCaseFirstDefault() { + if (m_caseFirst_ != m_defaultCaseFirst_) { latinOneRegenTable_ = true; } m_caseFirst_ = m_defaultCaseFirst_; @@ -395,58 +364,54 @@ public final class RuleBasedCollator extends Collator } /** - * Sets the alternate handling mode to the initial mode set during - * construction of the RuleBasedCollator. - * See setAlternateHandling(boolean) for more details. + * Sets the alternate handling mode to the initial mode set during construction of the RuleBasedCollator. See + * setAlternateHandling(boolean) for more details. + * * @see #setAlternateHandlingShifted(boolean) * @see #isAlternateHandlingShifted() * @stable ICU 2.8 */ - public void setAlternateHandlingDefault() - { + public void setAlternateHandlingDefault() { m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; updateInternalState(); } /** - * Sets the case level mode to the initial mode set during - * construction of the RuleBasedCollator. - * See setCaseLevel(boolean) for more details. + * Sets the case level mode to the initial mode set during construction of the RuleBasedCollator. See + * setCaseLevel(boolean) for more details. + * * @see #setCaseLevel(boolean) * @see #isCaseLevel * @stable ICU 2.8 */ - public void setCaseLevelDefault() - { + public void setCaseLevelDefault() { m_isCaseLevel_ = m_defaultIsCaseLevel_; updateInternalState(); } /** - * Sets the decomposition mode to the initial mode set during construction - * of the RuleBasedCollator. - * See setDecomposition(int) for more details. + * Sets the decomposition mode to the initial mode set during construction of the RuleBasedCollator. See + * setDecomposition(int) for more details. + * * @see #getDecomposition * @see #setDecomposition(int) * @stable ICU 2.8 */ - public void setDecompositionDefault() - { + public void setDecompositionDefault() { setDecomposition(m_defaultDecomposition_); - updateInternalState(); + updateInternalState(); } /** - * Sets the French collation mode to the initial mode set during - * construction of the RuleBasedCollator. - * See setFrenchCollation(boolean) for more details. + * Sets the French collation mode to the initial mode set during construction of the RuleBasedCollator. See + * setFrenchCollation(boolean) for more details. + * * @see #isFrenchCollation * @see #setFrenchCollation(boolean) * @stable ICU 2.8 */ - public void setFrenchCollationDefault() - { - if(m_isFrenchCollation_ != m_defaultIsFrenchCollation_) { + public void setFrenchCollationDefault() { + if (m_isFrenchCollation_ != m_defaultIsFrenchCollation_) { latinOneRegenTable_ = true; } m_isFrenchCollation_ = m_defaultIsFrenchCollation_; @@ -454,51 +419,57 @@ public final class RuleBasedCollator extends Collator } /** - * Sets the collation strength to the initial mode set during the - * construction of the RuleBasedCollator. - * See setStrength(int) for more details. + * Sets the collation strength to the initial mode set during the construction of the RuleBasedCollator. See + * setStrength(int) for more details. + * * @see #setStrength(int) * @see #getStrength * @stable ICU 2.8 */ - public void setStrengthDefault() - { + public void setStrengthDefault() { setStrength(m_defaultStrength_); - updateInternalState(); + updateInternalState(); } - + /** - * Method to set numeric collation to its default value. - * When numeric collation is turned on, this Collator generates a collation - * key for the numeric value of substrings of digits. This is a way to get - * '100' to sort AFTER '2' + * Method to set numeric collation to its default value. When numeric collation is turned on, this Collator + * generates a collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER + * '2' + * * @see #getNumericCollation * @see #setNumericCollation * @stable ICU 2.8 */ - public void setNumericCollationDefault() - { + public void setNumericCollationDefault() { setNumericCollation(m_defaultIsNumericCollation_); - updateInternalState(); + updateInternalState(); } /** - * Sets the mode for the direction of SECONDARY weights to be used in - * French collation. - * The default value is false, which treats SECONDARY weights in the order - * they appear. - * If set to true, the SECONDARY weights will be sorted backwards. - * See the section on - * + * Method to set the script order to its default value. + * + * @see #getScriptOrder + * @see #setScriptOrder + * @stable + */ + public void setScriptOrderDefault() { + setScriptOrder(m_defaultScriptOrder_); + } + + /** + * Sets the mode for the direction of SECONDARY weights to be used in French collation. The default value is false, + * which treats SECONDARY weights in the order they appear. If set to true, the SECONDARY weights will be sorted + * backwards. See the section on * French collation for more information. - * @param flag true to set the French collation on, false to set it off + * + * @param flag + * true to set the French collation on, false to set it off * @stable ICU 2.8 * @see #isFrenchCollation * @see #setFrenchCollationDefault */ - public void setFrenchCollation(boolean flag) - { - if(m_isFrenchCollation_ != flag) { + public void setFrenchCollation(boolean flag) { + if (m_isFrenchCollation_ != flag) { latinOneRegenTable_ = true; } m_isFrenchCollation_ = flag; @@ -506,68 +477,61 @@ public final class RuleBasedCollator extends Collator } /** - * Sets the alternate handling for QUATERNARY strength to be either - * shifted or non-ignorable. - * See the UCA definition on - * - * Alternate Weighting. - * This attribute will only be effective when QUATERNARY strength is set. - * The default value for this mode is false, corresponding to the - * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the - * RuleBasedCollator will treats all the codepoints with non-ignorable - * primary weights in the same way. - * If the mode is set to true, the behaviour corresponds to SHIFTED defined - * in UCA, this causes codepoints with PRIMARY orders that are equal or - * below the variable top value to be ignored in PRIMARY order and - * moved to the QUATERNARY order. - * @param shifted true if SHIFTED behaviour for alternate handling is - * desired, false for the NON_IGNORABLE behaviour. + * Sets the alternate handling for QUATERNARY strength to be either shifted or non-ignorable. See the UCA definition + * on Alternate Weighting. This + * attribute will only be effective when QUATERNARY strength is set. The default value for this mode is false, + * corresponding to the NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the RuleBasedCollator will treats all + * the codepoints with non-ignorable primary weights in the same way. If the mode is set to true, the behaviour + * corresponds to SHIFTED defined in UCA, this causes codepoints with PRIMARY orders that are equal or below the + * variable top value to be ignored in PRIMARY order and moved to the QUATERNARY order. + * + * @param shifted + * true if SHIFTED behaviour for alternate handling is desired, false for the NON_IGNORABLE behaviour. * @see #isAlternateHandlingShifted * @see #setAlternateHandlingDefault * @stable ICU 2.8 */ - public void setAlternateHandlingShifted(boolean shifted) - { + public void setAlternateHandlingShifted(boolean shifted) { m_isAlternateHandlingShifted_ = shifted; updateInternalState(); } /** *

    - * When case level is set to true, an additional weight is formed - * between the SECONDARY and TERTIARY weight, known as the case level. - * The case level is used to distinguish large and small Japanese Kana - * characters. Case level could also be used in other situations. - * For example to distinguish certain Pinyin characters. - * The default value is false, which means the case level is not generated. - * The contents of the case level are affected by the case first - * mode. A simple way to ignore accent differences in a string is to set - * the strength to PRIMARY and enable case level. + * When case level is set to true, an additional weight is formed between the SECONDARY and TERTIARY weight, known + * as the case level. The case level is used to distinguish large and small Japanese Kana characters. Case level + * could also be used in other situations. For example to distinguish certain Pinyin characters. The default value + * is false, which means the case level is not generated. The contents of the case level are affected by the case + * first mode. A simple way to ignore accent differences in a string is to set the strength to PRIMARY and enable + * case level. *

    *

    - * See the section on - * - * case level for more information. + * See the section on case + * level for more information. *

    - * @param flag true if case level sorting is required, false otherwise + * + * @param flag + * true if case level sorting is required, false otherwise * @stable ICU 2.8 * @see #setCaseLevelDefault * @see #isCaseLevel */ - public void setCaseLevel(boolean flag) - { + public void setCaseLevel(boolean flag) { m_isCaseLevel_ = flag; updateInternalState(); } /** *

    - * Sets this Collator's strength property. The strength property - * determines the minimum level of difference considered significant - * during comparison. + * Sets this Collator's strength property. The strength property determines the minimum level of difference + * considered significant during comparison. *

    - *

    See the Collator class description for an example of use.

    - * @param newStrength the new strength value. + *

    + * See the Collator class description for an example of use. + *

    + * + * @param newStrength + * the new strength value. * @see #getStrength * @see #setStrengthDefault * @see #PRIMARY @@ -575,48 +539,42 @@ public final class RuleBasedCollator extends Collator * @see #TERTIARY * @see #QUATERNARY * @see #IDENTICAL - * @exception IllegalArgumentException If the new strength value is not one - * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. + * @exception IllegalArgumentException + * If the new strength value is not one of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. * @stable ICU 2.8 */ - public void setStrength(int newStrength) - { + public void setStrength(int newStrength) { super.setStrength(newStrength); updateInternalState(); } - - /** + + /** *

    - * Variable top is a two byte primary value which causes all the codepoints - * with primary values that are less or equal than the variable top to be - * shifted when alternate handling is set to SHIFTED. + * Variable top is a two byte primary value which causes all the codepoints with primary values that are less or + * equal than the variable top to be shifted when alternate handling is set to SHIFTED. *

    *

    * Sets the variable top to a collation element value of a string supplied. - *

    - * @param varTop one or more (if contraction) characters to which the - * variable top should be set - * @return a int value containing the value of the variable top in upper 16 - * bits. Lower 16 bits are undefined. - * @exception IllegalArgumentException is thrown if varTop argument is not - * a valid variable top element. A variable top element is - * invalid when - *
      - *
    • it is a contraction that does not exist in the - * Collation order - *
    • when the PRIMARY strength collation element for the - * variable top has more than two bytes - *
    • when the varTop argument is null or zero in length. - *
    + *

    + * + * @param varTop + * one or more (if contraction) characters to which the variable top should be set + * @return a int value containing the value of the variable top in upper 16 bits. Lower 16 bits are undefined. + * @exception IllegalArgumentException + * is thrown if varTop argument is not a valid variable top element. A variable top element is + * invalid when + *
      + *
    • it is a contraction that does not exist in the Collation order + *
    • when the PRIMARY strength collation element for the variable top has more than two bytes + *
    • when the varTop argument is null or zero in length. + *
    * @see #getVariableTop * @see RuleBasedCollator#setAlternateHandlingShifted * @stable ICU 2.6 */ - public int setVariableTop(String varTop) - { + public int setVariableTop(String varTop) { if (varTop == null || varTop.length() == 0) { - throw new IllegalArgumentException( - "Variable top argument string can not be null or zero in length."); + throw new IllegalArgumentException("Variable top argument string can not be null or zero in length."); } if (m_srcUtilIter_ == null) { initUtility(true); @@ -624,112 +582,124 @@ public final class RuleBasedCollator extends Collator m_srcUtilColEIter_.setText(varTop); int ce = m_srcUtilColEIter_.next(); - - // here we check if we have consumed all characters + + // here we check if we have consumed all characters // you can put in either one character or a contraction - // you shouldn't put more... - if (m_srcUtilColEIter_.getOffset() != varTop.length() - || ce == CollationElementIterator.NULLORDER) { - throw new IllegalArgumentException( - "Variable top argument string is a contraction that does not exist " - + "in the Collation order"); + // you shouldn't put more... + if (m_srcUtilColEIter_.getOffset() != varTop.length() || ce == CollationElementIterator.NULLORDER) { + throw new IllegalArgumentException("Variable top argument string is a contraction that does not exist " + + "in the Collation order"); } - + int nextCE = m_srcUtilColEIter_.next(); - - if ((nextCE != CollationElementIterator.NULLORDER) - && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) { - throw new IllegalArgumentException( - "Variable top argument string can only have a single collation " - + "element that has less than or equal to two PRIMARY strength " - + "bytes"); + + if ((nextCE != CollationElementIterator.NULLORDER) + && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) { + throw new IllegalArgumentException("Variable top argument string can only have a single collation " + + "element that has less than or equal to two PRIMARY strength " + "bytes"); } - + m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16; - + return ce & CE_PRIMARY_MASK_; } - - /** - * Sets the variable top to a collation element value supplied. - * Variable top is set to the upper 16 bits. - * Lower 16 bits are ignored. - * @param varTop Collation element value, as returned by setVariableTop or - * getVariableTop + + /** + * Sets the variable top to a collation element value supplied. Variable top is set to the upper 16 bits. Lower 16 + * bits are ignored. + * + * @param varTop + * Collation element value, as returned by setVariableTop or getVariableTop * @see #getVariableTop * @see #setVariableTop(String) * @stable ICU 2.6 */ - public void setVariableTop(int varTop) - { + public void setVariableTop(int varTop) { m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16; } - + /** - * When numeric collation is turned on, this Collator generates a collation - * key for the numeric value of substrings of digits. This is a way to get - * '100' to sort AFTER '2' - * @param flag true to turn numeric collation on and false to turn it off + * When numeric collation is turned on, this Collator generates a collation key for the numeric value of substrings + * of digits. This is a way to get '100' to sort AFTER '2' + * + * @param flag + * true to turn numeric collation on and false to turn it off * @see #getNumericCollation * @see #setNumericCollationDefault * @stable ICU 2.8 */ - public void setNumericCollation(boolean flag) - { + public void setNumericCollation(boolean flag) { // sort substrings of digits as numbers m_isNumericCollation_ = flag; updateInternalState(); } + /** + * Set the order for scripts to be ordered in. + * + * @param order + * the reordering of scripts + * @see #getScriptOrder + * @see #setScriptOrderDefault + * @stable + */ + public void setScriptOrder(int... order) { + if (order != null) { + m_scriptOrder_ = new int[order.length]; + for (int i = 0; i < order.length; i++) { + m_scriptOrder_[i] = order[i]; + } + } else { + m_scriptOrder_ = null; + } + buildPermutationTable(); + } + // public getters -------------------------------------------------------- /** - * Gets the collation rules for this RuleBasedCollator. - * Equivalent to String getRules(RuleOption.FULL_RULES). + * Gets the collation rules for this RuleBasedCollator. Equivalent to String getRules(RuleOption.FULL_RULES). + * * @return returns the collation rules * @see #getRules(boolean) * @stable ICU 2.8 */ - public String getRules() - { + public String getRules() { return m_rules_; } - + /** - * Returns current rules. The argument defines whether full rules - * (UCA + tailored) rules are returned or just the tailoring. - * @param fullrules true if the rules that defines the full set of - * collation order is required, otherwise false for returning only - * the tailored rules + * Returns current rules. The argument defines whether full rules (UCA + tailored) rules are returned or just the + * tailoring. + * + * @param fullrules + * true if the rules that defines the full set of collation order is required, otherwise false for + * returning only the tailored rules * @return the current rules that defines this Collator. * @see #getRules() * @stable ICU 2.6 */ - public String getRules(boolean fullrules) - { + public String getRules(boolean fullrules) { if (!fullrules) { return m_rules_; } - // take the UCA rules and append real rules at the end + // take the UCA rules and append real rules at the end return UCA_.m_rules_.concat(m_rules_); } /** - * Get an UnicodeSet that contains all the characters and sequences - * tailored in this collator. - * @return a pointer to a UnicodeSet object containing all the - * code points and sequences that may sort differently than - * in the UCA. + * Get an UnicodeSet that contains all the characters and sequences tailored in this collator. + * + * @return a pointer to a UnicodeSet object containing all the code points and sequences that may sort differently + * than in the UCA. * @stable ICU 2.4 */ - public UnicodeSet getTailoredSet() - { + public UnicodeSet getTailoredSet() { try { - CollationRuleParser src = new CollationRuleParser(getRules()); - return src.getTailoredSet(); - } catch(Exception e) { - throw new IllegalStateException("A tailoring rule should not " + - "have errors. Something is quite wrong!"); + CollationRuleParser src = new CollationRuleParser(getRules()); + return src.getTailoredSet(); + } catch (Exception e) { + throw new IllegalStateException("A tailoring rule should not " + "have errors. Something is quite wrong!"); } } @@ -738,8 +708,9 @@ public final class RuleBasedCollator extends Collator UnicodeSet contractions; UnicodeSet expansions; UnicodeSet removedContractions; - boolean addPrefixes; - contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions, + boolean addPrefixes; + + contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions, UnicodeSet removedContractions, boolean addPrefixes) { this.coll = coll; this.contractions = contractions; @@ -748,63 +719,64 @@ public final class RuleBasedCollator extends Collator this.addPrefixes = addPrefixes; } } - - private void - addSpecial(contContext c, StringBuilder buffer, int CE) - { + + private void addSpecial(contContext c, StringBuilder buffer, int CE) { StringBuilder b = new StringBuilder(); int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_; int newCE = c.coll.m_contractionCE_[offset]; // we might have a contraction that ends from previous level - if(newCE != CollationElementIterator.CE_NOT_FOUND_) { - if(isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ - && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ - && c.addPrefixes) { + if (newCE != CollationElementIterator.CE_NOT_FOUND_) { + if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ && isSpecial(newCE) + && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) { addSpecial(c, buffer, newCE); } - if(buffer.length() > 1) { - if(c.contractions != null) { + if (buffer.length() > 1) { + if (c.contractions != null) { c.contractions.add(buffer.toString()); } - if(c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { + if (c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { c.expansions.add(buffer.toString()); } } - } - + } + offset++; // check whether we're doing contraction or prefix - if(getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) { - while(c.coll.m_contractionIndex_[offset] != 0xFFFF) { + if (getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) { + while (c.coll.m_contractionIndex_[offset] != 0xFFFF) { b.delete(0, b.length()); b.append(buffer); newCE = c.coll.m_contractionCE_[offset]; b.insert(0, c.coll.m_contractionIndex_[offset]); - if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) { + if (isSpecial(newCE) + && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) { addSpecial(c, b, newCE); } else { - if(c.contractions != null) { + if (c.contractions != null) { c.contractions.add(b.toString()); } - if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) { + if (c.expansions != null && isSpecial(newCE) + && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) { c.expansions.add(b.toString()); } } offset++; } - } else if(getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) { - while(c.coll.m_contractionIndex_[offset] != 0xFFFF) { + } else if (getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) { + while (c.coll.m_contractionIndex_[offset] != 0xFFFF) { b.delete(0, b.length()); b.append(buffer); newCE = c.coll.m_contractionCE_[offset]; b.append(c.coll.m_contractionIndex_[offset]); - if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) { + if (isSpecial(newCE) + && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) { addSpecial(c, b, newCE); } else { - if(c.contractions != null) { + if (c.contractions != null) { c.contractions.add(b.toString()); } - if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) { + if (c.expansions != null && isSpecial(newCE) + && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) { c.expansions.add(b.toString()); } } @@ -812,26 +784,23 @@ public final class RuleBasedCollator extends Collator } } } - - private - void processSpecials(contContext c) - { + + private void processSpecials(contContext c) { int internalBufferSize = 512; - TrieIterator trieiterator - = new TrieIterator(c.coll.m_trie_); + TrieIterator trieiterator = new TrieIterator(c.coll.m_trie_); RangeValueIterator.Element element = new RangeValueIterator.Element(); while (trieiterator.next(element)) { int start = element.start; int limit = element.limit; int CE = element.value; StringBuilder contraction = new StringBuilder(internalBufferSize); - - if(isSpecial(CE)) { - if(((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) { - while(start < limit) { - // if there are suppressed contractions, we don't + + if (isSpecial(CE)) { + if (((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) { + while (start < limit) { + // if there are suppressed contractions, we don't // want to add them. - if(c.removedContractions != null && c.removedContractions.contains(start)) { + if (c.removedContractions != null && c.removedContractions.contains(start)) { start++; continue; } @@ -841,69 +810,72 @@ public final class RuleBasedCollator extends Collator addSpecial(c, contraction, CE); start++; } - } else if(c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { - while(start < limit) { + } else if (c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { + while (start < limit) { c.expansions.add(start++); } } } } } - + /** * Gets unicode sets containing contractions and/or expansions of a collator - * @param contractions if not null, set to contain contractions - * @param expansions if not null, set to contain expansions - * @param addPrefixes add the prefix contextual elements to contractions - * @throws Exception Throws an exception if any errors occurs. + * + * @param contractions + * if not null, set to contain contractions + * @param expansions + * if not null, set to contain expansions + * @param addPrefixes + * add the prefix contextual elements to contractions + * @throws Exception + * Throws an exception if any errors occurs. * @stable ICU 3.4 */ - public void - getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, - boolean addPrefixes) throws Exception { - if(contractions != null) { + public void getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, boolean addPrefixes) + throws Exception { + if (contractions != null) { contractions.clear(); } - if(expansions != null) { + if (expansions != null) { expansions.clear(); } String rules = getRules(); try { CollationRuleParser src = new CollationRuleParser(rules); - contContext c = new contContext(RuleBasedCollator.UCA_, - contractions, expansions, src.m_removeSet_, addPrefixes); - + contContext c = new contContext(RuleBasedCollator.UCA_, contractions, expansions, src.m_removeSet_, + addPrefixes); + // Add the UCA contractions processSpecials(c); // This is collator specific. Add contractions from a collator c.coll = this; - c.removedContractions = null; + c.removedContractions = null; processSpecials(c); } catch (Exception e) { throw e; } } - + /** *

    - * Get a Collation key for the argument String source from this - * RuleBasedCollator. + * Get a Collation key for the argument String source from this RuleBasedCollator. *

    *

    * General recommendation:
    - * If comparison are to be done to the same String multiple times, it would - * be more efficient to generate CollationKeys for the Strings and use - * CollationKey.compareTo(CollationKey) for the comparisons. - * If the each Strings are compared to only once, using the method - * RuleBasedCollator.compare(String, String) will have a better performance. + * If comparison are to be done to the same String multiple times, it would be more efficient to generate + * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If the each + * Strings are compared to only once, using the method RuleBasedCollator.compare(String, String) will have a better + * performance. *

    *

    * See the class documentation for an explanation about CollationKeys. *

    - * @param source the text String to be transformed into a collation key. - * @return the CollationKey for the given String based on this - * RuleBasedCollator's collation rules. If the source String is - * null, a null CollationKey is returned. + * + * @param source + * the text String to be transformed into a collation key. + * @return the CollationKey for the given String based on this RuleBasedCollator's collation rules. If the source + * String is null, a null CollationKey is returned. * @see CollationKey * @see #compare(String, String) * @see #getRawCollationKey @@ -913,37 +885,33 @@ public final class RuleBasedCollator extends Collator if (source == null) { return null; } - m_utilRawCollationKey_ = getRawCollationKey(source, - m_utilRawCollationKey_); + m_utilRawCollationKey_ = getRawCollationKey(source, m_utilRawCollationKey_); return new CollationKey(source, m_utilRawCollationKey_); } - + /** - * Gets the simpler form of a CollationKey for the String source following - * the rules of this Collator and stores the result into the user provided - * argument key. - * If key has a internal byte array of length that's too small for the - * result, the internal byte array will be grown to the exact required - * size. - * @param source the text String to be transformed into a RawCollationKey - * @param key output RawCollationKey to store results - * @return If key is null, a new instance of RawCollationKey will be - * created and returned, otherwise the user provided key will be - * returned. - * @see #getCollationKey + * Gets the simpler form of a CollationKey for the String source following the rules of this Collator and stores the + * result into the user provided argument key. If key has a internal byte array of length that's too small for the + * result, the internal byte array will be grown to the exact required size. + * + * @param source + * the text String to be transformed into a RawCollationKey + * @param key + * output RawCollationKey to store results + * @return If key is null, a new instance of RawCollationKey will be created and returned, otherwise the user + * provided key will be returned. + * @see #getCollationKey * @see #compare(String, String) * @see RawCollationKey * @stable ICU 2.8 */ - public RawCollationKey getRawCollationKey(String source, - RawCollationKey key) - { + public RawCollationKey getRawCollationKey(String source, RawCollationKey key) { if (source == null) { return null; } int strength = getStrength(); m_utilCompare0_ = m_isCaseLevel_; - //m_utilCompare1_ = true; + // m_utilCompare1_ = true; m_utilCompare2_ = strength >= SECONDARY; m_utilCompare3_ = strength >= TERTIARY; m_utilCompare4_ = strength >= QUATERNARY; @@ -954,13 +922,13 @@ public final class RuleBasedCollator extends Collator m_utilBytesCount2_ = 0; m_utilBytesCount3_ = 0; m_utilBytesCount4_ = 0; - //m_utilBytesCount5_ = 0; - //m_utilCount0_ = 0; - //m_utilCount1_ = 0; + // m_utilBytesCount5_ = 0; + // m_utilCount0_ = 0; + // m_utilCount1_ = 0; m_utilCount2_ = 0; m_utilCount3_ = 0; m_utilCount4_ = 0; - //m_utilCount5_ = 0; + // m_utilCount5_ = 0; boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_; // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so @@ -969,28 +937,24 @@ public final class RuleBasedCollator extends Collator byte hiragana4 = 0; if (m_isHiragana4_ && m_utilCompare4_) { // allocate one more space for hiragana, value for hiragana - hiragana4 = (byte)commonBottom4; - commonBottom4 ++; + hiragana4 = (byte) commonBottom4; + commonBottom4++; } int bottomCount4 = 0xFF - commonBottom4; // If we need to normalize, we'll do it all at once at the beginning! - if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD,0) - != Normalizer.YES) { + if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) { // if it is identical strength, we have to normalize the string to // NFD so that it will be appended correctly to the end of the sort // key source = Normalizer.decompose(source, false); - } - else if (getDecomposition() != NO_DECOMPOSITION - && Normalizer.quickCheck(source, Normalizer.FCD,0) - != Normalizer.YES) { + } else if (getDecomposition() != NO_DECOMPOSITION + && Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.YES) { // for the rest of the strength, if decomposition is on, FCD is // enough for us to work on. - source = Normalizer.normalize(source,Normalizer.FCD); + source = Normalizer.normalize(source, Normalizer.FCD); } - getSortKeyBytes(source, doFrench, hiragana4, commonBottom4, - bottomCount4); + getSortKeyBytes(source, doFrench, hiragana4, commonBottom4, bottomCount4); if (key == null) { key = new RawCollationKey(); } @@ -999,136 +963,172 @@ public final class RuleBasedCollator extends Collator } /** - * Return true if an uppercase character is sorted before the corresponding lowercase character. - * See setCaseFirst(boolean) for details. + * Return true if an uppercase character is sorted before the corresponding lowercase character. See + * setCaseFirst(boolean) for details. + * * @see #setUpperCaseFirst * @see #setLowerCaseFirst * @see #isLowerCaseFirst * @see #setCaseFirstDefault - * @return true if upper cased characters are sorted before lower cased - * characters, false otherwise + * @return true if upper cased characters are sorted before lower cased characters, false otherwise * @stable ICU 2.8 */ - public boolean isUpperCaseFirst() - { + public boolean isUpperCaseFirst() { return (m_caseFirst_ == AttributeValue.UPPER_FIRST_); - } - + } + /** - * Return true if a lowercase character is sorted before the corresponding uppercase character. - * See setCaseFirst(boolean) for details. + * Return true if a lowercase character is sorted before the corresponding uppercase character. See + * setCaseFirst(boolean) for details. + * * @see #setUpperCaseFirst * @see #setLowerCaseFirst * @see #isUpperCaseFirst * @see #setCaseFirstDefault - * @return true lower cased characters are sorted before upper cased - * characters, false otherwise + * @return true lower cased characters are sorted before upper cased characters, false otherwise * @stable ICU 2.8 */ - public boolean isLowerCaseFirst() - { + public boolean isLowerCaseFirst() { return (m_caseFirst_ == AttributeValue.LOWER_FIRST_); } /** - * Checks if the alternate handling behaviour is the UCA defined SHIFTED or - * NON_IGNORABLE. - * If return value is true, then the alternate handling attribute for the - * Collator is SHIFTED. Otherwise if return value is false, then the - * alternate handling attribute for the Collator is NON_IGNORABLE - * See setAlternateHandlingShifted(boolean) for more details. + * Checks if the alternate handling behaviour is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true, + * then the alternate handling attribute for the Collator is SHIFTED. Otherwise if return value is false, then the + * alternate handling attribute for the Collator is NON_IGNORABLE See setAlternateHandlingShifted(boolean) for more + * details. + * * @return true or false * @see #setAlternateHandlingShifted(boolean) * @see #setAlternateHandlingDefault * @stable ICU 2.8 */ - public boolean isAlternateHandlingShifted() - { + public boolean isAlternateHandlingShifted() { return m_isAlternateHandlingShifted_; } /** - * Checks if case level is set to true. - * See setCaseLevel(boolean) for details. + * Checks if case level is set to true. See setCaseLevel(boolean) for details. + * * @return the case level mode * @see #setCaseLevelDefault * @see #isCaseLevel * @see #setCaseLevel(boolean) * @stable ICU 2.8 */ - public boolean isCaseLevel() - { + public boolean isCaseLevel() { return m_isCaseLevel_; } /** - * Checks if French Collation is set to true. - * See setFrenchCollation(boolean) for details. + * Checks if French Collation is set to true. See setFrenchCollation(boolean) for details. + * * @return true if French Collation is set to true, false otherwise * @see #setFrenchCollation(boolean) * @see #setFrenchCollationDefault * @stable ICU 2.8 */ - public boolean isFrenchCollation() - { - return m_isFrenchCollation_; - } + public boolean isFrenchCollation() { + return m_isFrenchCollation_; + } /** - * Checks if the Hiragana Quaternary mode is set on. - * See setHiraganaQuaternary(boolean) for more details. + * Checks if the Hiragana Quaternary mode is set on. See setHiraganaQuaternary(boolean) for more details. + * * @return flag true if Hiragana Quaternary mode is on, false otherwise * @see #setHiraganaQuaternaryDefault * @see #setHiraganaQuaternary(boolean) * @stable ICU 2.8 */ - public boolean isHiraganaQuaternary() - { + public boolean isHiraganaQuaternary() { return m_isHiragana4_; } - /** - * Gets the variable top value of a Collator. - * Lower 16 bits are undefined and should be ignored. + /** + * Gets the variable top value of a Collator. Lower 16 bits are undefined and should be ignored. + * * @return the variable top value of a Collator. * @see #setVariableTop * @stable ICU 2.6 */ - public int getVariableTop() - { - return m_variableTopValue_ << 16; + public int getVariableTop() { + return m_variableTopValue_ << 16; } - - /** - * Method to retrieve the numeric collation value. - * When numeric collation is turned on, this Collator generates a collation - * key for the numeric value of substrings of digits. This is a way to get - * '100' to sort AFTER '2' + + /** + * Method to retrieve the numeric collation value. When numeric collation is turned on, this Collator generates a + * collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER '2' + * * @see #setNumericCollation * @see #setNumericCollationDefault * @return true if numeric collation is turned on, false otherwise * @stable ICU 2.8 */ - public boolean getNumericCollation() - { + public boolean getNumericCollation() { return m_isNumericCollation_; } - + + /** + * Method to retrieve the script reordering. + * + * @see #setScriptOrder + * @see #setScriptOrderDefault + * @return the ordering of the scripts if one has been set, null otherwise. + * @stable + */ + public int[] getScriptOrder() { + if (m_scriptOrder_ != null) { + int[] ret = new int[m_scriptOrder_.length]; + for (int i = 0; i < m_scriptOrder_.length; i++) { + ret[i] = m_scriptOrder_[i]; + } + return ret; + } else { + return null; + } + } + + /** + * Method to retrieve the scripts equivalent to the given script for reordering. Some scripts will share the same + * "lead byte" used for the collation codes and so must be reordered together. + * + * @see #setScriptOrder + * @see #setScriptOrderDefault + * @param reorderCode code for which equivalents to be retrieved + * @return the set of scripts equivalent to the given script including the script given. + * @stable + */ + public static int[] getScriptEquivalentsForReordering(int reorderCode) { + Set equivalentScriptsSet = new HashSet(); + int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(reorderCode); + for (int leadByte : leadBytes) { + int[] scripts = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte); + for (int script : scripts) { + equivalentScriptsSet.add(script); + } + } + int[] equivalentScripts = new int[equivalentScriptsSet.size()]; + int i = 0; + for (int script : equivalentScriptsSet) { + equivalentScripts[i++] = script; + } + return equivalentScripts; + } + // public other methods ------------------------------------------------- /** - * Compares the equality of two RuleBasedCollator objects. - * RuleBasedCollator objects are equal if they have the same collation - * rules and the same attributes. - * @param obj the RuleBasedCollator to be compared to. - * @return true if this RuleBasedCollator has exactly the same - * collation behaviour as obj, false otherwise. + * Compares the equality of two RuleBasedCollator objects. RuleBasedCollator objects are equal if they have the same + * collation rules and the same attributes. + * + * @param obj + * the RuleBasedCollator to be compared to. + * @return true if this RuleBasedCollator has exactly the same collation behaviour as obj, false otherwise. * @stable ICU 2.8 */ - public boolean equals(Object obj) - { + public boolean equals(Object obj) { if (obj == null) { - return false; // super does class check + return false; // super does class check } if (this == obj) { return true; @@ -1136,19 +1136,28 @@ public final class RuleBasedCollator extends Collator if (getClass() != obj.getClass()) { return false; } - RuleBasedCollator other = (RuleBasedCollator)obj; + RuleBasedCollator other = (RuleBasedCollator) obj; // all other non-transient information is also contained in rules. - if (getStrength() != other.getStrength() - || getDecomposition() != other.getDecomposition() - || other.m_caseFirst_ != m_caseFirst_ - || other.m_caseSwitch_ != m_caseSwitch_ - || other.m_isAlternateHandlingShifted_ - != m_isAlternateHandlingShifted_ - || other.m_isCaseLevel_ != m_isCaseLevel_ - || other.m_isFrenchCollation_ != m_isFrenchCollation_ - || other.m_isHiragana4_ != m_isHiragana4_) { + if (getStrength() != other.getStrength() || getDecomposition() != other.getDecomposition() + || other.m_caseFirst_ != m_caseFirst_ || other.m_caseSwitch_ != m_caseSwitch_ + || other.m_isAlternateHandlingShifted_ != m_isAlternateHandlingShifted_ + || other.m_isCaseLevel_ != m_isCaseLevel_ || other.m_isFrenchCollation_ != m_isFrenchCollation_ + || other.m_isHiragana4_ != m_isHiragana4_) { return false; } + if (m_scriptOrder_ != null ^ other.m_scriptOrder_ != null) { + return false; + } + if (m_scriptOrder_ != null) { + if (m_scriptOrder_.length != other.m_scriptOrder_.length) { + return false; + } + for (int i = 0; i < m_scriptOrder_.length; i++) { + if (m_scriptOrder_[i] != other.m_scriptOrder_[i]) { + return false; + } + } + } boolean rules = m_rules_ == other.m_rules_; if (!rules && (m_rules_ != null && other.m_rules_ != null)) { rules = m_rules_.equals(other.m_rules_); @@ -1156,24 +1165,18 @@ public final class RuleBasedCollator extends Collator if (!rules || !ICUDebug.enabled("collation")) { return rules; } - if (m_addition3_ != other.m_addition3_ - || m_bottom3_ != other.m_bottom3_ - || m_bottomCount3_ != other.m_bottomCount3_ - || m_common3_ != other.m_common3_ - || m_isSimple3_ != other.m_isSimple3_ - || m_mask3_ != other.m_mask3_ - || m_minContractionEnd_ != other.m_minContractionEnd_ - || m_minUnsafe_ != other.m_minUnsafe_ - || m_top3_ != other.m_top3_ - || m_topCount3_ != other.m_topCount3_ - || !Arrays.equals(m_unsafe_, other.m_unsafe_)) { + if (m_addition3_ != other.m_addition3_ || m_bottom3_ != other.m_bottom3_ + || m_bottomCount3_ != other.m_bottomCount3_ || m_common3_ != other.m_common3_ + || m_isSimple3_ != other.m_isSimple3_ || m_mask3_ != other.m_mask3_ + || m_minContractionEnd_ != other.m_minContractionEnd_ || m_minUnsafe_ != other.m_minUnsafe_ + || m_top3_ != other.m_top3_ || m_topCount3_ != other.m_topCount3_ + || !Arrays.equals(m_unsafe_, other.m_unsafe_)) { return false; } if (!m_trie_.equals(other.m_trie_)) { // we should use the trie iterator here, but then this part is // only used in the test. - for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i --) - { + for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i--) { int v = m_trie_.getCodePointValue(i); int otherv = other.m_trie_.getCodePointValue(i); if (v != otherv) { @@ -1184,8 +1187,7 @@ public final class RuleBasedCollator extends Collator if (mask == 0xf1000000) { v -= (m_expansionOffset_ << 4); otherv -= (other.m_expansionOffset_ << 4); - } - else if (mask == 0xf2000000) { + } else if (mask == 0xf2000000) { v -= m_contractionOffset_; otherv -= other.m_contractionOffset_; } @@ -1209,17 +1211,17 @@ public final class RuleBasedCollator extends Collator if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) { return false; } - } + } return true; } /** * Generates a unique hash code for this RuleBasedCollator. + * * @return the unique hash code for this Collator * @stable ICU 2.8 */ - public int hashCode() - { + public int hashCode() { String rules = getRules(); if (rules == null) { rules = ""; @@ -1228,72 +1230,62 @@ public final class RuleBasedCollator extends Collator } /** - * Compares the source text String to the target text String according to - * the collation rules, strength and decomposition mode for this - * RuleBasedCollator. - * Returns an integer less than, - * equal to or greater than zero depending on whether the source String is - * less than, equal to or greater than the target String. See the Collator - * class description for an example of use. - *

    + * Compares the source text String to the target text String according to the collation rules, strength and + * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero + * depending on whether the source String is less than, equal to or greater than the target String. See the Collator + * class description for an example of use.

    *

    * General recommendation:
    - * If comparison are to be done to the same String multiple times, it would - * be more efficient to generate CollationKeys for the Strings and use - * CollationKey.compareTo(CollationKey) for the comparisons. - * If speed performance is critical and object instantiation is to be - * reduced, further optimization may be achieved by generating a simpler - * key of the form RawCollationKey and reusing this RawCollationKey - * object with the method RuleBasedCollator.getRawCollationKey. Internal - * byte representation can be directly accessed via RawCollationKey and - * stored for future use. Like CollationKey, RawCollationKey provides a - * method RawCollationKey.compareTo for key comparisons. - * If the each Strings are compared to only once, using the method - * RuleBasedCollator.compare(String, String) will have a better performance. + * If comparison are to be done to the same String multiple times, it would be more efficient to generate + * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed + * performance is critical and object instantiation is to be reduced, further optimization may be achieved by + * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method + * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey + * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key + * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String, + * String) will have a better performance. *

    - * @param source the source text String. - * @param target the target text String. - * @return Returns an integer value. Value is less than zero if source is - * less than target, value is zero if source and target are equal, - * value is greater than zero if source is greater than target. + * + * @param source + * the source text String. + * @param target + * the target text String. + * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source + * and target are equal, value is greater than zero if source is greater than target. * @see CollationKey * @see #getCollationKey * @stable ICU 2.8 */ - public int compare(String source, String target) - { + public int compare(String source, String target) { if (source == target) { return 0; } // Find the length of any leading portion that is equal int offset = getFirstUnmatchedOffset(source, target); - //return compareRegular(source, target, offset); - if(latinOneUse_) { - if ((offset < source.length() - && source.charAt(offset) > ENDOFLATINONERANGE_) - || (offset < target.length() - && target.charAt(offset) > ENDOFLATINONERANGE_)) { - // source or target start with non-latin-1 - return compareRegular(source, target, offset); - } else { - return compareUseLatin1(source, target, offset); - } + // return compareRegular(source, target, offset); + if (latinOneUse_) { + if ((offset < source.length() && source.charAt(offset) > ENDOFLATINONERANGE_) + || (offset < target.length() && target.charAt(offset) > ENDOFLATINONERANGE_)) { + // source or target start with non-latin-1 + return compareRegular(source, target, offset); + } else { + return compareUseLatin1(source, target, offset); + } } else { - return compareRegular(source, target, offset); + return compareRegular(source, target, offset); } } - + // package private inner interfaces -------------------------------------- /** * Attribute values to be used when setting the Collator options */ - static interface AttributeValue - { + static interface AttributeValue { /** - * Indicates that the default attribute value will be used. - * See individual attribute for details on its default value. + * Indicates that the default attribute value will be used. See individual attribute for details on its default + * value. */ static final int DEFAULT_ = -1; /** @@ -1329,13 +1321,12 @@ public final class RuleBasedCollator extends Collator */ static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1; /** - * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, - * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE + * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and + * DECOMPOSITION_MODE */ static final int OFF_ = 16; /** - * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, - * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE + * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE */ static final int ON_ = 17; /** @@ -1343,8 +1334,7 @@ public final class RuleBasedCollator extends Collator */ static final int SHIFTED_ = 20; /** - * Valid for ALTERNATE_HANDLING. Alternate handling will be non - * ignorable + * Valid for ALTERNATE_HANDLING. Alternate handling will be non ignorable */ static final int NON_IGNORABLE_ = 21; /** @@ -1362,73 +1352,55 @@ public final class RuleBasedCollator extends Collator } /** - * Attributes that collation service understands. All the attributes can - * take DEFAULT value, as well as the values specific to each one. + * Attributes that collation service understands. All the attributes can take DEFAULT value, as well as the values + * specific to each one. */ - static interface Attribute - { + static interface Attribute { /** - * Attribute for direction of secondary weights - used in French. - * Acceptable values are ON, which results in secondary weights being - * considered backwards and OFF which treats secondary weights in the - * order they appear. + * Attribute for direction of secondary weights - used in French. Acceptable values are ON, which results in + * secondary weights being considered backwards and OFF which treats secondary weights in the order they appear. */ static final int FRENCH_COLLATION_ = 0; /** - * Attribute for handling variable elements. Acceptable values are - * NON_IGNORABLE (default) which treats all the codepoints with - * non-ignorable primary weights in the same way, and SHIFTED which - * causes codepoints with primary weights that are equal or below the - * variable top value to be ignored on primary level and moved to the - * quaternary level. + * Attribute for handling variable elements. Acceptable values are NON_IGNORABLE (default) which treats all the + * codepoints with non-ignorable primary weights in the same way, and SHIFTED which causes codepoints with + * primary weights that are equal or below the variable top value to be ignored on primary level and moved to + * the quaternary level. */ static final int ALTERNATE_HANDLING_ = 1; /** - * Controls the ordering of upper and lower case letters. Acceptable - * values are OFF (default), which orders upper and lower case letters - * in accordance to their tertiary weights, UPPER_FIRST which forces - * upper case letters to sort before lower case letters, and - * LOWER_FIRST which does the opposite. + * Controls the ordering of upper and lower case letters. Acceptable values are OFF (default), which orders + * upper and lower case letters in accordance to their tertiary weights, UPPER_FIRST which forces upper case + * letters to sort before lower case letters, and LOWER_FIRST which does the opposite. */ static final int CASE_FIRST_ = 2; /** - * Controls whether an extra case level (positioned before the third - * level) is generated or not. Acceptable values are OFF (default), - * when case level is not generated, and ON which causes the case - * level to be generated. Contents of the case level are affected by - * the value of CASE_FIRST attribute. A simple way to ignore accent - * differences in a string is to set the strength to PRIMARY and - * enable case level. + * Controls whether an extra case level (positioned before the third level) is generated or not. Acceptable + * values are OFF (default), when case level is not generated, and ON which causes the case level to be + * generated. Contents of the case level are affected by the value of CASE_FIRST attribute. A simple way to + * ignore accent differences in a string is to set the strength to PRIMARY and enable case level. */ static final int CASE_LEVEL_ = 3; /** - * Controls whether the normalization check and necessary - * normalizations are performed. When set to OFF (default) no - * normalization check is performed. The correctness of the result is - * guaranteed only if the input data is in so-called FCD form (see - * users manual for more info). When set to ON, an incremental check - * is performed to see whether the input data is in the FCD form. If - * the data is not in the FCD form, incremental NFD normalization is - * performed. + * Controls whether the normalization check and necessary normalizations are performed. When set to OFF + * (default) no normalization check is performed. The correctness of the result is guaranteed only if the input + * data is in so-called FCD form (see users manual for more info). When set to ON, an incremental check is + * performed to see whether the input data is in the FCD form. If the data is not in the FCD form, incremental + * NFD normalization is performed. */ static final int NORMALIZATION_MODE_ = 4; /** - * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, - * QUATERNARY or IDENTICAL. The usual strength for most locales - * (except Japanese) is tertiary. Quaternary strength is useful when - * combined with shifted setting for alternate handling attribute and - * for JIS x 4061 collation, when it is used to distinguish between - * Katakana and Hiragana (this is achieved by setting the - * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is - * affected only by the number of non ignorable code points in the - * string. Identical strength is rarely useful, as it amounts to - * codepoints of the NFD form of the string. + * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. The usual + * strength for most locales (except Japanese) is tertiary. Quaternary strength is useful when combined with + * shifted setting for alternate handling attribute and for JIS x 4061 collation, when it is used to distinguish + * between Katakana and Hiragana (this is achieved by setting the HIRAGANA_QUATERNARY mode to on. Otherwise, + * quaternary level is affected only by the number of non ignorable code points in the string. Identical + * strength is rarely useful, as it amounts to codepoints of the NFD form of the string. */ static final int STRENGTH_ = 5; /** - * When turned on, this attribute positions Hiragana before all - * non-ignorables on quaternary level. This is a sneaky way to produce - * JIS sort order. + * When turned on, this attribute positions Hiragana before all non-ignorables on quaternary level. This is a + * sneaky way to produce JIS sort order. */ static final int HIRAGANA_QUATERNARY_MODE_ = 6; /** @@ -1440,19 +1412,18 @@ public final class RuleBasedCollator extends Collator /** * DataManipulate singleton */ - static class DataManipulate implements Trie.DataManipulate - { + static class DataManipulate implements Trie.DataManipulate { // public methods ---------------------------------------------------- /** - * Internal method called to parse a lead surrogate's ce for the offset - * to the next trail surrogate data. - * @param ce collation element of the lead surrogate + * Internal method called to parse a lead surrogate's ce for the offset to the next trail surrogate data. + * + * @param ce + * collation element of the lead surrogate * @return data offset or 0 for the next trail surrogate * @stable ICU 2.8 */ - public final int getFoldingOffset(int ce) - { + public final int getFoldingOffset(int ce) { if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) { return (ce & 0xFFFFFF); } @@ -1462,10 +1433,9 @@ public final class RuleBasedCollator extends Collator /** * Get singleton object */ - public static final DataManipulate getInstance() - { + public static final DataManipulate getInstance() { if (m_instance_ == null) { - m_instance_ = new DataManipulate(); + m_instance_ = new DataManipulate(); } return m_instance_; } @@ -1482,44 +1452,172 @@ public final class RuleBasedCollator extends Collator /** * private to prevent initialization */ - private DataManipulate() - { + private DataManipulate() { } } /** * UCAConstants */ - static final class UCAConstants - { - int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 - int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 - int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705 - int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000 - int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500 - int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05 - int FIRST_VARIABLE_[] = new int[2]; // 0x05070505 - int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505 - int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505 - int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505 - int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303 - int FIRST_IMPLICIT_[] = new int[2]; - int LAST_IMPLICIT_[] = new int[2]; - int FIRST_TRAILING_[] = new int[2]; - int LAST_TRAILING_[] = new int[2]; - int PRIMARY_TOP_MIN_; - int PRIMARY_IMPLICIT_MIN_; // 0xE8000000 - int PRIMARY_IMPLICIT_MAX_; // 0xF0000000 - int PRIMARY_TRAILING_MIN_; // 0xE8000000 - int PRIMARY_TRAILING_MAX_; // 0xF0000000 - int PRIMARY_SPECIAL_MIN_; // 0xE8000000 - int PRIMARY_SPECIAL_MAX_; // 0xF0000000 + static final class UCAConstants { + int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 + int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 + int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705 + int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000 + int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500 + int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05 + int FIRST_VARIABLE_[] = new int[2]; // 0x05070505 + int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505 + int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505 + int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505 + int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303 + int FIRST_IMPLICIT_[] = new int[2]; + int LAST_IMPLICIT_[] = new int[2]; + int FIRST_TRAILING_[] = new int[2]; + int LAST_TRAILING_[] = new int[2]; + int PRIMARY_TOP_MIN_; + int PRIMARY_IMPLICIT_MIN_; // 0xE8000000 + int PRIMARY_IMPLICIT_MAX_; // 0xF0000000 + int PRIMARY_TRAILING_MIN_; // 0xE8000000 + int PRIMARY_TRAILING_MAX_; // 0xF0000000 + int PRIMARY_SPECIAL_MIN_; // 0xE8000000 + int PRIMARY_SPECIAL_MAX_; // 0xF0000000 + } + + /** + * Script to Lead Byte and Lead Byte to Script Data + * + */ + static final class LeadByteConstants { + private static final int DATA_MASK_FOR_INDEX = 0x8000; + private static final int[] EMPTY_INT_ARRAY = new int[0]; + + private int serializedSize = 0; + + private Map SCRIPT_TO_LEAD_BYTES_INDEX; + private byte[] SCRIPT_TO_LEAD_BYTES_DATA; + + private int[] LEAD_BYTE_TO_SCRIPTS_INDEX; + private byte[] LEAD_BYTE_TO_SCRIPTS_DATA; + + LeadByteConstants() { + } + + void read(DataInputStream dis) throws IOException { + int readcount = 0; + int indexCount; + int dataSize; + + // script to lead bytes + indexCount = dis.readShort(); + readcount += 2; + dataSize = dis.readShort(); + readcount += 2; + this.SCRIPT_TO_LEAD_BYTES_INDEX = new HashMap(); + //System.out.println("Script to Lead Bytes Index - Count = " + indexCount); + for (int index = 0; index < indexCount; index++) { + int reorderCode = dis.readShort(); // reorder code + readcount += 2; + int dataOffset = 0xffff & dis.readShort(); // data offset + readcount += 2; + // System.out.println("\t-------------"); + // System.out.println("\toffset = " + Integer.toHexString(readcount - 4)); + // System.out.println("\treorderCode = " + Integer.toHexString(reorderCode)); + // System.out.println("\tdataOffset = " + Integer.toHexString(dataOffset)); + this.SCRIPT_TO_LEAD_BYTES_INDEX.put(reorderCode, dataOffset); + } + + this.SCRIPT_TO_LEAD_BYTES_DATA = new byte[dataSize * 2]; + dis.readFully(this.SCRIPT_TO_LEAD_BYTES_DATA, 0, this.SCRIPT_TO_LEAD_BYTES_DATA.length); + readcount += this.SCRIPT_TO_LEAD_BYTES_DATA.length; + + // lead byte to scripts + indexCount = dis.readShort(); + readcount += 2; + dataSize = dis.readShort(); + readcount += 2; + this.LEAD_BYTE_TO_SCRIPTS_INDEX = new int[indexCount]; + //System.out.println("Lead Byte to Scripts Index - Count = " + indexCount); + for (int index = 0; index < indexCount; index++) { + this.LEAD_BYTE_TO_SCRIPTS_INDEX[index] = 0xffff & dis.readShort(); + readcount += 2; + // System.out.println("\t-------------"); + // System.out.println("\toffset = " + Integer.toHexString(readcount - 2)); + // System.out.println("\tindex = " + Integer.toHexString(index)); + // System.out.println("\tdataOffset = " + Integer.toHexString(this.LEAD_BYTE_TO_SCRIPTS_INDEX[index])); + } + + this.LEAD_BYTE_TO_SCRIPTS_DATA = new byte[dataSize * 2]; + dis.readFully(this.LEAD_BYTE_TO_SCRIPTS_DATA, 0, this.LEAD_BYTE_TO_SCRIPTS_DATA.length); + readcount += this.LEAD_BYTE_TO_SCRIPTS_DATA.length; + + this.serializedSize = readcount; + } + + int getSerializedDataSize() { + return this.serializedSize; + } + + int[] getReorderCodesForLeadByte(int leadByte) { + if (leadByte >= this.LEAD_BYTE_TO_SCRIPTS_INDEX.length) { + return EMPTY_INT_ARRAY; + } + int offset = this.LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte]; + if (offset == 0) { + return EMPTY_INT_ARRAY; + } + if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) { + int[] reorderCodes = new int[1]; + reorderCodes[0] = offset & ~DATA_MASK_FOR_INDEX; + } + + int length = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset); + offset++; + + int[] reorderCodes = new int[length]; + + for (int code = 0; code < length; code++, offset++) { + reorderCodes[code] = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset); + } + return reorderCodes; + } + + int[] getLeadBytesForReorderCode(int reorderCode) { + if (!this.SCRIPT_TO_LEAD_BYTES_INDEX.containsKey(reorderCode)) { + return EMPTY_INT_ARRAY; + } + int offset = this.SCRIPT_TO_LEAD_BYTES_INDEX.get(reorderCode); + + if (offset == 0) { + return EMPTY_INT_ARRAY; + } + + int[] leadBytes; + if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) { + leadBytes = new int[1]; + leadBytes[0] = offset & ~DATA_MASK_FOR_INDEX; + } else { + + int length = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset); + offset++; + + leadBytes = new int[length]; + for (int leadByte = 0; leadByte < length; leadByte++, offset++) { + leadBytes[leadByte] = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset); + } + } + return leadBytes; + } + + private static int readShort(byte[] data, int offset) { + return data[offset * 2] << 8 | data[offset * 2 + 1]; + } } // package private data member ------------------------------------------- - static final byte BYTE_FIRST_TAILORED_ = (byte)0x04; - static final byte BYTE_COMMON_ = (byte)0x05; + static final byte BYTE_FIRST_TAILORED_ = (byte) 0x04; + static final byte BYTE_COMMON_ = (byte) 0x05; static final int COMMON_TOP_2_ = 0x86; // int for unsigness static final int COMMON_BOTTOM_2_ = BYTE_COMMON_; static final int COMMON_BOTTOM_3 = 0x05; @@ -1561,19 +1659,15 @@ public final class RuleBasedCollator extends Collator static final int CE_CONTINUATION_MARKER_ = 0xC0; /** - * Size of collator raw data headers and options before the expansion - * data. This is used when expansion ces are to be retrieved. ICU4C uses - * the expansion offset starting from UCollator.UColHeader, hence ICU4J - * will have to minus that off to get the right expansion ce offset. In - * number of ints. + * Size of collator raw data headers and options before the expansion data. This is used when expansion ces are to + * be retrieved. ICU4C uses the expansion offset starting from UCollator.UColHeader, hence ICU4J will have to minus + * that off to get the right expansion ce offset. In number of ints. */ int m_expansionOffset_; /** - * Size of collator raw data headers, options and expansions before - * contraction data. This is used when contraction ces are to be retrieved. - * ICU4C uses contraction offset starting from UCollator.UColHeader, hence - * ICU4J will have to minus that off to get the right contraction ce - * offset. In number of chars. + * Size of collator raw data headers, options and expansions before contraction data. This is used when contraction + * ces are to be retrieved. ICU4C uses contraction offset starting from UCollator.UColHeader, hence ICU4J will have + * to minus that off to get the right contraction ce offset. In number of chars. */ int m_contractionOffset_; /** @@ -1582,7 +1676,7 @@ public final class RuleBasedCollator extends Collator boolean m_isJamoSpecial_; // Collator options ------------------------------------------------------ - + int m_defaultVariableTopValue_; boolean m_defaultIsFrenchCollation_; boolean m_defaultIsAlternateHandlingShifted_; @@ -1592,7 +1686,8 @@ public final class RuleBasedCollator extends Collator int m_defaultStrength_; boolean m_defaultIsHiragana4_; boolean m_defaultIsNumericCollation_; - + int[] m_defaultScriptOrder_; + /** * Value of the variable top */ @@ -1609,6 +1704,10 @@ public final class RuleBasedCollator extends Collator * Numeric collation option */ boolean m_isNumericCollation_; + /** + * Script order + */ + int[] m_scriptOrder_; // end Collator options -------------------------------------------------- @@ -1629,28 +1728,23 @@ public final class RuleBasedCollator extends Collator */ IntTrie m_trie_; /** - * Table to store all collation elements that are the last element of an - * expansion. This is for use in StringSearch. + * Table to store all collation elements that are the last element of an expansion. This is for use in StringSearch. */ int m_expansionEndCE_[]; /** - * Table to store the maximum size of any expansions that end with the - * corresponding collation element in m_expansionEndCE_. For use in - * StringSearch too + * Table to store the maximum size of any expansions that end with the corresponding collation element in + * m_expansionEndCE_. For use in StringSearch too */ byte m_expansionEndCEMaxSize_[]; /** - * Heuristic table to store information on whether a char character is - * considered "unsafe". "Unsafe" character are combining marks or those - * belonging to some contraction sequence from the offset 1 onwards. - * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered - * unsafe. If we have another contraction "ZA" with the one above, then - * 'A', 'B', 'C' are "unsafe" but 'Z' is not. + * Heuristic table to store information on whether a char character is considered "unsafe". "Unsafe" character are + * combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is the + * only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one above, + * then 'A', 'B', 'C' are "unsafe" but 'Z' is not. */ byte m_unsafe_[]; /** - * Table to store information on whether a codepoint can occur as the last - * character in a contraction + * Table to store information on whether a codepoint can occur as the last character in a contraction */ byte m_contractionEnd_[]; /** @@ -1677,7 +1771,11 @@ public final class RuleBasedCollator extends Collator * UCD version */ VersionInfo m_UCD_version_; - + /** + * Lead byte and script data + */ + int m_leadByteToScripts; + int m_scriptToLeadBytes; /** * UnicodeData.txt property object */ @@ -1686,6 +1784,10 @@ public final class RuleBasedCollator extends Collator * UCA Constants */ static final UCAConstants UCA_CONSTANTS_; + /** + * Lead Byte Constants + */ + static LeadByteConstants LEADBYTE_CONSTANTS_; /** * Table for UCA and builder use */ @@ -1700,108 +1802,106 @@ public final class RuleBasedCollator extends Collator static final byte SORT_LEVEL_TERMINATOR_ = 1; -// These are values from UCA required for -// implicit generation and supressing sort key compression -// they should regularly be in the UCA, but if one -// is running without UCA, it could be a problem - static final int maxRegularPrimary = 0x7A; - static final int minImplicitPrimary = 0xE0; - static final int maxImplicitPrimary = 0xE4; - + // These are values from UCA required for + // implicit generation and supressing sort key compression + // they should regularly be in the UCA, but if one + // is running without UCA, it could be a problem + static final int maxRegularPrimary = 0x7A; + static final int minImplicitPrimary = 0xE0; + static final int maxImplicitPrimary = 0xE4; // block to initialise character property database - static - { + static { // take pains to let static class init succeed, otherwise the class itself won't exist and - // clients will get a NoClassDefFoundException. Instead, make the constructors fail if + // clients will get a NoClassDefFoundException. Instead, make the constructors fail if // we can't load the UCA data. RuleBasedCollator iUCA_ = null; UCAConstants iUCA_CONSTANTS_ = null; + LeadByteConstants iLEADBYTE_CONSTANTS = null; char iUCA_CONTRACTIONS_[] = null; ImplicitCEGenerator iimpCEGen_ = null; - try - { + try { // !!! note what's going on here... // even though the static init of the class is not yet complete, we - // instantiate an instance of the class. So we'd better be sure that + // instantiate an instance of the class. So we'd better be sure that // instantiation doesn't rely on the static initialization that's // not complete yet! iUCA_ = new RuleBasedCollator(); iUCA_CONSTANTS_ = new UCAConstants(); - iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_); + iLEADBYTE_CONSTANTS = new LeadByteConstants(); + iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_, iLEADBYTE_CONSTANTS); // called before doing canonical closure for the UCA. iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary); - //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_); + // iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, + // iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_); iUCA_.init(); - ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH); - iUCA_.m_rules_ = (String)rb.getObject("UCARules"); - } - catch (MissingResourceException ex) - { -// throw ex; - } - catch (IOException e) - { - // e.printStackTrace(); -// throw new MissingResourceException(e.getMessage(),"",""); + ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( + ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH); + iUCA_.m_rules_ = (String) rb.getObject("UCARules"); + } catch (MissingResourceException ex) { + int i =12; + // throw ex; + } catch (IOException e) { + int i =12; + // e.printStackTrace(); + // throw new MissingResourceException(e.getMessage(),"",""); } UCA_ = iUCA_; UCA_CONSTANTS_ = iUCA_CONSTANTS_; + LEADBYTE_CONSTANTS_ = iLEADBYTE_CONSTANTS; UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_; impCEGen_ = iimpCEGen_; UCA_INIT_COMPLETE = true; } - private static void checkUCA() throws MissingResourceException { if (UCA_INIT_COMPLETE && UCA_ == null) { throw new MissingResourceException("Collator UCA data unavailable", "", ""); } } - + // package private constructors ------------------------------------------ /** - *

    Private contructor for use by subclasses. - * Public access to creating Collators is handled by the API - * Collator.getInstance() or RuleBasedCollator(String rules). - *

    - *

    - * This constructor constructs the UCA collator internally - *

    - */ - RuleBasedCollator() - { + *

    + * Private contructor for use by subclasses. Public access to creating Collators is handled by the API + * Collator.getInstance() or RuleBasedCollator(String rules). + *

    + *

    + * This constructor constructs the UCA collator internally + *

    + */ + RuleBasedCollator() { checkUCA(); initUtility(false); } /** - * Constructors a RuleBasedCollator from the argument locale. - * If no resource bundle is associated with the locale, UCA is used - * instead. + * Constructors a RuleBasedCollator from the argument locale. If no resource bundle is associated with the locale, + * UCA is used instead. + * * @param locale */ - RuleBasedCollator(ULocale locale) - { + RuleBasedCollator(ULocale locale) { checkUCA(); - ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale); + ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( + ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale); initUtility(false); if (rb != null) { try { // Use keywords, if supplied for lookup String collkey = locale.getKeywordValue("collation"); - if(collkey == null) { - collkey = rb.getStringWithFallback("collations/default"); + if (collkey == null) { + collkey = rb.getStringWithFallback("collations/default"); } - + // collations/default will always give a string back // keyword for the real collation data - // if "collations/collkey" will return null if collkey == null + // if "collations/collkey" will return null if collkey == null ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey); if (elements != null) { // TODO: Determine actual & valid locale correctly @@ -1811,44 +1911,31 @@ public final class RuleBasedCollator extends Collator m_rules_ = elements.getString("Sequence"); ByteBuffer buf = elements.get("%%CollationBin").getBinary(); // %%CollationBin - if(buf!=null){ - // m_rules_ = (String)rules[1][1]; + if (buf != null) { + // m_rules_ = (String)rules[1][1]; CollatorReader.initRBC(this, buf); /* - BufferedInputStream input = - new BufferedInputStream( - new ByteArrayInputStream(map)); - /* - CollatorReader reader = new CollatorReader(input, false); - if (map.length > MIN_BINARY_DATA_SIZE_) { - reader.read(this, null); - } - else { - reader.readHeader(this); - reader.readOptions(this); - // duplicating UCA_'s data - setWithUCATables(); - } - */ + * BufferedInputStream input = new BufferedInputStream( new ByteArrayInputStream(map)); /* + * CollatorReader reader = new CollatorReader(input, false); if (map.length > + * MIN_BINARY_DATA_SIZE_) { reader.read(this, null); } else { reader.readHeader(this); + * reader.readOptions(this); // duplicating UCA_'s data setWithUCATables(); } + */ // at this point, we have read in the collator // now we need to check whether the binary image has // the right UCA and other versions - if(!m_UCA_version_.equals(UCA_.m_UCA_version_) || - !m_UCD_version_.equals(UCA_.m_UCD_version_)) { + if (!m_UCA_version_.equals(UCA_.m_UCA_version_) || !m_UCD_version_.equals(UCA_.m_UCD_version_)) { init(m_rules_); return; } init(); return; - } - else { + } else { init(m_rules_); return; } } - } - catch (Exception e) { - // e.printStackTrace(); + } catch (Exception e) { + e.printStackTrace(); // if failed use UCA. } } @@ -1858,11 +1945,9 @@ public final class RuleBasedCollator extends Collator // package private methods ----------------------------------------------- /** - * Sets this collator to use the tables in UCA. Note options not taken - * care of here. + * Sets this collator to use the tables in UCA. Note options not taken care of here. */ - final void setWithUCATables() - { + final void setWithUCATables() { m_contractionOffset_ = UCA_.m_contractionOffset_; m_expansionOffset_ = UCA_.m_expansionOffset_; m_expansion_ = UCA_.m_expansion_; @@ -1880,8 +1965,7 @@ public final class RuleBasedCollator extends Collator /** * Sets this collator to use the all options and tables in UCA. */ - final void setWithUCAData() - { + final void setWithUCAData() { latinOneFailed_ = true; m_addition3_ = UCA_.m_addition3_; @@ -1894,8 +1978,7 @@ public final class RuleBasedCollator extends Collator setDecomposition(UCA_.getDecomposition()); m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_; m_defaultDecomposition_ = UCA_.m_defaultDecomposition_; - m_defaultIsAlternateHandlingShifted_ - = UCA_.m_defaultIsAlternateHandlingShifted_; + m_defaultIsAlternateHandlingShifted_ = UCA_.m_defaultIsAlternateHandlingShifted_; m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_; m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_; m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_; @@ -1923,25 +2006,23 @@ public final class RuleBasedCollator extends Collator } /** - * Test whether a char character is potentially "unsafe" for use as a - * collation starting point. "Unsafe" characters are combining marks or - * those belonging to some contraction sequence from the offset 1 onwards. - * E.g. if "ABC" is the only contraction, then 'B' and - * 'C' are considered unsafe. If we have another contraction "ZA" with - * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not. - * @param ch character to determin + * Test whether a char character is potentially "unsafe" for use as a collation starting point. "Unsafe" characters + * are combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is + * the only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one + * above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not. + * + * @param ch + * character to determin * @return true if ch is unsafe, false otherwise */ - final boolean isUnsafe(char ch) - { + final boolean isUnsafe(char ch) { if (ch < m_minUnsafe_) { return false; } - + if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { - if (UTF16.isLeadSurrogate(ch) - || UTF16.isTrailSurrogate(ch)) { - // Trail surrogate are always considered unsafe. + if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) { + // Trail surrogate are always considered unsafe. return true; } ch &= HEURISTIC_OVERFLOW_MASK_; @@ -1952,13 +2033,13 @@ public final class RuleBasedCollator extends Collator } /** - * Approximate determination if a char character is at a contraction end. - * Guaranteed to be true if a character is at the end of a contraction, - * otherwise it is not deterministic. - * @param ch character to be determined + * Approximate determination if a char character is at a contraction end. Guaranteed to be true if a character is at + * the end of a contraction, otherwise it is not deterministic. + * + * @param ch + * character to be determined */ - final boolean isContractionEnd(char ch) - { + final boolean isContractionEnd(char ch) { if (UTF16.isTrailSurrogate(ch)) { return true; } @@ -1977,33 +2058,35 @@ public final class RuleBasedCollator extends Collator /** * Retrieve the tag of a special ce - * @param ce ce to test + * + * @param ce + * ce to test * @return tag of ce */ - static int getTag(int ce) - { + static int getTag(int ce) { return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_; } /** * Checking if ce is special - * @param ce to check + * + * @param ce + * to check * @return true if ce is special */ - static boolean isSpecial(int ce) - { + static boolean isSpecial(int ce) { return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_; } /** * Checks if the argument ce is a continuation - * @param ce collation element to test + * + * @param ce + * collation element to test * @return true if ce is a continuation */ - static final boolean isContinuation(int ce) - { - return ce != CollationElementIterator.NULLORDER - && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_; + static final boolean isContinuation(int ce) { + return ce != CollationElementIterator.NULLORDER && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_; } // private inner classes ------------------------------------------------ @@ -2011,20 +2094,16 @@ public final class RuleBasedCollator extends Collator // private variables ----------------------------------------------------- /** - * The smallest natural unsafe or contraction end char character before - * tailoring. - * This is a combining mark. + * The smallest natural unsafe or contraction end char character before tailoring. This is a combining mark. */ private static final int DEFAULT_MIN_HEURISTIC_ = 0x300; /** - * Heuristic table table size. Size is 32 bytes, 1 bit for each - * latin 1 char, and some power of two for hashing the rest of the chars. - * Size in bytes. + * Heuristic table table size. Size is 32 bytes, 1 bit for each latin 1 char, and some power of two for hashing the + * rest of the chars. Size in bytes. */ private static final char HEURISTIC_SIZE_ = 1056; /** - * Mask value down to "some power of two" - 1, - * number of bits, not num of bytes. + * Mask value down to "some power of two" - 1, number of bits, not num of bytes. */ private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff; /** @@ -2032,8 +2111,7 @@ public final class RuleBasedCollator extends Collator */ private static final int HEURISTIC_SHIFT_ = 3; /** - * Unsafe character addition for character too large, it has to be folded - * then incremented. + * Unsafe character addition for character too large, it has to be folded then incremented. */ private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256; /** @@ -2058,6 +2136,10 @@ public final class RuleBasedCollator extends Collator private int m_bottom3_; private int m_topCount3_; private int m_bottomCount3_; + /** + * Script reordering table + */ + private byte[] m_leadBytePermutationTable_; /** * Case first constants */ @@ -2081,20 +2163,19 @@ public final class RuleBasedCollator extends Collator // These values come from the UCA ---------------------------------------- /** - * This is an enum that lists magic special byte values from the - * fractional UCA + * This is an enum that lists magic special byte values from the fractional UCA */ - //private static final byte BYTE_ZERO_ = 0x0; - //private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01; - //private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02; - private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03; - /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_; - //private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_; - // TODO: Make the following values dynamic since they change with almost every UCA version. + // private static final byte BYTE_ZERO_ = 0x0; + // private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01; + // private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02; + private static final byte BYTE_SHIFT_PREFIX_ = (byte) 0x03; + /* private */static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_; + // private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_; + // TODO: Make the following values dynamic since they change with almost every UCA version. static final byte CODAN_PLACEHOLDER = 0x12; - private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x5B; + private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte) 0x5B; - private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF; + private static final byte BYTE_UNSHIFTED_MAX_ = (byte) 0xFF; private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1; private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80; private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40; @@ -2103,22 +2184,18 @@ public final class RuleBasedCollator extends Collator private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5; private static final int COMMON_BOTTOM_3_ = 0x05; private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86; - private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = - COMMON_BOTTOM_3_; - private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_); + private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = COMMON_BOTTOM_3_; + private static final int TOP_COUNT_2_ = (int) (PROPORTION_2_ * TOTAL_2_); private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_; private static final int COMMON_2_ = COMMON_BOTTOM_2_; private static final int COMMON_UPPER_FIRST_3_ = 0xC5; private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_; - //private static final int COMMON_4_ = (byte)0xFF; - - + // private static final int COMMON_4_ = (byte)0xFF; /* - * Minimum size required for the binary collation data in bytes. - * Size of UCA header + size of options to 4 bytes + * Minimum size required for the binary collation data in bytes. Size of UCA header + size of options to 4 bytes */ - //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2; + // private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2; /** * If this collator is to generate only simple tertiaries for fast path @@ -2130,9 +2207,8 @@ public final class RuleBasedCollator extends Collator */ private boolean m_isFrenchCollation_; /** - * Flag indicating if shifted is requested for Quaternary alternate - * handling. If this is not true, the default for alternate handling will - * be non-ignorable. + * Flag indicating if shifted is requested for Quaternary alternate handling. If this is not true, the default for + * alternate handling will be non-ignorable. */ private boolean m_isAlternateHandlingShifted_; /** @@ -2141,12 +2217,10 @@ public final class RuleBasedCollator extends Collator private boolean m_isCaseLevel_; private static final int SORT_BUFFER_INIT_SIZE_ = 128; - private static final int SORT_BUFFER_INIT_SIZE_1_ = - SORT_BUFFER_INIT_SIZE_ << 3; + private static final int SORT_BUFFER_INIT_SIZE_1_ = SORT_BUFFER_INIT_SIZE_ << 3; private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_; private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_; - private static final int SORT_BUFFER_INIT_SIZE_CASE_ = - SORT_BUFFER_INIT_SIZE_ >> 2; + private static final int SORT_BUFFER_INIT_SIZE_CASE_ = SORT_BUFFER_INIT_SIZE_ >> 2; private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_; private static final int CE_CONTINUATION_TAG_ = 0xC0; @@ -2154,11 +2228,11 @@ public final class RuleBasedCollator extends Collator private static final int LAST_BYTE_MASK_ = 0xFF; - //private static final int CE_RESET_TOP_VALUE_ = 0x9F000303; - //private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303; + // private static final int CE_RESET_TOP_VALUE_ = 0x9F000303; + // private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303; - private static final byte SORT_CASE_BYTE_START_ = (byte)0x80; - private static final byte SORT_CASE_SHIFT_START_ = (byte)7; + private static final byte SORT_CASE_BYTE_START_ = (byte) 0x80; + private static final byte SORT_CASE_SHIFT_START_ = (byte) 7; /** * CE buffer size @@ -2166,9 +2240,9 @@ public final class RuleBasedCollator extends Collator private static final int CE_BUFFER_SIZE_ = 512; // variables for Latin-1 processing - boolean latinOneUse_ = false; + boolean latinOneUse_ = false; boolean latinOneRegenTable_ = false; - boolean latinOneFailed_ = false; + boolean latinOneFailed_ = false; int latinOneTableLen_ = 0; int latinOneCEs_[] = null; @@ -2183,7 +2257,7 @@ public final class RuleBasedCollator extends Collator * Utility comparison flags */ private boolean m_utilCompare0_; - //private boolean m_utilCompare1_; + // private boolean m_utilCompare1_; private boolean m_utilCompare2_; private boolean m_utilCompare3_; private boolean m_utilCompare4_; @@ -2196,7 +2270,7 @@ public final class RuleBasedCollator extends Collator private byte m_utilBytes2_[]; private byte m_utilBytes3_[]; private byte m_utilBytes4_[]; - //private byte m_utilBytes5_[]; + // private byte m_utilBytes5_[]; private RawCollationKey m_utilRawCollationKey_; private int m_utilBytesCount0_; @@ -2204,13 +2278,13 @@ public final class RuleBasedCollator extends Collator private int m_utilBytesCount2_; private int m_utilBytesCount3_; private int m_utilBytesCount4_; - //private int m_utilBytesCount5_; - //private int m_utilCount0_; - //private int m_utilCount1_; + // private int m_utilBytesCount5_; + // private int m_utilCount0_; + // private int m_utilCount1_; private int m_utilCount2_; private int m_utilCount3_; private int m_utilCount4_; - //private int m_utilCount5_; + // private int m_utilCount5_; private int m_utilFrenchStart_; private int m_utilFrenchEnd_; @@ -2231,17 +2305,16 @@ public final class RuleBasedCollator extends Collator // private methods ------------------------------------------------------- - private void init(String rules) throws Exception - { + private void init(String rules) throws Exception { setWithUCAData(); - CollationParsedRuleBuilder builder - = new CollationParsedRuleBuilder(rules); + CollationParsedRuleBuilder builder = new CollationParsedRuleBuilder(rules); builder.setRules(this); m_rules_ = rules; init(); + buildPermutationTable(); initUtility(false); } - + private final int compareRegular(String source, String target, int offset) { if (m_srcUtilIter_ == null) { initUtility(true); @@ -2249,7 +2322,7 @@ public final class RuleBasedCollator extends Collator int strength = getStrength(); // setting up the collator parameters m_utilCompare0_ = m_isCaseLevel_; - //m_utilCompare1_ = true; + // m_utilCompare1_ = true; m_utilCompare2_ = strength >= SECONDARY; m_utilCompare3_ = strength >= TERTIARY; m_utilCompare4_ = strength >= QUATERNARY; @@ -2265,14 +2338,11 @@ public final class RuleBasedCollator extends Collator } // This is the lowest primary value that will not be ignored if shifted - int lowestpvalue = m_isAlternateHandlingShifted_ - ? m_variableTopValue_ << 16 : 0; + int lowestpvalue = m_isAlternateHandlingShifted_ ? m_variableTopValue_ << 16 : 0; m_srcUtilCEBufferSize_ = 0; m_tgtUtilCEBufferSize_ = 0; - int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, - target, offset); - if (m_srcUtilCEBufferSize_ == -1 - && m_tgtUtilCEBufferSize_ == -1) { + int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, target, offset); + if (m_srcUtilCEBufferSize_ == -1 && m_tgtUtilCEBufferSize_ == -1) { // since the cebuffer is cleared when we have determined that // either source is greater than target or vice versa, the return // result is the comparison result and not the hiragana result @@ -2302,13 +2372,12 @@ public final class RuleBasedCollator extends Collator } } - if (doShift4) { // checkQuad + if (doShift4) { // checkQuad result = doQuaternaryCompare(lowestpvalue); if (result != 0) { return result; } - } - else if (doHiragana4 && hiraganaresult != 0) { + } else if (doHiragana4 && hiraganaresult != 0) { // If we're fine on quaternaries, we might be different // on Hiragana. This, however, might fail us in shifted. return hiraganaresult; @@ -2316,7 +2385,7 @@ public final class RuleBasedCollator extends Collator // For IDENTICAL comparisons, we use a bitwise character comparison // as a tiebreaker if all else is equal. - // Getting here should be quite rare - strings are not identical - + // Getting here should be quite rare - strings are not identical - // that is checked first, but compared == through all other checks. if (m_utilCompare5_) { return doIdenticalCompare(source, target, offset, true); @@ -2327,236 +2396,203 @@ public final class RuleBasedCollator extends Collator // Is this primary weight compressible? // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). // TODO: This should use per-lead-byte flags from FractionalUCA.txt. - static boolean - isCompressible(int primary1) { + static boolean isCompressible(int primary1) { return BYTE_FIRST_NON_LATIN_PRIMARY_ <= primary1 && primary1 <= maxRegularPrimary; } /** * Gets the 2 bytes of primary order and adds it to the primary byte array - * @param ce current ce - * @param notIsContinuation flag indicating if the current bytes belong to - * a continuation ce - * @param doShift flag indicating if ce is to be shifted - * @param leadPrimary lead primary used for compression - * @param commonBottom4 common byte value for Quaternary - * @param bottomCount4 smallest byte value for Quaternary + * + * @param ce + * current ce + * @param notIsContinuation + * flag indicating if the current bytes belong to a continuation ce + * @param doShift + * flag indicating if ce is to be shifted + * @param leadPrimary + * lead primary used for compression + * @param commonBottom4 + * common byte value for Quaternary + * @param bottomCount4 + * smallest byte value for Quaternary * @return the new lead primary for compression */ - private final int doPrimaryBytes(int ce, boolean notIsContinuation, - boolean doShift, int leadPrimary, - int commonBottom4, int bottomCount4) - { + private final int doPrimaryBytes(int ce, boolean notIsContinuation, boolean doShift, int leadPrimary, + int commonBottom4, int bottomCount4) { int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned - int p1 = ce >>> 8; // comparison - if (doShift) { - if (m_utilCount4_ > 0) { - while (m_utilCount4_ > bottomCount4) { - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - (byte)(commonBottom4 + bottomCount4)); - m_utilBytesCount4_ ++; - m_utilCount4_ -= bottomCount4; - } - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - (byte)(commonBottom4 - + (m_utilCount4_ - 1))); - m_utilBytesCount4_ ++; - m_utilCount4_ = 0; - } - // dealing with a variable and we're treating them as shifted - // This is a shifted ignorable - if (p1 != 0) { - // we need to check this since we could be in continuation - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - (byte)p1); - m_utilBytesCount4_ ++; - } - if (p2 != 0) { - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - (byte)p2); - m_utilBytesCount4_ ++; - } - } - else { - // Note: This code assumes that the table is well built - // i.e. not having 0 bytes where they are not supposed to be. - // Usually, we'll have non-zero primary1 & primary2, except - // in cases of LatinOne and friends, when primary2 will be - // regular and simple sortkey calc - if (p1 != CollationElementIterator.IGNORABLE) { - if (notIsContinuation) { - if (leadPrimary == p1) { - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, (byte)p2); - m_utilBytesCount1_ ++; - } - else { - if (leadPrimary != 0) { - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - ((p1 > leadPrimary) - ? BYTE_UNSHIFTED_MAX_ - : BYTE_UNSHIFTED_MIN_)); - m_utilBytesCount1_ ++; - } - if (p2 == CollationElementIterator.IGNORABLE) { - // one byter, not compressed - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)p1); - m_utilBytesCount1_ ++; - leadPrimary = 0; - } - else if (isCompressible(p1)) { - // compress - leadPrimary = p1; - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)p1); - m_utilBytesCount1_ ++; - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)p2); - m_utilBytesCount1_ ++; - } - else { - leadPrimary = 0; - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)p1); - m_utilBytesCount1_ ++; - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)p2); - m_utilBytesCount1_ ++; - } - } - } - else { - // continuation, add primary to the key, no compression - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, (byte)p1); - m_utilBytesCount1_ ++; - if (p2 != CollationElementIterator.IGNORABLE) { - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, (byte)p2); - // second part - m_utilBytesCount1_ ++; - } - } - } - } - return leadPrimary; + int p1 = ce >>> 8; // comparison + if (doShift) { + if (m_utilCount4_ > 0) { + while (m_utilCount4_ > bottomCount4) { + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4)); + m_utilBytesCount4_++; + m_utilCount4_ -= bottomCount4; + } + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonBottom4 + (m_utilCount4_ - 1))); + m_utilBytesCount4_++; + m_utilCount4_ = 0; + } + // dealing with a variable and we're treating them as shifted + // This is a shifted ignorable + if (p1 != 0) { + // we need to check this since we could be in continuation + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) p1); + m_utilBytesCount4_++; + } + if (p2 != 0) { + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) p2); + m_utilBytesCount4_++; + } + } else { + // Note: This code assumes that the table is well built + // i.e. not having 0 bytes where they are not supposed to be. + // Usually, we'll have non-zero primary1 & primary2, except + // in cases of LatinOne and friends, when primary2 will be + // regular and simple sortkey calc + if (p1 != CollationElementIterator.IGNORABLE) { + if (notIsContinuation) { + if (leadPrimary == p1) { + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p2); + m_utilBytesCount1_++; + } else { + if (leadPrimary != 0) { + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, + ((p1 > leadPrimary) ? BYTE_UNSHIFTED_MAX_ : BYTE_UNSHIFTED_MIN_)); + m_utilBytesCount1_++; + } + if (p2 == CollationElementIterator.IGNORABLE) { + // one byter, not compressed + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1); + m_utilBytesCount1_++; + leadPrimary = 0; + } else if (isCompressible(p1)) { + // compress + leadPrimary = p1; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1); + m_utilBytesCount1_++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p2); + m_utilBytesCount1_++; + } else { + leadPrimary = 0; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1); + m_utilBytesCount1_++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p2); + m_utilBytesCount1_++; + } + } + } else { + // continuation, add primary to the key, no compression + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1); + m_utilBytesCount1_++; + if (p2 != CollationElementIterator.IGNORABLE) { + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p2); + // second part + m_utilBytesCount1_++; + } + } + } + } + return leadPrimary; } /** * Gets the secondary byte and adds it to the secondary byte array - * @param ce current ce - * @param notIsContinuation flag indicating if the current bytes belong to - * a continuation ce - * @param doFrench flag indicator if french sort is to be performed + * + * @param ce + * current ce + * @param notIsContinuation + * flag indicating if the current bytes belong to a continuation ce + * @param doFrench + * flag indicator if french sort is to be performed */ - private final void doSecondaryBytes(int ce, boolean notIsContinuation, - boolean doFrench) - { + private final void doSecondaryBytes(int ce, boolean notIsContinuation, boolean doFrench) { int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison if (s != 0) { if (!doFrench) { // This is compression code. if (s == COMMON_2_ && notIsContinuation) { - m_utilCount2_ ++; - } - else { + m_utilCount2_++; + } else { if (m_utilCount2_ > 0) { if (s > COMMON_2_) { // not necessary for 4th level. while (m_utilCount2_ > TOP_COUNT_2_) { - m_utilBytes2_ = append(m_utilBytes2_, - m_utilBytesCount2_, - (byte)(COMMON_TOP_2_ - TOP_COUNT_2_)); - m_utilBytesCount2_ ++; + m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, + (byte) (COMMON_TOP_2_ - TOP_COUNT_2_)); + m_utilBytesCount2_++; m_utilCount2_ -= TOP_COUNT_2_; } - m_utilBytes2_ = append(m_utilBytes2_, - m_utilBytesCount2_, - (byte)(COMMON_TOP_2_ - - (m_utilCount2_ - 1))); - m_utilBytesCount2_ ++; - } - else { + m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, + (byte) (COMMON_TOP_2_ - (m_utilCount2_ - 1))); + m_utilBytesCount2_++; + } else { while (m_utilCount2_ > BOTTOM_COUNT_2_) { - m_utilBytes2_ = append(m_utilBytes2_, - m_utilBytesCount2_, - (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); - m_utilBytesCount2_ ++; + m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, + (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); + m_utilBytesCount2_++; m_utilCount2_ -= BOTTOM_COUNT_2_; } - m_utilBytes2_ = append(m_utilBytes2_, - m_utilBytesCount2_, - (byte)(COMMON_BOTTOM_2_ - + (m_utilCount2_ - 1))); - m_utilBytesCount2_ ++; + m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, + (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1))); + m_utilBytesCount2_++; } m_utilCount2_ = 0; } - m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, - (byte)s); - m_utilBytesCount2_ ++; + m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, (byte) s); + m_utilBytesCount2_++; + } + } else { + m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, (byte) s); + m_utilBytesCount2_++; + // Do the special handling for French secondaries + // We need to get continuation elements and do intermediate + // restore + // abc1c2c3de with french secondaries need to be edc1c2c3ba + // NOT edc3c2c1ba + if (notIsContinuation) { + if (m_utilFrenchStart_ != -1) { + // reverse secondaries from frenchStartPtr up to + // frenchEndPtr + reverseBuffer(m_utilBytes2_); + m_utilFrenchStart_ = -1; + } + } else { + if (m_utilFrenchStart_ == -1) { + m_utilFrenchStart_ = m_utilBytesCount2_ - 2; + } + m_utilFrenchEnd_ = m_utilBytesCount2_ - 1; } - } - else { - m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, - (byte)s); - m_utilBytesCount2_ ++; - // Do the special handling for French secondaries - // We need to get continuation elements and do intermediate - // restore - // abc1c2c3de with french secondaries need to be edc1c2c3ba - // NOT edc3c2c1ba - if (notIsContinuation) { - if (m_utilFrenchStart_ != -1) { - // reverse secondaries from frenchStartPtr up to - // frenchEndPtr - reverseBuffer(m_utilBytes2_); - m_utilFrenchStart_ = -1; - } - } - else { - if (m_utilFrenchStart_ == -1) { - m_utilFrenchStart_ = m_utilBytesCount2_ - 2; - } - m_utilFrenchEnd_ = m_utilBytesCount2_ - 1; - } } } } /** * Reverse the argument buffer - * @param buffer to reverse + * + * @param buffer + * to reverse */ - private void reverseBuffer(byte buffer[]) - { + private void reverseBuffer(byte buffer[]) { int start = m_utilFrenchStart_; int end = m_utilFrenchEnd_; while (start < end) { byte b = buffer[start]; - buffer[start ++] = buffer[end]; - buffer[end --] = b; + buffer[start++] = buffer[end]; + buffer[end--] = b; } } /** * Insert the case shifting byte if required - * @param caseshift value + * + * @param caseshift + * value * @return new caseshift value */ - private final int doCaseShift(int caseshift) - { - if (caseshift == 0) { - m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_, - SORT_CASE_BYTE_START_); - m_utilBytesCount0_ ++; + private final int doCaseShift(int caseshift) { + if (caseshift == 0) { + m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_, SORT_CASE_BYTE_START_); + m_utilBytesCount0_++; caseshift = SORT_CASE_SHIFT_START_; } return caseshift; @@ -2564,42 +2600,35 @@ public final class RuleBasedCollator extends Collator /** * Performs the casing sort - * @param tertiary byte in ints for easy comparison - * @param notIsContinuation flag indicating if the current bytes belong to - * a continuation ce + * + * @param tertiary + * byte in ints for easy comparison + * @param notIsContinuation + * flag indicating if the current bytes belong to a continuation ce * @param caseshift * @return the new value of case shift */ - private final int doCaseBytes(int tertiary, boolean notIsContinuation, - int caseshift) - { + private final int doCaseBytes(int tertiary, boolean notIsContinuation, int caseshift) { caseshift = doCaseShift(caseshift); if (notIsContinuation && tertiary != 0) { - byte casebits = (byte)(tertiary & 0xC0); + byte casebits = (byte) (tertiary & 0xC0); if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { if (casebits == 0) { - m_utilBytes0_[m_utilBytesCount0_ - 1] - |= (1 << (-- caseshift)); + m_utilBytes0_[m_utilBytesCount0_ - 1] |= (1 << (--caseshift)); + } else { + // second bit + caseshift = doCaseShift(caseshift - 1); + m_utilBytes0_[m_utilBytesCount0_ - 1] |= ((casebits >> 6) & 1) << (--caseshift); } - else { - // second bit - caseshift = doCaseShift(caseshift - 1); - m_utilBytes0_[m_utilBytesCount0_ - 1] - |= ((casebits >> 6) & 1) << (-- caseshift); - } - } - else { + } else { if (casebits != 0) { - m_utilBytes0_[m_utilBytesCount0_ - 1] - |= 1 << (-- caseshift); + m_utilBytes0_[m_utilBytesCount0_ - 1] |= 1 << (--caseshift); // second bit caseshift = doCaseShift(caseshift); - m_utilBytes0_[m_utilBytesCount0_ - 1] - |= ((casebits >> 7) & 1) << (-- caseshift); - } - else { - caseshift --; + m_utilBytes0_[m_utilBytesCount0_ - 1] |= ((casebits >> 7) & 1) << (--caseshift); + } else { + caseshift--; } } } @@ -2609,114 +2638,102 @@ public final class RuleBasedCollator extends Collator /** * Gets the tertiary byte and adds it to the tertiary byte array - * @param tertiary byte in int for easy comparison - * @param notIsContinuation flag indicating if the current bytes belong to - * a continuation ce + * + * @param tertiary + * byte in int for easy comparison + * @param notIsContinuation + * flag indicating if the current bytes belong to a continuation ce */ - private final void doTertiaryBytes(int tertiary, boolean notIsContinuation) - { + private final void doTertiaryBytes(int tertiary, boolean notIsContinuation) { if (tertiary != 0) { // This is compression code. // sequence size check is included in the if clause if (tertiary == m_common3_ && notIsContinuation) { - m_utilCount3_ ++; - } - else { + m_utilCount3_++; + } else { int common3 = m_common3_ & LAST_BYTE_MASK_; if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) { tertiary += m_addition3_; - } - else if (tertiary <= common3 - && m_common3_ == COMMON_UPPER_FIRST_3_) { + } else if (tertiary <= common3 && m_common3_ == COMMON_UPPER_FIRST_3_) { tertiary -= m_addition3_; } if (m_utilCount3_ > 0) { if (tertiary > common3) { while (m_utilCount3_ > m_topCount3_) { - m_utilBytes3_ = append(m_utilBytes3_, - m_utilBytesCount3_, - (byte)(m_top3_ - m_topCount3_)); - m_utilBytesCount3_ ++; + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_)); + m_utilBytesCount3_++; m_utilCount3_ -= m_topCount3_; } - m_utilBytes3_ = append(m_utilBytes3_, - m_utilBytesCount3_, - (byte)(m_top3_ - - (m_utilCount3_ - 1))); - m_utilBytesCount3_ ++; - } - else { + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, + (byte) (m_top3_ - (m_utilCount3_ - 1))); + m_utilBytesCount3_++; + } else { while (m_utilCount3_ > m_bottomCount3_) { - m_utilBytes3_ = append(m_utilBytes3_, - m_utilBytesCount3_, - (byte)(m_bottom3_ + m_bottomCount3_)); - m_utilBytesCount3_ ++; + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, + (byte) (m_bottom3_ + m_bottomCount3_)); + m_utilBytesCount3_++; m_utilCount3_ -= m_bottomCount3_; } - m_utilBytes3_ = append(m_utilBytes3_, - m_utilBytesCount3_, - (byte)(m_bottom3_ - + (m_utilCount3_ - 1))); - m_utilBytesCount3_ ++; + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, + (byte) (m_bottom3_ + (m_utilCount3_ - 1))); + m_utilBytesCount3_++; } m_utilCount3_ = 0; } - m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, - (byte)tertiary); - m_utilBytesCount3_ ++; + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) tertiary); + m_utilBytesCount3_++; } } } /** * Gets the Quaternary byte and adds it to the Quaternary byte array - * @param isCodePointHiragana flag indicator if the previous codepoint - * we dealt with was Hiragana - * @param commonBottom4 smallest common Quaternary byte - * @param bottomCount4 smallest Quaternary byte - * @param hiragana4 hiragana Quaternary byte + * + * @param isCodePointHiragana + * flag indicator if the previous codepoint we dealt with was Hiragana + * @param commonBottom4 + * smallest common Quaternary byte + * @param bottomCount4 + * smallest Quaternary byte + * @param hiragana4 + * hiragana Quaternary byte */ - private final void doQuaternaryBytes(boolean isCodePointHiragana, - int commonBottom4, int bottomCount4, - byte hiragana4) - { + private final void doQuaternaryBytes(boolean isCodePointHiragana, int commonBottom4, int bottomCount4, + byte hiragana4) { if (isCodePointHiragana) { // This was Hiragana, need to note it if (m_utilCount4_ > 0) { // Close this part while (m_utilCount4_ > bottomCount4) { - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - (byte)(commonBottom4 - + bottomCount4)); - m_utilBytesCount4_ ++; + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4)); + m_utilBytesCount4_++; m_utilCount4_ -= bottomCount4; } - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - (byte)(commonBottom4 - + (m_utilCount4_ - 1))); - m_utilBytesCount4_ ++; + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonBottom4 + (m_utilCount4_ - 1))); + m_utilBytesCount4_++; m_utilCount4_ = 0; } - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - hiragana4); // Add the Hiragana - m_utilBytesCount4_ ++; - } - else { // This wasn't Hiragana, so we can continue adding stuff - m_utilCount4_ ++; + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, hiragana4); // Add the Hiragana + m_utilBytesCount4_++; + } else { // This wasn't Hiragana, so we can continue adding stuff + m_utilCount4_++; } } /** - * Iterates through the argument string for all ces. - * Split the ces into their relevant primaries, secondaries etc. - * @param source normalized string - * @param doFrench flag indicator if special handling of French has to be - * done - * @param hiragana4 offset for Hiragana quaternary - * @param commonBottom4 smallest common quaternary byte - * @param bottomCount4 smallest quaternary byte + * Iterates through the argument string for all ces. Split the ces into their relevant primaries, secondaries etc. + * + * @param source + * normalized string + * @param doFrench + * flag indicator if special handling of French has to be done + * @param hiragana4 + * offset for Hiragana quaternary + * @param commonBottom4 + * smallest common quaternary byte + * @param bottomCount4 + * smallest quaternary byte */ - private final void getSortKeyBytes(String source, boolean doFrench, - byte hiragana4, int commonBottom4, - int bottomCount4) + private final void getSortKeyBytes(String source, boolean doFrench, byte hiragana4, int commonBottom4, + int bottomCount4) { if (m_srcUtilIter_ == null) { @@ -2750,22 +2767,18 @@ public final class RuleBasedCollator extends Collator notIsContinuation = !isContinuation(ce); - /* - * if (notIsContinuation) { - if (scriptOrder != NULL) { - primary1 = scriptOrder[primary1]; - } - }*/ + if (notIsContinuation) { + if (m_leadBytePermutationTable_ != null) { + ce = (m_leadBytePermutationTable_[((ce >> 24) + 256) % 256] << 24) | (ce & 0x00FFFFFF); + } + } boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0; // actually we can just check that the first byte is 0 // generation stuffs the order left first - boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) - <= m_variableTopValue_; + boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) <= m_variableTopValue_; doShift = (m_isAlternateHandlingShifted_ - && ((notIsContinuation && isSmallerThanVariableTop - && !isPrimaryByteIgnorable) // primary byte not 0 - || (!notIsContinuation && doShift)) - || (doShift && isPrimaryByteIgnorable)); + && ((notIsContinuation && isSmallerThanVariableTop && !isPrimaryByteIgnorable) // primary byte not 0 + || (!notIsContinuation && doShift)) || (doShift && isPrimaryByteIgnorable)); if (doShift && isPrimaryByteIgnorable) { // amendment to the UCA says that primary ignorables and other // ignorables should be removed if following a shifted code @@ -2774,9 +2787,7 @@ public final class RuleBasedCollator extends Collator // we should just completely ignore it continue; } - leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, - leadPrimary, commonBottom4, - bottomCount4); + leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, leadPrimary, commonBottom4, bottomCount4); if (doShift) { continue; } @@ -2792,11 +2803,10 @@ public final class RuleBasedCollator extends Collator if (m_utilCompare0_ && (!isPrimaryByteIgnorable || m_utilCompare2_)) { // do the case level if we need to do it. We don't want to calculate // case level for primary ignorables if we have only primary strength and case level - // otherwise we would break well formedness of CEs + // otherwise we would break well formedness of CEs caseShift = doCaseBytes(t, notIsContinuation, caseShift); - } - else if (notIsContinuation) { - t ^= m_caseSwitch_; + } else if (notIsContinuation) { + t ^= m_caseSwitch_; } t &= m_mask3_; @@ -2806,8 +2816,7 @@ public final class RuleBasedCollator extends Collator } if (m_utilCompare4_ && notIsContinuation) { // compare quad - doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_, - commonBottom4, bottomCount4, hiragana4); + doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_, commonBottom4, bottomCount4, hiragana4); } } setDecomposition(backupDecomposition); // reverts to original @@ -2818,20 +2827,21 @@ public final class RuleBasedCollator extends Collator } /** - * From the individual strength byte results the final compact sortkey - * will be calculated. - * @param source text string - * @param doFrench flag indicating that special handling of French has to - * be done - * @param commonBottom4 smallest common quaternary byte - * @param bottomCount4 smallest quaternary byte - * @param key output RawCollationKey to store results, key cannot be null + * From the individual strength byte results the final compact sortkey will be calculated. + * + * @param source + * text string + * @param doFrench + * flag indicating that special handling of French has to be done + * @param commonBottom4 + * smallest common quaternary byte + * @param bottomCount4 + * smallest quaternary byte + * @param key + * output RawCollationKey to store results, key cannot be null */ - private final void getSortKey(String source, boolean doFrench, - int commonBottom4, - int bottomCount4, - RawCollationKey key) - { + private final void getSortKey(String source, boolean doFrench, int commonBottom4, int bottomCount4, + RawCollationKey key) { // we have done all the CE's, now let's put them together to form // a key if (m_utilCompare2_) { @@ -2851,8 +2861,8 @@ public final class RuleBasedCollator extends Collator } } - m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)0); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) 0); + m_utilBytesCount1_++; key.set(m_utilBytes1_, 0, m_utilBytesCount1_); } @@ -2860,116 +2870,97 @@ public final class RuleBasedCollator extends Collator /** * Packs the French bytes */ - private final void doFrench() - { - for (int i = 0; i < m_utilBytesCount2_; i ++) { + private final void doFrench() { + for (int i = 0; i < m_utilBytesCount2_; i++) { byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1]; // This is compression code. if (s == COMMON_2_) { - ++ m_utilCount2_; - } - else { + ++m_utilCount2_; + } else { if (m_utilCount2_ > 0) { // getting the unsigned value if ((s & LAST_BYTE_MASK_) > COMMON_2_) { // not necessary for 4th level. while (m_utilCount2_ > TOP_COUNT_2_) { - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)(COMMON_TOP_2_ - TOP_COUNT_2_)); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, + (byte) (COMMON_TOP_2_ - TOP_COUNT_2_)); + m_utilBytesCount1_++; m_utilCount2_ -= TOP_COUNT_2_; } - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)(COMMON_TOP_2_ - - (m_utilCount2_ - 1))); - m_utilBytesCount1_ ++; - } - else { + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, + (byte) (COMMON_TOP_2_ - (m_utilCount2_ - 1))); + m_utilBytesCount1_++; + } else { while (m_utilCount2_ > BOTTOM_COUNT_2_) { - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, + (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); + m_utilBytesCount1_++; m_utilCount2_ -= BOTTOM_COUNT_2_; } - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)(COMMON_BOTTOM_2_ - + (m_utilCount2_ - 1))); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, + (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1))); + m_utilBytesCount1_++; } m_utilCount2_ = 0; } m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, s); - m_utilBytesCount1_ ++; + m_utilBytesCount1_++; } } if (m_utilCount2_ > 0) { while (m_utilCount2_ > BOTTOM_COUNT_2_) { - m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, - (byte)(COMMON_BOTTOM_2_ - + BOTTOM_COUNT_2_)); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); + m_utilBytesCount1_++; m_utilCount2_ -= BOTTOM_COUNT_2_; } - m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, - (byte)(COMMON_BOTTOM_2_ - + (m_utilCount2_ - 1))); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1))); + m_utilBytesCount1_++; } } /** * Compacts the secondary bytes and stores them into the primary array - * @param doFrench flag indicator that French has to be handled specially + * + * @param doFrench + * flag indicator that French has to be handled specially */ - private final void doSecondary(boolean doFrench) - { + private final void doSecondary(boolean doFrench) { if (m_utilCount2_ > 0) { while (m_utilCount2_ > BOTTOM_COUNT_2_) { - m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, - (byte)(COMMON_BOTTOM_2_ - + BOTTOM_COUNT_2_)); - m_utilBytesCount2_ ++; + m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); + m_utilBytesCount2_++; m_utilCount2_ -= BOTTOM_COUNT_2_; } - m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, - (byte)(COMMON_BOTTOM_2_ + - (m_utilCount2_ - 1))); - m_utilBytesCount2_ ++; + m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1))); + m_utilBytesCount2_++; } - m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, - SORT_LEVEL_TERMINATOR_); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); + m_utilBytesCount1_++; if (doFrench) { // do the reverse copy doFrench(); - } - else { - if (m_utilBytes1_.length <= m_utilBytesCount1_ - + m_utilBytesCount2_) { - m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, - m_utilBytesCount2_); + } else { + if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount2_) { + m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount2_); } - System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_, - m_utilBytesCount1_, m_utilBytesCount2_); + System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount2_); m_utilBytesCount1_ += m_utilBytesCount2_; } } /** * Increase buffer size - * @param buffer array of bytes - * @param size of the byte array - * @param incrementsize size to increase + * + * @param buffer + * array of bytes + * @param size + * of the byte array + * @param incrementsize + * size to increase * @return the new buffer */ - private static final byte[] increase(byte buffer[], int size, - int incrementsize) - { + private static final byte[] increase(byte buffer[], int size, int incrementsize) { byte result[] = new byte[buffer.length + incrementsize]; System.arraycopy(buffer, 0, result, 0, size); return result; @@ -2977,14 +2968,16 @@ public final class RuleBasedCollator extends Collator /** * Increase buffer size - * @param buffer array of ints - * @param size of the byte array - * @param incrementsize size to increase + * + * @param buffer + * array of ints + * @param size + * of the byte array + * @param incrementsize + * size to increase * @return the new buffer */ - private static final int[] increase(int buffer[], int size, - int incrementsize) - { + private static final int[] increase(int buffer[], int size, int incrementsize) { int result[] = new int[buffer.length + incrementsize]; System.arraycopy(buffer, 0, result, 0, size); return result; @@ -2993,123 +2986,97 @@ public final class RuleBasedCollator extends Collator /** * Compacts the case bytes and stores them into the primary array */ - private final void doCase() - { - m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, - SORT_LEVEL_TERMINATOR_); - m_utilBytesCount1_ ++; + private final void doCase() { + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); + m_utilBytesCount1_++; if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount0_) { - m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, - m_utilBytesCount0_); + m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount0_); } - System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_, - m_utilBytesCount0_); + System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount0_); m_utilBytesCount1_ += m_utilBytesCount0_; } /** * Compacts the tertiary bytes and stores them into the primary array */ - private final void doTertiary() - { + private final void doTertiary() { if (m_utilCount3_ > 0) { if (m_common3_ != COMMON_BOTTOM_3_) { while (m_utilCount3_ >= m_topCount3_) { - m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, - (byte)(m_top3_ - m_topCount3_)); - m_utilBytesCount3_ ++; + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_)); + m_utilBytesCount3_++; m_utilCount3_ -= m_topCount3_; } - m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, - (byte)(m_top3_ - m_utilCount3_)); - m_utilBytesCount3_ ++; - } - else { + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_top3_ - m_utilCount3_)); + m_utilBytesCount3_++; + } else { while (m_utilCount3_ > m_bottomCount3_) { - m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, - (byte)(m_bottom3_ - + m_bottomCount3_)); - m_utilBytesCount3_ ++; + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_bottom3_ + m_bottomCount3_)); + m_utilBytesCount3_++; m_utilCount3_ -= m_bottomCount3_; } - m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, - (byte)(m_bottom3_ - + (m_utilCount3_ - 1))); - m_utilBytesCount3_ ++; + m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_bottom3_ + (m_utilCount3_ - 1))); + m_utilBytesCount3_++; } } - m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, - SORT_LEVEL_TERMINATOR_); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); + m_utilBytesCount1_++; if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount3_) { - m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, - m_utilBytesCount3_); + m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount3_); } - System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_, - m_utilBytesCount3_); + System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount3_); m_utilBytesCount1_ += m_utilBytesCount3_; } /** * Compacts the quaternary bytes and stores them into the primary array */ - private final void doQuaternary(int commonbottom4, int bottomcount4) - { + private final void doQuaternary(int commonbottom4, int bottomcount4) { if (m_utilCount4_ > 0) { while (m_utilCount4_ > bottomcount4) { - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - (byte)(commonbottom4 + bottomcount4)); - m_utilBytesCount4_ ++; + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonbottom4 + bottomcount4)); + m_utilBytesCount4_++; m_utilCount4_ -= bottomcount4; } - m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, - (byte)(commonbottom4 - + (m_utilCount4_ - 1))); - m_utilBytesCount4_ ++; + m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonbottom4 + (m_utilCount4_ - 1))); + m_utilBytesCount4_++; } - m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, - SORT_LEVEL_TERMINATOR_); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); + m_utilBytesCount1_++; if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount4_) { - m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, - m_utilBytesCount4_); + m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount4_); } - System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_, - m_utilBytesCount4_); + System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount4_); m_utilBytesCount1_ += m_utilBytesCount4_; } /** - * Deals with the identical sort. - * Appends the BOCSU version of the source string to the ends of the - * byte buffer. - * @param source text string + * Deals with the identical sort. Appends the BOCSU version of the source string to the ends of the byte buffer. + * + * @param source + * text string */ - private final void doIdentical(String source) - { + private final void doIdentical(String source) { int isize = BOCU.getCompressionLength(source); - m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, - SORT_LEVEL_TERMINATOR_); - m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); + m_utilBytesCount1_++; if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) { - m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, - 1 + isize); + m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, 1 + isize); } - m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_, - m_utilBytesCount1_); + m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_, m_utilBytesCount1_); } /** - * Gets the offset of the first unmatched characters in source and target. - * This method returns the offset of the start of a contraction or a - * combining sequence, if the first difference is in the middle of such a - * sequence. - * @param source string - * @param target string + * Gets the offset of the first unmatched characters in source and target. This method returns the offset of the + * start of a contraction or a combining sequence, if the first difference is in the middle of such a sequence. + * + * @param source + * string + * @param target + * string * @return offset of the first unmatched characters in source and target. */ - private final int getFirstUnmatchedOffset(String source, String target) - { + private final int getFirstUnmatchedOffset(String source, String target) { int result = 0; int slength = source.length(); int tlength = target.length(); @@ -3117,9 +3084,8 @@ public final class RuleBasedCollator extends Collator if (minlength > tlength) { minlength = tlength; } - while (result < minlength - && source.charAt(result) == target.charAt(result)) { - result ++; + while (result < minlength && source.charAt(result) == target.charAt(result)) { + result++; } if (result > 0) { // There is an identical portion at the beginning of the two @@ -3131,24 +3097,19 @@ public final class RuleBasedCollator extends Collator if (result < minlength) { schar = source.charAt(result); // first differing chars tchar = target.charAt(result); - } - else { + } else { schar = source.charAt(minlength - 1); if (isUnsafe(schar)) { tchar = schar; - } - else if (slength == tlength) { - return result; - } - else if (slength < tlength) { + } else if (slength == tlength) { + return result; + } else if (slength < tlength) { tchar = target.charAt(result); - } - else { + } else { schar = source.charAt(result); } } - if (isUnsafe(schar) || isUnsafe(tchar)) - { + if (isUnsafe(schar) || isUnsafe(tchar)) { // We are stopped in the middle of a contraction or combining // sequence. // Look backwards for the part of the string for the start of @@ -3156,30 +3117,28 @@ public final class RuleBasedCollator extends Collator // It doesn't matter which string we scan, since they are the // same in this region. do { - result --; - } - while (result > 0 && isUnsafe(source.charAt(result))); + result--; + } while (result > 0 && isUnsafe(source.charAt(result))); } } return result; } /** - * Appending an byte to an array of bytes and increases it if we run out of - * space - * @param array of byte arrays - * @param appendindex index in the byte array to append - * @param value to append - * @return array if array size can accomodate the new value, otherwise - * a bigger array will be created and returned + * Appending an byte to an array of bytes and increases it if we run out of space + * + * @param array + * of byte arrays + * @param appendindex + * index in the byte array to append + * @param value + * to append + * @return array if array size can accomodate the new value, otherwise a bigger array will be created and returned */ - private static final byte[] append(byte array[], int appendindex, - byte value) - { + private static final byte[] append(byte array[], int appendindex, byte value) { try { array[appendindex] = value; - } - catch (ArrayIndexOutOfBoundsException e) { + } catch (ArrayIndexOutOfBoundsException e) { array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_); array[appendindex] = value; } @@ -3187,45 +3146,44 @@ public final class RuleBasedCollator extends Collator } /** - * This is a trick string compare function that goes in and uses sortkeys - * to compare. It is used when compare gets in trouble and needs to bail - * out. - * @param source text string - * @param target text string + * This is a trick string compare function that goes in and uses sortkeys to compare. It is used when compare gets + * in trouble and needs to bail out. + * + * @param source + * text string + * @param target + * text string */ private final int compareBySortKeys(String source, String target) { - m_utilRawCollationKey_ = getRawCollationKey(source, - m_utilRawCollationKey_); + m_utilRawCollationKey_ = getRawCollationKey(source, m_utilRawCollationKey_); // this method is very seldom called RawCollationKey targetkey = getRawCollationKey(target, null); return m_utilRawCollationKey_.compareTo(targetkey); } /** - * Performs the primary comparisons, and fills up the CE buffer at the - * same time. - * The return value toggles between the comparison result and the hiragana - * result. If either the source is greater than target or vice versa, the - * return result is the comparison result, ie 1 or -1, furthermore the - * cebuffers will be cleared when that happens. If the primary comparisons - * are equal, we'll have to continue with secondary comparison. In this case - * the cebuffer will not be cleared and the return result will be the - * hiragana result. - * @param doHiragana4 flag indicator that Hiragana Quaternary has to be - * observed - * @param lowestpvalue the lowest primary value that will not be ignored if - * alternate handling is shifted - * @param source text string - * @param target text string - * @param textoffset offset in text to start the comparison - * @return comparion result if a primary difference is found, otherwise - * hiragana result + * Performs the primary comparisons, and fills up the CE buffer at the same time. The return value toggles between + * the comparison result and the hiragana result. If either the source is greater than target or vice versa, the + * return result is the comparison result, ie 1 or -1, furthermore the cebuffers will be cleared when that happens. + * If the primary comparisons are equal, we'll have to continue with secondary comparison. In this case the cebuffer + * will not be cleared and the return result will be the hiragana result. + * + * @param doHiragana4 + * flag indicator that Hiragana Quaternary has to be observed + * @param lowestpvalue + * the lowest primary value that will not be ignored if alternate handling is shifted + * @param source + * text string + * @param target + * text string + * @param textoffset + * offset in text to start the comparison + * @return comparion result if a primary difference is found, otherwise hiragana result */ - private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, - String source, String target, - int textoffset) + private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, String source, String target, + int textoffset) { // Preparing the context objects for iterating over strings @@ -3242,72 +3200,62 @@ public final class RuleBasedCollator extends Collator // We fetch CEs until we hit a non ignorable primary or end. do { sorder = m_srcUtilColEIter_.next(); - m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_, - m_srcUtilCEBufferSize_, sorder); - m_srcUtilCEBufferSize_ ++; + m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_, m_srcUtilCEBufferSize_, sorder); + m_srcUtilCEBufferSize_++; sorder &= CE_PRIMARY_MASK_; } while (sorder == CollationElementIterator.IGNORABLE); int torder = 0; do { torder = m_tgtUtilColEIter_.next(); - m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_, - m_tgtUtilCEBufferSize_, torder); - m_tgtUtilCEBufferSize_ ++; + m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_, m_tgtUtilCEBufferSize_, torder); + m_tgtUtilCEBufferSize_++; torder &= CE_PRIMARY_MASK_; } while (torder == CollationElementIterator.IGNORABLE); + if (!isContinuation(sorder) && m_leadBytePermutationTable_ != null) { + sorder = (m_leadBytePermutationTable_[((sorder >> 24) + 256) % 256] << 24) | (sorder & 0x00FFFFFF); + torder = (m_leadBytePermutationTable_[((torder >> 24) + 256) % 256] << 24) | (torder & 0x00FFFFFF); + } + // if both primaries are the same if (sorder == torder) { // and there are no more CEs, we advance to the next level // see if we are at the end of either string - if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] - == CollationElementIterator.NULLORDER) { - if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] - != CollationElementIterator.NULLORDER) { + if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { + if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] != CollationElementIterator.NULLORDER) { return -1; } break; - } - else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] - == CollationElementIterator.NULLORDER) { + } else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { return 1; } if (doHiragana4 && hiraganaresult == 0 - && m_srcUtilColEIter_.m_isCodePointHiragana_ != - m_tgtUtilColEIter_.m_isCodePointHiragana_) { + && m_srcUtilColEIter_.m_isCodePointHiragana_ != m_tgtUtilColEIter_.m_isCodePointHiragana_) { if (m_srcUtilColEIter_.m_isCodePointHiragana_) { hiraganaresult = -1; - } - else { + } else { hiraganaresult = 1; } } - } - else { + } else { // if two primaries are different, we are done return endPrimaryCompare(sorder, torder); } } // no primary difference... do the rest from the buffers return hiraganaresult; - } - else { // shifted - do a slightly more complicated processing :) + } else { // shifted - do a slightly more complicated processing :) while (true) { - int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_, - lowestpvalue, true); - int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_, - lowestpvalue, false); + int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_, lowestpvalue, true); + int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_, lowestpvalue, false); if (sorder == torder) { - if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] - == CollationElementIterator.NULLORDER) { + if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { break; - } - else { + } else { continue; } - } - else { + } else { return endPrimaryCompare(sorder, torder); } } // no primary difference... do the rest from the buffers @@ -3316,24 +3264,20 @@ public final class RuleBasedCollator extends Collator } /** - * This is used only for primary strength when we know that sorder is - * already different from torder. - * Compares sorder and torder, returns -1 if sorder is less than torder. - * Clears the cebuffer at the same time. - * @param sorder source strength order - * @param torder target strength order + * This is used only for primary strength when we know that sorder is already different from torder. Compares sorder + * and torder, returns -1 if sorder is less than torder. Clears the cebuffer at the same time. + * + * @param sorder + * source strength order + * @param torder + * target strength order * @return the comparison result of sorder and torder */ - private final int endPrimaryCompare(int sorder, int torder) - { + private final int endPrimaryCompare(int sorder, int torder) { // if we reach here, the ce offset accessed is the last ce // appended to the buffer - boolean isSourceNullOrder = (m_srcUtilCEBuffer_[ - m_srcUtilCEBufferSize_ - 1] - == CollationElementIterator.NULLORDER); - boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[ - m_tgtUtilCEBufferSize_ - 1] - == CollationElementIterator.NULLORDER); + boolean isSourceNullOrder = (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER); + boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER); m_srcUtilCEBufferSize_ = -1; m_tgtUtilCEBufferSize_ = -1; if (isSourceNullOrder) { @@ -3344,26 +3288,25 @@ public final class RuleBasedCollator extends Collator } // getting rid of the sign sorder >>>= CE_PRIMARY_SHIFT_; - torder >>>= CE_PRIMARY_SHIFT_; - if (sorder < torder) { - return -1; - } - return 1; + torder >>>= CE_PRIMARY_SHIFT_; + if (sorder < torder) { + return -1; + } + return 1; } /** - * Calculates the next primary shifted value and fills up cebuffer with the - * next non-ignorable ce. - * @param coleiter collation element iterator - * @param doHiragana4 flag indicator if hiragana quaternary is to be - * handled - * @param lowestpvalue lowest primary shifted value that will not be - * ignored + * Calculates the next primary shifted value and fills up cebuffer with the next non-ignorable ce. + * + * @param coleiter + * collation element iterator + * @param doHiragana4 + * flag indicator if hiragana quaternary is to be handled + * @param lowestpvalue + * lowest primary shifted value that will not be ignored * @return result next modified ce */ - private final int getPrimaryShiftedCompareCE( - CollationElementIterator coleiter, - int lowestpvalue, boolean isSrc) + private final int getPrimaryShiftedCompareCE(CollationElementIterator coleiter, int lowestpvalue, boolean isSrc) { boolean shifted = false; @@ -3378,60 +3321,48 @@ public final class RuleBasedCollator extends Collator result = coleiter.next(); if (result == CollationElementIterator.NULLORDER) { cebuffer = append(cebuffer, cebuffersize, result); - cebuffersize ++; + cebuffersize++; break; - } - else if (result == CollationElementIterator.IGNORABLE - || (shifted - && (result & CE_PRIMARY_MASK_) - == CollationElementIterator.IGNORABLE)) { + } else if (result == CollationElementIterator.IGNORABLE + || (shifted && (result & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE)) { // UCA amendment - ignore ignorables that follow shifted code // points continue; - } - else if (isContinuation(result)) { - if ((result & CE_PRIMARY_MASK_) - != CollationElementIterator.IGNORABLE) { + } else if (isContinuation(result)) { + if ((result & CE_PRIMARY_MASK_) != CollationElementIterator.IGNORABLE) { // There is primary value if (shifted) { - result = (result & CE_PRIMARY_MASK_) - | CE_CONTINUATION_MARKER_; + result = (result & CE_PRIMARY_MASK_) | CE_CONTINUATION_MARKER_; // preserve interesting continuation cebuffer = append(cebuffer, cebuffersize, result); - cebuffersize ++; + cebuffersize++; continue; - } - else { + } else { cebuffer = append(cebuffer, cebuffersize, result); - cebuffersize ++; + cebuffersize++; break; } - } - else { // Just lower level values + } else { // Just lower level values if (!shifted) { cebuffer = append(cebuffer, cebuffersize, result); - cebuffersize ++; + cebuffersize++; } } - } - else { // regular - if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_, - lowestpvalue) > 0) { + } else { // regular + if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_, lowestpvalue) > 0) { cebuffer = append(cebuffer, cebuffersize, result); - cebuffersize ++; + cebuffersize++; break; - } - else { + } else { if ((result & CE_PRIMARY_MASK_) != 0) { shifted = true; result &= CE_PRIMARY_MASK_; cebuffer = append(cebuffer, cebuffersize, result); - cebuffersize ++; + cebuffersize++; continue; - } - else { + } else { cebuffer = append(cebuffer, cebuffersize, result); - cebuffersize ++; + cebuffersize++; shifted = false; continue; } @@ -3441,8 +3372,7 @@ public final class RuleBasedCollator extends Collator if (isSrc) { m_srcUtilCEBuffer_ = cebuffer; m_srcUtilCEBufferSize_ = cebuffersize; - } - else { + } else { m_tgtUtilCEBuffer_ = cebuffer; m_tgtUtilCEBufferSize_ = cebuffersize; } @@ -3451,16 +3381,17 @@ public final class RuleBasedCollator extends Collator } /** - * Appending an int to an array of ints and increases it if we run out of - * space - * @param array of int arrays - * @param appendindex index at which value will be appended - * @param value to append - * @return array if size is not increased, otherwise a new array will be - * returned + * Appending an int to an array of ints and increases it if we run out of space + * + * @param array + * of int arrays + * @param appendindex + * index at which value will be appended + * @param value + * to append + * @return array if size is not increased, otherwise a new array will be returned */ - private static final int[] append(int array[], int appendindex, int value) - { + private static final int[] append(int array[], int appendindex, int value) { if (appendindex + 1 >= array.length) { array = increase(array, appendindex, CE_BUFFER_SIZE_); } @@ -3470,11 +3401,12 @@ public final class RuleBasedCollator extends Collator /** * Does secondary strength comparison based on the collected ces. - * @param doFrench flag indicates if French ordering is to be done + * + * @param doFrench + * flag indicates if French ordering is to be done * @return the secondary strength comparison result */ - private final int doSecondaryCompare(boolean doFrench) - { + private final int doSecondaryCompare(boolean doFrench) { // now, we're gonna reexamine collected CEs if (!doFrench) { // normal int soffset = 0; @@ -3482,43 +3414,33 @@ public final class RuleBasedCollator extends Collator while (true) { int sorder = CollationElementIterator.IGNORABLE; while (sorder == CollationElementIterator.IGNORABLE) { - sorder = m_srcUtilCEBuffer_[soffset ++] - & CE_SECONDARY_MASK_; + sorder = m_srcUtilCEBuffer_[soffset++] & CE_SECONDARY_MASK_; } int torder = CollationElementIterator.IGNORABLE; while (torder == CollationElementIterator.IGNORABLE) { - torder = m_tgtUtilCEBuffer_[toffset ++] - & CE_SECONDARY_MASK_; + torder = m_tgtUtilCEBuffer_[toffset++] & CE_SECONDARY_MASK_; } if (sorder == torder) { - if (m_srcUtilCEBuffer_[soffset - 1] - == CollationElementIterator.NULLORDER) { - if (m_tgtUtilCEBuffer_[toffset - 1] - != CollationElementIterator.NULLORDER) { + if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { + if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { return -1; } break; - } - else if (m_tgtUtilCEBuffer_[toffset - 1] - == CollationElementIterator.NULLORDER) { + } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } - } - else { - if (m_srcUtilCEBuffer_[soffset - 1] == - CollationElementIterator.NULLORDER) { + } else { + if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return -1; } - if (m_tgtUtilCEBuffer_[toffset - 1] == - CollationElementIterator.NULLORDER) { + if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } return (sorder < torder) ? -1 : 1; } } - } - else { // do the French + } else { // do the French m_srcUtilContOffset_ = 0; m_tgtUtilContOffset_ = 0; m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2; @@ -3528,13 +3450,10 @@ public final class RuleBasedCollator extends Collator int torder = getSecondaryFrenchCE(false); if (sorder == torder) { if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0) - || (m_srcUtilOffset_ >= 0 - && m_srcUtilCEBuffer_[m_srcUtilOffset_] - == CollationElementIterator.NULLORDER)) { + || (m_srcUtilOffset_ >= 0 && m_srcUtilCEBuffer_[m_srcUtilOffset_] == CollationElementIterator.NULLORDER)) { break; } - } - else { + } else { return (sorder < torder) ? -1 : 1; } } @@ -3544,11 +3463,12 @@ public final class RuleBasedCollator extends Collator /** * Calculates the next secondary french CE. - * @param isSrc flag indicator if we are calculating the src ces + * + * @param isSrc + * flag indicator if we are calculating the src ces * @return result next modified ce */ - private final int getSecondaryFrenchCE(boolean isSrc) - { + private final int getSecondaryFrenchCE(boolean isSrc) { int result = CollationElementIterator.IGNORABLE; int offset = m_srcUtilOffset_; int continuationoffset = m_srcUtilContOffset_; @@ -3559,11 +3479,10 @@ public final class RuleBasedCollator extends Collator cebuffer = m_tgtUtilCEBuffer_; } - while (result == CollationElementIterator.IGNORABLE - && offset >= 0) { + while (result == CollationElementIterator.IGNORABLE && offset >= 0) { if (continuationoffset == 0) { result = cebuffer[offset]; - while (isContinuation(cebuffer[offset --])){ + while (isContinuation(cebuffer[offset--])) { } // after this, sorder is at the start of continuation, // and offset points before that @@ -3572,9 +3491,8 @@ public final class RuleBasedCollator extends Collator continuationoffset = offset; offset += 2; } - } - else { - result = cebuffer[offset ++]; + } else { + result = cebuffer[offset++]; if (!isContinuation(result)) { // we have finished with this continuation offset = continuationoffset; @@ -3588,8 +3506,7 @@ public final class RuleBasedCollator extends Collator if (isSrc) { m_srcUtilOffset_ = offset; m_srcUtilContOffset_ = continuationoffset; - } - else { + } else { m_tgtUtilOffset_ = offset; m_tgtUtilContOffset_ = continuationoffset; } @@ -3598,39 +3515,35 @@ public final class RuleBasedCollator extends Collator /** * Does case strength comparison based on the collected ces. + * * @return the case strength comparison result */ - private final int doCaseCompare() - { + private final int doCaseCompare() { int soffset = 0; int toffset = 0; while (true) { int sorder = CollationElementIterator.IGNORABLE; int torder = CollationElementIterator.IGNORABLE; - while ((sorder & CE_REMOVE_CASE_) - == CollationElementIterator.IGNORABLE) { - sorder = m_srcUtilCEBuffer_[soffset ++]; + while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { + sorder = m_srcUtilCEBuffer_[soffset++]; if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) { // primary ignorables should not be considered on the case level when the strength is primary // otherwise, the CEs stop being well-formed sorder &= CE_CASE_MASK_3_; sorder ^= m_caseSwitch_; - } - else { + } else { sorder = CollationElementIterator.IGNORABLE; } } - while ((torder & CE_REMOVE_CASE_) - == CollationElementIterator.IGNORABLE) { - torder = m_tgtUtilCEBuffer_[toffset ++]; + while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { + torder = m_tgtUtilCEBuffer_[toffset++]; if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) { // primary ignorables should not be considered on the case level when the strength is primary // otherwise, the CEs stop being well-formed torder &= CE_CASE_MASK_3_; torder ^= m_caseSwitch_; - } - else { + } else { torder = CollationElementIterator.IGNORABLE; } } @@ -3639,26 +3552,19 @@ public final class RuleBasedCollator extends Collator torder &= CE_CASE_BIT_MASK_; if (sorder == torder) { // checking end of strings - if (m_srcUtilCEBuffer_[soffset - 1] - == CollationElementIterator.NULLORDER) { - if (m_tgtUtilCEBuffer_[toffset - 1] - != CollationElementIterator.NULLORDER) { + if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { + if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { return -1; } break; - } - else if (m_tgtUtilCEBuffer_[toffset - 1] - == CollationElementIterator.NULLORDER) { + } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } - } - else { - if (m_srcUtilCEBuffer_[soffset - 1] - == CollationElementIterator.NULLORDER) { + } else { + if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return -1; } - if (m_tgtUtilCEBuffer_[soffset - 1] - == CollationElementIterator.NULLORDER) { + if (m_tgtUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return 1; } return (sorder < torder) ? -1 : 1; @@ -3669,58 +3575,47 @@ public final class RuleBasedCollator extends Collator /** * Does tertiary strength comparison based on the collected ces. + * * @return the tertiary strength comparison result */ - private final int doTertiaryCompare() - { + private final int doTertiaryCompare() { int soffset = 0; int toffset = 0; while (true) { int sorder = CollationElementIterator.IGNORABLE; int torder = CollationElementIterator.IGNORABLE; - while ((sorder & CE_REMOVE_CASE_) - == CollationElementIterator.IGNORABLE) { - sorder = m_srcUtilCEBuffer_[soffset ++] & m_mask3_; + while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { + sorder = m_srcUtilCEBuffer_[soffset++] & m_mask3_; if (!isContinuation(sorder)) { sorder ^= m_caseSwitch_; - } - else { + } else { sorder &= CE_REMOVE_CASE_; } } - while ((torder & CE_REMOVE_CASE_) - == CollationElementIterator.IGNORABLE) { - torder = m_tgtUtilCEBuffer_[toffset ++] & m_mask3_; + while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { + torder = m_tgtUtilCEBuffer_[toffset++] & m_mask3_; if (!isContinuation(torder)) { torder ^= m_caseSwitch_; - } - else { + } else { torder &= CE_REMOVE_CASE_; } } if (sorder == torder) { - if (m_srcUtilCEBuffer_[soffset - 1] - == CollationElementIterator.NULLORDER) { - if (m_tgtUtilCEBuffer_[toffset - 1] - != CollationElementIterator.NULLORDER) { + if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { + if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { return -1; } break; - } - else if (m_tgtUtilCEBuffer_[toffset - 1] - == CollationElementIterator.NULLORDER) { + } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } - } - else { - if (m_srcUtilCEBuffer_[soffset - 1] == - CollationElementIterator.NULLORDER) { + } else { + if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return -1; } - if (m_tgtUtilCEBuffer_[toffset - 1] == - CollationElementIterator.NULLORDER) { + if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } return (sorder < torder) ? -1 : 1; @@ -3731,12 +3626,12 @@ public final class RuleBasedCollator extends Collator /** * Does quaternary strength comparison based on the collected ces. - * @param lowestpvalue the lowest primary value that will not be ignored if - * alternate handling is shifted + * + * @param lowestpvalue + * the lowest primary value that will not be ignored if alternate handling is shifted * @return the quaternary strength comparison result */ - private final int doQuaternaryCompare(int lowestpvalue) - { + private final int doQuaternaryCompare(int lowestpvalue) { boolean sShifted = true; boolean tShifted = true; int soffset = 0; @@ -3744,100 +3639,84 @@ public final class RuleBasedCollator extends Collator while (true) { int sorder = CollationElementIterator.IGNORABLE; int torder = CollationElementIterator.IGNORABLE; - while (sorder == CollationElementIterator.IGNORABLE - || (isContinuation(sorder) && !sShifted)) { - sorder = m_srcUtilCEBuffer_[soffset ++]; + while (sorder == CollationElementIterator.IGNORABLE || (isContinuation(sorder) && !sShifted)) { + sorder = m_srcUtilCEBuffer_[soffset++]; if (isContinuation(sorder)) { if (!sShifted) { continue; } - } - else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0 - || (sorder & CE_PRIMARY_MASK_) - == CollationElementIterator.IGNORABLE) { + } else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0 + || (sorder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) { // non continuation sorder = CE_PRIMARY_MASK_; sShifted = false; - } - else { + } else { sShifted = true; } } sorder >>>= CE_PRIMARY_SHIFT_; - while (torder == CollationElementIterator.IGNORABLE - || (isContinuation(torder) && !tShifted)) { - torder = m_tgtUtilCEBuffer_[toffset ++]; - if (isContinuation(torder)) { - if (!tShifted) { - continue; + while (torder == CollationElementIterator.IGNORABLE || (isContinuation(torder) && !tShifted)) { + torder = m_tgtUtilCEBuffer_[toffset++]; + if (isContinuation(torder)) { + if (!tShifted) { + continue; + } + } else if (Utility.compareUnsigned(torder, lowestpvalue) > 0 + || (torder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) { + // non continuation + torder = CE_PRIMARY_MASK_; + tShifted = false; + } else { + tShifted = true; + } } - } - else if (Utility.compareUnsigned(torder, lowestpvalue) > 0 - || (torder & CE_PRIMARY_MASK_) - == CollationElementIterator.IGNORABLE) { - // non continuation - torder = CE_PRIMARY_MASK_; - tShifted = false; - } - else { - tShifted = true; - } - } - torder >>>= CE_PRIMARY_SHIFT_; + torder >>>= CE_PRIMARY_SHIFT_; - if (sorder == torder) { - if (m_srcUtilCEBuffer_[soffset - 1] - == CollationElementIterator.NULLORDER) { - if (m_tgtUtilCEBuffer_[toffset - 1] - != CollationElementIterator.NULLORDER) { - return -1; + if (sorder == torder) { + if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { + if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { + return -1; + } + break; + } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { + return 1; + } + } else { + if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { + return -1; + } + if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { + return 1; + } + return (sorder < torder) ? -1 : 1; } - break; - } - else if (m_tgtUtilCEBuffer_[toffset - 1] - == CollationElementIterator.NULLORDER) { - return 1; - } - } - else { - if (m_srcUtilCEBuffer_[soffset - 1] == - CollationElementIterator.NULLORDER) { - return -1; - } - if (m_tgtUtilCEBuffer_[toffset - 1] == - CollationElementIterator.NULLORDER) { - return 1; - } - return (sorder < torder) ? -1 : 1; - } } return 0; } /** - * Internal function. Does byte level string compare. Used by strcoll if - * strength == identical and strings are otherwise equal. This is a rare - * case. Comparison must be done on NFD normalized strings. FCD is not good - * enough. - * @param source text - * @param target text - * @param offset of the first difference in the text strings - * @param normalize flag indicating if we are to normalize the text before - * comparison + * Internal function. Does byte level string compare. Used by strcoll if strength == identical and strings are + * otherwise equal. This is a rare case. Comparison must be done on NFD normalized strings. FCD is not good enough. + * + * @param source + * text + * @param target + * text + * @param offset + * of the first difference in the text strings + * @param normalize + * flag indicating if we are to normalize the text before comparison * @return 1 if source is greater than target, -1 less than and 0 if equals */ - private static final int doIdenticalCompare(String source, String target, - int offset, boolean normalize) + private static final int doIdenticalCompare(String source, String target, int offset, boolean normalize) { if (normalize) { - if (Normalizer.quickCheck(source, Normalizer.NFD,0) - != Normalizer.YES) { + if (Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) { source = Normalizer.decompose(source, false); } - if (Normalizer.quickCheck(target, Normalizer.NFD,0) - != Normalizer.YES) { + if (Normalizer.quickCheck(target, Normalizer.NFD, 0) != Normalizer.YES) { target = Normalizer.decompose(target, false); } offset = 0; @@ -3847,18 +3726,18 @@ public final class RuleBasedCollator extends Collator } /** - * Compares string for their codepoint order. - * This comparison handles surrogate characters and place them after the + * Compares string for their codepoint order. This comparison handles surrogate characters and place them after the * all non surrogate characters. - * @param source text - * @param target text - * @param offset start offset for comparison + * + * @param source + * text + * @param target + * text + * @param offset + * start offset for comparison * @return 1 if source is greater than target, -1 less than and 0 if equals */ - private static final int doStringCompare(String source, - String target, - int offset) - { + private static final int doStringCompare(String source, String target, int offset) { // compare identical prefixes - they do not need to be fixed up char schar = 0; char tchar = 0; @@ -3867,7 +3746,7 @@ public final class RuleBasedCollator extends Collator int minlength = Math.min(slength, tlength); while (offset < minlength) { schar = source.charAt(offset); - tchar = target.charAt(offset ++); + tchar = target.charAt(offset++); if (schar != tchar) { break; } @@ -3883,9 +3762,8 @@ public final class RuleBasedCollator extends Collator return 0; } - // if both values are in or above the surrogate range, Fix them up. - if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE - && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) { + // if both values are in or above the surrogate range, Fix them up. + if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) { schar = fixupUTF16(schar); tchar = fixupUTF16(tchar); } @@ -3897,26 +3775,138 @@ public final class RuleBasedCollator extends Collator /** * Rotate surrogates to the top to get code point order */ - private static final char fixupUTF16(char ch) - { + private static final char fixupUTF16(char ch) { if (ch >= 0xe000) { ch -= 0x800; - } - else { + } else { ch += 0x2000; } return ch; } + private static final int UCOL_REORDER_CODE_IGNORE = CollationReorderCodes.LIMIT + 1; + /** + * Builds the lead byte permuatation table + */ + private void buildPermutationTable() { + if (m_scriptOrder_ == null) { + m_leadBytePermutationTable_ = null; + return; + } + + // TODO - these need to be read in from the UCA data file + // The lowest byte that hasn't been assigned a mapping + int toBottom = 0x03; + // The highest byte that hasn't been assigned a mapping + int toTop = 0xe4; + + // filled slots in the output m_scriptOrder_ + boolean[] permutationSlotFilled = new boolean[256]; + + // used lead bytes + boolean[] newLeadByteUsed = new boolean[256]; + + if (m_leadBytePermutationTable_ == null) { + m_leadBytePermutationTable_ = new byte[256]; + } + + // prefill the reordering codes with the leading entries + int[] internalReorderCodes = new int[m_scriptOrder_.length + 5]; // TODO - replace 5 with the reorder codes prefix size + for (int codeIndex = 0; codeIndex < CollationReorderCodes.LIMIT - CollationReorderCodes.FIRST; codeIndex++) { + internalReorderCodes[codeIndex] = CollationReorderCodes.FIRST + codeIndex; + } + for (int codeIndex = 0; codeIndex < m_scriptOrder_.length; codeIndex++) { + internalReorderCodes[codeIndex + (CollationReorderCodes.LIMIT - CollationReorderCodes.FIRST)] = m_scriptOrder_[codeIndex]; + if (m_scriptOrder_[codeIndex] >= CollationReorderCodes.FIRST && m_scriptOrder_[codeIndex] < CollationReorderCodes.LIMIT) { + internalReorderCodes[m_scriptOrder_[codeIndex] - CollationReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE; + } + } + + /* + * Start from the front of the list and place each script we encounter at the earliest possible locatation + * in the permutation table. If we encounter UNKNOWN, start processing from the back, and place each script + * in the last possible location. At each step, we also need to make sure that any scripts that need to not + * be moved are copied to their same location in the final table. + */ + boolean fromTheBottom = true; + for (int reorderCodesIndex = 0; reorderCodesIndex < internalReorderCodes.length; reorderCodesIndex++) { + int next = internalReorderCodes[reorderCodesIndex]; + if (next == UCOL_REORDER_CODE_IGNORE) { + continue; + } + if (next == UScript.UNKNOWN) { + if (fromTheBottom == false) { + // double turnaround + //*status = U_ILLEGAL_ARGUMENT_ERROR; + // TODO - exception + m_leadBytePermutationTable_ = null; + return; + } + fromTheBottom = false; + continue; + } + + int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(next); + if (fromTheBottom) { + for (int leadByte : leadBytes) { + // don't place a lead byte twice in the permutation table + if (permutationSlotFilled[leadByte]) { + // lead byte already used + //*status = U_ILLEGAL_ARGUMENT_ERROR; + // TODO - exception? + m_leadBytePermutationTable_ = null; + return; + } + m_leadBytePermutationTable_[leadByte] = (byte) toBottom; + newLeadByteUsed[toBottom] = true; + permutationSlotFilled[leadByte] = true; + toBottom++; + } + } else { + for (int leadByteIndex = leadBytes.length - 1; leadByteIndex >= 0; leadByteIndex--) { + int leadByte = leadBytes[leadByteIndex]; + // don't place a lead byte twice in the permutation table + if (permutationSlotFilled[leadByte]) { + // lead byte already used + //*status = U_ILLEGAL_ARGUMENT_ERROR; + // TODO - exception? + m_leadBytePermutationTable_ = null; + return; + } + + m_leadBytePermutationTable_[leadByte] = (byte) toTop; + newLeadByteUsed[toTop] = true; + permutationSlotFilled[leadByte] = true; + toTop--; + } + } + } + + /* Copy everything that's left over */ + int reorderCode = 0; + for (int i = 0; i < 256; i++) { + if (!permutationSlotFilled[i]) { + while (reorderCode < 256 && newLeadByteUsed[reorderCode]) { + reorderCode++; + } + m_leadBytePermutationTable_[i] = (byte) reorderCode; + permutationSlotFilled[i] = true; + newLeadByteUsed[reorderCode] = true; + } + } + + // for (int i = 0; i < 256; i++){ + // System.out.println(Integer.toString(i, 16) + " -> " + Integer.toString(m_scriptReorderTable_[i], 16)); + // } + } + /** * Resets the internal case data members and compression values. */ - private void updateInternalState() - { + private void updateInternalState() { if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { m_caseSwitch_ = CASE_SWITCH_; - } - else { + } else { m_caseSwitch_ = NO_CASE_SWITCH_; } @@ -3926,8 +3916,7 @@ public final class RuleBasedCollator extends Collator m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_; m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_; m_bottom3_ = COMMON_BOTTOM_3_; - } - else { + } else { m_mask3_ = CE_KEEP_CASE_; m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_; if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { @@ -3944,31 +3933,30 @@ public final class RuleBasedCollator extends Collator // Set the compression values int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1; // we multilply double with int, but need only int - m_topCount3_ = (int)(PROPORTION_3_ * total3); + m_topCount3_ = (int) (PROPORTION_3_ * total3); m_bottomCount3_ = total3 - m_topCount3_; - if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ - && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) { + if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ && !m_isFrenchCollation_ + && !m_isAlternateHandlingShifted_) { m_isSimple3_ = true; - } - else { + } else { m_isSimple3_ = false; } - if(!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_ - && !m_isAlternateHandlingShifted_ && !latinOneFailed_) { - if(latinOneCEs_ == null || latinOneRegenTable_) { - if(setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it - latinOneUse_ = true; - } else { - latinOneUse_ = false; - latinOneFailed_ = true; + if (!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_ + && !m_isAlternateHandlingShifted_ && !latinOneFailed_) { + if (latinOneCEs_ == null || latinOneRegenTable_) { + if (setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it + latinOneUse_ = true; + } else { + latinOneUse_ = false; + latinOneFailed_ = true; + } + latinOneRegenTable_ = false; + } else { // latin1Table exists and it doesn't need to be regenerated, just use it + latinOneUse_ = true; } - latinOneRegenTable_ = false; - } else { // latin1Table exists and it doesn't need to be regenerated, just use it - latinOneUse_ = true; - } } else { - latinOneUse_ = false; + latinOneUse_ = false; } } @@ -3976,19 +3964,15 @@ public final class RuleBasedCollator extends Collator /** * Initializes the RuleBasedCollator */ - private final void init() - { - for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; - m_minUnsafe_ ++) { + private final void init() { + for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; m_minUnsafe_++) { // Find the smallest unsafe char. if (isUnsafe(m_minUnsafe_)) { break; } } - for (m_minContractionEnd_ = 0; - m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; - m_minContractionEnd_ ++) { + for (m_minContractionEnd_ = 0; m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; m_minContractionEnd_++) { // Find the smallest contraction-ending char. if (isContractionEnd(m_minContractionEnd_)) { break; @@ -4005,11 +3989,19 @@ public final class RuleBasedCollator extends Collator m_isHiragana4_ = m_defaultIsHiragana4_; m_isNumericCollation_ = m_defaultIsNumericCollation_; latinOneFailed_ = false; + if (m_defaultScriptOrder_ != null) { + m_scriptOrder_ = new int[m_defaultScriptOrder_.length]; + for (int i = 0; i < m_defaultScriptOrder_.length; i++) { + m_scriptOrder_[i] = m_defaultScriptOrder_[i]; + } + } else { + m_scriptOrder_ = null; + } updateInternalState(); } /** - * Initializes utility iterators and byte buffer used by compare + * Initializes utility iterators and byte buffer used by compare */ private final void initUtility(boolean allocate) { if (allocate) { @@ -4022,7 +4014,7 @@ public final class RuleBasedCollator extends Collator m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary - m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary + m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_]; m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_]; } @@ -4043,10 +4035,10 @@ public final class RuleBasedCollator extends Collator // Consts for Latin-1 special processing private static final int ENDOFLATINONERANGE_ = 0xFF; - private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_+50); - private static final int BAIL_OUT_CE_ = 0xFF000000; + private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_ + 50); + private static final int BAIL_OUT_CE_ = 0xFF000000; - /** + /** * Generate latin-1 tables */ @@ -4056,211 +4048,211 @@ public final class RuleBasedCollator extends Collator int terShift = 24; } - private final void - addLatinOneEntry(char ch, int CE, shiftValues sh) { - int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; - boolean reverseSecondary = false; - if(!isContinuation(CE)) { - tertiary = ((CE & m_mask3_)); - tertiary ^= m_caseSwitch_; - reverseSecondary = true; - } else { - tertiary = (byte)((CE & CE_REMOVE_CONTINUATION_MASK_)); - tertiary &= CE_REMOVE_CASE_; - reverseSecondary = false; - } - - secondary = ((CE >>>= 8) & LAST_BYTE_MASK_); - primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_); - primary1 = (CE >>> 8); - - if(primary1 != 0) { - latinOneCEs_[ch] |= (primary1 << sh.primShift); - sh.primShift -= 8; - } - if(primary2 != 0) { - if(sh.primShift < 0) { - latinOneCEs_[ch] = BAIL_OUT_CE_; - latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_; - latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_; - return; + private final void addLatinOneEntry(char ch, int CE, shiftValues sh) { + int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; + boolean reverseSecondary = false; + if (!isContinuation(CE)) { + tertiary = ((CE & m_mask3_)); + tertiary ^= m_caseSwitch_; + reverseSecondary = true; + } else { + tertiary = (byte) ((CE & CE_REMOVE_CONTINUATION_MASK_)); + tertiary &= CE_REMOVE_CASE_; + reverseSecondary = false; } - latinOneCEs_[ch] |= (primary2 << sh.primShift); - sh.primShift -= 8; - } - if(secondary != 0) { - if(reverseSecondary && m_isFrenchCollation_) { // reverse secondary - latinOneCEs_[latinOneTableLen_+ch] >>>= 8; // make space for secondary - latinOneCEs_[latinOneTableLen_+ch] |= (secondary << 24); - } else { // normal case - latinOneCEs_[latinOneTableLen_+ch] |= (secondary << sh.secShift); + + secondary = ((CE >>>= 8) & LAST_BYTE_MASK_); + primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_); + primary1 = (CE >>> 8); + + if (primary1 != 0) { + latinOneCEs_[ch] |= (primary1 << sh.primShift); + sh.primShift -= 8; + } + if (primary2 != 0) { + if (sh.primShift < 0) { + latinOneCEs_[ch] = BAIL_OUT_CE_; + latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_; + latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_; + return; + } + latinOneCEs_[ch] |= (primary2 << sh.primShift); + sh.primShift -= 8; + } + if (secondary != 0) { + if (reverseSecondary && m_isFrenchCollation_) { // reverse secondary + latinOneCEs_[latinOneTableLen_ + ch] >>>= 8; // make space for secondary + latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << 24); + } else { // normal case + latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << sh.secShift); + } + sh.secShift -= 8; + } + if (tertiary != 0) { + latinOneCEs_[2 * latinOneTableLen_ + ch] |= (tertiary << sh.terShift); + sh.terShift -= 8; } - sh.secShift -= 8; - } - if(tertiary != 0) { - latinOneCEs_[2*latinOneTableLen_+ch] |= (tertiary << sh.terShift); - sh.terShift -= 8; - } } - private final void - resizeLatinOneTable(int newSize) { - int newTable[] = new int[3*newSize]; - int sizeToCopy = ((newSize> 4) - m_expansionOffset_; //it.getExpansionOffset(this, CE); - int size = CE & 0xF; // getExpansionCount(CE); - //CE = *CEOffset++; - if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ - for(i = 0; i> 4) - m_expansionOffset_; // it.getExpansionOffset(this, + // CE); + int size = CE & 0xF; // getExpansionCount(CE); + // CE = *CEOffset++; + if (size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ + for (i = 0; i < size; i++) { + if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) { + latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; + latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; + latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; + break; + } + addLatinOneEntry(contractionOffset, m_expansion_[offset + i], s); + } + } else { /* else, we do */ + while (m_expansion_[offset] != 0) { + if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) { + latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; + latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; + latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; + break; + } + addLatinOneEntry(contractionOffset, m_expansion_[offset++], s); + } + } + contractionOffset++; + } else if (!isSpecial(CE)) { + addLatinOneEntry(contractionOffset++, CE, s); + } else { + latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; + latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; + latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; + contractionOffset++; + } + UCharOffset++; + s.primShift = 24; + s.secShift = 24; + s.terShift = 24; + if (contractionOffset == latinOneTableLen_) { // we need to reallocate + resizeLatinOneTable(2 * latinOneTableLen_); + } + } while (m_contractionIndex_[UCharOffset] != 0xFFFF); + } + break; + case CollationElementIterator.CE_SPEC_PROC_TAG_: { + // 0xB7 is a precontext character defined in UCA5.1, a special + // handle is implemeted in order to save LatinOne table for + // most locales. + if (ch == 0xb7) { + addLatinOneEntry(ch, CE, s); + } else { + latinOneFailed_ = true; + return false; + } + } + break; + default: + latinOneFailed_ = true; + return false; + } } - break; - case CollationElementIterator.CE_SPEC_PROC_TAG_: - { - // 0xB7 is a precontext character defined in UCA5.1, a special - // handle is implemeted in order to save LatinOne table for - // most locales. - if (ch == 0xb7) { - addLatinOneEntry(ch, CE, s); - } - else { - latinOneFailed_ = true; - return false; - } - } - break; - default: - latinOneFailed_ = true; - return false; - } } - } - // compact table - if(contractionOffset < latinOneTableLen_) { - resizeLatinOneTable(contractionOffset); - } - return true; + // compact table + if (contractionOffset < latinOneTableLen_) { + resizeLatinOneTable(contractionOffset); + } + return true; } private class ContractionInfo { @@ -4269,71 +4261,59 @@ public final class RuleBasedCollator extends Collator ContractionInfo m_ContInfo_; - private int - getLatinOneContraction(int strength, int CE, String s) { - //int strength, int CE, String s, Integer ind) { - int len = s.length(); - //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); - int UCharOffset = (CE & 0xFFF) - m_contractionOffset_; - int offset = 1; - int latinOneOffset = (CE & 0x00FFF000) >>> 12; - char schar = 0, tchar = 0; + private int getLatinOneContraction(int strength, int CE, String s) { + // int strength, int CE, String s, Integer ind) { + int len = s.length(); + // const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); + int UCharOffset = (CE & 0xFFF) - m_contractionOffset_; + int offset = 1; + int latinOneOffset = (CE & 0x00FFF000) >>> 12; + char schar = 0, tchar = 0; - for(;;) { - /* - if(len == -1) { - if(s[*index] == 0) { // end of string - return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); - } else { - schar = s[*index]; - } - } else { - */ - if(m_ContInfo_.index == len) { - return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]); - } else { - schar = s.charAt(m_ContInfo_.index); - } - //} + for (;;) { + /* + * if(len == -1) { if(s[*index] == 0) { // end of string + * return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } else { schar = s[*index]; } + * } else { + */ + if (m_ContInfo_.index == len) { + return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]); + } else { + schar = s.charAt(m_ContInfo_.index); + } + // } - while(schar > (tchar = m_contractionIndex_[UCharOffset+offset]/**(UCharOffset+offset)*/)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ - offset++; - } + while (schar > (tchar = m_contractionIndex_[UCharOffset + offset]/** (UCharOffset+offset) */ + )) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ + offset++; + } - if (schar == tchar) { - m_ContInfo_.index++; - return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset+offset]); - } - else - { - if(schar > ENDOFLATINONERANGE_ /*& 0xFF00*/) { - return BAIL_OUT_CE_; - } - // skip completely ignorables - int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar); - if(isZeroCE == 0) { // we have to ignore completely ignorables - m_ContInfo_.index++; - continue; - } + if (schar == tchar) { + m_ContInfo_.index++; + return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset + offset]); + } else { + if (schar > ENDOFLATINONERANGE_ /* & 0xFF00 */) { + return BAIL_OUT_CE_; + } + // skip completely ignorables + int isZeroCE = m_trie_.getLeadValue(schar); // UTRIE_GET32_FROM_LEAD(coll->mapping, schar); + if (isZeroCE == 0) { // we have to ignore completely ignorables + m_ContInfo_.index++; + continue; + } - return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]); - } - } + return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]); + } + } } - /** - * This is a fast strcoll, geared towards text in Latin-1. - * It supports contractions of size two, French secondaries - * and case switching. You can use it with strengths primary - * to tertiary. It does not support shifted and case level. - * It relies on the table build by setupLatin1Table. If it - * doesn't understand something, it will go to the regular - * strcoll. + * This is a fast strcoll, geared towards text in Latin-1. It supports contractions of size two, French secondaries + * and case switching. You can use it with strengths primary to tertiary. It does not support shifted and case + * level. It relies on the table build by setupLatin1Table. If it doesn't understand something, it will go to the + * regular strcoll. */ - private final int - compareUseLatin1(String source, String target, int startOffset) - { + private final int compareUseLatin1(String source, String target, int startOffset) { int sLen = source.length(); int tLen = target.length(); @@ -4341,318 +4321,328 @@ public final class RuleBasedCollator extends Collator int sIndex = startOffset, tIndex = startOffset; char sChar = 0, tChar = 0; - int sOrder=0, tOrder=0; + int sOrder = 0, tOrder = 0; boolean endOfSource = false; - //uint32_t *elements = coll->latinOneCEs; + // uint32_t *elements = coll->latinOneCEs; boolean haveContractions = false; // if we have contractions in our string - // we cannot do French secondary + // we cannot do French secondary int offset = latinOneTableLen_; // Do the primary level - primLoop: - for(;;) { - while(sOrder==0) { // this loop skips primary ignorables - // sOrder=getNextlatinOneCE(source); - if(sIndex==sLen) { - endOfSource = true; - break; - } - sChar=source.charAt(sIndex++); //[sIndex++]; - //} - if(sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out - //fprintf(stderr, "R"); - return compareRegular(source, target, startOffset); + primLoop: for (;;) { + while (sOrder == 0) { // this loop skips primary ignorables + // sOrder=getNextlatinOneCE(source); + if (sIndex == sLen) { + endOfSource = true; + break; + } + sChar = source.charAt(sIndex++); // [sIndex++]; + // } + if (sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out + // fprintf(stderr, "R"); + return compareRegular(source, target, startOffset); + } + sOrder = latinOneCEs_[sChar]; + if (isSpecial(sOrder)) { // if we got a special + // specials can basically be either contractions or bail-out signs. If we get anything + // else, we'll bail out anywasy + if (getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) { + m_ContInfo_.index = sIndex; + sOrder = getLatinOneContraction(0, sOrder, source); + sIndex = m_ContInfo_.index; + haveContractions = true; // if there are contractions, we cannot do French secondary + // However, if there are contractions in the table, but we always use just one char, + // we might be able to do French. This should be checked out. + } + if (isSpecial(sOrder) /* == UCOL_BAIL_OUT_CE */) { + // fprintf(stderr, "S"); + return compareRegular(source, target, startOffset); + } + } } - sOrder = latinOneCEs_[sChar]; - if(isSpecial(sOrder)) { // if we got a special - // specials can basically be either contractions or bail-out signs. If we get anything - // else, we'll bail out anywasy - if(getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) { - m_ContInfo_.index = sIndex; - sOrder = getLatinOneContraction(0, sOrder, source); - sIndex = m_ContInfo_.index; - haveContractions = true; // if there are contractions, we cannot do French secondary - // However, if there are contractions in the table, but we always use just one char, - // we might be able to do French. This should be checked out. - } - if(isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) { - //fprintf(stderr, "S"); - return compareRegular(source, target, startOffset); - } - } - } - while(tOrder==0) { // this loop skips primary ignorables - // tOrder=getNextlatinOneCE(target); - if(tIndex==tLen) { - if(endOfSource) { - break primLoop; - } else { - return 1; - } + while (tOrder == 0) { // this loop skips primary ignorables + // tOrder=getNextlatinOneCE(target); + if (tIndex == tLen) { + if (endOfSource) { + break primLoop; + } else { + return 1; + } + } + tChar = target.charAt(tIndex++); // [tIndex++]; + if (tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out + // fprintf(stderr, "R"); + return compareRegular(source, target, startOffset); + } + tOrder = latinOneCEs_[tChar]; + if (isSpecial(tOrder)) { + // Handling specials, see the comments for source + if (getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) { + m_ContInfo_.index = tIndex; + tOrder = getLatinOneContraction(0, tOrder, target); + tIndex = m_ContInfo_.index; + haveContractions = true; + } + if (isSpecial(tOrder)/* == UCOL_BAIL_OUT_CE */) { + // fprintf(stderr, "S"); + return compareRegular(source, target, startOffset); + } + } } - tChar=target.charAt(tIndex++); //[tIndex++]; - if(tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out - //fprintf(stderr, "R"); - return compareRegular(source, target, startOffset); - } - tOrder = latinOneCEs_[tChar]; - if(isSpecial(tOrder)) { - // Handling specials, see the comments for source - if(getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) { - m_ContInfo_.index = tIndex; - tOrder = getLatinOneContraction(0, tOrder, target); - tIndex = m_ContInfo_.index; - haveContractions = true; - } - if(isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) { - //fprintf(stderr, "S"); - return compareRegular(source, target, startOffset); - } - } - } - if(endOfSource) { // source is finished, but target is not, say the result. - return -1; - } - - if(sOrder == tOrder) { // if we have same CEs, we continue the loop - sOrder = 0; tOrder = 0; - continue; - } else { - // compare current top bytes - if(((sOrder^tOrder)&0xFF000000)!=0) { - // top bytes differ, return difference - if(sOrder >>> 8 < tOrder >>> 8) { + if (endOfSource) { // source is finished, but target is not, say the result. return -1; - } else { - return 1; - } - // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); - // since we must return enum value } - // top bytes match, continue with following bytes - sOrder<<=8; - tOrder<<=8; - } + if (!isContinuation(sOrder) && m_leadBytePermutationTable_ != null) { + sOrder = (m_leadBytePermutationTable_[((sOrder >> 24) + 256) % 256] << 24) | (sOrder & 0x00FFFFFF); + tOrder = (m_leadBytePermutationTable_[((tOrder >> 24) + 256) % 256] << 24) | (tOrder & 0x00FFFFFF); + } + + if (sOrder == tOrder) { // if we have same CEs, we continue the loop + sOrder = 0; + tOrder = 0; + continue; + } else { + // compare current top bytes + if (((sOrder ^ tOrder) & 0xFF000000) != 0) { + // top bytes differ, return difference + if (sOrder >>> 8 < tOrder >>> 8) { + return -1; + } else { + return 1; + } + // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); + // since we must return enum value + } + + // top bytes match, continue with following bytes + sOrder <<= 8; + tOrder <<= 8; + } } // after primary loop, we definitely know the sizes of strings, // so we set it and use simpler loop for secondaries and tertiaries - //sLen = sIndex; tLen = tIndex; - if(strength >= SECONDARY) { - // adjust the table beggining - //latinOneCEs_ += coll->latinOneTableLen; - endOfSource = false; + // sLen = sIndex; tLen = tIndex; + if (strength >= SECONDARY) { + // adjust the table beggining + // latinOneCEs_ += coll->latinOneTableLen; + endOfSource = false; - if(!m_isFrenchCollation_) { // non French - // This loop is a simplified copy of primary loop - // at this point we know that whole strings are latin-1, so we don't - // check for that. We also know that we only have contractions as - // specials. - //sIndex = 0; tIndex = 0; - sIndex = startOffset; tIndex = startOffset; - secLoop: - for(;;) { - while(sOrder==0) { - if(sIndex==sLen) { - endOfSource = true; - break; - } - sChar=source.charAt(sIndex++); //[sIndex++]; - sOrder = latinOneCEs_[offset+sChar]; - if(isSpecial(sOrder)) { - m_ContInfo_.index = sIndex; - sOrder = getLatinOneContraction(1, sOrder, source); - sIndex = m_ContInfo_.index; - } - } + if (!m_isFrenchCollation_) { // non French + // This loop is a simplified copy of primary loop + // at this point we know that whole strings are latin-1, so we don't + // check for that. We also know that we only have contractions as + // specials. + // sIndex = 0; tIndex = 0; + sIndex = startOffset; + tIndex = startOffset; + secLoop: for (;;) { + while (sOrder == 0) { + if (sIndex == sLen) { + endOfSource = true; + break; + } + sChar = source.charAt(sIndex++); // [sIndex++]; + sOrder = latinOneCEs_[offset + sChar]; + if (isSpecial(sOrder)) { + m_ContInfo_.index = sIndex; + sOrder = getLatinOneContraction(1, sOrder, source); + sIndex = m_ContInfo_.index; + } + } - while(tOrder==0) { - if(tIndex==tLen) { - if(endOfSource) { - break secLoop; - } else { - return 1; - } - } - tChar=target.charAt(tIndex++); //[tIndex++]; - tOrder = latinOneCEs_[offset+tChar]; - if(isSpecial(tOrder)) { - m_ContInfo_.index = tIndex; - tOrder = getLatinOneContraction(1, tOrder, target); - tIndex = m_ContInfo_.index; - } - } - if(endOfSource) { - return -1; - } + while (tOrder == 0) { + if (tIndex == tLen) { + if (endOfSource) { + break secLoop; + } else { + return 1; + } + } + tChar = target.charAt(tIndex++); // [tIndex++]; + tOrder = latinOneCEs_[offset + tChar]; + if (isSpecial(tOrder)) { + m_ContInfo_.index = tIndex; + tOrder = getLatinOneContraction(1, tOrder, target); + tIndex = m_ContInfo_.index; + } + } + if (endOfSource) { + return -1; + } - if(sOrder == tOrder) { - sOrder = 0; tOrder = 0; - continue; - } else { - // see primary loop for comments on this - if(((sOrder^tOrder)&0xFF000000)!=0) { - if(sOrder >>> 8 < tOrder >>> 8) { - return -1; - } else { - return 1; - } + if (sOrder == tOrder) { + sOrder = 0; + tOrder = 0; + continue; + } else { + // see primary loop for comments on this + if (((sOrder ^ tOrder) & 0xFF000000) != 0) { + if (sOrder >>> 8 < tOrder >>> 8) { + return -1; + } else { + return 1; + } + } + sOrder <<= 8; + tOrder <<= 8; + } + } + } else { // French + if (haveContractions) { // if we have contractions, we have to bail out + // since we don't really know how to handle them here + return compareRegular(source, target, startOffset); + } + // For French, we go backwards + sIndex = sLen; + tIndex = tLen; + secFLoop: for (;;) { + while (sOrder == 0) { + if (sIndex == startOffset) { + endOfSource = true; + break; + } + sChar = source.charAt(--sIndex); // [--sIndex]; + sOrder = latinOneCEs_[offset + sChar]; + // don't even look for contractions + } + + while (tOrder == 0) { + if (tIndex == startOffset) { + if (endOfSource) { + break secFLoop; + } else { + return 1; + } + } + tChar = target.charAt(--tIndex); // [--tIndex]; + tOrder = latinOneCEs_[offset + tChar]; + // don't even look for contractions + } + if (endOfSource) { + return -1; + } + + if (sOrder == tOrder) { + sOrder = 0; + tOrder = 0; + continue; + } else { + // see the primary loop for comments + if (((sOrder ^ tOrder) & 0xFF000000) != 0) { + if (sOrder >>> 8 < tOrder >>> 8) { + return -1; + } else { + return 1; + } + } + sOrder <<= 8; + tOrder <<= 8; + } } - sOrder<<=8; - tOrder<<=8; - } } - } else { // French - if(haveContractions) { // if we have contractions, we have to bail out - // since we don't really know how to handle them here - return compareRegular(source, target, startOffset); - } - // For French, we go backwards - sIndex = sLen; tIndex = tLen; - secFLoop: - for(;;) { - while(sOrder==0) { - if(sIndex==startOffset) { - endOfSource = true; - break; - } - sChar=source.charAt(--sIndex); //[--sIndex]; - sOrder = latinOneCEs_[offset+sChar]; - // don't even look for contractions - } - - while(tOrder==0) { - if(tIndex==startOffset) { - if(endOfSource) { - break secFLoop; - } else { - return 1; - } - } - tChar=target.charAt(--tIndex); //[--tIndex]; - tOrder = latinOneCEs_[offset+tChar]; - // don't even look for contractions - } - if(endOfSource) { - return -1; - } - - if(sOrder == tOrder) { - sOrder = 0; tOrder = 0; - continue; - } else { - // see the primary loop for comments - if(((sOrder^tOrder)&0xFF000000)!=0) { - if(sOrder >>> 8 < tOrder >>> 8) { - return -1; - } else { - return 1; - } - } - sOrder<<=8; - tOrder<<=8; - } - } - } } - if(strength >= TERTIARY) { - // tertiary loop is the same as secondary (except no French) - offset += latinOneTableLen_; - //sIndex = 0; tIndex = 0; - sIndex = startOffset; tIndex = startOffset; - endOfSource = false; - for(;;) { - while(sOrder==0) { - if(sIndex==sLen) { - endOfSource = true; - break; - } - sChar=source.charAt(sIndex++); //[sIndex++]; - sOrder = latinOneCEs_[offset+sChar]; - if(isSpecial(sOrder)) { - m_ContInfo_.index = sIndex; - sOrder = getLatinOneContraction(2, sOrder, source); - sIndex = m_ContInfo_.index; - } - } - while(tOrder==0) { - if(tIndex==tLen) { - if(endOfSource) { - return 0; // if both strings are at the end, they are equal - } else { - return 1; + if (strength >= TERTIARY) { + // tertiary loop is the same as secondary (except no French) + offset += latinOneTableLen_; + // sIndex = 0; tIndex = 0; + sIndex = startOffset; + tIndex = startOffset; + endOfSource = false; + for (;;) { + while (sOrder == 0) { + if (sIndex == sLen) { + endOfSource = true; + break; + } + sChar = source.charAt(sIndex++); // [sIndex++]; + sOrder = latinOneCEs_[offset + sChar]; + if (isSpecial(sOrder)) { + m_ContInfo_.index = sIndex; + sOrder = getLatinOneContraction(2, sOrder, source); + sIndex = m_ContInfo_.index; + } } - } - tChar=target.charAt(tIndex++); //[tIndex++]; - tOrder = latinOneCEs_[offset+tChar]; - if(isSpecial(tOrder)) { - m_ContInfo_.index = tIndex; - tOrder = getLatinOneContraction(2, tOrder, target); - tIndex = m_ContInfo_.index; - } - } - if(endOfSource) { - return -1; - } - if(sOrder == tOrder) { - sOrder = 0; tOrder = 0; - continue; - } else { - if(((sOrder^tOrder)&0xff000000)!=0) { - if(sOrder >>> 8 < tOrder >>> 8) { - return -1; - } else { - return 1; + while (tOrder == 0) { + if (tIndex == tLen) { + if (endOfSource) { + return 0; // if both strings are at the end, they are equal + } else { + return 1; + } + } + tChar = target.charAt(tIndex++); // [tIndex++]; + tOrder = latinOneCEs_[offset + tChar]; + if (isSpecial(tOrder)) { + m_ContInfo_.index = tIndex; + tOrder = getLatinOneContraction(2, tOrder, target); + tIndex = m_ContInfo_.index; + } + } + if (endOfSource) { + return -1; + } + if (sOrder == tOrder) { + sOrder = 0; + tOrder = 0; + continue; + } else { + if (((sOrder ^ tOrder) & 0xff000000) != 0) { + if (sOrder >>> 8 < tOrder >>> 8) { + return -1; + } else { + return 1; + } + } + sOrder <<= 8; + tOrder <<= 8; } - } - sOrder<<=8; - tOrder<<=8; } - } } return 0; } - /** + + /** * Get the version of this collator object. + * * @return the version object associated with this collator * @stable ICU 2.8 */ public VersionInfo getVersion() { - /* RunTime version */ + /* RunTime version */ int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor(); - /* Builder version*/ + /* Builder version */ int bdVersion = m_version_.getMajor(); - /* Charset Version. Need to get the version from cnv files - * makeconv should populate cnv files with version and + /* + * Charset Version. Need to get the version from cnv files makeconv should populate cnv files with version and * an api has to be provided in ucnv.h to obtain this version */ int csVersion = 0; /* combine the version info */ - int cmbVersion = ((rtVersion<<11) | (bdVersion<<6) | (csVersion)) & 0xFFFF; - - /* Tailoring rules */ - return VersionInfo.getInstance(cmbVersion>>8, - cmbVersion & 0xFF, - m_version_.getMinor(), - UCA_.m_UCA_version_.getMajor()); + int cmbVersion = ((rtVersion << 11) | (bdVersion << 6) | (csVersion)) & 0xFFFF; -// versionInfo[0] = (uint8_t)(cmbVersion>>8); -// versionInfo[1] = (uint8_t)cmbVersion; -// versionInfo[2] = coll->image->version[1]; -// versionInfo[3] = coll->UCA->image->UCAVersion[0]; + /* Tailoring rules */ + return VersionInfo.getInstance(cmbVersion >> 8, cmbVersion & 0xFF, m_version_.getMinor(), + UCA_.m_UCA_version_.getMajor()); + + // versionInfo[0] = (uint8_t)(cmbVersion>>8); + // versionInfo[1] = (uint8_t)cmbVersion; + // versionInfo[2] = coll->image->version[1]; + // versionInfo[3] = coll->UCA->image->UCAVersion[0]; } - - /** + + /** * Get the UCA version of this collator object. + * * @return the version object associated with this collator * @stable ICU 2.8 */