From 1fac4c690bf47e3ca739b0a10c431a1ec92f2679 Mon Sep 17 00:00:00 2001
From: Stuart Gill
Date: Mon, 8 Nov 2010 18:57:42 +0000
Subject: [PATCH] ICU-3984 initial commit of the collation reordering
X-SVN-Rev: 29015
---
.../icu/text/CollationParsedRuleBuilder.java | 1 +
.../com/ibm/icu/text/CollationRuleParser.java | 59 +-
.../src/com/ibm/icu/text/Collator.java | 34 +
.../src/com/ibm/icu/text/CollatorReader.java | 546 +-
.../com/ibm/icu/text/RuleBasedCollator.java | 4442 ++++++++---------
5 files changed, 2606 insertions(+), 2476 deletions(-)
diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
index 14d8c23a97..2ef16ba685 100644
--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
@@ -3631,6 +3631,7 @@ final class CollationParsedRuleBuilder {
collator.m_isHiragana4_ = option.m_isHiragana4_;
collator.setStrength(option.m_strength_);
collator.m_variableTopValue_ = option.m_variableTopValue_;
+ collator.m_scriptOrder_ = option.m_scriptOrder_;
collator.latinOneFailed_ = false;
}
diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java
index 69e715404a..e8d8da220d 100644
--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java
@@ -7,6 +7,7 @@
package com.ibm.icu.text;
import java.text.ParseException;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
@@ -16,6 +17,7 @@ import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UScript;
/**
* Class for parsing collation rules, produces a list of tokens that will be
@@ -89,6 +91,14 @@ final class CollationRuleParser
m_decomposition_ = collator.getDecomposition();
m_strength_ = collator.getStrength();
m_isHiragana4_ = collator.m_isHiragana4_;
+
+ if(collator.m_scriptOrder_ != null){
+ m_scriptOrder_ = new int[collator.m_scriptOrder_.length];
+ for(int i = 0; i < m_scriptOrder_.length; i++){
+ m_scriptOrder_[i] = collator.m_scriptOrder_[i];
+ }
+ }
+
}
// package private data members --------------------------------------
@@ -119,6 +129,11 @@ final class CollationRuleParser
* attribute for special Hiragana
*/
boolean m_isHiragana4_;
+
+ /**
+ * the ordering of the scripts
+ */
+ int[] m_scriptOrder_;
}
/**
@@ -291,6 +306,14 @@ final class CollationRuleParser
collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
+ if(m_options_.m_scriptOrder_ != null) {
+ collator.m_defaultScriptOrder_ = new int[m_options_.m_scriptOrder_.length];
+ for (int i = 0; i < m_options_.m_scriptOrder_.length; i++) {
+ collator.m_defaultScriptOrder_[i] = m_options_.m_scriptOrder_[i];
+ }
+ } else {
+ collator.m_defaultScriptOrder_ = null;
+ }
}
// private inner classes -------------------------------------------------
@@ -662,7 +685,7 @@ final class CollationRuleParser
RULES_OPTIONS_[15] = new TokenOption("undefined",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
- RULES_OPTIONS_[16] = new TokenOption("scriptOrder",
+ RULES_OPTIONS_[16] = new TokenOption("reorder",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
RULES_OPTIONS_[17] = new TokenOption("charsetname",
@@ -2028,7 +2051,6 @@ final class CollationRuleParser
return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
}
-
/** in C, optionarg is passed by reference to function.
* We use a private int to simulate this.
*/
@@ -2061,6 +2083,7 @@ final class CollationRuleParser
}
return i;
}
+
/**
* Reads and set collation options
* @return TOKEN_SUCCESS if option is set correct, 0 otherwise
@@ -2152,6 +2175,11 @@ final class CollationRuleParser
m_optionEnd_ = m_current_-1;
return TOKEN_SUCCESS_MASK_;
}
+ else if(i == 16) {
+ m_current_ = m_optionarg_; // skip opening brace and name
+ parseScriptReorder();
+ return TOKEN_SUCCESS_MASK_;
+ }
else {
throwParseException(m_rules_, optionarg);
}
@@ -2282,4 +2310,31 @@ final class CollationRuleParser
}
return rules;
}
+
+ private void parseScriptReorder() throws ParseException{
+ ArrayList tempOrder = new ArrayList();
+ int end = m_rules_.indexOf(']', m_current_);
+ while(m_current_ < end){
+ // Ensure that the following token is 4 characters long
+ if ((end != m_current_+4) &&
+ (m_rules_.charAt(m_current_+4) != ' ')) {
+ throw new ParseException(m_rules_, m_current_);
+ }
+ int[] script = UScript.getCode(m_rules_.substring(m_current_, m_current_+4));
+ if (script.length > 0) {
+ tempOrder.add(script[0]);
+ } else {
+ throw new ParseException(m_rules_, m_current_);
+ }
+ m_current_+= 4;
+ while (m_current_ < end && UCharacter.isWhitespace(m_rules_.charAt(m_current_)))
+ { // eat whitespace
+ m_current_++;
+ }
+ }
+ m_options_.m_scriptOrder_ = new int[tempOrder.size()];
+ for(int i = 0; i < tempOrder.size(); i++){
+ m_options_.m_scriptOrder_[i] = tempOrder.get(i);
+ }
+ }
}
diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java
index b0c1fcf8c5..19b7dcc243 100644
--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/Collator.java
@@ -225,6 +225,18 @@ public abstract class Collator implements Comparator
*
* This class is not subclassable
*
+ *
* @author Syn Wee Quek
* @stable ICU 2.8
*/
-public final class RuleBasedCollator extends Collator
-{
+public final class RuleBasedCollator extends Collator {
// public constructors ---------------------------------------------------
/**
*
- * Constructor that takes the argument rules for
- * customization. The collator will be based on UCA,
- * with the attributes and re-ordering of the characters specified in the
- * argument rules.
+ * Constructor that takes the argument rules for customization. The collator will be based on UCA, with the
+ * attributes and re-ordering of the characters specified in the argument rules.
*
- * @param rules the collation rules to build the collation table from.
- * @exception ParseException and IOException thrown. ParseException thrown
- * when argument rules have an invalid syntax. IOException
- * thrown when an error occured while reading internal data.
+ *
+ * @param rules
+ * the collation rules to build the collation table from.
+ * @exception ParseException
+ * and IOException thrown. ParseException thrown when argument rules have an invalid syntax.
+ * IOException thrown when an error occured while reading internal data.
* @stable ICU 2.8
*/
- public RuleBasedCollator(String rules) throws Exception
- {
+ public RuleBasedCollator(String rules) throws Exception {
checkUCA();
if (rules == null) {
- throw new IllegalArgumentException(
- "Collation rules can not be null");
+ throw new IllegalArgumentException("Collation rules can not be null");
}
init(rules);
}
@@ -219,12 +205,12 @@ public final class RuleBasedCollator extends Collator
/**
* Clones the RuleBasedCollator
+ *
* @return a new instance of this RuleBasedCollator object
* @stable ICU 2.8
*/
- public Object clone() throws CloneNotSupportedException
- {
- RuleBasedCollator result = (RuleBasedCollator)super.clone();
+ public Object clone() throws CloneNotSupportedException {
+ RuleBasedCollator result = (RuleBasedCollator) super.clone();
if (latinOneCEs_ != null) {
result.m_reallocLatinOneCEs_ = true;
result.m_ContInfo_ = new ContractionInfo();
@@ -232,107 +218,97 @@ public final class RuleBasedCollator extends Collator
// since all collation data in the RuleBasedCollator do not change
// we can safely assign the result.fields to this collator
- result.initUtility(false); // let the new clone have their own util
- // iterators
+ result.initUtility(false); // let the new clone have their own util
+ // iterators
return result;
}
/**
* Return a CollationElementIterator for the given String.
+ *
* @see CollationElementIterator
* @stable ICU 2.8
*/
- public CollationElementIterator getCollationElementIterator(String source)
- {
+ public CollationElementIterator getCollationElementIterator(String source) {
return new CollationElementIterator(source, this);
}
/**
- * Return a CollationElementIterator for the given CharacterIterator.
- * The source iterator's integrity will be preserved since a new copy
- * will be created for use.
+ * Return a CollationElementIterator for the given CharacterIterator. The source iterator's integrity will be
+ * preserved since a new copy will be created for use.
+ *
* @see CollationElementIterator
* @stable ICU 2.8
*/
- public CollationElementIterator getCollationElementIterator(
- CharacterIterator source)
- {
- CharacterIterator newsource = (CharacterIterator)source.clone();
+ public CollationElementIterator getCollationElementIterator(CharacterIterator source) {
+ CharacterIterator newsource = (CharacterIterator) source.clone();
return new CollationElementIterator(newsource, this);
}
-
+
/**
- * Return a CollationElementIterator for the given UCharacterIterator.
- * The source iterator's integrity will be preserved since a new copy
- * will be created for use.
+ * Return a CollationElementIterator for the given UCharacterIterator. The source iterator's integrity will be
+ * preserved since a new copy will be created for use.
+ *
* @see CollationElementIterator
* @stable ICU 2.8
*/
- public CollationElementIterator getCollationElementIterator(
- UCharacterIterator source)
- {
+ public CollationElementIterator getCollationElementIterator(UCharacterIterator source) {
return new CollationElementIterator(source, this);
}
// public setters --------------------------------------------------------
/**
- * Sets the Hiragana Quaternary mode to be on or off.
- * When the Hiragana Quaternary mode is turned on, the collator
- * positions Hiragana characters before all non-ignorable characters in
- * QUATERNARY strength. This is to produce a correct JIS collation order,
- * distinguishing between Katakana and Hiragana characters.
- * @param flag true if Hiragana Quaternary mode is to be on, false
- * otherwise
+ * Sets the Hiragana Quaternary mode to be on or off. When the Hiragana Quaternary mode is turned on, the collator
+ * positions Hiragana characters before all non-ignorable characters in QUATERNARY strength. This is to produce a
+ * correct JIS collation order, distinguishing between Katakana and Hiragana characters.
+ *
+ * @param flag
+ * true if Hiragana Quaternary mode is to be on, false otherwise
* @see #setHiraganaQuaternaryDefault
* @see #isHiraganaQuaternary
* @stable ICU 2.8
*/
- public void setHiraganaQuaternary(boolean flag)
- {
+ public void setHiraganaQuaternary(boolean flag) {
m_isHiragana4_ = flag;
- updateInternalState();
+ updateInternalState();
}
/**
- * Sets the Hiragana Quaternary mode to the initial mode set during
- * construction of the RuleBasedCollator.
- * See setHiraganaQuaternary(boolean) for more details.
+ * Sets the Hiragana Quaternary mode to the initial mode set during construction of the RuleBasedCollator. See
+ * setHiraganaQuaternary(boolean) for more details.
+ *
* @see #setHiraganaQuaternary(boolean)
* @see #isHiraganaQuaternary
* @stable ICU 2.8
*/
- public void setHiraganaQuaternaryDefault()
- {
+ public void setHiraganaQuaternaryDefault() {
m_isHiragana4_ = m_defaultIsHiragana4_;
updateInternalState();
}
/**
- * Sets whether uppercase characters sort before lowercase
- * characters or vice versa, in strength TERTIARY. The default
- * mode is false, and so lowercase characters sort before uppercase
- * characters.
- * If true, sort upper case characters first.
- * @param upperfirst true to sort uppercase characters before
- * lowercase characters, false to sort lowercase
- * characters before uppercase characters
+ * Sets whether uppercase characters sort before lowercase characters or vice versa, in strength TERTIARY. The
+ * default mode is false, and so lowercase characters sort before uppercase characters. If true, sort upper case
+ * characters first.
+ *
+ * @param upperfirst
+ * true to sort uppercase characters before lowercase characters, false to sort lowercase characters
+ * before uppercase characters
* @see #isLowerCaseFirst
* @see #isUpperCaseFirst
* @see #setLowerCaseFirst
* @see #setCaseFirstDefault
* @stable ICU 2.8
*/
- public void setUpperCaseFirst(boolean upperfirst)
- {
+ public void setUpperCaseFirst(boolean upperfirst) {
if (upperfirst) {
- if(m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
+ if (m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
latinOneRegenTable_ = true;
}
m_caseFirst_ = AttributeValue.UPPER_FIRST_;
- }
- else {
- if(m_caseFirst_ != AttributeValue.OFF_) {
+ } else {
+ if (m_caseFirst_ != AttributeValue.OFF_) {
latinOneRegenTable_ = true;
}
m_caseFirst_ = AttributeValue.OFF_;
@@ -341,53 +317,46 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Sets the orders of lower cased characters to sort before upper cased
- * characters, in strength TERTIARY. The default
- * mode is false.
- * If true is set, the RuleBasedCollator will sort lower cased characters
- * before the upper cased ones.
- * Otherwise, if false is set, the RuleBasedCollator will ignore case
- * preferences.
- * @param lowerfirst true for sorting lower cased characters before
- * upper cased characters, false to ignore case
- * preferences.
+ * Sets the orders of lower cased characters to sort before upper cased characters, in strength TERTIARY. The
+ * default mode is false. If true is set, the RuleBasedCollator will sort lower cased characters before the upper
+ * cased ones. Otherwise, if false is set, the RuleBasedCollator will ignore case preferences.
+ *
+ * @param lowerfirst
+ * true for sorting lower cased characters before upper cased characters, false to ignore case
+ * preferences.
* @see #isLowerCaseFirst
* @see #isUpperCaseFirst
* @see #setUpperCaseFirst
* @see #setCaseFirstDefault
* @stable ICU 2.8
*/
- public void setLowerCaseFirst(boolean lowerfirst)
- {
+ public void setLowerCaseFirst(boolean lowerfirst) {
if (lowerfirst) {
- if(m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
- latinOneRegenTable_ = true;
- }
- m_caseFirst_ = AttributeValue.LOWER_FIRST_;
- }
- else {
- if(m_caseFirst_ != AttributeValue.OFF_) {
- latinOneRegenTable_ = true;
- }
- m_caseFirst_ = AttributeValue.OFF_;
+ if (m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
+ latinOneRegenTable_ = true;
}
+ m_caseFirst_ = AttributeValue.LOWER_FIRST_;
+ } else {
+ if (m_caseFirst_ != AttributeValue.OFF_) {
+ latinOneRegenTable_ = true;
+ }
+ m_caseFirst_ = AttributeValue.OFF_;
+ }
updateInternalState();
}
/**
- * Sets the case first mode to the initial mode set during
- * construction of the RuleBasedCollator.
- * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more
- * details.
+ * Sets the case first mode to the initial mode set during construction of the RuleBasedCollator. See
+ * setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more details.
+ *
* @see #isLowerCaseFirst
* @see #isUpperCaseFirst
* @see #setLowerCaseFirst(boolean)
* @see #setUpperCaseFirst(boolean)
* @stable ICU 2.8
*/
- public final void setCaseFirstDefault()
- {
- if(m_caseFirst_ != m_defaultCaseFirst_) {
+ public final void setCaseFirstDefault() {
+ if (m_caseFirst_ != m_defaultCaseFirst_) {
latinOneRegenTable_ = true;
}
m_caseFirst_ = m_defaultCaseFirst_;
@@ -395,58 +364,54 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Sets the alternate handling mode to the initial mode set during
- * construction of the RuleBasedCollator.
- * See setAlternateHandling(boolean) for more details.
+ * Sets the alternate handling mode to the initial mode set during construction of the RuleBasedCollator. See
+ * setAlternateHandling(boolean) for more details.
+ *
* @see #setAlternateHandlingShifted(boolean)
* @see #isAlternateHandlingShifted()
* @stable ICU 2.8
*/
- public void setAlternateHandlingDefault()
- {
+ public void setAlternateHandlingDefault() {
m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
updateInternalState();
}
/**
- * Sets the case level mode to the initial mode set during
- * construction of the RuleBasedCollator.
- * See setCaseLevel(boolean) for more details.
+ * Sets the case level mode to the initial mode set during construction of the RuleBasedCollator. See
+ * setCaseLevel(boolean) for more details.
+ *
* @see #setCaseLevel(boolean)
* @see #isCaseLevel
* @stable ICU 2.8
*/
- public void setCaseLevelDefault()
- {
+ public void setCaseLevelDefault() {
m_isCaseLevel_ = m_defaultIsCaseLevel_;
updateInternalState();
}
/**
- * Sets the decomposition mode to the initial mode set during construction
- * of the RuleBasedCollator.
- * See setDecomposition(int) for more details.
+ * Sets the decomposition mode to the initial mode set during construction of the RuleBasedCollator. See
+ * setDecomposition(int) for more details.
+ *
* @see #getDecomposition
* @see #setDecomposition(int)
* @stable ICU 2.8
*/
- public void setDecompositionDefault()
- {
+ public void setDecompositionDefault() {
setDecomposition(m_defaultDecomposition_);
- updateInternalState();
+ updateInternalState();
}
/**
- * Sets the French collation mode to the initial mode set during
- * construction of the RuleBasedCollator.
- * See setFrenchCollation(boolean) for more details.
+ * Sets the French collation mode to the initial mode set during construction of the RuleBasedCollator. See
+ * setFrenchCollation(boolean) for more details.
+ *
* @see #isFrenchCollation
* @see #setFrenchCollation(boolean)
* @stable ICU 2.8
*/
- public void setFrenchCollationDefault()
- {
- if(m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
+ public void setFrenchCollationDefault() {
+ if (m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
latinOneRegenTable_ = true;
}
m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
@@ -454,51 +419,57 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Sets the collation strength to the initial mode set during the
- * construction of the RuleBasedCollator.
- * See setStrength(int) for more details.
+ * Sets the collation strength to the initial mode set during the construction of the RuleBasedCollator. See
+ * setStrength(int) for more details.
+ *
* @see #setStrength(int)
* @see #getStrength
* @stable ICU 2.8
*/
- public void setStrengthDefault()
- {
+ public void setStrengthDefault() {
setStrength(m_defaultStrength_);
- updateInternalState();
+ updateInternalState();
}
-
+
/**
- * Method to set numeric collation to its default value.
- * When numeric collation is turned on, this Collator generates a collation
- * key for the numeric value of substrings of digits. This is a way to get
- * '100' to sort AFTER '2'
+ * Method to set numeric collation to its default value. When numeric collation is turned on, this Collator
+ * generates a collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER
+ * '2'
+ *
* @see #getNumericCollation
* @see #setNumericCollation
* @stable ICU 2.8
*/
- public void setNumericCollationDefault()
- {
+ public void setNumericCollationDefault() {
setNumericCollation(m_defaultIsNumericCollation_);
- updateInternalState();
+ updateInternalState();
}
/**
- * Sets the mode for the direction of SECONDARY weights to be used in
- * French collation.
- * The default value is false, which treats SECONDARY weights in the order
- * they appear.
- * If set to true, the SECONDARY weights will be sorted backwards.
- * See the section on
- *
+ * Method to set the script order to its default value.
+ *
+ * @see #getScriptOrder
+ * @see #setScriptOrder
+ * @stable
+ */
+ public void setScriptOrderDefault() {
+ setScriptOrder(m_defaultScriptOrder_);
+ }
+
+ /**
+ * Sets the mode for the direction of SECONDARY weights to be used in French collation. The default value is false,
+ * which treats SECONDARY weights in the order they appear. If set to true, the SECONDARY weights will be sorted
+ * backwards. See the section on
* French collation for more information.
- * @param flag true to set the French collation on, false to set it off
+ *
+ * @param flag
+ * true to set the French collation on, false to set it off
* @stable ICU 2.8
* @see #isFrenchCollation
* @see #setFrenchCollationDefault
*/
- public void setFrenchCollation(boolean flag)
- {
- if(m_isFrenchCollation_ != flag) {
+ public void setFrenchCollation(boolean flag) {
+ if (m_isFrenchCollation_ != flag) {
latinOneRegenTable_ = true;
}
m_isFrenchCollation_ = flag;
@@ -506,68 +477,61 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Sets the alternate handling for QUATERNARY strength to be either
- * shifted or non-ignorable.
- * See the UCA definition on
- *
- * Alternate Weighting.
- * This attribute will only be effective when QUATERNARY strength is set.
- * The default value for this mode is false, corresponding to the
- * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the
- * RuleBasedCollator will treats all the codepoints with non-ignorable
- * primary weights in the same way.
- * If the mode is set to true, the behaviour corresponds to SHIFTED defined
- * in UCA, this causes codepoints with PRIMARY orders that are equal or
- * below the variable top value to be ignored in PRIMARY order and
- * moved to the QUATERNARY order.
- * @param shifted true if SHIFTED behaviour for alternate handling is
- * desired, false for the NON_IGNORABLE behaviour.
+ * Sets the alternate handling for QUATERNARY strength to be either shifted or non-ignorable. See the UCA definition
+ * on Alternate Weighting. This
+ * attribute will only be effective when QUATERNARY strength is set. The default value for this mode is false,
+ * corresponding to the NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the RuleBasedCollator will treats all
+ * the codepoints with non-ignorable primary weights in the same way. If the mode is set to true, the behaviour
+ * corresponds to SHIFTED defined in UCA, this causes codepoints with PRIMARY orders that are equal or below the
+ * variable top value to be ignored in PRIMARY order and moved to the QUATERNARY order.
+ *
+ * @param shifted
+ * true if SHIFTED behaviour for alternate handling is desired, false for the NON_IGNORABLE behaviour.
* @see #isAlternateHandlingShifted
* @see #setAlternateHandlingDefault
* @stable ICU 2.8
*/
- public void setAlternateHandlingShifted(boolean shifted)
- {
+ public void setAlternateHandlingShifted(boolean shifted) {
m_isAlternateHandlingShifted_ = shifted;
updateInternalState();
}
/**
*
- * When case level is set to true, an additional weight is formed
- * between the SECONDARY and TERTIARY weight, known as the case level.
- * The case level is used to distinguish large and small Japanese Kana
- * characters. Case level could also be used in other situations.
- * For example to distinguish certain Pinyin characters.
- * The default value is false, which means the case level is not generated.
- * The contents of the case level are affected by the case first
- * mode. A simple way to ignore accent differences in a string is to set
- * the strength to PRIMARY and enable case level.
+ * When case level is set to true, an additional weight is formed between the SECONDARY and TERTIARY weight, known
+ * as the case level. The case level is used to distinguish large and small Japanese Kana characters. Case level
+ * could also be used in other situations. For example to distinguish certain Pinyin characters. The default value
+ * is false, which means the case level is not generated. The contents of the case level are affected by the case
+ * first mode. A simple way to ignore accent differences in a string is to set the strength to PRIMARY and enable
+ * case level.
*
*
- * See the section on
- *
- * case level for more information.
+ * See the section on case
+ * level for more information.
*
- * @param flag true if case level sorting is required, false otherwise
+ *
+ * @param flag
+ * true if case level sorting is required, false otherwise
* @stable ICU 2.8
* @see #setCaseLevelDefault
* @see #isCaseLevel
*/
- public void setCaseLevel(boolean flag)
- {
+ public void setCaseLevel(boolean flag) {
m_isCaseLevel_ = flag;
updateInternalState();
}
/**
*
- * Sets this Collator's strength property. The strength property
- * determines the minimum level of difference considered significant
- * during comparison.
+ * Sets this Collator's strength property. The strength property determines the minimum level of difference
+ * considered significant during comparison.
*
- *
See the Collator class description for an example of use.
- * @param newStrength the new strength value.
+ *
+ * See the Collator class description for an example of use.
+ *
+ *
+ * @param newStrength
+ * the new strength value.
* @see #getStrength
* @see #setStrengthDefault
* @see #PRIMARY
@@ -575,48 +539,42 @@ public final class RuleBasedCollator extends Collator
* @see #TERTIARY
* @see #QUATERNARY
* @see #IDENTICAL
- * @exception IllegalArgumentException If the new strength value is not one
- * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
+ * @exception IllegalArgumentException
+ * If the new strength value is not one of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
* @stable ICU 2.8
*/
- public void setStrength(int newStrength)
- {
+ public void setStrength(int newStrength) {
super.setStrength(newStrength);
updateInternalState();
}
-
- /**
+
+ /**
*
- * Variable top is a two byte primary value which causes all the codepoints
- * with primary values that are less or equal than the variable top to be
- * shifted when alternate handling is set to SHIFTED.
+ * Variable top is a two byte primary value which causes all the codepoints with primary values that are less or
+ * equal than the variable top to be shifted when alternate handling is set to SHIFTED.
*
*
* Sets the variable top to a collation element value of a string supplied.
- *
- * @param varTop one or more (if contraction) characters to which the
- * variable top should be set
- * @return a int value containing the value of the variable top in upper 16
- * bits. Lower 16 bits are undefined.
- * @exception IllegalArgumentException is thrown if varTop argument is not
- * a valid variable top element. A variable top element is
- * invalid when
- *
- *
it is a contraction that does not exist in the
- * Collation order
- *
when the PRIMARY strength collation element for the
- * variable top has more than two bytes
- *
when the varTop argument is null or zero in length.
- *
+ *
+ *
+ * @param varTop
+ * one or more (if contraction) characters to which the variable top should be set
+ * @return a int value containing the value of the variable top in upper 16 bits. Lower 16 bits are undefined.
+ * @exception IllegalArgumentException
+ * is thrown if varTop argument is not a valid variable top element. A variable top element is
+ * invalid when
+ *
+ *
it is a contraction that does not exist in the Collation order
+ *
when the PRIMARY strength collation element for the variable top has more than two bytes
+ *
when the varTop argument is null or zero in length.
+ *
* @see #getVariableTop
* @see RuleBasedCollator#setAlternateHandlingShifted
* @stable ICU 2.6
*/
- public int setVariableTop(String varTop)
- {
+ public int setVariableTop(String varTop) {
if (varTop == null || varTop.length() == 0) {
- throw new IllegalArgumentException(
- "Variable top argument string can not be null or zero in length.");
+ throw new IllegalArgumentException("Variable top argument string can not be null or zero in length.");
}
if (m_srcUtilIter_ == null) {
initUtility(true);
@@ -624,112 +582,124 @@ public final class RuleBasedCollator extends Collator
m_srcUtilColEIter_.setText(varTop);
int ce = m_srcUtilColEIter_.next();
-
- // here we check if we have consumed all characters
+
+ // here we check if we have consumed all characters
// you can put in either one character or a contraction
- // you shouldn't put more...
- if (m_srcUtilColEIter_.getOffset() != varTop.length()
- || ce == CollationElementIterator.NULLORDER) {
- throw new IllegalArgumentException(
- "Variable top argument string is a contraction that does not exist "
- + "in the Collation order");
+ // you shouldn't put more...
+ if (m_srcUtilColEIter_.getOffset() != varTop.length() || ce == CollationElementIterator.NULLORDER) {
+ throw new IllegalArgumentException("Variable top argument string is a contraction that does not exist "
+ + "in the Collation order");
}
-
+
int nextCE = m_srcUtilColEIter_.next();
-
- if ((nextCE != CollationElementIterator.NULLORDER)
- && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
- throw new IllegalArgumentException(
- "Variable top argument string can only have a single collation "
- + "element that has less than or equal to two PRIMARY strength "
- + "bytes");
+
+ if ((nextCE != CollationElementIterator.NULLORDER)
+ && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
+ throw new IllegalArgumentException("Variable top argument string can only have a single collation "
+ + "element that has less than or equal to two PRIMARY strength " + "bytes");
}
-
+
m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
-
+
return ce & CE_PRIMARY_MASK_;
}
-
- /**
- * Sets the variable top to a collation element value supplied.
- * Variable top is set to the upper 16 bits.
- * Lower 16 bits are ignored.
- * @param varTop Collation element value, as returned by setVariableTop or
- * getVariableTop
+
+ /**
+ * Sets the variable top to a collation element value supplied. Variable top is set to the upper 16 bits. Lower 16
+ * bits are ignored.
+ *
+ * @param varTop
+ * Collation element value, as returned by setVariableTop or getVariableTop
* @see #getVariableTop
* @see #setVariableTop(String)
* @stable ICU 2.6
*/
- public void setVariableTop(int varTop)
- {
+ public void setVariableTop(int varTop) {
m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
}
-
+
/**
- * When numeric collation is turned on, this Collator generates a collation
- * key for the numeric value of substrings of digits. This is a way to get
- * '100' to sort AFTER '2'
- * @param flag true to turn numeric collation on and false to turn it off
+ * When numeric collation is turned on, this Collator generates a collation key for the numeric value of substrings
+ * of digits. This is a way to get '100' to sort AFTER '2'
+ *
+ * @param flag
+ * true to turn numeric collation on and false to turn it off
* @see #getNumericCollation
* @see #setNumericCollationDefault
* @stable ICU 2.8
*/
- public void setNumericCollation(boolean flag)
- {
+ public void setNumericCollation(boolean flag) {
// sort substrings of digits as numbers
m_isNumericCollation_ = flag;
updateInternalState();
}
+ /**
+ * Set the order for scripts to be ordered in.
+ *
+ * @param order
+ * the reordering of scripts
+ * @see #getScriptOrder
+ * @see #setScriptOrderDefault
+ * @stable
+ */
+ public void setScriptOrder(int... order) {
+ if (order != null) {
+ m_scriptOrder_ = new int[order.length];
+ for (int i = 0; i < order.length; i++) {
+ m_scriptOrder_[i] = order[i];
+ }
+ } else {
+ m_scriptOrder_ = null;
+ }
+ buildPermutationTable();
+ }
+
// public getters --------------------------------------------------------
/**
- * Gets the collation rules for this RuleBasedCollator.
- * Equivalent to String getRules(RuleOption.FULL_RULES).
+ * Gets the collation rules for this RuleBasedCollator. Equivalent to String getRules(RuleOption.FULL_RULES).
+ *
* @return returns the collation rules
* @see #getRules(boolean)
* @stable ICU 2.8
*/
- public String getRules()
- {
+ public String getRules() {
return m_rules_;
}
-
+
/**
- * Returns current rules. The argument defines whether full rules
- * (UCA + tailored) rules are returned or just the tailoring.
- * @param fullrules true if the rules that defines the full set of
- * collation order is required, otherwise false for returning only
- * the tailored rules
+ * Returns current rules. The argument defines whether full rules (UCA + tailored) rules are returned or just the
+ * tailoring.
+ *
+ * @param fullrules
+ * true if the rules that defines the full set of collation order is required, otherwise false for
+ * returning only the tailored rules
* @return the current rules that defines this Collator.
* @see #getRules()
* @stable ICU 2.6
*/
- public String getRules(boolean fullrules)
- {
+ public String getRules(boolean fullrules) {
if (!fullrules) {
return m_rules_;
}
- // take the UCA rules and append real rules at the end
+ // take the UCA rules and append real rules at the end
return UCA_.m_rules_.concat(m_rules_);
}
/**
- * Get an UnicodeSet that contains all the characters and sequences
- * tailored in this collator.
- * @return a pointer to a UnicodeSet object containing all the
- * code points and sequences that may sort differently than
- * in the UCA.
+ * Get an UnicodeSet that contains all the characters and sequences tailored in this collator.
+ *
+ * @return a pointer to a UnicodeSet object containing all the code points and sequences that may sort differently
+ * than in the UCA.
* @stable ICU 2.4
*/
- public UnicodeSet getTailoredSet()
- {
+ public UnicodeSet getTailoredSet() {
try {
- CollationRuleParser src = new CollationRuleParser(getRules());
- return src.getTailoredSet();
- } catch(Exception e) {
- throw new IllegalStateException("A tailoring rule should not " +
- "have errors. Something is quite wrong!");
+ CollationRuleParser src = new CollationRuleParser(getRules());
+ return src.getTailoredSet();
+ } catch (Exception e) {
+ throw new IllegalStateException("A tailoring rule should not " + "have errors. Something is quite wrong!");
}
}
@@ -738,8 +708,9 @@ public final class RuleBasedCollator extends Collator
UnicodeSet contractions;
UnicodeSet expansions;
UnicodeSet removedContractions;
- boolean addPrefixes;
- contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
+ boolean addPrefixes;
+
+ contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
UnicodeSet removedContractions, boolean addPrefixes) {
this.coll = coll;
this.contractions = contractions;
@@ -748,63 +719,64 @@ public final class RuleBasedCollator extends Collator
this.addPrefixes = addPrefixes;
}
}
-
- private void
- addSpecial(contContext c, StringBuilder buffer, int CE)
- {
+
+ private void addSpecial(contContext c, StringBuilder buffer, int CE) {
StringBuilder b = new StringBuilder();
int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
int newCE = c.coll.m_contractionCE_[offset];
// we might have a contraction that ends from previous level
- if(newCE != CollationElementIterator.CE_NOT_FOUND_) {
- if(isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_
- && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_
- && c.addPrefixes) {
+ if (newCE != CollationElementIterator.CE_NOT_FOUND_) {
+ if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ && isSpecial(newCE)
+ && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
addSpecial(c, buffer, newCE);
}
- if(buffer.length() > 1) {
- if(c.contractions != null) {
+ if (buffer.length() > 1) {
+ if (c.contractions != null) {
c.contractions.add(buffer.toString());
}
- if(c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
+ if (c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
c.expansions.add(buffer.toString());
}
}
- }
-
+ }
+
offset++;
// check whether we're doing contraction or prefix
- if(getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
- while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
+ if (getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
+ while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
b.delete(0, b.length());
b.append(buffer);
newCE = c.coll.m_contractionCE_[offset];
b.insert(0, c.coll.m_contractionIndex_[offset]);
- if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
+ if (isSpecial(newCE)
+ && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
addSpecial(c, b, newCE);
} else {
- if(c.contractions != null) {
+ if (c.contractions != null) {
c.contractions.add(b.toString());
}
- if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
+ if (c.expansions != null && isSpecial(newCE)
+ && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
c.expansions.add(b.toString());
}
}
offset++;
}
- } else if(getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
- while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
+ } else if (getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
+ while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
b.delete(0, b.length());
b.append(buffer);
newCE = c.coll.m_contractionCE_[offset];
b.append(c.coll.m_contractionIndex_[offset]);
- if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
+ if (isSpecial(newCE)
+ && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
addSpecial(c, b, newCE);
} else {
- if(c.contractions != null) {
+ if (c.contractions != null) {
c.contractions.add(b.toString());
}
- if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
+ if (c.expansions != null && isSpecial(newCE)
+ && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
c.expansions.add(b.toString());
}
}
@@ -812,26 +784,23 @@ public final class RuleBasedCollator extends Collator
}
}
}
-
- private
- void processSpecials(contContext c)
- {
+
+ private void processSpecials(contContext c) {
int internalBufferSize = 512;
- TrieIterator trieiterator
- = new TrieIterator(c.coll.m_trie_);
+ TrieIterator trieiterator = new TrieIterator(c.coll.m_trie_);
RangeValueIterator.Element element = new RangeValueIterator.Element();
while (trieiterator.next(element)) {
int start = element.start;
int limit = element.limit;
int CE = element.value;
StringBuilder contraction = new StringBuilder(internalBufferSize);
-
- if(isSpecial(CE)) {
- if(((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
- while(start < limit) {
- // if there are suppressed contractions, we don't
+
+ if (isSpecial(CE)) {
+ if (((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
+ while (start < limit) {
+ // if there are suppressed contractions, we don't
// want to add them.
- if(c.removedContractions != null && c.removedContractions.contains(start)) {
+ if (c.removedContractions != null && c.removedContractions.contains(start)) {
start++;
continue;
}
@@ -841,69 +810,72 @@ public final class RuleBasedCollator extends Collator
addSpecial(c, contraction, CE);
start++;
}
- } else if(c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
- while(start < limit) {
+ } else if (c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
+ while (start < limit) {
c.expansions.add(start++);
}
}
}
}
}
-
+
/**
* Gets unicode sets containing contractions and/or expansions of a collator
- * @param contractions if not null, set to contain contractions
- * @param expansions if not null, set to contain expansions
- * @param addPrefixes add the prefix contextual elements to contractions
- * @throws Exception Throws an exception if any errors occurs.
+ *
+ * @param contractions
+ * if not null, set to contain contractions
+ * @param expansions
+ * if not null, set to contain expansions
+ * @param addPrefixes
+ * add the prefix contextual elements to contractions
+ * @throws Exception
+ * Throws an exception if any errors occurs.
* @stable ICU 3.4
*/
- public void
- getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions,
- boolean addPrefixes) throws Exception {
- if(contractions != null) {
+ public void getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, boolean addPrefixes)
+ throws Exception {
+ if (contractions != null) {
contractions.clear();
}
- if(expansions != null) {
+ if (expansions != null) {
expansions.clear();
}
String rules = getRules();
try {
CollationRuleParser src = new CollationRuleParser(rules);
- contContext c = new contContext(RuleBasedCollator.UCA_,
- contractions, expansions, src.m_removeSet_, addPrefixes);
-
+ contContext c = new contContext(RuleBasedCollator.UCA_, contractions, expansions, src.m_removeSet_,
+ addPrefixes);
+
// Add the UCA contractions
processSpecials(c);
// This is collator specific. Add contractions from a collator
c.coll = this;
- c.removedContractions = null;
+ c.removedContractions = null;
processSpecials(c);
} catch (Exception e) {
throw e;
}
}
-
+
/**
*
- * Get a Collation key for the argument String source from this
- * RuleBasedCollator.
+ * Get a Collation key for the argument String source from this RuleBasedCollator.
*
*
* General recommendation:
- * If comparison are to be done to the same String multiple times, it would
- * be more efficient to generate CollationKeys for the Strings and use
- * CollationKey.compareTo(CollationKey) for the comparisons.
- * If the each Strings are compared to only once, using the method
- * RuleBasedCollator.compare(String, String) will have a better performance.
+ * If comparison are to be done to the same String multiple times, it would be more efficient to generate
+ * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If the each
+ * Strings are compared to only once, using the method RuleBasedCollator.compare(String, String) will have a better
+ * performance.
*
*
* See the class documentation for an explanation about CollationKeys.
*
- * @param source the text String to be transformed into a collation key.
- * @return the CollationKey for the given String based on this
- * RuleBasedCollator's collation rules. If the source String is
- * null, a null CollationKey is returned.
+ *
+ * @param source
+ * the text String to be transformed into a collation key.
+ * @return the CollationKey for the given String based on this RuleBasedCollator's collation rules. If the source
+ * String is null, a null CollationKey is returned.
* @see CollationKey
* @see #compare(String, String)
* @see #getRawCollationKey
@@ -913,37 +885,33 @@ public final class RuleBasedCollator extends Collator
if (source == null) {
return null;
}
- m_utilRawCollationKey_ = getRawCollationKey(source,
- m_utilRawCollationKey_);
+ m_utilRawCollationKey_ = getRawCollationKey(source, m_utilRawCollationKey_);
return new CollationKey(source, m_utilRawCollationKey_);
}
-
+
/**
- * Gets the simpler form of a CollationKey for the String source following
- * the rules of this Collator and stores the result into the user provided
- * argument key.
- * If key has a internal byte array of length that's too small for the
- * result, the internal byte array will be grown to the exact required
- * size.
- * @param source the text String to be transformed into a RawCollationKey
- * @param key output RawCollationKey to store results
- * @return If key is null, a new instance of RawCollationKey will be
- * created and returned, otherwise the user provided key will be
- * returned.
- * @see #getCollationKey
+ * Gets the simpler form of a CollationKey for the String source following the rules of this Collator and stores the
+ * result into the user provided argument key. If key has a internal byte array of length that's too small for the
+ * result, the internal byte array will be grown to the exact required size.
+ *
+ * @param source
+ * the text String to be transformed into a RawCollationKey
+ * @param key
+ * output RawCollationKey to store results
+ * @return If key is null, a new instance of RawCollationKey will be created and returned, otherwise the user
+ * provided key will be returned.
+ * @see #getCollationKey
* @see #compare(String, String)
* @see RawCollationKey
* @stable ICU 2.8
*/
- public RawCollationKey getRawCollationKey(String source,
- RawCollationKey key)
- {
+ public RawCollationKey getRawCollationKey(String source, RawCollationKey key) {
if (source == null) {
return null;
}
int strength = getStrength();
m_utilCompare0_ = m_isCaseLevel_;
- //m_utilCompare1_ = true;
+ // m_utilCompare1_ = true;
m_utilCompare2_ = strength >= SECONDARY;
m_utilCompare3_ = strength >= TERTIARY;
m_utilCompare4_ = strength >= QUATERNARY;
@@ -954,13 +922,13 @@ public final class RuleBasedCollator extends Collator
m_utilBytesCount2_ = 0;
m_utilBytesCount3_ = 0;
m_utilBytesCount4_ = 0;
- //m_utilBytesCount5_ = 0;
- //m_utilCount0_ = 0;
- //m_utilCount1_ = 0;
+ // m_utilBytesCount5_ = 0;
+ // m_utilCount0_ = 0;
+ // m_utilCount1_ = 0;
m_utilCount2_ = 0;
m_utilCount3_ = 0;
m_utilCount4_ = 0;
- //m_utilCount5_ = 0;
+ // m_utilCount5_ = 0;
boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
// TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
// If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
@@ -969,28 +937,24 @@ public final class RuleBasedCollator extends Collator
byte hiragana4 = 0;
if (m_isHiragana4_ && m_utilCompare4_) {
// allocate one more space for hiragana, value for hiragana
- hiragana4 = (byte)commonBottom4;
- commonBottom4 ++;
+ hiragana4 = (byte) commonBottom4;
+ commonBottom4++;
}
int bottomCount4 = 0xFF - commonBottom4;
// If we need to normalize, we'll do it all at once at the beginning!
- if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD,0)
- != Normalizer.YES) {
+ if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
// if it is identical strength, we have to normalize the string to
// NFD so that it will be appended correctly to the end of the sort
// key
source = Normalizer.decompose(source, false);
- }
- else if (getDecomposition() != NO_DECOMPOSITION
- && Normalizer.quickCheck(source, Normalizer.FCD,0)
- != Normalizer.YES) {
+ } else if (getDecomposition() != NO_DECOMPOSITION
+ && Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.YES) {
// for the rest of the strength, if decomposition is on, FCD is
// enough for us to work on.
- source = Normalizer.normalize(source,Normalizer.FCD);
+ source = Normalizer.normalize(source, Normalizer.FCD);
}
- getSortKeyBytes(source, doFrench, hiragana4, commonBottom4,
- bottomCount4);
+ getSortKeyBytes(source, doFrench, hiragana4, commonBottom4, bottomCount4);
if (key == null) {
key = new RawCollationKey();
}
@@ -999,136 +963,172 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Return true if an uppercase character is sorted before the corresponding lowercase character.
- * See setCaseFirst(boolean) for details.
+ * Return true if an uppercase character is sorted before the corresponding lowercase character. See
+ * setCaseFirst(boolean) for details.
+ *
* @see #setUpperCaseFirst
* @see #setLowerCaseFirst
* @see #isLowerCaseFirst
* @see #setCaseFirstDefault
- * @return true if upper cased characters are sorted before lower cased
- * characters, false otherwise
+ * @return true if upper cased characters are sorted before lower cased characters, false otherwise
* @stable ICU 2.8
*/
- public boolean isUpperCaseFirst()
- {
+ public boolean isUpperCaseFirst() {
return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
- }
-
+ }
+
/**
- * Return true if a lowercase character is sorted before the corresponding uppercase character.
- * See setCaseFirst(boolean) for details.
+ * Return true if a lowercase character is sorted before the corresponding uppercase character. See
+ * setCaseFirst(boolean) for details.
+ *
* @see #setUpperCaseFirst
* @see #setLowerCaseFirst
* @see #isUpperCaseFirst
* @see #setCaseFirstDefault
- * @return true lower cased characters are sorted before upper cased
- * characters, false otherwise
+ * @return true lower cased characters are sorted before upper cased characters, false otherwise
* @stable ICU 2.8
*/
- public boolean isLowerCaseFirst()
- {
+ public boolean isLowerCaseFirst() {
return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
}
/**
- * Checks if the alternate handling behaviour is the UCA defined SHIFTED or
- * NON_IGNORABLE.
- * If return value is true, then the alternate handling attribute for the
- * Collator is SHIFTED. Otherwise if return value is false, then the
- * alternate handling attribute for the Collator is NON_IGNORABLE
- * See setAlternateHandlingShifted(boolean) for more details.
+ * Checks if the alternate handling behaviour is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true,
+ * then the alternate handling attribute for the Collator is SHIFTED. Otherwise if return value is false, then the
+ * alternate handling attribute for the Collator is NON_IGNORABLE See setAlternateHandlingShifted(boolean) for more
+ * details.
+ *
* @return true or false
* @see #setAlternateHandlingShifted(boolean)
* @see #setAlternateHandlingDefault
* @stable ICU 2.8
*/
- public boolean isAlternateHandlingShifted()
- {
+ public boolean isAlternateHandlingShifted() {
return m_isAlternateHandlingShifted_;
}
/**
- * Checks if case level is set to true.
- * See setCaseLevel(boolean) for details.
+ * Checks if case level is set to true. See setCaseLevel(boolean) for details.
+ *
* @return the case level mode
* @see #setCaseLevelDefault
* @see #isCaseLevel
* @see #setCaseLevel(boolean)
* @stable ICU 2.8
*/
- public boolean isCaseLevel()
- {
+ public boolean isCaseLevel() {
return m_isCaseLevel_;
}
/**
- * Checks if French Collation is set to true.
- * See setFrenchCollation(boolean) for details.
+ * Checks if French Collation is set to true. See setFrenchCollation(boolean) for details.
+ *
* @return true if French Collation is set to true, false otherwise
* @see #setFrenchCollation(boolean)
* @see #setFrenchCollationDefault
* @stable ICU 2.8
*/
- public boolean isFrenchCollation()
- {
- return m_isFrenchCollation_;
- }
+ public boolean isFrenchCollation() {
+ return m_isFrenchCollation_;
+ }
/**
- * Checks if the Hiragana Quaternary mode is set on.
- * See setHiraganaQuaternary(boolean) for more details.
+ * Checks if the Hiragana Quaternary mode is set on. See setHiraganaQuaternary(boolean) for more details.
+ *
* @return flag true if Hiragana Quaternary mode is on, false otherwise
* @see #setHiraganaQuaternaryDefault
* @see #setHiraganaQuaternary(boolean)
* @stable ICU 2.8
*/
- public boolean isHiraganaQuaternary()
- {
+ public boolean isHiraganaQuaternary() {
return m_isHiragana4_;
}
- /**
- * Gets the variable top value of a Collator.
- * Lower 16 bits are undefined and should be ignored.
+ /**
+ * Gets the variable top value of a Collator. Lower 16 bits are undefined and should be ignored.
+ *
* @return the variable top value of a Collator.
* @see #setVariableTop
* @stable ICU 2.6
*/
- public int getVariableTop()
- {
- return m_variableTopValue_ << 16;
+ public int getVariableTop() {
+ return m_variableTopValue_ << 16;
}
-
- /**
- * Method to retrieve the numeric collation value.
- * When numeric collation is turned on, this Collator generates a collation
- * key for the numeric value of substrings of digits. This is a way to get
- * '100' to sort AFTER '2'
+
+ /**
+ * Method to retrieve the numeric collation value. When numeric collation is turned on, this Collator generates a
+ * collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER '2'
+ *
* @see #setNumericCollation
* @see #setNumericCollationDefault
* @return true if numeric collation is turned on, false otherwise
* @stable ICU 2.8
*/
- public boolean getNumericCollation()
- {
+ public boolean getNumericCollation() {
return m_isNumericCollation_;
}
-
+
+ /**
+ * Method to retrieve the script reordering.
+ *
+ * @see #setScriptOrder
+ * @see #setScriptOrderDefault
+ * @return the ordering of the scripts if one has been set, null otherwise.
+ * @stable
+ */
+ public int[] getScriptOrder() {
+ if (m_scriptOrder_ != null) {
+ int[] ret = new int[m_scriptOrder_.length];
+ for (int i = 0; i < m_scriptOrder_.length; i++) {
+ ret[i] = m_scriptOrder_[i];
+ }
+ return ret;
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Method to retrieve the scripts equivalent to the given script for reordering. Some scripts will share the same
+ * "lead byte" used for the collation codes and so must be reordered together.
+ *
+ * @see #setScriptOrder
+ * @see #setScriptOrderDefault
+ * @param reorderCode code for which equivalents to be retrieved
+ * @return the set of scripts equivalent to the given script including the script given.
+ * @stable
+ */
+ public static int[] getScriptEquivalentsForReordering(int reorderCode) {
+ Set equivalentScriptsSet = new HashSet();
+ int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(reorderCode);
+ for (int leadByte : leadBytes) {
+ int[] scripts = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte);
+ for (int script : scripts) {
+ equivalentScriptsSet.add(script);
+ }
+ }
+ int[] equivalentScripts = new int[equivalentScriptsSet.size()];
+ int i = 0;
+ for (int script : equivalentScriptsSet) {
+ equivalentScripts[i++] = script;
+ }
+ return equivalentScripts;
+ }
+
// public other methods -------------------------------------------------
/**
- * Compares the equality of two RuleBasedCollator objects.
- * RuleBasedCollator objects are equal if they have the same collation
- * rules and the same attributes.
- * @param obj the RuleBasedCollator to be compared to.
- * @return true if this RuleBasedCollator has exactly the same
- * collation behaviour as obj, false otherwise.
+ * Compares the equality of two RuleBasedCollator objects. RuleBasedCollator objects are equal if they have the same
+ * collation rules and the same attributes.
+ *
+ * @param obj
+ * the RuleBasedCollator to be compared to.
+ * @return true if this RuleBasedCollator has exactly the same collation behaviour as obj, false otherwise.
* @stable ICU 2.8
*/
- public boolean equals(Object obj)
- {
+ public boolean equals(Object obj) {
if (obj == null) {
- return false; // super does class check
+ return false; // super does class check
}
if (this == obj) {
return true;
@@ -1136,19 +1136,28 @@ public final class RuleBasedCollator extends Collator
if (getClass() != obj.getClass()) {
return false;
}
- RuleBasedCollator other = (RuleBasedCollator)obj;
+ RuleBasedCollator other = (RuleBasedCollator) obj;
// all other non-transient information is also contained in rules.
- if (getStrength() != other.getStrength()
- || getDecomposition() != other.getDecomposition()
- || other.m_caseFirst_ != m_caseFirst_
- || other.m_caseSwitch_ != m_caseSwitch_
- || other.m_isAlternateHandlingShifted_
- != m_isAlternateHandlingShifted_
- || other.m_isCaseLevel_ != m_isCaseLevel_
- || other.m_isFrenchCollation_ != m_isFrenchCollation_
- || other.m_isHiragana4_ != m_isHiragana4_) {
+ if (getStrength() != other.getStrength() || getDecomposition() != other.getDecomposition()
+ || other.m_caseFirst_ != m_caseFirst_ || other.m_caseSwitch_ != m_caseSwitch_
+ || other.m_isAlternateHandlingShifted_ != m_isAlternateHandlingShifted_
+ || other.m_isCaseLevel_ != m_isCaseLevel_ || other.m_isFrenchCollation_ != m_isFrenchCollation_
+ || other.m_isHiragana4_ != m_isHiragana4_) {
return false;
}
+ if (m_scriptOrder_ != null ^ other.m_scriptOrder_ != null) {
+ return false;
+ }
+ if (m_scriptOrder_ != null) {
+ if (m_scriptOrder_.length != other.m_scriptOrder_.length) {
+ return false;
+ }
+ for (int i = 0; i < m_scriptOrder_.length; i++) {
+ if (m_scriptOrder_[i] != other.m_scriptOrder_[i]) {
+ return false;
+ }
+ }
+ }
boolean rules = m_rules_ == other.m_rules_;
if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
rules = m_rules_.equals(other.m_rules_);
@@ -1156,24 +1165,18 @@ public final class RuleBasedCollator extends Collator
if (!rules || !ICUDebug.enabled("collation")) {
return rules;
}
- if (m_addition3_ != other.m_addition3_
- || m_bottom3_ != other.m_bottom3_
- || m_bottomCount3_ != other.m_bottomCount3_
- || m_common3_ != other.m_common3_
- || m_isSimple3_ != other.m_isSimple3_
- || m_mask3_ != other.m_mask3_
- || m_minContractionEnd_ != other.m_minContractionEnd_
- || m_minUnsafe_ != other.m_minUnsafe_
- || m_top3_ != other.m_top3_
- || m_topCount3_ != other.m_topCount3_
- || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
+ if (m_addition3_ != other.m_addition3_ || m_bottom3_ != other.m_bottom3_
+ || m_bottomCount3_ != other.m_bottomCount3_ || m_common3_ != other.m_common3_
+ || m_isSimple3_ != other.m_isSimple3_ || m_mask3_ != other.m_mask3_
+ || m_minContractionEnd_ != other.m_minContractionEnd_ || m_minUnsafe_ != other.m_minUnsafe_
+ || m_top3_ != other.m_top3_ || m_topCount3_ != other.m_topCount3_
+ || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
return false;
}
if (!m_trie_.equals(other.m_trie_)) {
// we should use the trie iterator here, but then this part is
// only used in the test.
- for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i --)
- {
+ for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i--) {
int v = m_trie_.getCodePointValue(i);
int otherv = other.m_trie_.getCodePointValue(i);
if (v != otherv) {
@@ -1184,8 +1187,7 @@ public final class RuleBasedCollator extends Collator
if (mask == 0xf1000000) {
v -= (m_expansionOffset_ << 4);
otherv -= (other.m_expansionOffset_ << 4);
- }
- else if (mask == 0xf2000000) {
+ } else if (mask == 0xf2000000) {
v -= m_contractionOffset_;
otherv -= other.m_contractionOffset_;
}
@@ -1209,17 +1211,17 @@ public final class RuleBasedCollator extends Collator
if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) {
return false;
}
- }
+ }
return true;
}
/**
* Generates a unique hash code for this RuleBasedCollator.
+ *
* @return the unique hash code for this Collator
* @stable ICU 2.8
*/
- public int hashCode()
- {
+ public int hashCode() {
String rules = getRules();
if (rules == null) {
rules = "";
@@ -1228,72 +1230,62 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Compares the source text String to the target text String according to
- * the collation rules, strength and decomposition mode for this
- * RuleBasedCollator.
- * Returns an integer less than,
- * equal to or greater than zero depending on whether the source String is
- * less than, equal to or greater than the target String. See the Collator
- * class description for an example of use.
- *
+ * Compares the source text String to the target text String according to the collation rules, strength and
+ * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero
+ * depending on whether the source String is less than, equal to or greater than the target String. See the Collator
+ * class description for an example of use.
*
* General recommendation:
- * If comparison are to be done to the same String multiple times, it would
- * be more efficient to generate CollationKeys for the Strings and use
- * CollationKey.compareTo(CollationKey) for the comparisons.
- * If speed performance is critical and object instantiation is to be
- * reduced, further optimization may be achieved by generating a simpler
- * key of the form RawCollationKey and reusing this RawCollationKey
- * object with the method RuleBasedCollator.getRawCollationKey. Internal
- * byte representation can be directly accessed via RawCollationKey and
- * stored for future use. Like CollationKey, RawCollationKey provides a
- * method RawCollationKey.compareTo for key comparisons.
- * If the each Strings are compared to only once, using the method
- * RuleBasedCollator.compare(String, String) will have a better performance.
+ * If comparison are to be done to the same String multiple times, it would be more efficient to generate
+ * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed
+ * performance is critical and object instantiation is to be reduced, further optimization may be achieved by
+ * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method
+ * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey
+ * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key
+ * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String,
+ * String) will have a better performance.
*
- * @param source the source text String.
- * @param target the target text String.
- * @return Returns an integer value. Value is less than zero if source is
- * less than target, value is zero if source and target are equal,
- * value is greater than zero if source is greater than target.
+ *
+ * @param source
+ * the source text String.
+ * @param target
+ * the target text String.
+ * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source
+ * and target are equal, value is greater than zero if source is greater than target.
* @see CollationKey
* @see #getCollationKey
* @stable ICU 2.8
*/
- public int compare(String source, String target)
- {
+ public int compare(String source, String target) {
if (source == target) {
return 0;
}
// Find the length of any leading portion that is equal
int offset = getFirstUnmatchedOffset(source, target);
- //return compareRegular(source, target, offset);
- if(latinOneUse_) {
- if ((offset < source.length()
- && source.charAt(offset) > ENDOFLATINONERANGE_)
- || (offset < target.length()
- && target.charAt(offset) > ENDOFLATINONERANGE_)) {
- // source or target start with non-latin-1
- return compareRegular(source, target, offset);
- } else {
- return compareUseLatin1(source, target, offset);
- }
+ // return compareRegular(source, target, offset);
+ if (latinOneUse_) {
+ if ((offset < source.length() && source.charAt(offset) > ENDOFLATINONERANGE_)
+ || (offset < target.length() && target.charAt(offset) > ENDOFLATINONERANGE_)) {
+ // source or target start with non-latin-1
+ return compareRegular(source, target, offset);
+ } else {
+ return compareUseLatin1(source, target, offset);
+ }
} else {
- return compareRegular(source, target, offset);
+ return compareRegular(source, target, offset);
}
}
-
+
// package private inner interfaces --------------------------------------
/**
* Attribute values to be used when setting the Collator options
*/
- static interface AttributeValue
- {
+ static interface AttributeValue {
/**
- * Indicates that the default attribute value will be used.
- * See individual attribute for details on its default value.
+ * Indicates that the default attribute value will be used. See individual attribute for details on its default
+ * value.
*/
static final int DEFAULT_ = -1;
/**
@@ -1329,13 +1321,12 @@ public final class RuleBasedCollator extends Collator
*/
static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
/**
- * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL,
- * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
+ * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and
+ * DECOMPOSITION_MODE
*/
static final int OFF_ = 16;
/**
- * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL,
- * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
+ * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
*/
static final int ON_ = 17;
/**
@@ -1343,8 +1334,7 @@ public final class RuleBasedCollator extends Collator
*/
static final int SHIFTED_ = 20;
/**
- * Valid for ALTERNATE_HANDLING. Alternate handling will be non
- * ignorable
+ * Valid for ALTERNATE_HANDLING. Alternate handling will be non ignorable
*/
static final int NON_IGNORABLE_ = 21;
/**
@@ -1362,73 +1352,55 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Attributes that collation service understands. All the attributes can
- * take DEFAULT value, as well as the values specific to each one.
+ * Attributes that collation service understands. All the attributes can take DEFAULT value, as well as the values
+ * specific to each one.
*/
- static interface Attribute
- {
+ static interface Attribute {
/**
- * Attribute for direction of secondary weights - used in French.
- * Acceptable values are ON, which results in secondary weights being
- * considered backwards and OFF which treats secondary weights in the
- * order they appear.
+ * Attribute for direction of secondary weights - used in French. Acceptable values are ON, which results in
+ * secondary weights being considered backwards and OFF which treats secondary weights in the order they appear.
*/
static final int FRENCH_COLLATION_ = 0;
/**
- * Attribute for handling variable elements. Acceptable values are
- * NON_IGNORABLE (default) which treats all the codepoints with
- * non-ignorable primary weights in the same way, and SHIFTED which
- * causes codepoints with primary weights that are equal or below the
- * variable top value to be ignored on primary level and moved to the
- * quaternary level.
+ * Attribute for handling variable elements. Acceptable values are NON_IGNORABLE (default) which treats all the
+ * codepoints with non-ignorable primary weights in the same way, and SHIFTED which causes codepoints with
+ * primary weights that are equal or below the variable top value to be ignored on primary level and moved to
+ * the quaternary level.
*/
static final int ALTERNATE_HANDLING_ = 1;
/**
- * Controls the ordering of upper and lower case letters. Acceptable
- * values are OFF (default), which orders upper and lower case letters
- * in accordance to their tertiary weights, UPPER_FIRST which forces
- * upper case letters to sort before lower case letters, and
- * LOWER_FIRST which does the opposite.
+ * Controls the ordering of upper and lower case letters. Acceptable values are OFF (default), which orders
+ * upper and lower case letters in accordance to their tertiary weights, UPPER_FIRST which forces upper case
+ * letters to sort before lower case letters, and LOWER_FIRST which does the opposite.
*/
static final int CASE_FIRST_ = 2;
/**
- * Controls whether an extra case level (positioned before the third
- * level) is generated or not. Acceptable values are OFF (default),
- * when case level is not generated, and ON which causes the case
- * level to be generated. Contents of the case level are affected by
- * the value of CASE_FIRST attribute. A simple way to ignore accent
- * differences in a string is to set the strength to PRIMARY and
- * enable case level.
+ * Controls whether an extra case level (positioned before the third level) is generated or not. Acceptable
+ * values are OFF (default), when case level is not generated, and ON which causes the case level to be
+ * generated. Contents of the case level are affected by the value of CASE_FIRST attribute. A simple way to
+ * ignore accent differences in a string is to set the strength to PRIMARY and enable case level.
*/
static final int CASE_LEVEL_ = 3;
/**
- * Controls whether the normalization check and necessary
- * normalizations are performed. When set to OFF (default) no
- * normalization check is performed. The correctness of the result is
- * guaranteed only if the input data is in so-called FCD form (see
- * users manual for more info). When set to ON, an incremental check
- * is performed to see whether the input data is in the FCD form. If
- * the data is not in the FCD form, incremental NFD normalization is
- * performed.
+ * Controls whether the normalization check and necessary normalizations are performed. When set to OFF
+ * (default) no normalization check is performed. The correctness of the result is guaranteed only if the input
+ * data is in so-called FCD form (see users manual for more info). When set to ON, an incremental check is
+ * performed to see whether the input data is in the FCD form. If the data is not in the FCD form, incremental
+ * NFD normalization is performed.
*/
static final int NORMALIZATION_MODE_ = 4;
/**
- * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY,
- * QUATERNARY or IDENTICAL. The usual strength for most locales
- * (except Japanese) is tertiary. Quaternary strength is useful when
- * combined with shifted setting for alternate handling attribute and
- * for JIS x 4061 collation, when it is used to distinguish between
- * Katakana and Hiragana (this is achieved by setting the
- * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is
- * affected only by the number of non ignorable code points in the
- * string. Identical strength is rarely useful, as it amounts to
- * codepoints of the NFD form of the string.
+ * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. The usual
+ * strength for most locales (except Japanese) is tertiary. Quaternary strength is useful when combined with
+ * shifted setting for alternate handling attribute and for JIS x 4061 collation, when it is used to distinguish
+ * between Katakana and Hiragana (this is achieved by setting the HIRAGANA_QUATERNARY mode to on. Otherwise,
+ * quaternary level is affected only by the number of non ignorable code points in the string. Identical
+ * strength is rarely useful, as it amounts to codepoints of the NFD form of the string.
*/
static final int STRENGTH_ = 5;
/**
- * When turned on, this attribute positions Hiragana before all
- * non-ignorables on quaternary level. This is a sneaky way to produce
- * JIS sort order.
+ * When turned on, this attribute positions Hiragana before all non-ignorables on quaternary level. This is a
+ * sneaky way to produce JIS sort order.
*/
static final int HIRAGANA_QUATERNARY_MODE_ = 6;
/**
@@ -1440,19 +1412,18 @@ public final class RuleBasedCollator extends Collator
/**
* DataManipulate singleton
*/
- static class DataManipulate implements Trie.DataManipulate
- {
+ static class DataManipulate implements Trie.DataManipulate {
// public methods ----------------------------------------------------
/**
- * Internal method called to parse a lead surrogate's ce for the offset
- * to the next trail surrogate data.
- * @param ce collation element of the lead surrogate
+ * Internal method called to parse a lead surrogate's ce for the offset to the next trail surrogate data.
+ *
+ * @param ce
+ * collation element of the lead surrogate
* @return data offset or 0 for the next trail surrogate
* @stable ICU 2.8
*/
- public final int getFoldingOffset(int ce)
- {
+ public final int getFoldingOffset(int ce) {
if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
return (ce & 0xFFFFFF);
}
@@ -1462,10 +1433,9 @@ public final class RuleBasedCollator extends Collator
/**
* Get singleton object
*/
- public static final DataManipulate getInstance()
- {
+ public static final DataManipulate getInstance() {
if (m_instance_ == null) {
- m_instance_ = new DataManipulate();
+ m_instance_ = new DataManipulate();
}
return m_instance_;
}
@@ -1482,44 +1452,172 @@ public final class RuleBasedCollator extends Collator
/**
* private to prevent initialization
*/
- private DataManipulate()
- {
+ private DataManipulate() {
}
}
/**
* UCAConstants
*/
- static final class UCAConstants
- {
- int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
- int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
- int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
- int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
- int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
- int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
- int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
- int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
- int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
- int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
- int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
- int FIRST_IMPLICIT_[] = new int[2];
- int LAST_IMPLICIT_[] = new int[2];
- int FIRST_TRAILING_[] = new int[2];
- int LAST_TRAILING_[] = new int[2];
- int PRIMARY_TOP_MIN_;
- int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
- int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
- int PRIMARY_TRAILING_MIN_; // 0xE8000000
- int PRIMARY_TRAILING_MAX_; // 0xF0000000
- int PRIMARY_SPECIAL_MIN_; // 0xE8000000
- int PRIMARY_SPECIAL_MAX_; // 0xF0000000
+ static final class UCAConstants {
+ int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
+ int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
+ int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
+ int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
+ int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
+ int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
+ int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
+ int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
+ int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
+ int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
+ int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
+ int FIRST_IMPLICIT_[] = new int[2];
+ int LAST_IMPLICIT_[] = new int[2];
+ int FIRST_TRAILING_[] = new int[2];
+ int LAST_TRAILING_[] = new int[2];
+ int PRIMARY_TOP_MIN_;
+ int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
+ int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
+ int PRIMARY_TRAILING_MIN_; // 0xE8000000
+ int PRIMARY_TRAILING_MAX_; // 0xF0000000
+ int PRIMARY_SPECIAL_MIN_; // 0xE8000000
+ int PRIMARY_SPECIAL_MAX_; // 0xF0000000
+ }
+
+ /**
+ * Script to Lead Byte and Lead Byte to Script Data
+ *
+ */
+ static final class LeadByteConstants {
+ private static final int DATA_MASK_FOR_INDEX = 0x8000;
+ private static final int[] EMPTY_INT_ARRAY = new int[0];
+
+ private int serializedSize = 0;
+
+ private Map SCRIPT_TO_LEAD_BYTES_INDEX;
+ private byte[] SCRIPT_TO_LEAD_BYTES_DATA;
+
+ private int[] LEAD_BYTE_TO_SCRIPTS_INDEX;
+ private byte[] LEAD_BYTE_TO_SCRIPTS_DATA;
+
+ LeadByteConstants() {
+ }
+
+ void read(DataInputStream dis) throws IOException {
+ int readcount = 0;
+ int indexCount;
+ int dataSize;
+
+ // script to lead bytes
+ indexCount = dis.readShort();
+ readcount += 2;
+ dataSize = dis.readShort();
+ readcount += 2;
+ this.SCRIPT_TO_LEAD_BYTES_INDEX = new HashMap();
+ //System.out.println("Script to Lead Bytes Index - Count = " + indexCount);
+ for (int index = 0; index < indexCount; index++) {
+ int reorderCode = dis.readShort(); // reorder code
+ readcount += 2;
+ int dataOffset = 0xffff & dis.readShort(); // data offset
+ readcount += 2;
+ // System.out.println("\t-------------");
+ // System.out.println("\toffset = " + Integer.toHexString(readcount - 4));
+ // System.out.println("\treorderCode = " + Integer.toHexString(reorderCode));
+ // System.out.println("\tdataOffset = " + Integer.toHexString(dataOffset));
+ this.SCRIPT_TO_LEAD_BYTES_INDEX.put(reorderCode, dataOffset);
+ }
+
+ this.SCRIPT_TO_LEAD_BYTES_DATA = new byte[dataSize * 2];
+ dis.readFully(this.SCRIPT_TO_LEAD_BYTES_DATA, 0, this.SCRIPT_TO_LEAD_BYTES_DATA.length);
+ readcount += this.SCRIPT_TO_LEAD_BYTES_DATA.length;
+
+ // lead byte to scripts
+ indexCount = dis.readShort();
+ readcount += 2;
+ dataSize = dis.readShort();
+ readcount += 2;
+ this.LEAD_BYTE_TO_SCRIPTS_INDEX = new int[indexCount];
+ //System.out.println("Lead Byte to Scripts Index - Count = " + indexCount);
+ for (int index = 0; index < indexCount; index++) {
+ this.LEAD_BYTE_TO_SCRIPTS_INDEX[index] = 0xffff & dis.readShort();
+ readcount += 2;
+ // System.out.println("\t-------------");
+ // System.out.println("\toffset = " + Integer.toHexString(readcount - 2));
+ // System.out.println("\tindex = " + Integer.toHexString(index));
+ // System.out.println("\tdataOffset = " + Integer.toHexString(this.LEAD_BYTE_TO_SCRIPTS_INDEX[index]));
+ }
+
+ this.LEAD_BYTE_TO_SCRIPTS_DATA = new byte[dataSize * 2];
+ dis.readFully(this.LEAD_BYTE_TO_SCRIPTS_DATA, 0, this.LEAD_BYTE_TO_SCRIPTS_DATA.length);
+ readcount += this.LEAD_BYTE_TO_SCRIPTS_DATA.length;
+
+ this.serializedSize = readcount;
+ }
+
+ int getSerializedDataSize() {
+ return this.serializedSize;
+ }
+
+ int[] getReorderCodesForLeadByte(int leadByte) {
+ if (leadByte >= this.LEAD_BYTE_TO_SCRIPTS_INDEX.length) {
+ return EMPTY_INT_ARRAY;
+ }
+ int offset = this.LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte];
+ if (offset == 0) {
+ return EMPTY_INT_ARRAY;
+ }
+ if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) {
+ int[] reorderCodes = new int[1];
+ reorderCodes[0] = offset & ~DATA_MASK_FOR_INDEX;
+ }
+
+ int length = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset);
+ offset++;
+
+ int[] reorderCodes = new int[length];
+
+ for (int code = 0; code < length; code++, offset++) {
+ reorderCodes[code] = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset);
+ }
+ return reorderCodes;
+ }
+
+ int[] getLeadBytesForReorderCode(int reorderCode) {
+ if (!this.SCRIPT_TO_LEAD_BYTES_INDEX.containsKey(reorderCode)) {
+ return EMPTY_INT_ARRAY;
+ }
+ int offset = this.SCRIPT_TO_LEAD_BYTES_INDEX.get(reorderCode);
+
+ if (offset == 0) {
+ return EMPTY_INT_ARRAY;
+ }
+
+ int[] leadBytes;
+ if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) {
+ leadBytes = new int[1];
+ leadBytes[0] = offset & ~DATA_MASK_FOR_INDEX;
+ } else {
+
+ int length = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset);
+ offset++;
+
+ leadBytes = new int[length];
+ for (int leadByte = 0; leadByte < length; leadByte++, offset++) {
+ leadBytes[leadByte] = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset);
+ }
+ }
+ return leadBytes;
+ }
+
+ private static int readShort(byte[] data, int offset) {
+ return data[offset * 2] << 8 | data[offset * 2 + 1];
+ }
}
// package private data member -------------------------------------------
- static final byte BYTE_FIRST_TAILORED_ = (byte)0x04;
- static final byte BYTE_COMMON_ = (byte)0x05;
+ static final byte BYTE_FIRST_TAILORED_ = (byte) 0x04;
+ static final byte BYTE_COMMON_ = (byte) 0x05;
static final int COMMON_TOP_2_ = 0x86; // int for unsigness
static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
static final int COMMON_BOTTOM_3 = 0x05;
@@ -1561,19 +1659,15 @@ public final class RuleBasedCollator extends Collator
static final int CE_CONTINUATION_MARKER_ = 0xC0;
/**
- * Size of collator raw data headers and options before the expansion
- * data. This is used when expansion ces are to be retrieved. ICU4C uses
- * the expansion offset starting from UCollator.UColHeader, hence ICU4J
- * will have to minus that off to get the right expansion ce offset. In
- * number of ints.
+ * Size of collator raw data headers and options before the expansion data. This is used when expansion ces are to
+ * be retrieved. ICU4C uses the expansion offset starting from UCollator.UColHeader, hence ICU4J will have to minus
+ * that off to get the right expansion ce offset. In number of ints.
*/
int m_expansionOffset_;
/**
- * Size of collator raw data headers, options and expansions before
- * contraction data. This is used when contraction ces are to be retrieved.
- * ICU4C uses contraction offset starting from UCollator.UColHeader, hence
- * ICU4J will have to minus that off to get the right contraction ce
- * offset. In number of chars.
+ * Size of collator raw data headers, options and expansions before contraction data. This is used when contraction
+ * ces are to be retrieved. ICU4C uses contraction offset starting from UCollator.UColHeader, hence ICU4J will have
+ * to minus that off to get the right contraction ce offset. In number of chars.
*/
int m_contractionOffset_;
/**
@@ -1582,7 +1676,7 @@ public final class RuleBasedCollator extends Collator
boolean m_isJamoSpecial_;
// Collator options ------------------------------------------------------
-
+
int m_defaultVariableTopValue_;
boolean m_defaultIsFrenchCollation_;
boolean m_defaultIsAlternateHandlingShifted_;
@@ -1592,7 +1686,8 @@ public final class RuleBasedCollator extends Collator
int m_defaultStrength_;
boolean m_defaultIsHiragana4_;
boolean m_defaultIsNumericCollation_;
-
+ int[] m_defaultScriptOrder_;
+
/**
* Value of the variable top
*/
@@ -1609,6 +1704,10 @@ public final class RuleBasedCollator extends Collator
* Numeric collation option
*/
boolean m_isNumericCollation_;
+ /**
+ * Script order
+ */
+ int[] m_scriptOrder_;
// end Collator options --------------------------------------------------
@@ -1629,28 +1728,23 @@ public final class RuleBasedCollator extends Collator
*/
IntTrie m_trie_;
/**
- * Table to store all collation elements that are the last element of an
- * expansion. This is for use in StringSearch.
+ * Table to store all collation elements that are the last element of an expansion. This is for use in StringSearch.
*/
int m_expansionEndCE_[];
/**
- * Table to store the maximum size of any expansions that end with the
- * corresponding collation element in m_expansionEndCE_. For use in
- * StringSearch too
+ * Table to store the maximum size of any expansions that end with the corresponding collation element in
+ * m_expansionEndCE_. For use in StringSearch too
*/
byte m_expansionEndCEMaxSize_[];
/**
- * Heuristic table to store information on whether a char character is
- * considered "unsafe". "Unsafe" character are combining marks or those
- * belonging to some contraction sequence from the offset 1 onwards.
- * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered
- * unsafe. If we have another contraction "ZA" with the one above, then
- * 'A', 'B', 'C' are "unsafe" but 'Z' is not.
+ * Heuristic table to store information on whether a char character is considered "unsafe". "Unsafe" character are
+ * combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is the
+ * only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one above,
+ * then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
*/
byte m_unsafe_[];
/**
- * Table to store information on whether a codepoint can occur as the last
- * character in a contraction
+ * Table to store information on whether a codepoint can occur as the last character in a contraction
*/
byte m_contractionEnd_[];
/**
@@ -1677,7 +1771,11 @@ public final class RuleBasedCollator extends Collator
* UCD version
*/
VersionInfo m_UCD_version_;
-
+ /**
+ * Lead byte and script data
+ */
+ int m_leadByteToScripts;
+ int m_scriptToLeadBytes;
/**
* UnicodeData.txt property object
*/
@@ -1686,6 +1784,10 @@ public final class RuleBasedCollator extends Collator
* UCA Constants
*/
static final UCAConstants UCA_CONSTANTS_;
+ /**
+ * Lead Byte Constants
+ */
+ static LeadByteConstants LEADBYTE_CONSTANTS_;
/**
* Table for UCA and builder use
*/
@@ -1700,108 +1802,106 @@ public final class RuleBasedCollator extends Collator
static final byte SORT_LEVEL_TERMINATOR_ = 1;
-// These are values from UCA required for
-// implicit generation and supressing sort key compression
-// they should regularly be in the UCA, but if one
-// is running without UCA, it could be a problem
- static final int maxRegularPrimary = 0x7A;
- static final int minImplicitPrimary = 0xE0;
- static final int maxImplicitPrimary = 0xE4;
-
+ // These are values from UCA required for
+ // implicit generation and supressing sort key compression
+ // they should regularly be in the UCA, but if one
+ // is running without UCA, it could be a problem
+ static final int maxRegularPrimary = 0x7A;
+ static final int minImplicitPrimary = 0xE0;
+ static final int maxImplicitPrimary = 0xE4;
// block to initialise character property database
- static
- {
+ static {
// take pains to let static class init succeed, otherwise the class itself won't exist and
- // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
+ // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
// we can't load the UCA data.
RuleBasedCollator iUCA_ = null;
UCAConstants iUCA_CONSTANTS_ = null;
+ LeadByteConstants iLEADBYTE_CONSTANTS = null;
char iUCA_CONTRACTIONS_[] = null;
ImplicitCEGenerator iimpCEGen_ = null;
- try
- {
+ try {
// !!! note what's going on here...
// even though the static init of the class is not yet complete, we
- // instantiate an instance of the class. So we'd better be sure that
+ // instantiate an instance of the class. So we'd better be sure that
// instantiation doesn't rely on the static initialization that's
// not complete yet!
iUCA_ = new RuleBasedCollator();
iUCA_CONSTANTS_ = new UCAConstants();
- iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_);
+ iLEADBYTE_CONSTANTS = new LeadByteConstants();
+ iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_, iLEADBYTE_CONSTANTS);
// called before doing canonical closure for the UCA.
iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
- //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
+ // iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_,
+ // iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
iUCA_.init();
- ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
- iUCA_.m_rules_ = (String)rb.getObject("UCARules");
- }
- catch (MissingResourceException ex)
- {
-// throw ex;
- }
- catch (IOException e)
- {
- // e.printStackTrace();
-// throw new MissingResourceException(e.getMessage(),"","");
+ ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
+ ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
+ iUCA_.m_rules_ = (String) rb.getObject("UCARules");
+ } catch (MissingResourceException ex) {
+ int i =12;
+ // throw ex;
+ } catch (IOException e) {
+ int i =12;
+ // e.printStackTrace();
+ // throw new MissingResourceException(e.getMessage(),"","");
}
UCA_ = iUCA_;
UCA_CONSTANTS_ = iUCA_CONSTANTS_;
+ LEADBYTE_CONSTANTS_ = iLEADBYTE_CONSTANTS;
UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
impCEGen_ = iimpCEGen_;
UCA_INIT_COMPLETE = true;
}
-
private static void checkUCA() throws MissingResourceException {
if (UCA_INIT_COMPLETE && UCA_ == null) {
throw new MissingResourceException("Collator UCA data unavailable", "", "");
}
}
-
+
// package private constructors ------------------------------------------
/**
- *
Private contructor for use by subclasses.
- * Public access to creating Collators is handled by the API
- * Collator.getInstance() or RuleBasedCollator(String rules).
- *
- *
- * This constructor constructs the UCA collator internally
- *
- */
- RuleBasedCollator()
- {
+ *
+ * Private contructor for use by subclasses. Public access to creating Collators is handled by the API
+ * Collator.getInstance() or RuleBasedCollator(String rules).
+ *
+ *
+ * This constructor constructs the UCA collator internally
+ *
+ */
+ RuleBasedCollator() {
checkUCA();
initUtility(false);
}
/**
- * Constructors a RuleBasedCollator from the argument locale.
- * If no resource bundle is associated with the locale, UCA is used
- * instead.
+ * Constructors a RuleBasedCollator from the argument locale. If no resource bundle is associated with the locale,
+ * UCA is used instead.
+ *
* @param locale
*/
- RuleBasedCollator(ULocale locale)
- {
+ RuleBasedCollator(ULocale locale) {
checkUCA();
- ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
+ ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
+ ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
initUtility(false);
if (rb != null) {
try {
// Use keywords, if supplied for lookup
String collkey = locale.getKeywordValue("collation");
- if(collkey == null) {
- collkey = rb.getStringWithFallback("collations/default");
+ if (collkey == null) {
+ collkey = rb.getStringWithFallback("collations/default");
}
-
+
// collations/default will always give a string back
// keyword for the real collation data
- // if "collations/collkey" will return null if collkey == null
+ // if "collations/collkey" will return null if collkey == null
ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey);
if (elements != null) {
// TODO: Determine actual & valid locale correctly
@@ -1811,44 +1911,31 @@ public final class RuleBasedCollator extends Collator
m_rules_ = elements.getString("Sequence");
ByteBuffer buf = elements.get("%%CollationBin").getBinary();
// %%CollationBin
- if(buf!=null){
- // m_rules_ = (String)rules[1][1];
+ if (buf != null) {
+ // m_rules_ = (String)rules[1][1];
CollatorReader.initRBC(this, buf);
/*
- BufferedInputStream input =
- new BufferedInputStream(
- new ByteArrayInputStream(map));
- /*
- CollatorReader reader = new CollatorReader(input, false);
- if (map.length > MIN_BINARY_DATA_SIZE_) {
- reader.read(this, null);
- }
- else {
- reader.readHeader(this);
- reader.readOptions(this);
- // duplicating UCA_'s data
- setWithUCATables();
- }
- */
+ * BufferedInputStream input = new BufferedInputStream( new ByteArrayInputStream(map)); /*
+ * CollatorReader reader = new CollatorReader(input, false); if (map.length >
+ * MIN_BINARY_DATA_SIZE_) { reader.read(this, null); } else { reader.readHeader(this);
+ * reader.readOptions(this); // duplicating UCA_'s data setWithUCATables(); }
+ */
// at this point, we have read in the collator
// now we need to check whether the binary image has
// the right UCA and other versions
- if(!m_UCA_version_.equals(UCA_.m_UCA_version_) ||
- !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
+ if (!m_UCA_version_.equals(UCA_.m_UCA_version_) || !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
init(m_rules_);
return;
}
init();
return;
- }
- else {
+ } else {
init(m_rules_);
return;
}
}
- }
- catch (Exception e) {
- // e.printStackTrace();
+ } catch (Exception e) {
+ e.printStackTrace();
// if failed use UCA.
}
}
@@ -1858,11 +1945,9 @@ public final class RuleBasedCollator extends Collator
// package private methods -----------------------------------------------
/**
- * Sets this collator to use the tables in UCA. Note options not taken
- * care of here.
+ * Sets this collator to use the tables in UCA. Note options not taken care of here.
*/
- final void setWithUCATables()
- {
+ final void setWithUCATables() {
m_contractionOffset_ = UCA_.m_contractionOffset_;
m_expansionOffset_ = UCA_.m_expansionOffset_;
m_expansion_ = UCA_.m_expansion_;
@@ -1880,8 +1965,7 @@ public final class RuleBasedCollator extends Collator
/**
* Sets this collator to use the all options and tables in UCA.
*/
- final void setWithUCAData()
- {
+ final void setWithUCAData() {
latinOneFailed_ = true;
m_addition3_ = UCA_.m_addition3_;
@@ -1894,8 +1978,7 @@ public final class RuleBasedCollator extends Collator
setDecomposition(UCA_.getDecomposition());
m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
- m_defaultIsAlternateHandlingShifted_
- = UCA_.m_defaultIsAlternateHandlingShifted_;
+ m_defaultIsAlternateHandlingShifted_ = UCA_.m_defaultIsAlternateHandlingShifted_;
m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
@@ -1923,25 +2006,23 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Test whether a char character is potentially "unsafe" for use as a
- * collation starting point. "Unsafe" characters are combining marks or
- * those belonging to some contraction sequence from the offset 1 onwards.
- * E.g. if "ABC" is the only contraction, then 'B' and
- * 'C' are considered unsafe. If we have another contraction "ZA" with
- * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
- * @param ch character to determin
+ * Test whether a char character is potentially "unsafe" for use as a collation starting point. "Unsafe" characters
+ * are combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is
+ * the only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one
+ * above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
+ *
+ * @param ch
+ * character to determin
* @return true if ch is unsafe, false otherwise
*/
- final boolean isUnsafe(char ch)
- {
+ final boolean isUnsafe(char ch) {
if (ch < m_minUnsafe_) {
return false;
}
-
+
if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
- if (UTF16.isLeadSurrogate(ch)
- || UTF16.isTrailSurrogate(ch)) {
- // Trail surrogate are always considered unsafe.
+ if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) {
+ // Trail surrogate are always considered unsafe.
return true;
}
ch &= HEURISTIC_OVERFLOW_MASK_;
@@ -1952,13 +2033,13 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Approximate determination if a char character is at a contraction end.
- * Guaranteed to be true if a character is at the end of a contraction,
- * otherwise it is not deterministic.
- * @param ch character to be determined
+ * Approximate determination if a char character is at a contraction end. Guaranteed to be true if a character is at
+ * the end of a contraction, otherwise it is not deterministic.
+ *
+ * @param ch
+ * character to be determined
*/
- final boolean isContractionEnd(char ch)
- {
+ final boolean isContractionEnd(char ch) {
if (UTF16.isTrailSurrogate(ch)) {
return true;
}
@@ -1977,33 +2058,35 @@ public final class RuleBasedCollator extends Collator
/**
* Retrieve the tag of a special ce
- * @param ce ce to test
+ *
+ * @param ce
+ * ce to test
* @return tag of ce
*/
- static int getTag(int ce)
- {
+ static int getTag(int ce) {
return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
}
/**
* Checking if ce is special
- * @param ce to check
+ *
+ * @param ce
+ * to check
* @return true if ce is special
*/
- static boolean isSpecial(int ce)
- {
+ static boolean isSpecial(int ce) {
return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
}
/**
* Checks if the argument ce is a continuation
- * @param ce collation element to test
+ *
+ * @param ce
+ * collation element to test
* @return true if ce is a continuation
*/
- static final boolean isContinuation(int ce)
- {
- return ce != CollationElementIterator.NULLORDER
- && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
+ static final boolean isContinuation(int ce) {
+ return ce != CollationElementIterator.NULLORDER && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
}
// private inner classes ------------------------------------------------
@@ -2011,20 +2094,16 @@ public final class RuleBasedCollator extends Collator
// private variables -----------------------------------------------------
/**
- * The smallest natural unsafe or contraction end char character before
- * tailoring.
- * This is a combining mark.
+ * The smallest natural unsafe or contraction end char character before tailoring. This is a combining mark.
*/
private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
/**
- * Heuristic table table size. Size is 32 bytes, 1 bit for each
- * latin 1 char, and some power of two for hashing the rest of the chars.
- * Size in bytes.
+ * Heuristic table table size. Size is 32 bytes, 1 bit for each latin 1 char, and some power of two for hashing the
+ * rest of the chars. Size in bytes.
*/
private static final char HEURISTIC_SIZE_ = 1056;
/**
- * Mask value down to "some power of two" - 1,
- * number of bits, not num of bytes.
+ * Mask value down to "some power of two" - 1, number of bits, not num of bytes.
*/
private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
/**
@@ -2032,8 +2111,7 @@ public final class RuleBasedCollator extends Collator
*/
private static final int HEURISTIC_SHIFT_ = 3;
/**
- * Unsafe character addition for character too large, it has to be folded
- * then incremented.
+ * Unsafe character addition for character too large, it has to be folded then incremented.
*/
private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
/**
@@ -2058,6 +2136,10 @@ public final class RuleBasedCollator extends Collator
private int m_bottom3_;
private int m_topCount3_;
private int m_bottomCount3_;
+ /**
+ * Script reordering table
+ */
+ private byte[] m_leadBytePermutationTable_;
/**
* Case first constants
*/
@@ -2081,20 +2163,19 @@ public final class RuleBasedCollator extends Collator
// These values come from the UCA ----------------------------------------
/**
- * This is an enum that lists magic special byte values from the
- * fractional UCA
+ * This is an enum that lists magic special byte values from the fractional UCA
*/
- //private static final byte BYTE_ZERO_ = 0x0;
- //private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
- //private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
- private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03;
- /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
- //private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
- // TODO: Make the following values dynamic since they change with almost every UCA version.
+ // private static final byte BYTE_ZERO_ = 0x0;
+ // private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
+ // private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
+ private static final byte BYTE_SHIFT_PREFIX_ = (byte) 0x03;
+ /* private */static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
+ // private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
+ // TODO: Make the following values dynamic since they change with almost every UCA version.
static final byte CODAN_PLACEHOLDER = 0x12;
- private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x5B;
+ private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte) 0x5B;
- private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF;
+ private static final byte BYTE_UNSHIFTED_MAX_ = (byte) 0xFF;
private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
@@ -2103,22 +2184,18 @@ public final class RuleBasedCollator extends Collator
private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
private static final int COMMON_BOTTOM_3_ = 0x05;
private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
- private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ =
- COMMON_BOTTOM_3_;
- private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_);
+ private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = COMMON_BOTTOM_3_;
+ private static final int TOP_COUNT_2_ = (int) (PROPORTION_2_ * TOTAL_2_);
private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
private static final int COMMON_2_ = COMMON_BOTTOM_2_;
private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
- //private static final int COMMON_4_ = (byte)0xFF;
-
-
+ // private static final int COMMON_4_ = (byte)0xFF;
/*
- * Minimum size required for the binary collation data in bytes.
- * Size of UCA header + size of options to 4 bytes
+ * Minimum size required for the binary collation data in bytes. Size of UCA header + size of options to 4 bytes
*/
- //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
+ // private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
/**
* If this collator is to generate only simple tertiaries for fast path
@@ -2130,9 +2207,8 @@ public final class RuleBasedCollator extends Collator
*/
private boolean m_isFrenchCollation_;
/**
- * Flag indicating if shifted is requested for Quaternary alternate
- * handling. If this is not true, the default for alternate handling will
- * be non-ignorable.
+ * Flag indicating if shifted is requested for Quaternary alternate handling. If this is not true, the default for
+ * alternate handling will be non-ignorable.
*/
private boolean m_isAlternateHandlingShifted_;
/**
@@ -2141,12 +2217,10 @@ public final class RuleBasedCollator extends Collator
private boolean m_isCaseLevel_;
private static final int SORT_BUFFER_INIT_SIZE_ = 128;
- private static final int SORT_BUFFER_INIT_SIZE_1_ =
- SORT_BUFFER_INIT_SIZE_ << 3;
+ private static final int SORT_BUFFER_INIT_SIZE_1_ = SORT_BUFFER_INIT_SIZE_ << 3;
private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
- private static final int SORT_BUFFER_INIT_SIZE_CASE_ =
- SORT_BUFFER_INIT_SIZE_ >> 2;
+ private static final int SORT_BUFFER_INIT_SIZE_CASE_ = SORT_BUFFER_INIT_SIZE_ >> 2;
private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
private static final int CE_CONTINUATION_TAG_ = 0xC0;
@@ -2154,11 +2228,11 @@ public final class RuleBasedCollator extends Collator
private static final int LAST_BYTE_MASK_ = 0xFF;
- //private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
- //private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
+ // private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
+ // private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
- private static final byte SORT_CASE_BYTE_START_ = (byte)0x80;
- private static final byte SORT_CASE_SHIFT_START_ = (byte)7;
+ private static final byte SORT_CASE_BYTE_START_ = (byte) 0x80;
+ private static final byte SORT_CASE_SHIFT_START_ = (byte) 7;
/**
* CE buffer size
@@ -2166,9 +2240,9 @@ public final class RuleBasedCollator extends Collator
private static final int CE_BUFFER_SIZE_ = 512;
// variables for Latin-1 processing
- boolean latinOneUse_ = false;
+ boolean latinOneUse_ = false;
boolean latinOneRegenTable_ = false;
- boolean latinOneFailed_ = false;
+ boolean latinOneFailed_ = false;
int latinOneTableLen_ = 0;
int latinOneCEs_[] = null;
@@ -2183,7 +2257,7 @@ public final class RuleBasedCollator extends Collator
* Utility comparison flags
*/
private boolean m_utilCompare0_;
- //private boolean m_utilCompare1_;
+ // private boolean m_utilCompare1_;
private boolean m_utilCompare2_;
private boolean m_utilCompare3_;
private boolean m_utilCompare4_;
@@ -2196,7 +2270,7 @@ public final class RuleBasedCollator extends Collator
private byte m_utilBytes2_[];
private byte m_utilBytes3_[];
private byte m_utilBytes4_[];
- //private byte m_utilBytes5_[];
+ // private byte m_utilBytes5_[];
private RawCollationKey m_utilRawCollationKey_;
private int m_utilBytesCount0_;
@@ -2204,13 +2278,13 @@ public final class RuleBasedCollator extends Collator
private int m_utilBytesCount2_;
private int m_utilBytesCount3_;
private int m_utilBytesCount4_;
- //private int m_utilBytesCount5_;
- //private int m_utilCount0_;
- //private int m_utilCount1_;
+ // private int m_utilBytesCount5_;
+ // private int m_utilCount0_;
+ // private int m_utilCount1_;
private int m_utilCount2_;
private int m_utilCount3_;
private int m_utilCount4_;
- //private int m_utilCount5_;
+ // private int m_utilCount5_;
private int m_utilFrenchStart_;
private int m_utilFrenchEnd_;
@@ -2231,17 +2305,16 @@ public final class RuleBasedCollator extends Collator
// private methods -------------------------------------------------------
- private void init(String rules) throws Exception
- {
+ private void init(String rules) throws Exception {
setWithUCAData();
- CollationParsedRuleBuilder builder
- = new CollationParsedRuleBuilder(rules);
+ CollationParsedRuleBuilder builder = new CollationParsedRuleBuilder(rules);
builder.setRules(this);
m_rules_ = rules;
init();
+ buildPermutationTable();
initUtility(false);
}
-
+
private final int compareRegular(String source, String target, int offset) {
if (m_srcUtilIter_ == null) {
initUtility(true);
@@ -2249,7 +2322,7 @@ public final class RuleBasedCollator extends Collator
int strength = getStrength();
// setting up the collator parameters
m_utilCompare0_ = m_isCaseLevel_;
- //m_utilCompare1_ = true;
+ // m_utilCompare1_ = true;
m_utilCompare2_ = strength >= SECONDARY;
m_utilCompare3_ = strength >= TERTIARY;
m_utilCompare4_ = strength >= QUATERNARY;
@@ -2265,14 +2338,11 @@ public final class RuleBasedCollator extends Collator
}
// This is the lowest primary value that will not be ignored if shifted
- int lowestpvalue = m_isAlternateHandlingShifted_
- ? m_variableTopValue_ << 16 : 0;
+ int lowestpvalue = m_isAlternateHandlingShifted_ ? m_variableTopValue_ << 16 : 0;
m_srcUtilCEBufferSize_ = 0;
m_tgtUtilCEBufferSize_ = 0;
- int result = doPrimaryCompare(doHiragana4, lowestpvalue, source,
- target, offset);
- if (m_srcUtilCEBufferSize_ == -1
- && m_tgtUtilCEBufferSize_ == -1) {
+ int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, target, offset);
+ if (m_srcUtilCEBufferSize_ == -1 && m_tgtUtilCEBufferSize_ == -1) {
// since the cebuffer is cleared when we have determined that
// either source is greater than target or vice versa, the return
// result is the comparison result and not the hiragana result
@@ -2302,13 +2372,12 @@ public final class RuleBasedCollator extends Collator
}
}
- if (doShift4) { // checkQuad
+ if (doShift4) { // checkQuad
result = doQuaternaryCompare(lowestpvalue);
if (result != 0) {
return result;
}
- }
- else if (doHiragana4 && hiraganaresult != 0) {
+ } else if (doHiragana4 && hiraganaresult != 0) {
// If we're fine on quaternaries, we might be different
// on Hiragana. This, however, might fail us in shifted.
return hiraganaresult;
@@ -2316,7 +2385,7 @@ public final class RuleBasedCollator extends Collator
// For IDENTICAL comparisons, we use a bitwise character comparison
// as a tiebreaker if all else is equal.
- // Getting here should be quite rare - strings are not identical -
+ // Getting here should be quite rare - strings are not identical -
// that is checked first, but compared == through all other checks.
if (m_utilCompare5_) {
return doIdenticalCompare(source, target, offset, true);
@@ -2327,236 +2396,203 @@ public final class RuleBasedCollator extends Collator
// Is this primary weight compressible?
// Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
// TODO: This should use per-lead-byte flags from FractionalUCA.txt.
- static boolean
- isCompressible(int primary1) {
+ static boolean isCompressible(int primary1) {
return BYTE_FIRST_NON_LATIN_PRIMARY_ <= primary1 && primary1 <= maxRegularPrimary;
}
/**
* Gets the 2 bytes of primary order and adds it to the primary byte array
- * @param ce current ce
- * @param notIsContinuation flag indicating if the current bytes belong to
- * a continuation ce
- * @param doShift flag indicating if ce is to be shifted
- * @param leadPrimary lead primary used for compression
- * @param commonBottom4 common byte value for Quaternary
- * @param bottomCount4 smallest byte value for Quaternary
+ *
+ * @param ce
+ * current ce
+ * @param notIsContinuation
+ * flag indicating if the current bytes belong to a continuation ce
+ * @param doShift
+ * flag indicating if ce is to be shifted
+ * @param leadPrimary
+ * lead primary used for compression
+ * @param commonBottom4
+ * common byte value for Quaternary
+ * @param bottomCount4
+ * smallest byte value for Quaternary
* @return the new lead primary for compression
*/
- private final int doPrimaryBytes(int ce, boolean notIsContinuation,
- boolean doShift, int leadPrimary,
- int commonBottom4, int bottomCount4)
- {
+ private final int doPrimaryBytes(int ce, boolean notIsContinuation, boolean doShift, int leadPrimary,
+ int commonBottom4, int bottomCount4) {
int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
- int p1 = ce >>> 8; // comparison
- if (doShift) {
- if (m_utilCount4_ > 0) {
- while (m_utilCount4_ > bottomCount4) {
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- (byte)(commonBottom4 + bottomCount4));
- m_utilBytesCount4_ ++;
- m_utilCount4_ -= bottomCount4;
- }
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- (byte)(commonBottom4
- + (m_utilCount4_ - 1)));
- m_utilBytesCount4_ ++;
- m_utilCount4_ = 0;
- }
- // dealing with a variable and we're treating them as shifted
- // This is a shifted ignorable
- if (p1 != 0) {
- // we need to check this since we could be in continuation
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- (byte)p1);
- m_utilBytesCount4_ ++;
- }
- if (p2 != 0) {
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- (byte)p2);
- m_utilBytesCount4_ ++;
- }
- }
- else {
- // Note: This code assumes that the table is well built
- // i.e. not having 0 bytes where they are not supposed to be.
- // Usually, we'll have non-zero primary1 & primary2, except
- // in cases of LatinOne and friends, when primary2 will be
- // regular and simple sortkey calc
- if (p1 != CollationElementIterator.IGNORABLE) {
- if (notIsContinuation) {
- if (leadPrimary == p1) {
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_, (byte)p2);
- m_utilBytesCount1_ ++;
- }
- else {
- if (leadPrimary != 0) {
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- ((p1 > leadPrimary)
- ? BYTE_UNSHIFTED_MAX_
- : BYTE_UNSHIFTED_MIN_));
- m_utilBytesCount1_ ++;
- }
- if (p2 == CollationElementIterator.IGNORABLE) {
- // one byter, not compressed
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)p1);
- m_utilBytesCount1_ ++;
- leadPrimary = 0;
- }
- else if (isCompressible(p1)) {
- // compress
- leadPrimary = p1;
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)p1);
- m_utilBytesCount1_ ++;
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)p2);
- m_utilBytesCount1_ ++;
- }
- else {
- leadPrimary = 0;
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)p1);
- m_utilBytesCount1_ ++;
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)p2);
- m_utilBytesCount1_ ++;
- }
- }
- }
- else {
- // continuation, add primary to the key, no compression
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_, (byte)p1);
- m_utilBytesCount1_ ++;
- if (p2 != CollationElementIterator.IGNORABLE) {
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_, (byte)p2);
- // second part
- m_utilBytesCount1_ ++;
- }
- }
- }
- }
- return leadPrimary;
+ int p1 = ce >>> 8; // comparison
+ if (doShift) {
+ if (m_utilCount4_ > 0) {
+ while (m_utilCount4_ > bottomCount4) {
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4));
+ m_utilBytesCount4_++;
+ m_utilCount4_ -= bottomCount4;
+ }
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonBottom4 + (m_utilCount4_ - 1)));
+ m_utilBytesCount4_++;
+ m_utilCount4_ = 0;
+ }
+ // dealing with a variable and we're treating them as shifted
+ // This is a shifted ignorable
+ if (p1 != 0) {
+ // we need to check this since we could be in continuation
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) p1);
+ m_utilBytesCount4_++;
+ }
+ if (p2 != 0) {
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) p2);
+ m_utilBytesCount4_++;
+ }
+ } else {
+ // Note: This code assumes that the table is well built
+ // i.e. not having 0 bytes where they are not supposed to be.
+ // Usually, we'll have non-zero primary1 & primary2, except
+ // in cases of LatinOne and friends, when primary2 will be
+ // regular and simple sortkey calc
+ if (p1 != CollationElementIterator.IGNORABLE) {
+ if (notIsContinuation) {
+ if (leadPrimary == p1) {
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p2);
+ m_utilBytesCount1_++;
+ } else {
+ if (leadPrimary != 0) {
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ ((p1 > leadPrimary) ? BYTE_UNSHIFTED_MAX_ : BYTE_UNSHIFTED_MIN_));
+ m_utilBytesCount1_++;
+ }
+ if (p2 == CollationElementIterator.IGNORABLE) {
+ // one byter, not compressed
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1);
+ m_utilBytesCount1_++;
+ leadPrimary = 0;
+ } else if (isCompressible(p1)) {
+ // compress
+ leadPrimary = p1;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1);
+ m_utilBytesCount1_++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p2);
+ m_utilBytesCount1_++;
+ } else {
+ leadPrimary = 0;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1);
+ m_utilBytesCount1_++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p2);
+ m_utilBytesCount1_++;
+ }
+ }
+ } else {
+ // continuation, add primary to the key, no compression
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p1);
+ m_utilBytesCount1_++;
+ if (p2 != CollationElementIterator.IGNORABLE) {
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) p2);
+ // second part
+ m_utilBytesCount1_++;
+ }
+ }
+ }
+ }
+ return leadPrimary;
}
/**
* Gets the secondary byte and adds it to the secondary byte array
- * @param ce current ce
- * @param notIsContinuation flag indicating if the current bytes belong to
- * a continuation ce
- * @param doFrench flag indicator if french sort is to be performed
+ *
+ * @param ce
+ * current ce
+ * @param notIsContinuation
+ * flag indicating if the current bytes belong to a continuation ce
+ * @param doFrench
+ * flag indicator if french sort is to be performed
*/
- private final void doSecondaryBytes(int ce, boolean notIsContinuation,
- boolean doFrench)
- {
+ private final void doSecondaryBytes(int ce, boolean notIsContinuation, boolean doFrench) {
int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison
if (s != 0) {
if (!doFrench) {
// This is compression code.
if (s == COMMON_2_ && notIsContinuation) {
- m_utilCount2_ ++;
- }
- else {
+ m_utilCount2_++;
+ } else {
if (m_utilCount2_ > 0) {
if (s > COMMON_2_) { // not necessary for 4th level.
while (m_utilCount2_ > TOP_COUNT_2_) {
- m_utilBytes2_ = append(m_utilBytes2_,
- m_utilBytesCount2_,
- (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
- m_utilBytesCount2_ ++;
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
+ (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
+ m_utilBytesCount2_++;
m_utilCount2_ -= TOP_COUNT_2_;
}
- m_utilBytes2_ = append(m_utilBytes2_,
- m_utilBytesCount2_,
- (byte)(COMMON_TOP_2_
- - (m_utilCount2_ - 1)));
- m_utilBytesCount2_ ++;
- }
- else {
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
+ (byte) (COMMON_TOP_2_ - (m_utilCount2_ - 1)));
+ m_utilBytesCount2_++;
+ } else {
while (m_utilCount2_ > BOTTOM_COUNT_2_) {
- m_utilBytes2_ = append(m_utilBytes2_,
- m_utilBytesCount2_,
- (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
- m_utilBytesCount2_ ++;
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
+ (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
+ m_utilBytesCount2_++;
m_utilCount2_ -= BOTTOM_COUNT_2_;
}
- m_utilBytes2_ = append(m_utilBytes2_,
- m_utilBytesCount2_,
- (byte)(COMMON_BOTTOM_2_
- + (m_utilCount2_ - 1)));
- m_utilBytesCount2_ ++;
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
+ (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1)));
+ m_utilBytesCount2_++;
}
m_utilCount2_ = 0;
}
- m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
- (byte)s);
- m_utilBytesCount2_ ++;
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, (byte) s);
+ m_utilBytesCount2_++;
+ }
+ } else {
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, (byte) s);
+ m_utilBytesCount2_++;
+ // Do the special handling for French secondaries
+ // We need to get continuation elements and do intermediate
+ // restore
+ // abc1c2c3de with french secondaries need to be edc1c2c3ba
+ // NOT edc3c2c1ba
+ if (notIsContinuation) {
+ if (m_utilFrenchStart_ != -1) {
+ // reverse secondaries from frenchStartPtr up to
+ // frenchEndPtr
+ reverseBuffer(m_utilBytes2_);
+ m_utilFrenchStart_ = -1;
+ }
+ } else {
+ if (m_utilFrenchStart_ == -1) {
+ m_utilFrenchStart_ = m_utilBytesCount2_ - 2;
+ }
+ m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;
}
- }
- else {
- m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
- (byte)s);
- m_utilBytesCount2_ ++;
- // Do the special handling for French secondaries
- // We need to get continuation elements and do intermediate
- // restore
- // abc1c2c3de with french secondaries need to be edc1c2c3ba
- // NOT edc3c2c1ba
- if (notIsContinuation) {
- if (m_utilFrenchStart_ != -1) {
- // reverse secondaries from frenchStartPtr up to
- // frenchEndPtr
- reverseBuffer(m_utilBytes2_);
- m_utilFrenchStart_ = -1;
- }
- }
- else {
- if (m_utilFrenchStart_ == -1) {
- m_utilFrenchStart_ = m_utilBytesCount2_ - 2;
- }
- m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;
- }
}
}
}
/**
* Reverse the argument buffer
- * @param buffer to reverse
+ *
+ * @param buffer
+ * to reverse
*/
- private void reverseBuffer(byte buffer[])
- {
+ private void reverseBuffer(byte buffer[]) {
int start = m_utilFrenchStart_;
int end = m_utilFrenchEnd_;
while (start < end) {
byte b = buffer[start];
- buffer[start ++] = buffer[end];
- buffer[end --] = b;
+ buffer[start++] = buffer[end];
+ buffer[end--] = b;
}
}
/**
* Insert the case shifting byte if required
- * @param caseshift value
+ *
+ * @param caseshift
+ * value
* @return new caseshift value
*/
- private final int doCaseShift(int caseshift)
- {
- if (caseshift == 0) {
- m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_,
- SORT_CASE_BYTE_START_);
- m_utilBytesCount0_ ++;
+ private final int doCaseShift(int caseshift) {
+ if (caseshift == 0) {
+ m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_, SORT_CASE_BYTE_START_);
+ m_utilBytesCount0_++;
caseshift = SORT_CASE_SHIFT_START_;
}
return caseshift;
@@ -2564,42 +2600,35 @@ public final class RuleBasedCollator extends Collator
/**
* Performs the casing sort
- * @param tertiary byte in ints for easy comparison
- * @param notIsContinuation flag indicating if the current bytes belong to
- * a continuation ce
+ *
+ * @param tertiary
+ * byte in ints for easy comparison
+ * @param notIsContinuation
+ * flag indicating if the current bytes belong to a continuation ce
* @param caseshift
* @return the new value of case shift
*/
- private final int doCaseBytes(int tertiary, boolean notIsContinuation,
- int caseshift)
- {
+ private final int doCaseBytes(int tertiary, boolean notIsContinuation, int caseshift) {
caseshift = doCaseShift(caseshift);
if (notIsContinuation && tertiary != 0) {
- byte casebits = (byte)(tertiary & 0xC0);
+ byte casebits = (byte) (tertiary & 0xC0);
if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
if (casebits == 0) {
- m_utilBytes0_[m_utilBytesCount0_ - 1]
- |= (1 << (-- caseshift));
+ m_utilBytes0_[m_utilBytesCount0_ - 1] |= (1 << (--caseshift));
+ } else {
+ // second bit
+ caseshift = doCaseShift(caseshift - 1);
+ m_utilBytes0_[m_utilBytesCount0_ - 1] |= ((casebits >> 6) & 1) << (--caseshift);
}
- else {
- // second bit
- caseshift = doCaseShift(caseshift - 1);
- m_utilBytes0_[m_utilBytesCount0_ - 1]
- |= ((casebits >> 6) & 1) << (-- caseshift);
- }
- }
- else {
+ } else {
if (casebits != 0) {
- m_utilBytes0_[m_utilBytesCount0_ - 1]
- |= 1 << (-- caseshift);
+ m_utilBytes0_[m_utilBytesCount0_ - 1] |= 1 << (--caseshift);
// second bit
caseshift = doCaseShift(caseshift);
- m_utilBytes0_[m_utilBytesCount0_ - 1]
- |= ((casebits >> 7) & 1) << (-- caseshift);
- }
- else {
- caseshift --;
+ m_utilBytes0_[m_utilBytesCount0_ - 1] |= ((casebits >> 7) & 1) << (--caseshift);
+ } else {
+ caseshift--;
}
}
}
@@ -2609,114 +2638,102 @@ public final class RuleBasedCollator extends Collator
/**
* Gets the tertiary byte and adds it to the tertiary byte array
- * @param tertiary byte in int for easy comparison
- * @param notIsContinuation flag indicating if the current bytes belong to
- * a continuation ce
+ *
+ * @param tertiary
+ * byte in int for easy comparison
+ * @param notIsContinuation
+ * flag indicating if the current bytes belong to a continuation ce
*/
- private final void doTertiaryBytes(int tertiary, boolean notIsContinuation)
- {
+ private final void doTertiaryBytes(int tertiary, boolean notIsContinuation) {
if (tertiary != 0) {
// This is compression code.
// sequence size check is included in the if clause
if (tertiary == m_common3_ && notIsContinuation) {
- m_utilCount3_ ++;
- }
- else {
+ m_utilCount3_++;
+ } else {
int common3 = m_common3_ & LAST_BYTE_MASK_;
if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
tertiary += m_addition3_;
- }
- else if (tertiary <= common3
- && m_common3_ == COMMON_UPPER_FIRST_3_) {
+ } else if (tertiary <= common3 && m_common3_ == COMMON_UPPER_FIRST_3_) {
tertiary -= m_addition3_;
}
if (m_utilCount3_ > 0) {
if (tertiary > common3) {
while (m_utilCount3_ > m_topCount3_) {
- m_utilBytes3_ = append(m_utilBytes3_,
- m_utilBytesCount3_,
- (byte)(m_top3_ - m_topCount3_));
- m_utilBytesCount3_ ++;
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_));
+ m_utilBytesCount3_++;
m_utilCount3_ -= m_topCount3_;
}
- m_utilBytes3_ = append(m_utilBytes3_,
- m_utilBytesCount3_,
- (byte)(m_top3_
- - (m_utilCount3_ - 1)));
- m_utilBytesCount3_ ++;
- }
- else {
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
+ (byte) (m_top3_ - (m_utilCount3_ - 1)));
+ m_utilBytesCount3_++;
+ } else {
while (m_utilCount3_ > m_bottomCount3_) {
- m_utilBytes3_ = append(m_utilBytes3_,
- m_utilBytesCount3_,
- (byte)(m_bottom3_ + m_bottomCount3_));
- m_utilBytesCount3_ ++;
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
+ (byte) (m_bottom3_ + m_bottomCount3_));
+ m_utilBytesCount3_++;
m_utilCount3_ -= m_bottomCount3_;
}
- m_utilBytes3_ = append(m_utilBytes3_,
- m_utilBytesCount3_,
- (byte)(m_bottom3_
- + (m_utilCount3_ - 1)));
- m_utilBytesCount3_ ++;
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
+ (byte) (m_bottom3_ + (m_utilCount3_ - 1)));
+ m_utilBytesCount3_++;
}
m_utilCount3_ = 0;
}
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
- (byte)tertiary);
- m_utilBytesCount3_ ++;
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) tertiary);
+ m_utilBytesCount3_++;
}
}
}
/**
* Gets the Quaternary byte and adds it to the Quaternary byte array
- * @param isCodePointHiragana flag indicator if the previous codepoint
- * we dealt with was Hiragana
- * @param commonBottom4 smallest common Quaternary byte
- * @param bottomCount4 smallest Quaternary byte
- * @param hiragana4 hiragana Quaternary byte
+ *
+ * @param isCodePointHiragana
+ * flag indicator if the previous codepoint we dealt with was Hiragana
+ * @param commonBottom4
+ * smallest common Quaternary byte
+ * @param bottomCount4
+ * smallest Quaternary byte
+ * @param hiragana4
+ * hiragana Quaternary byte
*/
- private final void doQuaternaryBytes(boolean isCodePointHiragana,
- int commonBottom4, int bottomCount4,
- byte hiragana4)
- {
+ private final void doQuaternaryBytes(boolean isCodePointHiragana, int commonBottom4, int bottomCount4,
+ byte hiragana4) {
if (isCodePointHiragana) { // This was Hiragana, need to note it
if (m_utilCount4_ > 0) { // Close this part
while (m_utilCount4_ > bottomCount4) {
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- (byte)(commonBottom4
- + bottomCount4));
- m_utilBytesCount4_ ++;
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4));
+ m_utilBytesCount4_++;
m_utilCount4_ -= bottomCount4;
}
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- (byte)(commonBottom4
- + (m_utilCount4_ - 1)));
- m_utilBytesCount4_ ++;
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonBottom4 + (m_utilCount4_ - 1)));
+ m_utilBytesCount4_++;
m_utilCount4_ = 0;
}
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- hiragana4); // Add the Hiragana
- m_utilBytesCount4_ ++;
- }
- else { // This wasn't Hiragana, so we can continue adding stuff
- m_utilCount4_ ++;
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, hiragana4); // Add the Hiragana
+ m_utilBytesCount4_++;
+ } else { // This wasn't Hiragana, so we can continue adding stuff
+ m_utilCount4_++;
}
}
/**
- * Iterates through the argument string for all ces.
- * Split the ces into their relevant primaries, secondaries etc.
- * @param source normalized string
- * @param doFrench flag indicator if special handling of French has to be
- * done
- * @param hiragana4 offset for Hiragana quaternary
- * @param commonBottom4 smallest common quaternary byte
- * @param bottomCount4 smallest quaternary byte
+ * Iterates through the argument string for all ces. Split the ces into their relevant primaries, secondaries etc.
+ *
+ * @param source
+ * normalized string
+ * @param doFrench
+ * flag indicator if special handling of French has to be done
+ * @param hiragana4
+ * offset for Hiragana quaternary
+ * @param commonBottom4
+ * smallest common quaternary byte
+ * @param bottomCount4
+ * smallest quaternary byte
*/
- private final void getSortKeyBytes(String source, boolean doFrench,
- byte hiragana4, int commonBottom4,
- int bottomCount4)
+ private final void getSortKeyBytes(String source, boolean doFrench, byte hiragana4, int commonBottom4,
+ int bottomCount4)
{
if (m_srcUtilIter_ == null) {
@@ -2750,22 +2767,18 @@ public final class RuleBasedCollator extends Collator
notIsContinuation = !isContinuation(ce);
- /*
- * if (notIsContinuation) {
- if (scriptOrder != NULL) {
- primary1 = scriptOrder[primary1];
- }
- }*/
+ if (notIsContinuation) {
+ if (m_leadBytePermutationTable_ != null) {
+ ce = (m_leadBytePermutationTable_[((ce >> 24) + 256) % 256] << 24) | (ce & 0x00FFFFFF);
+ }
+ }
boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
// actually we can just check that the first byte is 0
// generation stuffs the order left first
- boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_)
- <= m_variableTopValue_;
+ boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) <= m_variableTopValue_;
doShift = (m_isAlternateHandlingShifted_
- && ((notIsContinuation && isSmallerThanVariableTop
- && !isPrimaryByteIgnorable) // primary byte not 0
- || (!notIsContinuation && doShift))
- || (doShift && isPrimaryByteIgnorable));
+ && ((notIsContinuation && isSmallerThanVariableTop && !isPrimaryByteIgnorable) // primary byte not 0
+ || (!notIsContinuation && doShift)) || (doShift && isPrimaryByteIgnorable));
if (doShift && isPrimaryByteIgnorable) {
// amendment to the UCA says that primary ignorables and other
// ignorables should be removed if following a shifted code
@@ -2774,9 +2787,7 @@ public final class RuleBasedCollator extends Collator
// we should just completely ignore it
continue;
}
- leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift,
- leadPrimary, commonBottom4,
- bottomCount4);
+ leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, leadPrimary, commonBottom4, bottomCount4);
if (doShift) {
continue;
}
@@ -2792,11 +2803,10 @@ public final class RuleBasedCollator extends Collator
if (m_utilCompare0_ && (!isPrimaryByteIgnorable || m_utilCompare2_)) {
// do the case level if we need to do it. We don't want to calculate
// case level for primary ignorables if we have only primary strength and case level
- // otherwise we would break well formedness of CEs
+ // otherwise we would break well formedness of CEs
caseShift = doCaseBytes(t, notIsContinuation, caseShift);
- }
- else if (notIsContinuation) {
- t ^= m_caseSwitch_;
+ } else if (notIsContinuation) {
+ t ^= m_caseSwitch_;
}
t &= m_mask3_;
@@ -2806,8 +2816,7 @@ public final class RuleBasedCollator extends Collator
}
if (m_utilCompare4_ && notIsContinuation) { // compare quad
- doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_,
- commonBottom4, bottomCount4, hiragana4);
+ doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_, commonBottom4, bottomCount4, hiragana4);
}
}
setDecomposition(backupDecomposition); // reverts to original
@@ -2818,20 +2827,21 @@ public final class RuleBasedCollator extends Collator
}
/**
- * From the individual strength byte results the final compact sortkey
- * will be calculated.
- * @param source text string
- * @param doFrench flag indicating that special handling of French has to
- * be done
- * @param commonBottom4 smallest common quaternary byte
- * @param bottomCount4 smallest quaternary byte
- * @param key output RawCollationKey to store results, key cannot be null
+ * From the individual strength byte results the final compact sortkey will be calculated.
+ *
+ * @param source
+ * text string
+ * @param doFrench
+ * flag indicating that special handling of French has to be done
+ * @param commonBottom4
+ * smallest common quaternary byte
+ * @param bottomCount4
+ * smallest quaternary byte
+ * @param key
+ * output RawCollationKey to store results, key cannot be null
*/
- private final void getSortKey(String source, boolean doFrench,
- int commonBottom4,
- int bottomCount4,
- RawCollationKey key)
- {
+ private final void getSortKey(String source, boolean doFrench, int commonBottom4, int bottomCount4,
+ RawCollationKey key) {
// we have done all the CE's, now let's put them together to form
// a key
if (m_utilCompare2_) {
@@ -2851,8 +2861,8 @@ public final class RuleBasedCollator extends Collator
}
}
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)0);
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) 0);
+ m_utilBytesCount1_++;
key.set(m_utilBytes1_, 0, m_utilBytesCount1_);
}
@@ -2860,116 +2870,97 @@ public final class RuleBasedCollator extends Collator
/**
* Packs the French bytes
*/
- private final void doFrench()
- {
- for (int i = 0; i < m_utilBytesCount2_; i ++) {
+ private final void doFrench() {
+ for (int i = 0; i < m_utilBytesCount2_; i++) {
byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1];
// This is compression code.
if (s == COMMON_2_) {
- ++ m_utilCount2_;
- }
- else {
+ ++m_utilCount2_;
+ } else {
if (m_utilCount2_ > 0) {
// getting the unsigned value
if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
// not necessary for 4th level.
while (m_utilCount2_ > TOP_COUNT_2_) {
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
+ m_utilBytesCount1_++;
m_utilCount2_ -= TOP_COUNT_2_;
}
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)(COMMON_TOP_2_
- - (m_utilCount2_ - 1)));
- m_utilBytesCount1_ ++;
- }
- else {
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ (byte) (COMMON_TOP_2_ - (m_utilCount2_ - 1)));
+ m_utilBytesCount1_++;
+ } else {
while (m_utilCount2_ > BOTTOM_COUNT_2_) {
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
+ m_utilBytesCount1_++;
m_utilCount2_ -= BOTTOM_COUNT_2_;
}
- m_utilBytes1_ = append(m_utilBytes1_,
- m_utilBytesCount1_,
- (byte)(COMMON_BOTTOM_2_
- + (m_utilCount2_ - 1)));
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1)));
+ m_utilBytesCount1_++;
}
m_utilCount2_ = 0;
}
m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, s);
- m_utilBytesCount1_ ++;
+ m_utilBytesCount1_++;
}
}
if (m_utilCount2_ > 0) {
while (m_utilCount2_ > BOTTOM_COUNT_2_) {
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
- (byte)(COMMON_BOTTOM_2_
- + BOTTOM_COUNT_2_));
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
+ m_utilBytesCount1_++;
m_utilCount2_ -= BOTTOM_COUNT_2_;
}
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
- (byte)(COMMON_BOTTOM_2_
- + (m_utilCount2_ - 1)));
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1)));
+ m_utilBytesCount1_++;
}
}
/**
* Compacts the secondary bytes and stores them into the primary array
- * @param doFrench flag indicator that French has to be handled specially
+ *
+ * @param doFrench
+ * flag indicator that French has to be handled specially
*/
- private final void doSecondary(boolean doFrench)
- {
+ private final void doSecondary(boolean doFrench) {
if (m_utilCount2_ > 0) {
while (m_utilCount2_ > BOTTOM_COUNT_2_) {
- m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
- (byte)(COMMON_BOTTOM_2_
- + BOTTOM_COUNT_2_));
- m_utilBytesCount2_ ++;
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
+ m_utilBytesCount2_++;
m_utilCount2_ -= BOTTOM_COUNT_2_;
}
- m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
- (byte)(COMMON_BOTTOM_2_ +
- (m_utilCount2_ - 1)));
- m_utilBytesCount2_ ++;
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1)));
+ m_utilBytesCount2_++;
}
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
- SORT_LEVEL_TERMINATOR_);
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_++;
if (doFrench) { // do the reverse copy
doFrench();
- }
- else {
- if (m_utilBytes1_.length <= m_utilBytesCount1_
- + m_utilBytesCount2_) {
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
- m_utilBytesCount2_);
+ } else {
+ if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount2_) {
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount2_);
}
- System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_,
- m_utilBytesCount1_, m_utilBytesCount2_);
+ System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount2_);
m_utilBytesCount1_ += m_utilBytesCount2_;
}
}
/**
* Increase buffer size
- * @param buffer array of bytes
- * @param size of the byte array
- * @param incrementsize size to increase
+ *
+ * @param buffer
+ * array of bytes
+ * @param size
+ * of the byte array
+ * @param incrementsize
+ * size to increase
* @return the new buffer
*/
- private static final byte[] increase(byte buffer[], int size,
- int incrementsize)
- {
+ private static final byte[] increase(byte buffer[], int size, int incrementsize) {
byte result[] = new byte[buffer.length + incrementsize];
System.arraycopy(buffer, 0, result, 0, size);
return result;
@@ -2977,14 +2968,16 @@ public final class RuleBasedCollator extends Collator
/**
* Increase buffer size
- * @param buffer array of ints
- * @param size of the byte array
- * @param incrementsize size to increase
+ *
+ * @param buffer
+ * array of ints
+ * @param size
+ * of the byte array
+ * @param incrementsize
+ * size to increase
* @return the new buffer
*/
- private static final int[] increase(int buffer[], int size,
- int incrementsize)
- {
+ private static final int[] increase(int buffer[], int size, int incrementsize) {
int result[] = new int[buffer.length + incrementsize];
System.arraycopy(buffer, 0, result, 0, size);
return result;
@@ -2993,123 +2986,97 @@ public final class RuleBasedCollator extends Collator
/**
* Compacts the case bytes and stores them into the primary array
*/
- private final void doCase()
- {
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
- SORT_LEVEL_TERMINATOR_);
- m_utilBytesCount1_ ++;
+ private final void doCase() {
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_++;
if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount0_) {
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
- m_utilBytesCount0_);
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount0_);
}
- System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_,
- m_utilBytesCount0_);
+ System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount0_);
m_utilBytesCount1_ += m_utilBytesCount0_;
}
/**
* Compacts the tertiary bytes and stores them into the primary array
*/
- private final void doTertiary()
- {
+ private final void doTertiary() {
if (m_utilCount3_ > 0) {
if (m_common3_ != COMMON_BOTTOM_3_) {
while (m_utilCount3_ >= m_topCount3_) {
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
- (byte)(m_top3_ - m_topCount3_));
- m_utilBytesCount3_ ++;
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_));
+ m_utilBytesCount3_++;
m_utilCount3_ -= m_topCount3_;
}
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
- (byte)(m_top3_ - m_utilCount3_));
- m_utilBytesCount3_ ++;
- }
- else {
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_top3_ - m_utilCount3_));
+ m_utilBytesCount3_++;
+ } else {
while (m_utilCount3_ > m_bottomCount3_) {
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
- (byte)(m_bottom3_
- + m_bottomCount3_));
- m_utilBytesCount3_ ++;
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_bottom3_ + m_bottomCount3_));
+ m_utilBytesCount3_++;
m_utilCount3_ -= m_bottomCount3_;
}
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
- (byte)(m_bottom3_
- + (m_utilCount3_ - 1)));
- m_utilBytesCount3_ ++;
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_, (byte) (m_bottom3_ + (m_utilCount3_ - 1)));
+ m_utilBytesCount3_++;
}
}
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
- SORT_LEVEL_TERMINATOR_);
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_++;
if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount3_) {
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
- m_utilBytesCount3_);
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount3_);
}
- System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_,
- m_utilBytesCount3_);
+ System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount3_);
m_utilBytesCount1_ += m_utilBytesCount3_;
}
/**
* Compacts the quaternary bytes and stores them into the primary array
*/
- private final void doQuaternary(int commonbottom4, int bottomcount4)
- {
+ private final void doQuaternary(int commonbottom4, int bottomcount4) {
if (m_utilCount4_ > 0) {
while (m_utilCount4_ > bottomcount4) {
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- (byte)(commonbottom4 + bottomcount4));
- m_utilBytesCount4_ ++;
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonbottom4 + bottomcount4));
+ m_utilBytesCount4_++;
m_utilCount4_ -= bottomcount4;
}
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
- (byte)(commonbottom4
- + (m_utilCount4_ - 1)));
- m_utilBytesCount4_ ++;
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_, (byte) (commonbottom4 + (m_utilCount4_ - 1)));
+ m_utilBytesCount4_++;
}
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
- SORT_LEVEL_TERMINATOR_);
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_++;
if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount4_) {
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
- m_utilBytesCount4_);
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount4_);
}
- System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_,
- m_utilBytesCount4_);
+ System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_, m_utilBytesCount4_);
m_utilBytesCount1_ += m_utilBytesCount4_;
}
/**
- * Deals with the identical sort.
- * Appends the BOCSU version of the source string to the ends of the
- * byte buffer.
- * @param source text string
+ * Deals with the identical sort. Appends the BOCSU version of the source string to the ends of the byte buffer.
+ *
+ * @param source
+ * text string
*/
- private final void doIdentical(String source)
- {
+ private final void doIdentical(String source) {
int isize = BOCU.getCompressionLength(source);
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
- SORT_LEVEL_TERMINATOR_);
- m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_++;
if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) {
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
- 1 + isize);
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_, 1 + isize);
}
- m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_,
- m_utilBytesCount1_);
+ m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_, m_utilBytesCount1_);
}
/**
- * Gets the offset of the first unmatched characters in source and target.
- * This method returns the offset of the start of a contraction or a
- * combining sequence, if the first difference is in the middle of such a
- * sequence.
- * @param source string
- * @param target string
+ * Gets the offset of the first unmatched characters in source and target. This method returns the offset of the
+ * start of a contraction or a combining sequence, if the first difference is in the middle of such a sequence.
+ *
+ * @param source
+ * string
+ * @param target
+ * string
* @return offset of the first unmatched characters in source and target.
*/
- private final int getFirstUnmatchedOffset(String source, String target)
- {
+ private final int getFirstUnmatchedOffset(String source, String target) {
int result = 0;
int slength = source.length();
int tlength = target.length();
@@ -3117,9 +3084,8 @@ public final class RuleBasedCollator extends Collator
if (minlength > tlength) {
minlength = tlength;
}
- while (result < minlength
- && source.charAt(result) == target.charAt(result)) {
- result ++;
+ while (result < minlength && source.charAt(result) == target.charAt(result)) {
+ result++;
}
if (result > 0) {
// There is an identical portion at the beginning of the two
@@ -3131,24 +3097,19 @@ public final class RuleBasedCollator extends Collator
if (result < minlength) {
schar = source.charAt(result); // first differing chars
tchar = target.charAt(result);
- }
- else {
+ } else {
schar = source.charAt(minlength - 1);
if (isUnsafe(schar)) {
tchar = schar;
- }
- else if (slength == tlength) {
- return result;
- }
- else if (slength < tlength) {
+ } else if (slength == tlength) {
+ return result;
+ } else if (slength < tlength) {
tchar = target.charAt(result);
- }
- else {
+ } else {
schar = source.charAt(result);
}
}
- if (isUnsafe(schar) || isUnsafe(tchar))
- {
+ if (isUnsafe(schar) || isUnsafe(tchar)) {
// We are stopped in the middle of a contraction or combining
// sequence.
// Look backwards for the part of the string for the start of
@@ -3156,30 +3117,28 @@ public final class RuleBasedCollator extends Collator
// It doesn't matter which string we scan, since they are the
// same in this region.
do {
- result --;
- }
- while (result > 0 && isUnsafe(source.charAt(result)));
+ result--;
+ } while (result > 0 && isUnsafe(source.charAt(result)));
}
}
return result;
}
/**
- * Appending an byte to an array of bytes and increases it if we run out of
- * space
- * @param array of byte arrays
- * @param appendindex index in the byte array to append
- * @param value to append
- * @return array if array size can accomodate the new value, otherwise
- * a bigger array will be created and returned
+ * Appending an byte to an array of bytes and increases it if we run out of space
+ *
+ * @param array
+ * of byte arrays
+ * @param appendindex
+ * index in the byte array to append
+ * @param value
+ * to append
+ * @return array if array size can accomodate the new value, otherwise a bigger array will be created and returned
*/
- private static final byte[] append(byte array[], int appendindex,
- byte value)
- {
+ private static final byte[] append(byte array[], int appendindex, byte value) {
try {
array[appendindex] = value;
- }
- catch (ArrayIndexOutOfBoundsException e) {
+ } catch (ArrayIndexOutOfBoundsException e) {
array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
array[appendindex] = value;
}
@@ -3187,45 +3146,44 @@ public final class RuleBasedCollator extends Collator
}
/**
- * This is a trick string compare function that goes in and uses sortkeys
- * to compare. It is used when compare gets in trouble and needs to bail
- * out.
- * @param source text string
- * @param target text string
+ * This is a trick string compare function that goes in and uses sortkeys to compare. It is used when compare gets
+ * in trouble and needs to bail out.
+ *
+ * @param source
+ * text string
+ * @param target
+ * text string
*/
private final int compareBySortKeys(String source, String target)
{
- m_utilRawCollationKey_ = getRawCollationKey(source,
- m_utilRawCollationKey_);
+ m_utilRawCollationKey_ = getRawCollationKey(source, m_utilRawCollationKey_);
// this method is very seldom called
RawCollationKey targetkey = getRawCollationKey(target, null);
return m_utilRawCollationKey_.compareTo(targetkey);
}
/**
- * Performs the primary comparisons, and fills up the CE buffer at the
- * same time.
- * The return value toggles between the comparison result and the hiragana
- * result. If either the source is greater than target or vice versa, the
- * return result is the comparison result, ie 1 or -1, furthermore the
- * cebuffers will be cleared when that happens. If the primary comparisons
- * are equal, we'll have to continue with secondary comparison. In this case
- * the cebuffer will not be cleared and the return result will be the
- * hiragana result.
- * @param doHiragana4 flag indicator that Hiragana Quaternary has to be
- * observed
- * @param lowestpvalue the lowest primary value that will not be ignored if
- * alternate handling is shifted
- * @param source text string
- * @param target text string
- * @param textoffset offset in text to start the comparison
- * @return comparion result if a primary difference is found, otherwise
- * hiragana result
+ * Performs the primary comparisons, and fills up the CE buffer at the same time. The return value toggles between
+ * the comparison result and the hiragana result. If either the source is greater than target or vice versa, the
+ * return result is the comparison result, ie 1 or -1, furthermore the cebuffers will be cleared when that happens.
+ * If the primary comparisons are equal, we'll have to continue with secondary comparison. In this case the cebuffer
+ * will not be cleared and the return result will be the hiragana result.
+ *
+ * @param doHiragana4
+ * flag indicator that Hiragana Quaternary has to be observed
+ * @param lowestpvalue
+ * the lowest primary value that will not be ignored if alternate handling is shifted
+ * @param source
+ * text string
+ * @param target
+ * text string
+ * @param textoffset
+ * offset in text to start the comparison
+ * @return comparion result if a primary difference is found, otherwise hiragana result
*/
- private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue,
- String source, String target,
- int textoffset)
+ private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, String source, String target,
+ int textoffset)
{
// Preparing the context objects for iterating over strings
@@ -3242,72 +3200,62 @@ public final class RuleBasedCollator extends Collator
// We fetch CEs until we hit a non ignorable primary or end.
do {
sorder = m_srcUtilColEIter_.next();
- m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_,
- m_srcUtilCEBufferSize_, sorder);
- m_srcUtilCEBufferSize_ ++;
+ m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_, m_srcUtilCEBufferSize_, sorder);
+ m_srcUtilCEBufferSize_++;
sorder &= CE_PRIMARY_MASK_;
} while (sorder == CollationElementIterator.IGNORABLE);
int torder = 0;
do {
torder = m_tgtUtilColEIter_.next();
- m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_,
- m_tgtUtilCEBufferSize_, torder);
- m_tgtUtilCEBufferSize_ ++;
+ m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_, m_tgtUtilCEBufferSize_, torder);
+ m_tgtUtilCEBufferSize_++;
torder &= CE_PRIMARY_MASK_;
} while (torder == CollationElementIterator.IGNORABLE);
+ if (!isContinuation(sorder) && m_leadBytePermutationTable_ != null) {
+ sorder = (m_leadBytePermutationTable_[((sorder >> 24) + 256) % 256] << 24) | (sorder & 0x00FFFFFF);
+ torder = (m_leadBytePermutationTable_[((torder >> 24) + 256) % 256] << 24) | (torder & 0x00FFFFFF);
+ }
+
// if both primaries are the same
if (sorder == torder) {
// and there are no more CEs, we advance to the next level
// see if we are at the end of either string
- if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
- == CollationElementIterator.NULLORDER) {
- if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
- != CollationElementIterator.NULLORDER) {
+ if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] != CollationElementIterator.NULLORDER) {
return -1;
}
break;
- }
- else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
- == CollationElementIterator.NULLORDER) {
+ } else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
return 1;
}
if (doHiragana4 && hiraganaresult == 0
- && m_srcUtilColEIter_.m_isCodePointHiragana_ !=
- m_tgtUtilColEIter_.m_isCodePointHiragana_) {
+ && m_srcUtilColEIter_.m_isCodePointHiragana_ != m_tgtUtilColEIter_.m_isCodePointHiragana_) {
if (m_srcUtilColEIter_.m_isCodePointHiragana_) {
hiraganaresult = -1;
- }
- else {
+ } else {
hiraganaresult = 1;
}
}
- }
- else {
+ } else {
// if two primaries are different, we are done
return endPrimaryCompare(sorder, torder);
}
}
// no primary difference... do the rest from the buffers
return hiraganaresult;
- }
- else { // shifted - do a slightly more complicated processing :)
+ } else { // shifted - do a slightly more complicated processing :)
while (true) {
- int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_,
- lowestpvalue, true);
- int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_,
- lowestpvalue, false);
+ int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_, lowestpvalue, true);
+ int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_, lowestpvalue, false);
if (sorder == torder) {
- if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
- == CollationElementIterator.NULLORDER) {
+ if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
break;
- }
- else {
+ } else {
continue;
}
- }
- else {
+ } else {
return endPrimaryCompare(sorder, torder);
}
} // no primary difference... do the rest from the buffers
@@ -3316,24 +3264,20 @@ public final class RuleBasedCollator extends Collator
}
/**
- * This is used only for primary strength when we know that sorder is
- * already different from torder.
- * Compares sorder and torder, returns -1 if sorder is less than torder.
- * Clears the cebuffer at the same time.
- * @param sorder source strength order
- * @param torder target strength order
+ * This is used only for primary strength when we know that sorder is already different from torder. Compares sorder
+ * and torder, returns -1 if sorder is less than torder. Clears the cebuffer at the same time.
+ *
+ * @param sorder
+ * source strength order
+ * @param torder
+ * target strength order
* @return the comparison result of sorder and torder
*/
- private final int endPrimaryCompare(int sorder, int torder)
- {
+ private final int endPrimaryCompare(int sorder, int torder) {
// if we reach here, the ce offset accessed is the last ce
// appended to the buffer
- boolean isSourceNullOrder = (m_srcUtilCEBuffer_[
- m_srcUtilCEBufferSize_ - 1]
- == CollationElementIterator.NULLORDER);
- boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[
- m_tgtUtilCEBufferSize_ - 1]
- == CollationElementIterator.NULLORDER);
+ boolean isSourceNullOrder = (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
+ boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
m_srcUtilCEBufferSize_ = -1;
m_tgtUtilCEBufferSize_ = -1;
if (isSourceNullOrder) {
@@ -3344,26 +3288,25 @@ public final class RuleBasedCollator extends Collator
}
// getting rid of the sign
sorder >>>= CE_PRIMARY_SHIFT_;
- torder >>>= CE_PRIMARY_SHIFT_;
- if (sorder < torder) {
- return -1;
- }
- return 1;
+ torder >>>= CE_PRIMARY_SHIFT_;
+ if (sorder < torder) {
+ return -1;
+ }
+ return 1;
}
/**
- * Calculates the next primary shifted value and fills up cebuffer with the
- * next non-ignorable ce.
- * @param coleiter collation element iterator
- * @param doHiragana4 flag indicator if hiragana quaternary is to be
- * handled
- * @param lowestpvalue lowest primary shifted value that will not be
- * ignored
+ * Calculates the next primary shifted value and fills up cebuffer with the next non-ignorable ce.
+ *
+ * @param coleiter
+ * collation element iterator
+ * @param doHiragana4
+ * flag indicator if hiragana quaternary is to be handled
+ * @param lowestpvalue
+ * lowest primary shifted value that will not be ignored
* @return result next modified ce
*/
- private final int getPrimaryShiftedCompareCE(
- CollationElementIterator coleiter,
- int lowestpvalue, boolean isSrc)
+ private final int getPrimaryShiftedCompareCE(CollationElementIterator coleiter, int lowestpvalue, boolean isSrc)
{
boolean shifted = false;
@@ -3378,60 +3321,48 @@ public final class RuleBasedCollator extends Collator
result = coleiter.next();
if (result == CollationElementIterator.NULLORDER) {
cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize ++;
+ cebuffersize++;
break;
- }
- else if (result == CollationElementIterator.IGNORABLE
- || (shifted
- && (result & CE_PRIMARY_MASK_)
- == CollationElementIterator.IGNORABLE)) {
+ } else if (result == CollationElementIterator.IGNORABLE
+ || (shifted && (result & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE)) {
// UCA amendment - ignore ignorables that follow shifted code
// points
continue;
- }
- else if (isContinuation(result)) {
- if ((result & CE_PRIMARY_MASK_)
- != CollationElementIterator.IGNORABLE) {
+ } else if (isContinuation(result)) {
+ if ((result & CE_PRIMARY_MASK_) != CollationElementIterator.IGNORABLE) {
// There is primary value
if (shifted) {
- result = (result & CE_PRIMARY_MASK_)
- | CE_CONTINUATION_MARKER_;
+ result = (result & CE_PRIMARY_MASK_) | CE_CONTINUATION_MARKER_;
// preserve interesting continuation
cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize ++;
+ cebuffersize++;
continue;
- }
- else {
+ } else {
cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize ++;
+ cebuffersize++;
break;
}
- }
- else { // Just lower level values
+ } else { // Just lower level values
if (!shifted) {
cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize ++;
+ cebuffersize++;
}
}
- }
- else { // regular
- if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_,
- lowestpvalue) > 0) {
+ } else { // regular
+ if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_, lowestpvalue) > 0) {
cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize ++;
+ cebuffersize++;
break;
- }
- else {
+ } else {
if ((result & CE_PRIMARY_MASK_) != 0) {
shifted = true;
result &= CE_PRIMARY_MASK_;
cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize ++;
+ cebuffersize++;
continue;
- }
- else {
+ } else {
cebuffer = append(cebuffer, cebuffersize, result);
- cebuffersize ++;
+ cebuffersize++;
shifted = false;
continue;
}
@@ -3441,8 +3372,7 @@ public final class RuleBasedCollator extends Collator
if (isSrc) {
m_srcUtilCEBuffer_ = cebuffer;
m_srcUtilCEBufferSize_ = cebuffersize;
- }
- else {
+ } else {
m_tgtUtilCEBuffer_ = cebuffer;
m_tgtUtilCEBufferSize_ = cebuffersize;
}
@@ -3451,16 +3381,17 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Appending an int to an array of ints and increases it if we run out of
- * space
- * @param array of int arrays
- * @param appendindex index at which value will be appended
- * @param value to append
- * @return array if size is not increased, otherwise a new array will be
- * returned
+ * Appending an int to an array of ints and increases it if we run out of space
+ *
+ * @param array
+ * of int arrays
+ * @param appendindex
+ * index at which value will be appended
+ * @param value
+ * to append
+ * @return array if size is not increased, otherwise a new array will be returned
*/
- private static final int[] append(int array[], int appendindex, int value)
- {
+ private static final int[] append(int array[], int appendindex, int value) {
if (appendindex + 1 >= array.length) {
array = increase(array, appendindex, CE_BUFFER_SIZE_);
}
@@ -3470,11 +3401,12 @@ public final class RuleBasedCollator extends Collator
/**
* Does secondary strength comparison based on the collected ces.
- * @param doFrench flag indicates if French ordering is to be done
+ *
+ * @param doFrench
+ * flag indicates if French ordering is to be done
* @return the secondary strength comparison result
*/
- private final int doSecondaryCompare(boolean doFrench)
- {
+ private final int doSecondaryCompare(boolean doFrench) {
// now, we're gonna reexamine collected CEs
if (!doFrench) { // normal
int soffset = 0;
@@ -3482,43 +3414,33 @@ public final class RuleBasedCollator extends Collator
while (true) {
int sorder = CollationElementIterator.IGNORABLE;
while (sorder == CollationElementIterator.IGNORABLE) {
- sorder = m_srcUtilCEBuffer_[soffset ++]
- & CE_SECONDARY_MASK_;
+ sorder = m_srcUtilCEBuffer_[soffset++] & CE_SECONDARY_MASK_;
}
int torder = CollationElementIterator.IGNORABLE;
while (torder == CollationElementIterator.IGNORABLE) {
- torder = m_tgtUtilCEBuffer_[toffset ++]
- & CE_SECONDARY_MASK_;
+ torder = m_tgtUtilCEBuffer_[toffset++] & CE_SECONDARY_MASK_;
}
if (sorder == torder) {
- if (m_srcUtilCEBuffer_[soffset - 1]
- == CollationElementIterator.NULLORDER) {
- if (m_tgtUtilCEBuffer_[toffset - 1]
- != CollationElementIterator.NULLORDER) {
+ if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
return -1;
}
break;
- }
- else if (m_tgtUtilCEBuffer_[toffset - 1]
- == CollationElementIterator.NULLORDER) {
+ } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
return 1;
}
- }
- else {
- if (m_srcUtilCEBuffer_[soffset - 1] ==
- CollationElementIterator.NULLORDER) {
+ } else {
+ if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
return -1;
}
- if (m_tgtUtilCEBuffer_[toffset - 1] ==
- CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
return 1;
}
return (sorder < torder) ? -1 : 1;
}
}
- }
- else { // do the French
+ } else { // do the French
m_srcUtilContOffset_ = 0;
m_tgtUtilContOffset_ = 0;
m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2;
@@ -3528,13 +3450,10 @@ public final class RuleBasedCollator extends Collator
int torder = getSecondaryFrenchCE(false);
if (sorder == torder) {
if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0)
- || (m_srcUtilOffset_ >= 0
- && m_srcUtilCEBuffer_[m_srcUtilOffset_]
- == CollationElementIterator.NULLORDER)) {
+ || (m_srcUtilOffset_ >= 0 && m_srcUtilCEBuffer_[m_srcUtilOffset_] == CollationElementIterator.NULLORDER)) {
break;
}
- }
- else {
+ } else {
return (sorder < torder) ? -1 : 1;
}
}
@@ -3544,11 +3463,12 @@ public final class RuleBasedCollator extends Collator
/**
* Calculates the next secondary french CE.
- * @param isSrc flag indicator if we are calculating the src ces
+ *
+ * @param isSrc
+ * flag indicator if we are calculating the src ces
* @return result next modified ce
*/
- private final int getSecondaryFrenchCE(boolean isSrc)
- {
+ private final int getSecondaryFrenchCE(boolean isSrc) {
int result = CollationElementIterator.IGNORABLE;
int offset = m_srcUtilOffset_;
int continuationoffset = m_srcUtilContOffset_;
@@ -3559,11 +3479,10 @@ public final class RuleBasedCollator extends Collator
cebuffer = m_tgtUtilCEBuffer_;
}
- while (result == CollationElementIterator.IGNORABLE
- && offset >= 0) {
+ while (result == CollationElementIterator.IGNORABLE && offset >= 0) {
if (continuationoffset == 0) {
result = cebuffer[offset];
- while (isContinuation(cebuffer[offset --])){
+ while (isContinuation(cebuffer[offset--])) {
}
// after this, sorder is at the start of continuation,
// and offset points before that
@@ -3572,9 +3491,8 @@ public final class RuleBasedCollator extends Collator
continuationoffset = offset;
offset += 2;
}
- }
- else {
- result = cebuffer[offset ++];
+ } else {
+ result = cebuffer[offset++];
if (!isContinuation(result)) {
// we have finished with this continuation
offset = continuationoffset;
@@ -3588,8 +3506,7 @@ public final class RuleBasedCollator extends Collator
if (isSrc) {
m_srcUtilOffset_ = offset;
m_srcUtilContOffset_ = continuationoffset;
- }
- else {
+ } else {
m_tgtUtilOffset_ = offset;
m_tgtUtilContOffset_ = continuationoffset;
}
@@ -3598,39 +3515,35 @@ public final class RuleBasedCollator extends Collator
/**
* Does case strength comparison based on the collected ces.
+ *
* @return the case strength comparison result
*/
- private final int doCaseCompare()
- {
+ private final int doCaseCompare() {
int soffset = 0;
int toffset = 0;
while (true) {
int sorder = CollationElementIterator.IGNORABLE;
int torder = CollationElementIterator.IGNORABLE;
- while ((sorder & CE_REMOVE_CASE_)
- == CollationElementIterator.IGNORABLE) {
- sorder = m_srcUtilCEBuffer_[soffset ++];
+ while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
+ sorder = m_srcUtilCEBuffer_[soffset++];
if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
// primary ignorables should not be considered on the case level when the strength is primary
// otherwise, the CEs stop being well-formed
sorder &= CE_CASE_MASK_3_;
sorder ^= m_caseSwitch_;
- }
- else {
+ } else {
sorder = CollationElementIterator.IGNORABLE;
}
}
- while ((torder & CE_REMOVE_CASE_)
- == CollationElementIterator.IGNORABLE) {
- torder = m_tgtUtilCEBuffer_[toffset ++];
+ while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
+ torder = m_tgtUtilCEBuffer_[toffset++];
if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
// primary ignorables should not be considered on the case level when the strength is primary
// otherwise, the CEs stop being well-formed
torder &= CE_CASE_MASK_3_;
torder ^= m_caseSwitch_;
- }
- else {
+ } else {
torder = CollationElementIterator.IGNORABLE;
}
}
@@ -3639,26 +3552,19 @@ public final class RuleBasedCollator extends Collator
torder &= CE_CASE_BIT_MASK_;
if (sorder == torder) {
// checking end of strings
- if (m_srcUtilCEBuffer_[soffset - 1]
- == CollationElementIterator.NULLORDER) {
- if (m_tgtUtilCEBuffer_[toffset - 1]
- != CollationElementIterator.NULLORDER) {
+ if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
return -1;
}
break;
- }
- else if (m_tgtUtilCEBuffer_[toffset - 1]
- == CollationElementIterator.NULLORDER) {
+ } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
return 1;
}
- }
- else {
- if (m_srcUtilCEBuffer_[soffset - 1]
- == CollationElementIterator.NULLORDER) {
+ } else {
+ if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
return -1;
}
- if (m_tgtUtilCEBuffer_[soffset - 1]
- == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
return 1;
}
return (sorder < torder) ? -1 : 1;
@@ -3669,58 +3575,47 @@ public final class RuleBasedCollator extends Collator
/**
* Does tertiary strength comparison based on the collected ces.
+ *
* @return the tertiary strength comparison result
*/
- private final int doTertiaryCompare()
- {
+ private final int doTertiaryCompare() {
int soffset = 0;
int toffset = 0;
while (true) {
int sorder = CollationElementIterator.IGNORABLE;
int torder = CollationElementIterator.IGNORABLE;
- while ((sorder & CE_REMOVE_CASE_)
- == CollationElementIterator.IGNORABLE) {
- sorder = m_srcUtilCEBuffer_[soffset ++] & m_mask3_;
+ while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
+ sorder = m_srcUtilCEBuffer_[soffset++] & m_mask3_;
if (!isContinuation(sorder)) {
sorder ^= m_caseSwitch_;
- }
- else {
+ } else {
sorder &= CE_REMOVE_CASE_;
}
}
- while ((torder & CE_REMOVE_CASE_)
- == CollationElementIterator.IGNORABLE) {
- torder = m_tgtUtilCEBuffer_[toffset ++] & m_mask3_;
+ while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
+ torder = m_tgtUtilCEBuffer_[toffset++] & m_mask3_;
if (!isContinuation(torder)) {
torder ^= m_caseSwitch_;
- }
- else {
+ } else {
torder &= CE_REMOVE_CASE_;
}
}
if (sorder == torder) {
- if (m_srcUtilCEBuffer_[soffset - 1]
- == CollationElementIterator.NULLORDER) {
- if (m_tgtUtilCEBuffer_[toffset - 1]
- != CollationElementIterator.NULLORDER) {
+ if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
return -1;
}
break;
- }
- else if (m_tgtUtilCEBuffer_[toffset - 1]
- == CollationElementIterator.NULLORDER) {
+ } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
return 1;
}
- }
- else {
- if (m_srcUtilCEBuffer_[soffset - 1] ==
- CollationElementIterator.NULLORDER) {
+ } else {
+ if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
return -1;
}
- if (m_tgtUtilCEBuffer_[toffset - 1] ==
- CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
return 1;
}
return (sorder < torder) ? -1 : 1;
@@ -3731,12 +3626,12 @@ public final class RuleBasedCollator extends Collator
/**
* Does quaternary strength comparison based on the collected ces.
- * @param lowestpvalue the lowest primary value that will not be ignored if
- * alternate handling is shifted
+ *
+ * @param lowestpvalue
+ * the lowest primary value that will not be ignored if alternate handling is shifted
* @return the quaternary strength comparison result
*/
- private final int doQuaternaryCompare(int lowestpvalue)
- {
+ private final int doQuaternaryCompare(int lowestpvalue) {
boolean sShifted = true;
boolean tShifted = true;
int soffset = 0;
@@ -3744,100 +3639,84 @@ public final class RuleBasedCollator extends Collator
while (true) {
int sorder = CollationElementIterator.IGNORABLE;
int torder = CollationElementIterator.IGNORABLE;
- while (sorder == CollationElementIterator.IGNORABLE
- || (isContinuation(sorder) && !sShifted)) {
- sorder = m_srcUtilCEBuffer_[soffset ++];
+ while (sorder == CollationElementIterator.IGNORABLE || (isContinuation(sorder) && !sShifted)) {
+ sorder = m_srcUtilCEBuffer_[soffset++];
if (isContinuation(sorder)) {
if (!sShifted) {
continue;
}
- }
- else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
- || (sorder & CE_PRIMARY_MASK_)
- == CollationElementIterator.IGNORABLE) {
+ } else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
+ || (sorder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
// non continuation
sorder = CE_PRIMARY_MASK_;
sShifted = false;
- }
- else {
+ } else {
sShifted = true;
}
}
sorder >>>= CE_PRIMARY_SHIFT_;
- while (torder == CollationElementIterator.IGNORABLE
- || (isContinuation(torder) && !tShifted)) {
- torder = m_tgtUtilCEBuffer_[toffset ++];
- if (isContinuation(torder)) {
- if (!tShifted) {
- continue;
+ while (torder == CollationElementIterator.IGNORABLE || (isContinuation(torder) && !tShifted)) {
+ torder = m_tgtUtilCEBuffer_[toffset++];
+ if (isContinuation(torder)) {
+ if (!tShifted) {
+ continue;
+ }
+ } else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
+ || (torder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
+ // non continuation
+ torder = CE_PRIMARY_MASK_;
+ tShifted = false;
+ } else {
+ tShifted = true;
+ }
}
- }
- else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
- || (torder & CE_PRIMARY_MASK_)
- == CollationElementIterator.IGNORABLE) {
- // non continuation
- torder = CE_PRIMARY_MASK_;
- tShifted = false;
- }
- else {
- tShifted = true;
- }
- }
- torder >>>= CE_PRIMARY_SHIFT_;
+ torder >>>= CE_PRIMARY_SHIFT_;
- if (sorder == torder) {
- if (m_srcUtilCEBuffer_[soffset - 1]
- == CollationElementIterator.NULLORDER) {
- if (m_tgtUtilCEBuffer_[toffset - 1]
- != CollationElementIterator.NULLORDER) {
- return -1;
+ if (sorder == torder) {
+ if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ break;
+ } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ } else {
+ if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ return (sorder < torder) ? -1 : 1;
}
- break;
- }
- else if (m_tgtUtilCEBuffer_[toffset - 1]
- == CollationElementIterator.NULLORDER) {
- return 1;
- }
- }
- else {
- if (m_srcUtilCEBuffer_[soffset - 1] ==
- CollationElementIterator.NULLORDER) {
- return -1;
- }
- if (m_tgtUtilCEBuffer_[toffset - 1] ==
- CollationElementIterator.NULLORDER) {
- return 1;
- }
- return (sorder < torder) ? -1 : 1;
- }
}
return 0;
}
/**
- * Internal function. Does byte level string compare. Used by strcoll if
- * strength == identical and strings are otherwise equal. This is a rare
- * case. Comparison must be done on NFD normalized strings. FCD is not good
- * enough.
- * @param source text
- * @param target text
- * @param offset of the first difference in the text strings
- * @param normalize flag indicating if we are to normalize the text before
- * comparison
+ * Internal function. Does byte level string compare. Used by strcoll if strength == identical and strings are
+ * otherwise equal. This is a rare case. Comparison must be done on NFD normalized strings. FCD is not good enough.
+ *
+ * @param source
+ * text
+ * @param target
+ * text
+ * @param offset
+ * of the first difference in the text strings
+ * @param normalize
+ * flag indicating if we are to normalize the text before comparison
* @return 1 if source is greater than target, -1 less than and 0 if equals
*/
- private static final int doIdenticalCompare(String source, String target,
- int offset, boolean normalize)
+ private static final int doIdenticalCompare(String source, String target, int offset, boolean normalize)
{
if (normalize) {
- if (Normalizer.quickCheck(source, Normalizer.NFD,0)
- != Normalizer.YES) {
+ if (Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
source = Normalizer.decompose(source, false);
}
- if (Normalizer.quickCheck(target, Normalizer.NFD,0)
- != Normalizer.YES) {
+ if (Normalizer.quickCheck(target, Normalizer.NFD, 0) != Normalizer.YES) {
target = Normalizer.decompose(target, false);
}
offset = 0;
@@ -3847,18 +3726,18 @@ public final class RuleBasedCollator extends Collator
}
/**
- * Compares string for their codepoint order.
- * This comparison handles surrogate characters and place them after the
+ * Compares string for their codepoint order. This comparison handles surrogate characters and place them after the
* all non surrogate characters.
- * @param source text
- * @param target text
- * @param offset start offset for comparison
+ *
+ * @param source
+ * text
+ * @param target
+ * text
+ * @param offset
+ * start offset for comparison
* @return 1 if source is greater than target, -1 less than and 0 if equals
*/
- private static final int doStringCompare(String source,
- String target,
- int offset)
- {
+ private static final int doStringCompare(String source, String target, int offset) {
// compare identical prefixes - they do not need to be fixed up
char schar = 0;
char tchar = 0;
@@ -3867,7 +3746,7 @@ public final class RuleBasedCollator extends Collator
int minlength = Math.min(slength, tlength);
while (offset < minlength) {
schar = source.charAt(offset);
- tchar = target.charAt(offset ++);
+ tchar = target.charAt(offset++);
if (schar != tchar) {
break;
}
@@ -3883,9 +3762,8 @@ public final class RuleBasedCollator extends Collator
return 0;
}
- // if both values are in or above the surrogate range, Fix them up.
- if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE
- && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
+ // if both values are in or above the surrogate range, Fix them up.
+ if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
schar = fixupUTF16(schar);
tchar = fixupUTF16(tchar);
}
@@ -3897,26 +3775,138 @@ public final class RuleBasedCollator extends Collator
/**
* Rotate surrogates to the top to get code point order
*/
- private static final char fixupUTF16(char ch)
- {
+ private static final char fixupUTF16(char ch) {
if (ch >= 0xe000) {
ch -= 0x800;
- }
- else {
+ } else {
ch += 0x2000;
}
return ch;
}
+ private static final int UCOL_REORDER_CODE_IGNORE = CollationReorderCodes.LIMIT + 1;
+ /**
+ * Builds the lead byte permuatation table
+ */
+ private void buildPermutationTable() {
+ if (m_scriptOrder_ == null) {
+ m_leadBytePermutationTable_ = null;
+ return;
+ }
+
+ // TODO - these need to be read in from the UCA data file
+ // The lowest byte that hasn't been assigned a mapping
+ int toBottom = 0x03;
+ // The highest byte that hasn't been assigned a mapping
+ int toTop = 0xe4;
+
+ // filled slots in the output m_scriptOrder_
+ boolean[] permutationSlotFilled = new boolean[256];
+
+ // used lead bytes
+ boolean[] newLeadByteUsed = new boolean[256];
+
+ if (m_leadBytePermutationTable_ == null) {
+ m_leadBytePermutationTable_ = new byte[256];
+ }
+
+ // prefill the reordering codes with the leading entries
+ int[] internalReorderCodes = new int[m_scriptOrder_.length + 5]; // TODO - replace 5 with the reorder codes prefix size
+ for (int codeIndex = 0; codeIndex < CollationReorderCodes.LIMIT - CollationReorderCodes.FIRST; codeIndex++) {
+ internalReorderCodes[codeIndex] = CollationReorderCodes.FIRST + codeIndex;
+ }
+ for (int codeIndex = 0; codeIndex < m_scriptOrder_.length; codeIndex++) {
+ internalReorderCodes[codeIndex + (CollationReorderCodes.LIMIT - CollationReorderCodes.FIRST)] = m_scriptOrder_[codeIndex];
+ if (m_scriptOrder_[codeIndex] >= CollationReorderCodes.FIRST && m_scriptOrder_[codeIndex] < CollationReorderCodes.LIMIT) {
+ internalReorderCodes[m_scriptOrder_[codeIndex] - CollationReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE;
+ }
+ }
+
+ /*
+ * Start from the front of the list and place each script we encounter at the earliest possible locatation
+ * in the permutation table. If we encounter UNKNOWN, start processing from the back, and place each script
+ * in the last possible location. At each step, we also need to make sure that any scripts that need to not
+ * be moved are copied to their same location in the final table.
+ */
+ boolean fromTheBottom = true;
+ for (int reorderCodesIndex = 0; reorderCodesIndex < internalReorderCodes.length; reorderCodesIndex++) {
+ int next = internalReorderCodes[reorderCodesIndex];
+ if (next == UCOL_REORDER_CODE_IGNORE) {
+ continue;
+ }
+ if (next == UScript.UNKNOWN) {
+ if (fromTheBottom == false) {
+ // double turnaround
+ //*status = U_ILLEGAL_ARGUMENT_ERROR;
+ // TODO - exception
+ m_leadBytePermutationTable_ = null;
+ return;
+ }
+ fromTheBottom = false;
+ continue;
+ }
+
+ int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(next);
+ if (fromTheBottom) {
+ for (int leadByte : leadBytes) {
+ // don't place a lead byte twice in the permutation table
+ if (permutationSlotFilled[leadByte]) {
+ // lead byte already used
+ //*status = U_ILLEGAL_ARGUMENT_ERROR;
+ // TODO - exception?
+ m_leadBytePermutationTable_ = null;
+ return;
+ }
+ m_leadBytePermutationTable_[leadByte] = (byte) toBottom;
+ newLeadByteUsed[toBottom] = true;
+ permutationSlotFilled[leadByte] = true;
+ toBottom++;
+ }
+ } else {
+ for (int leadByteIndex = leadBytes.length - 1; leadByteIndex >= 0; leadByteIndex--) {
+ int leadByte = leadBytes[leadByteIndex];
+ // don't place a lead byte twice in the permutation table
+ if (permutationSlotFilled[leadByte]) {
+ // lead byte already used
+ //*status = U_ILLEGAL_ARGUMENT_ERROR;
+ // TODO - exception?
+ m_leadBytePermutationTable_ = null;
+ return;
+ }
+
+ m_leadBytePermutationTable_[leadByte] = (byte) toTop;
+ newLeadByteUsed[toTop] = true;
+ permutationSlotFilled[leadByte] = true;
+ toTop--;
+ }
+ }
+ }
+
+ /* Copy everything that's left over */
+ int reorderCode = 0;
+ for (int i = 0; i < 256; i++) {
+ if (!permutationSlotFilled[i]) {
+ while (reorderCode < 256 && newLeadByteUsed[reorderCode]) {
+ reorderCode++;
+ }
+ m_leadBytePermutationTable_[i] = (byte) reorderCode;
+ permutationSlotFilled[i] = true;
+ newLeadByteUsed[reorderCode] = true;
+ }
+ }
+
+ // for (int i = 0; i < 256; i++){
+ // System.out.println(Integer.toString(i, 16) + " -> " + Integer.toString(m_scriptReorderTable_[i], 16));
+ // }
+ }
+
/**
* Resets the internal case data members and compression values.
*/
- private void updateInternalState()
- {
+ private void updateInternalState() {
if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
m_caseSwitch_ = CASE_SWITCH_;
- }
- else {
+ } else {
m_caseSwitch_ = NO_CASE_SWITCH_;
}
@@ -3926,8 +3916,7 @@ public final class RuleBasedCollator extends Collator
m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
m_bottom3_ = COMMON_BOTTOM_3_;
- }
- else {
+ } else {
m_mask3_ = CE_KEEP_CASE_;
m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
@@ -3944,31 +3933,30 @@ public final class RuleBasedCollator extends Collator
// Set the compression values
int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1;
// we multilply double with int, but need only int
- m_topCount3_ = (int)(PROPORTION_3_ * total3);
+ m_topCount3_ = (int) (PROPORTION_3_ * total3);
m_bottomCount3_ = total3 - m_topCount3_;
- if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_
- && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) {
+ if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ && !m_isFrenchCollation_
+ && !m_isAlternateHandlingShifted_) {
m_isSimple3_ = true;
- }
- else {
+ } else {
m_isSimple3_ = false;
}
- if(!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
- && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
- if(latinOneCEs_ == null || latinOneRegenTable_) {
- if(setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
- latinOneUse_ = true;
- } else {
- latinOneUse_ = false;
- latinOneFailed_ = true;
+ if (!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
+ && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
+ if (latinOneCEs_ == null || latinOneRegenTable_) {
+ if (setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
+ latinOneUse_ = true;
+ } else {
+ latinOneUse_ = false;
+ latinOneFailed_ = true;
+ }
+ latinOneRegenTable_ = false;
+ } else { // latin1Table exists and it doesn't need to be regenerated, just use it
+ latinOneUse_ = true;
}
- latinOneRegenTable_ = false;
- } else { // latin1Table exists and it doesn't need to be regenerated, just use it
- latinOneUse_ = true;
- }
} else {
- latinOneUse_ = false;
+ latinOneUse_ = false;
}
}
@@ -3976,19 +3964,15 @@ public final class RuleBasedCollator extends Collator
/**
* Initializes the RuleBasedCollator
*/
- private final void init()
- {
- for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_;
- m_minUnsafe_ ++) {
+ private final void init() {
+ for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; m_minUnsafe_++) {
// Find the smallest unsafe char.
if (isUnsafe(m_minUnsafe_)) {
break;
}
}
- for (m_minContractionEnd_ = 0;
- m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_;
- m_minContractionEnd_ ++) {
+ for (m_minContractionEnd_ = 0; m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; m_minContractionEnd_++) {
// Find the smallest contraction-ending char.
if (isContractionEnd(m_minContractionEnd_)) {
break;
@@ -4005,11 +3989,19 @@ public final class RuleBasedCollator extends Collator
m_isHiragana4_ = m_defaultIsHiragana4_;
m_isNumericCollation_ = m_defaultIsNumericCollation_;
latinOneFailed_ = false;
+ if (m_defaultScriptOrder_ != null) {
+ m_scriptOrder_ = new int[m_defaultScriptOrder_.length];
+ for (int i = 0; i < m_defaultScriptOrder_.length; i++) {
+ m_scriptOrder_[i] = m_defaultScriptOrder_[i];
+ }
+ } else {
+ m_scriptOrder_ = null;
+ }
updateInternalState();
}
/**
- * Initializes utility iterators and byte buffer used by compare
+ * Initializes utility iterators and byte buffer used by compare
*/
private final void initUtility(boolean allocate) {
if (allocate) {
@@ -4022,7 +4014,7 @@ public final class RuleBasedCollator extends Collator
m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
- m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
+ m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
}
@@ -4043,10 +4035,10 @@ public final class RuleBasedCollator extends Collator
// Consts for Latin-1 special processing
private static final int ENDOFLATINONERANGE_ = 0xFF;
- private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_+50);
- private static final int BAIL_OUT_CE_ = 0xFF000000;
+ private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_ + 50);
+ private static final int BAIL_OUT_CE_ = 0xFF000000;
- /**
+ /**
* Generate latin-1 tables
*/
@@ -4056,211 +4048,211 @@ public final class RuleBasedCollator extends Collator
int terShift = 24;
}
- private final void
- addLatinOneEntry(char ch, int CE, shiftValues sh) {
- int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
- boolean reverseSecondary = false;
- if(!isContinuation(CE)) {
- tertiary = ((CE & m_mask3_));
- tertiary ^= m_caseSwitch_;
- reverseSecondary = true;
- } else {
- tertiary = (byte)((CE & CE_REMOVE_CONTINUATION_MASK_));
- tertiary &= CE_REMOVE_CASE_;
- reverseSecondary = false;
- }
-
- secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
- primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
- primary1 = (CE >>> 8);
-
- if(primary1 != 0) {
- latinOneCEs_[ch] |= (primary1 << sh.primShift);
- sh.primShift -= 8;
- }
- if(primary2 != 0) {
- if(sh.primShift < 0) {
- latinOneCEs_[ch] = BAIL_OUT_CE_;
- latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
- latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
- return;
+ private final void addLatinOneEntry(char ch, int CE, shiftValues sh) {
+ int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
+ boolean reverseSecondary = false;
+ if (!isContinuation(CE)) {
+ tertiary = ((CE & m_mask3_));
+ tertiary ^= m_caseSwitch_;
+ reverseSecondary = true;
+ } else {
+ tertiary = (byte) ((CE & CE_REMOVE_CONTINUATION_MASK_));
+ tertiary &= CE_REMOVE_CASE_;
+ reverseSecondary = false;
}
- latinOneCEs_[ch] |= (primary2 << sh.primShift);
- sh.primShift -= 8;
- }
- if(secondary != 0) {
- if(reverseSecondary && m_isFrenchCollation_) { // reverse secondary
- latinOneCEs_[latinOneTableLen_+ch] >>>= 8; // make space for secondary
- latinOneCEs_[latinOneTableLen_+ch] |= (secondary << 24);
- } else { // normal case
- latinOneCEs_[latinOneTableLen_+ch] |= (secondary << sh.secShift);
+
+ secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
+ primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
+ primary1 = (CE >>> 8);
+
+ if (primary1 != 0) {
+ latinOneCEs_[ch] |= (primary1 << sh.primShift);
+ sh.primShift -= 8;
+ }
+ if (primary2 != 0) {
+ if (sh.primShift < 0) {
+ latinOneCEs_[ch] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
+ latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
+ return;
+ }
+ latinOneCEs_[ch] |= (primary2 << sh.primShift);
+ sh.primShift -= 8;
+ }
+ if (secondary != 0) {
+ if (reverseSecondary && m_isFrenchCollation_) { // reverse secondary
+ latinOneCEs_[latinOneTableLen_ + ch] >>>= 8; // make space for secondary
+ latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << 24);
+ } else { // normal case
+ latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << sh.secShift);
+ }
+ sh.secShift -= 8;
+ }
+ if (tertiary != 0) {
+ latinOneCEs_[2 * latinOneTableLen_ + ch] |= (tertiary << sh.terShift);
+ sh.terShift -= 8;
}
- sh.secShift -= 8;
- }
- if(tertiary != 0) {
- latinOneCEs_[2*latinOneTableLen_+ch] |= (tertiary << sh.terShift);
- sh.terShift -= 8;
- }
}
- private final void
- resizeLatinOneTable(int newSize) {
- int newTable[] = new int[3*newSize];
- int sizeToCopy = ((newSize> 4) - m_expansionOffset_; //it.getExpansionOffset(this, CE);
- int size = CE & 0xF; // getExpansionCount(CE);
- //CE = *CEOffset++;
- if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
- for(i = 0; i> 4) - m_expansionOffset_; // it.getExpansionOffset(this,
+ // CE);
+ int size = CE & 0xF; // getExpansionCount(CE);
+ // CE = *CEOffset++;
+ if (size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
+ for (i = 0; i < size; i++) {
+ if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
+ latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
+ break;
+ }
+ addLatinOneEntry(contractionOffset, m_expansion_[offset + i], s);
+ }
+ } else { /* else, we do */
+ while (m_expansion_[offset] != 0) {
+ if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
+ latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
+ break;
+ }
+ addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
+ }
+ }
+ contractionOffset++;
+ } else if (!isSpecial(CE)) {
+ addLatinOneEntry(contractionOffset++, CE, s);
+ } else {
+ latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
+ contractionOffset++;
+ }
+ UCharOffset++;
+ s.primShift = 24;
+ s.secShift = 24;
+ s.terShift = 24;
+ if (contractionOffset == latinOneTableLen_) { // we need to reallocate
+ resizeLatinOneTable(2 * latinOneTableLen_);
+ }
+ } while (m_contractionIndex_[UCharOffset] != 0xFFFF);
+ }
+ break;
+ case CollationElementIterator.CE_SPEC_PROC_TAG_: {
+ // 0xB7 is a precontext character defined in UCA5.1, a special
+ // handle is implemeted in order to save LatinOne table for
+ // most locales.
+ if (ch == 0xb7) {
+ addLatinOneEntry(ch, CE, s);
+ } else {
+ latinOneFailed_ = true;
+ return false;
+ }
+ }
+ break;
+ default:
+ latinOneFailed_ = true;
+ return false;
+ }
}
- break;
- case CollationElementIterator.CE_SPEC_PROC_TAG_:
- {
- // 0xB7 is a precontext character defined in UCA5.1, a special
- // handle is implemeted in order to save LatinOne table for
- // most locales.
- if (ch == 0xb7) {
- addLatinOneEntry(ch, CE, s);
- }
- else {
- latinOneFailed_ = true;
- return false;
- }
- }
- break;
- default:
- latinOneFailed_ = true;
- return false;
- }
}
- }
- // compact table
- if(contractionOffset < latinOneTableLen_) {
- resizeLatinOneTable(contractionOffset);
- }
- return true;
+ // compact table
+ if (contractionOffset < latinOneTableLen_) {
+ resizeLatinOneTable(contractionOffset);
+ }
+ return true;
}
private class ContractionInfo {
@@ -4269,71 +4261,59 @@ public final class RuleBasedCollator extends Collator
ContractionInfo m_ContInfo_;
- private int
- getLatinOneContraction(int strength, int CE, String s) {
- //int strength, int CE, String s, Integer ind) {
- int len = s.length();
- //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
- int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
- int offset = 1;
- int latinOneOffset = (CE & 0x00FFF000) >>> 12;
- char schar = 0, tchar = 0;
+ private int getLatinOneContraction(int strength, int CE, String s) {
+ // int strength, int CE, String s, Integer ind) {
+ int len = s.length();
+ // const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
+ int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
+ int offset = 1;
+ int latinOneOffset = (CE & 0x00FFF000) >>> 12;
+ char schar = 0, tchar = 0;
- for(;;) {
- /*
- if(len == -1) {
- if(s[*index] == 0) { // end of string
- return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
- } else {
- schar = s[*index];
- }
- } else {
- */
- if(m_ContInfo_.index == len) {
- return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
- } else {
- schar = s.charAt(m_ContInfo_.index);
- }
- //}
+ for (;;) {
+ /*
+ * if(len == -1) { if(s[*index] == 0) { // end of string
+ * return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } else { schar = s[*index]; }
+ * } else {
+ */
+ if (m_ContInfo_.index == len) {
+ return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]);
+ } else {
+ schar = s.charAt(m_ContInfo_.index);
+ }
+ // }
- while(schar > (tchar = m_contractionIndex_[UCharOffset+offset]/**(UCharOffset+offset)*/)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
- offset++;
- }
+ while (schar > (tchar = m_contractionIndex_[UCharOffset + offset]/** (UCharOffset+offset) */
+ )) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
+ offset++;
+ }
- if (schar == tchar) {
- m_ContInfo_.index++;
- return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset+offset]);
- }
- else
- {
- if(schar > ENDOFLATINONERANGE_ /*& 0xFF00*/) {
- return BAIL_OUT_CE_;
- }
- // skip completely ignorables
- int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
- if(isZeroCE == 0) { // we have to ignore completely ignorables
- m_ContInfo_.index++;
- continue;
- }
+ if (schar == tchar) {
+ m_ContInfo_.index++;
+ return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset + offset]);
+ } else {
+ if (schar > ENDOFLATINONERANGE_ /* & 0xFF00 */) {
+ return BAIL_OUT_CE_;
+ }
+ // skip completely ignorables
+ int isZeroCE = m_trie_.getLeadValue(schar); // UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
+ if (isZeroCE == 0) { // we have to ignore completely ignorables
+ m_ContInfo_.index++;
+ continue;
+ }
- return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
- }
- }
+ return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]);
+ }
+ }
}
-
/**
- * This is a fast strcoll, geared towards text in Latin-1.
- * It supports contractions of size two, French secondaries
- * and case switching. You can use it with strengths primary
- * to tertiary. It does not support shifted and case level.
- * It relies on the table build by setupLatin1Table. If it
- * doesn't understand something, it will go to the regular
- * strcoll.
+ * This is a fast strcoll, geared towards text in Latin-1. It supports contractions of size two, French secondaries
+ * and case switching. You can use it with strengths primary to tertiary. It does not support shifted and case
+ * level. It relies on the table build by setupLatin1Table. If it doesn't understand something, it will go to the
+ * regular strcoll.
*/
- private final int
- compareUseLatin1(String source, String target, int startOffset)
- {
+ private final int compareUseLatin1(String source, String target, int startOffset) {
int sLen = source.length();
int tLen = target.length();
@@ -4341,318 +4321,328 @@ public final class RuleBasedCollator extends Collator
int sIndex = startOffset, tIndex = startOffset;
char sChar = 0, tChar = 0;
- int sOrder=0, tOrder=0;
+ int sOrder = 0, tOrder = 0;
boolean endOfSource = false;
- //uint32_t *elements = coll->latinOneCEs;
+ // uint32_t *elements = coll->latinOneCEs;
boolean haveContractions = false; // if we have contractions in our string
- // we cannot do French secondary
+ // we cannot do French secondary
int offset = latinOneTableLen_;
// Do the primary level
- primLoop:
- for(;;) {
- while(sOrder==0) { // this loop skips primary ignorables
- // sOrder=getNextlatinOneCE(source);
- if(sIndex==sLen) {
- endOfSource = true;
- break;
- }
- sChar=source.charAt(sIndex++); //[sIndex++];
- //}
- if(sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
- //fprintf(stderr, "R");
- return compareRegular(source, target, startOffset);
+ primLoop: for (;;) {
+ while (sOrder == 0) { // this loop skips primary ignorables
+ // sOrder=getNextlatinOneCE(source);
+ if (sIndex == sLen) {
+ endOfSource = true;
+ break;
+ }
+ sChar = source.charAt(sIndex++); // [sIndex++];
+ // }
+ if (sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
+ // fprintf(stderr, "R");
+ return compareRegular(source, target, startOffset);
+ }
+ sOrder = latinOneCEs_[sChar];
+ if (isSpecial(sOrder)) { // if we got a special
+ // specials can basically be either contractions or bail-out signs. If we get anything
+ // else, we'll bail out anywasy
+ if (getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
+ m_ContInfo_.index = sIndex;
+ sOrder = getLatinOneContraction(0, sOrder, source);
+ sIndex = m_ContInfo_.index;
+ haveContractions = true; // if there are contractions, we cannot do French secondary
+ // However, if there are contractions in the table, but we always use just one char,
+ // we might be able to do French. This should be checked out.
+ }
+ if (isSpecial(sOrder) /* == UCOL_BAIL_OUT_CE */) {
+ // fprintf(stderr, "S");
+ return compareRegular(source, target, startOffset);
+ }
+ }
}
- sOrder = latinOneCEs_[sChar];
- if(isSpecial(sOrder)) { // if we got a special
- // specials can basically be either contractions or bail-out signs. If we get anything
- // else, we'll bail out anywasy
- if(getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
- m_ContInfo_.index = sIndex;
- sOrder = getLatinOneContraction(0, sOrder, source);
- sIndex = m_ContInfo_.index;
- haveContractions = true; // if there are contractions, we cannot do French secondary
- // However, if there are contractions in the table, but we always use just one char,
- // we might be able to do French. This should be checked out.
- }
- if(isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) {
- //fprintf(stderr, "S");
- return compareRegular(source, target, startOffset);
- }
- }
- }
- while(tOrder==0) { // this loop skips primary ignorables
- // tOrder=getNextlatinOneCE(target);
- if(tIndex==tLen) {
- if(endOfSource) {
- break primLoop;
- } else {
- return 1;
- }
+ while (tOrder == 0) { // this loop skips primary ignorables
+ // tOrder=getNextlatinOneCE(target);
+ if (tIndex == tLen) {
+ if (endOfSource) {
+ break primLoop;
+ } else {
+ return 1;
+ }
+ }
+ tChar = target.charAt(tIndex++); // [tIndex++];
+ if (tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
+ // fprintf(stderr, "R");
+ return compareRegular(source, target, startOffset);
+ }
+ tOrder = latinOneCEs_[tChar];
+ if (isSpecial(tOrder)) {
+ // Handling specials, see the comments for source
+ if (getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
+ m_ContInfo_.index = tIndex;
+ tOrder = getLatinOneContraction(0, tOrder, target);
+ tIndex = m_ContInfo_.index;
+ haveContractions = true;
+ }
+ if (isSpecial(tOrder)/* == UCOL_BAIL_OUT_CE */) {
+ // fprintf(stderr, "S");
+ return compareRegular(source, target, startOffset);
+ }
+ }
}
- tChar=target.charAt(tIndex++); //[tIndex++];
- if(tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
- //fprintf(stderr, "R");
- return compareRegular(source, target, startOffset);
- }
- tOrder = latinOneCEs_[tChar];
- if(isSpecial(tOrder)) {
- // Handling specials, see the comments for source
- if(getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
- m_ContInfo_.index = tIndex;
- tOrder = getLatinOneContraction(0, tOrder, target);
- tIndex = m_ContInfo_.index;
- haveContractions = true;
- }
- if(isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) {
- //fprintf(stderr, "S");
- return compareRegular(source, target, startOffset);
- }
- }
- }
- if(endOfSource) { // source is finished, but target is not, say the result.
- return -1;
- }
-
- if(sOrder == tOrder) { // if we have same CEs, we continue the loop
- sOrder = 0; tOrder = 0;
- continue;
- } else {
- // compare current top bytes
- if(((sOrder^tOrder)&0xFF000000)!=0) {
- // top bytes differ, return difference
- if(sOrder >>> 8 < tOrder >>> 8) {
+ if (endOfSource) { // source is finished, but target is not, say the result.
return -1;
- } else {
- return 1;
- }
- // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
- // since we must return enum value
}
- // top bytes match, continue with following bytes
- sOrder<<=8;
- tOrder<<=8;
- }
+ if (!isContinuation(sOrder) && m_leadBytePermutationTable_ != null) {
+ sOrder = (m_leadBytePermutationTable_[((sOrder >> 24) + 256) % 256] << 24) | (sOrder & 0x00FFFFFF);
+ tOrder = (m_leadBytePermutationTable_[((tOrder >> 24) + 256) % 256] << 24) | (tOrder & 0x00FFFFFF);
+ }
+
+ if (sOrder == tOrder) { // if we have same CEs, we continue the loop
+ sOrder = 0;
+ tOrder = 0;
+ continue;
+ } else {
+ // compare current top bytes
+ if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
+ // top bytes differ, return difference
+ if (sOrder >>> 8 < tOrder >>> 8) {
+ return -1;
+ } else {
+ return 1;
+ }
+ // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
+ // since we must return enum value
+ }
+
+ // top bytes match, continue with following bytes
+ sOrder <<= 8;
+ tOrder <<= 8;
+ }
}
// after primary loop, we definitely know the sizes of strings,
// so we set it and use simpler loop for secondaries and tertiaries
- //sLen = sIndex; tLen = tIndex;
- if(strength >= SECONDARY) {
- // adjust the table beggining
- //latinOneCEs_ += coll->latinOneTableLen;
- endOfSource = false;
+ // sLen = sIndex; tLen = tIndex;
+ if (strength >= SECONDARY) {
+ // adjust the table beggining
+ // latinOneCEs_ += coll->latinOneTableLen;
+ endOfSource = false;
- if(!m_isFrenchCollation_) { // non French
- // This loop is a simplified copy of primary loop
- // at this point we know that whole strings are latin-1, so we don't
- // check for that. We also know that we only have contractions as
- // specials.
- //sIndex = 0; tIndex = 0;
- sIndex = startOffset; tIndex = startOffset;
- secLoop:
- for(;;) {
- while(sOrder==0) {
- if(sIndex==sLen) {
- endOfSource = true;
- break;
- }
- sChar=source.charAt(sIndex++); //[sIndex++];
- sOrder = latinOneCEs_[offset+sChar];
- if(isSpecial(sOrder)) {
- m_ContInfo_.index = sIndex;
- sOrder = getLatinOneContraction(1, sOrder, source);
- sIndex = m_ContInfo_.index;
- }
- }
+ if (!m_isFrenchCollation_) { // non French
+ // This loop is a simplified copy of primary loop
+ // at this point we know that whole strings are latin-1, so we don't
+ // check for that. We also know that we only have contractions as
+ // specials.
+ // sIndex = 0; tIndex = 0;
+ sIndex = startOffset;
+ tIndex = startOffset;
+ secLoop: for (;;) {
+ while (sOrder == 0) {
+ if (sIndex == sLen) {
+ endOfSource = true;
+ break;
+ }
+ sChar = source.charAt(sIndex++); // [sIndex++];
+ sOrder = latinOneCEs_[offset + sChar];
+ if (isSpecial(sOrder)) {
+ m_ContInfo_.index = sIndex;
+ sOrder = getLatinOneContraction(1, sOrder, source);
+ sIndex = m_ContInfo_.index;
+ }
+ }
- while(tOrder==0) {
- if(tIndex==tLen) {
- if(endOfSource) {
- break secLoop;
- } else {
- return 1;
- }
- }
- tChar=target.charAt(tIndex++); //[tIndex++];
- tOrder = latinOneCEs_[offset+tChar];
- if(isSpecial(tOrder)) {
- m_ContInfo_.index = tIndex;
- tOrder = getLatinOneContraction(1, tOrder, target);
- tIndex = m_ContInfo_.index;
- }
- }
- if(endOfSource) {
- return -1;
- }
+ while (tOrder == 0) {
+ if (tIndex == tLen) {
+ if (endOfSource) {
+ break secLoop;
+ } else {
+ return 1;
+ }
+ }
+ tChar = target.charAt(tIndex++); // [tIndex++];
+ tOrder = latinOneCEs_[offset + tChar];
+ if (isSpecial(tOrder)) {
+ m_ContInfo_.index = tIndex;
+ tOrder = getLatinOneContraction(1, tOrder, target);
+ tIndex = m_ContInfo_.index;
+ }
+ }
+ if (endOfSource) {
+ return -1;
+ }
- if(sOrder == tOrder) {
- sOrder = 0; tOrder = 0;
- continue;
- } else {
- // see primary loop for comments on this
- if(((sOrder^tOrder)&0xFF000000)!=0) {
- if(sOrder >>> 8 < tOrder >>> 8) {
- return -1;
- } else {
- return 1;
- }
+ if (sOrder == tOrder) {
+ sOrder = 0;
+ tOrder = 0;
+ continue;
+ } else {
+ // see primary loop for comments on this
+ if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
+ if (sOrder >>> 8 < tOrder >>> 8) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+ sOrder <<= 8;
+ tOrder <<= 8;
+ }
+ }
+ } else { // French
+ if (haveContractions) { // if we have contractions, we have to bail out
+ // since we don't really know how to handle them here
+ return compareRegular(source, target, startOffset);
+ }
+ // For French, we go backwards
+ sIndex = sLen;
+ tIndex = tLen;
+ secFLoop: for (;;) {
+ while (sOrder == 0) {
+ if (sIndex == startOffset) {
+ endOfSource = true;
+ break;
+ }
+ sChar = source.charAt(--sIndex); // [--sIndex];
+ sOrder = latinOneCEs_[offset + sChar];
+ // don't even look for contractions
+ }
+
+ while (tOrder == 0) {
+ if (tIndex == startOffset) {
+ if (endOfSource) {
+ break secFLoop;
+ } else {
+ return 1;
+ }
+ }
+ tChar = target.charAt(--tIndex); // [--tIndex];
+ tOrder = latinOneCEs_[offset + tChar];
+ // don't even look for contractions
+ }
+ if (endOfSource) {
+ return -1;
+ }
+
+ if (sOrder == tOrder) {
+ sOrder = 0;
+ tOrder = 0;
+ continue;
+ } else {
+ // see the primary loop for comments
+ if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
+ if (sOrder >>> 8 < tOrder >>> 8) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+ sOrder <<= 8;
+ tOrder <<= 8;
+ }
}
- sOrder<<=8;
- tOrder<<=8;
- }
}
- } else { // French
- if(haveContractions) { // if we have contractions, we have to bail out
- // since we don't really know how to handle them here
- return compareRegular(source, target, startOffset);
- }
- // For French, we go backwards
- sIndex = sLen; tIndex = tLen;
- secFLoop:
- for(;;) {
- while(sOrder==0) {
- if(sIndex==startOffset) {
- endOfSource = true;
- break;
- }
- sChar=source.charAt(--sIndex); //[--sIndex];
- sOrder = latinOneCEs_[offset+sChar];
- // don't even look for contractions
- }
-
- while(tOrder==0) {
- if(tIndex==startOffset) {
- if(endOfSource) {
- break secFLoop;
- } else {
- return 1;
- }
- }
- tChar=target.charAt(--tIndex); //[--tIndex];
- tOrder = latinOneCEs_[offset+tChar];
- // don't even look for contractions
- }
- if(endOfSource) {
- return -1;
- }
-
- if(sOrder == tOrder) {
- sOrder = 0; tOrder = 0;
- continue;
- } else {
- // see the primary loop for comments
- if(((sOrder^tOrder)&0xFF000000)!=0) {
- if(sOrder >>> 8 < tOrder >>> 8) {
- return -1;
- } else {
- return 1;
- }
- }
- sOrder<<=8;
- tOrder<<=8;
- }
- }
- }
}
- if(strength >= TERTIARY) {
- // tertiary loop is the same as secondary (except no French)
- offset += latinOneTableLen_;
- //sIndex = 0; tIndex = 0;
- sIndex = startOffset; tIndex = startOffset;
- endOfSource = false;
- for(;;) {
- while(sOrder==0) {
- if(sIndex==sLen) {
- endOfSource = true;
- break;
- }
- sChar=source.charAt(sIndex++); //[sIndex++];
- sOrder = latinOneCEs_[offset+sChar];
- if(isSpecial(sOrder)) {
- m_ContInfo_.index = sIndex;
- sOrder = getLatinOneContraction(2, sOrder, source);
- sIndex = m_ContInfo_.index;
- }
- }
- while(tOrder==0) {
- if(tIndex==tLen) {
- if(endOfSource) {
- return 0; // if both strings are at the end, they are equal
- } else {
- return 1;
+ if (strength >= TERTIARY) {
+ // tertiary loop is the same as secondary (except no French)
+ offset += latinOneTableLen_;
+ // sIndex = 0; tIndex = 0;
+ sIndex = startOffset;
+ tIndex = startOffset;
+ endOfSource = false;
+ for (;;) {
+ while (sOrder == 0) {
+ if (sIndex == sLen) {
+ endOfSource = true;
+ break;
+ }
+ sChar = source.charAt(sIndex++); // [sIndex++];
+ sOrder = latinOneCEs_[offset + sChar];
+ if (isSpecial(sOrder)) {
+ m_ContInfo_.index = sIndex;
+ sOrder = getLatinOneContraction(2, sOrder, source);
+ sIndex = m_ContInfo_.index;
+ }
}
- }
- tChar=target.charAt(tIndex++); //[tIndex++];
- tOrder = latinOneCEs_[offset+tChar];
- if(isSpecial(tOrder)) {
- m_ContInfo_.index = tIndex;
- tOrder = getLatinOneContraction(2, tOrder, target);
- tIndex = m_ContInfo_.index;
- }
- }
- if(endOfSource) {
- return -1;
- }
- if(sOrder == tOrder) {
- sOrder = 0; tOrder = 0;
- continue;
- } else {
- if(((sOrder^tOrder)&0xff000000)!=0) {
- if(sOrder >>> 8 < tOrder >>> 8) {
- return -1;
- } else {
- return 1;
+ while (tOrder == 0) {
+ if (tIndex == tLen) {
+ if (endOfSource) {
+ return 0; // if both strings are at the end, they are equal
+ } else {
+ return 1;
+ }
+ }
+ tChar = target.charAt(tIndex++); // [tIndex++];
+ tOrder = latinOneCEs_[offset + tChar];
+ if (isSpecial(tOrder)) {
+ m_ContInfo_.index = tIndex;
+ tOrder = getLatinOneContraction(2, tOrder, target);
+ tIndex = m_ContInfo_.index;
+ }
+ }
+ if (endOfSource) {
+ return -1;
+ }
+ if (sOrder == tOrder) {
+ sOrder = 0;
+ tOrder = 0;
+ continue;
+ } else {
+ if (((sOrder ^ tOrder) & 0xff000000) != 0) {
+ if (sOrder >>> 8 < tOrder >>> 8) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+ sOrder <<= 8;
+ tOrder <<= 8;
}
- }
- sOrder<<=8;
- tOrder<<=8;
}
- }
}
return 0;
}
- /**
+
+ /**
* Get the version of this collator object.
+ *
* @return the version object associated with this collator
* @stable ICU 2.8
*/
public VersionInfo getVersion() {
- /* RunTime version */
+ /* RunTime version */
int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
- /* Builder version*/
+ /* Builder version */
int bdVersion = m_version_.getMajor();
- /* Charset Version. Need to get the version from cnv files
- * makeconv should populate cnv files with version and
+ /*
+ * Charset Version. Need to get the version from cnv files makeconv should populate cnv files with version and
* an api has to be provided in ucnv.h to obtain this version
*/
int csVersion = 0;
/* combine the version info */
- int cmbVersion = ((rtVersion<<11) | (bdVersion<<6) | (csVersion)) & 0xFFFF;
-
- /* Tailoring rules */
- return VersionInfo.getInstance(cmbVersion>>8,
- cmbVersion & 0xFF,
- m_version_.getMinor(),
- UCA_.m_UCA_version_.getMajor());
+ int cmbVersion = ((rtVersion << 11) | (bdVersion << 6) | (csVersion)) & 0xFFFF;
-// versionInfo[0] = (uint8_t)(cmbVersion>>8);
-// versionInfo[1] = (uint8_t)cmbVersion;
-// versionInfo[2] = coll->image->version[1];
-// versionInfo[3] = coll->UCA->image->UCAVersion[0];
+ /* Tailoring rules */
+ return VersionInfo.getInstance(cmbVersion >> 8, cmbVersion & 0xFF, m_version_.getMinor(),
+ UCA_.m_UCA_version_.getMajor());
+
+ // versionInfo[0] = (uint8_t)(cmbVersion>>8);
+ // versionInfo[1] = (uint8_t)cmbVersion;
+ // versionInfo[2] = coll->image->version[1];
+ // versionInfo[3] = coll->UCA->image->UCAVersion[0];
}
-
- /**
+
+ /**
* Get the UCA version of this collator object.
+ *
* @return the version object associated with this collator
* @stable ICU 2.8
*/