ICU-1897
builder completed but not fully tested. and documentation updates. X-SVN-Rev: 8914
This commit is contained in:
parent
4a6e11bcba
commit
1cef5c4d34
@ -6,53 +6,77 @@ import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
|
||||
/**
|
||||
* <p>The <code>CollationElementIterator</code> class is used as an iterator
|
||||
* to walk through each character of an international string. Use the iterator
|
||||
* to return the ordering priority of the positioned character. The ordering
|
||||
* priority of a character, which we refer to as a key, defines how a
|
||||
* character is collated in the given collation object.</p>
|
||||
* <p>For example, consider the following in Spanish:
|
||||
* <p>
|
||||
* The <code>CollationElementIterator</code> object is an iterator created
|
||||
* by a RuleBasedCollator to walk through an international string. The return
|
||||
* result of each iteration is a 32 bit collation element that defines the
|
||||
* ordering priority of the next sequence of characters in the source string.
|
||||
* </p>
|
||||
* <p>For better illustration, consider the following in Spanish:
|
||||
* <blockquote>
|
||||
* <pre>
|
||||
* "ca" -> the first key is key('c') and second key is key('a').
|
||||
* "cha" -> the first key is key('ch') and second key is key('a').
|
||||
* "ca" -> the first collation element is collation_element('c') and second
|
||||
* collation element is collation_element('a').
|
||||
*
|
||||
* Since "ch" in Spanish sorts as one entity, the below example returns one
|
||||
* collation element for the 2 characters 'c' and 'h'
|
||||
*
|
||||
* "cha" -> the first collation element is collation_element('ch') and second
|
||||
* collation element is collation_element('a').
|
||||
* </pre>
|
||||
* </blockquote>
|
||||
* And in German,
|
||||
* <blockquote>
|
||||
* <pre>
|
||||
* "\u00e4b"-> the first key is key('a'), the second key is key('e'), and
|
||||
* the third key is key('b').
|
||||
* Since the character 'æ' is a composed character of 'a' and 'e', the
|
||||
* below example returns 2 collation elements for the single character
|
||||
* 'æ'
|
||||
*
|
||||
* "æb" -> the first collation element is collation_element('a'), the
|
||||
* second collation element is collation_element('e'), and the
|
||||
* third collation element is collation_element('b').
|
||||
* </pre>
|
||||
* </blockquote>
|
||||
* </p>
|
||||
* <p>The key of a character is an integer composed of primary order(short),
|
||||
* secondary order(byte), and tertiary order(byte). Java strictly defines
|
||||
* the size and signedness of its primitive data types. Therefore, the static
|
||||
* functions <code>primaryOrder</code>, <code>secondaryOrder</code>, and
|
||||
* <code>tertiaryOrder</code> return <code>int</code>, <code>short</code>,
|
||||
* and <code>short</code> respectively to ensure the correctness of the key
|
||||
* value.</p>
|
||||
* <p>
|
||||
* Example of the iterator usage,
|
||||
* For collation ordering comparison, the collation element results can not be
|
||||
* compared simply by using basic arithmetric operators, e.g. <, == or >,
|
||||
* further processing has to be done. Details can be found in the ICU
|
||||
* <a href=http://oss.software.ibm.com/icu/userguide/Collate_ServiceArchitecture.html>
|
||||
* user guide</a>. An example of using the CollationElementIterator for
|
||||
* collation ordering comparison is the class <a href=StringSearch.html>
|
||||
* com.ibm.icu.text.StringSearch</a>.
|
||||
* </p>
|
||||
* <p>
|
||||
* To construct a CollationElementIterator object, users would have to call the
|
||||
* factory method getCollationElementIterator() in a RuleBasedCollator object
|
||||
* that defines the sorting order that is desired.
|
||||
* </p>
|
||||
* <p>
|
||||
* Example:
|
||||
* <blockquote>
|
||||
* <pre>
|
||||
* String testString = "This is a test";
|
||||
* RuleBasedCollator ruleBasedCollator = (RuleBasedCollator)Collator.getInstance();
|
||||
* CollationElementIterator collationElementIterator = ruleBasedCollator.getCollationElementIterator(testString);
|
||||
* int primaryOrder = CollationElementIterator.primaryOrder(collationElementIterator.next());
|
||||
* RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
|
||||
* CollationElementIterator collationElementIterator = rbc.getCollationElementIterator(testString);
|
||||
* int primaryOrder = CollationElementIterator.IGNORABLE;
|
||||
* while (primaryOrder != CollationElementIterator.NULLORDER) {
|
||||
* int order = collationElementIterator.next();
|
||||
* if (order != CollationElementIterator.IGNORABLE &&
|
||||
* order != CollationElementIterator.NULLORDER) {
|
||||
* // order is valid, not ignorable and we have not passed the end
|
||||
* // of the iteration, we do something
|
||||
* primaryOrder = CollationElementIterator.primaryOrder(order);
|
||||
* System.out.println("Next primary order 0x" + Integer.toHexString(primaryOrder));
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
* </blockquote>
|
||||
* </p>
|
||||
* <p>
|
||||
* <code>CollationElementIterator.next</code> returns the collation order
|
||||
* of the next character. A collation order consists of primary order,
|
||||
* secondary order and tertiary order. The data type of the collation
|
||||
* order is <strong>int</strong>. The first 16 bits of a collation order
|
||||
* is its primary order; the next 8 bits is the secondary order and the
|
||||
* last 8 bits is the tertiary order.</p>
|
||||
* @see Collator
|
||||
* @see RuleBasedCollator
|
||||
* @see Collator
|
||||
* @see RuleBasedCollator
|
||||
* @see StringSearch
|
||||
* @author Syn Wee Quek
|
||||
* @since release 2.2, April 18 2002
|
||||
* @draft 2.2
|
||||
@ -62,12 +86,22 @@ public final class CollationElementIterator
|
||||
// public data members --------------------------------------------------
|
||||
|
||||
/**
|
||||
* Null order which indicates the end of string is reached
|
||||
* <p>This constant is returned by the iterator in the methods next() and
|
||||
* previous() when the end or the beginning of the source string has been
|
||||
* reached, and there are no more valid collation elements to return.</p>
|
||||
* <p>See class documentation for an example of use.</p>
|
||||
* @draft 2.2
|
||||
* @see #next
|
||||
* @see #previous
|
||||
*/
|
||||
public final static int NULLORDER = 0xffffffff;
|
||||
/**
|
||||
* Ignorable collation element order.
|
||||
* <p>This constant is returned by the iterator in the methods next() and
|
||||
* previous() when a collation element result is to be ignored.</p>
|
||||
* <p>See class documentation for an example of use.</p>
|
||||
* @draft 2.2
|
||||
* @see #next
|
||||
* @see #previous
|
||||
*/
|
||||
public static final int IGNORABLE = 0;
|
||||
|
||||
@ -76,24 +110,25 @@ public final class CollationElementIterator
|
||||
// public getters -------------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Returns the character offset in the original text corresponding to
|
||||
* the next collation element. (That is, getOffset() returns the position
|
||||
* in the text corresponding to the collation element that will be
|
||||
* returned by the next call to next().) This value could be either
|
||||
* <p>Returns the character offset in the source string corresponding to
|
||||
* the next collation element. i.e. getOffset() returns the position
|
||||
* in source string corresponding to the collation element that will be
|
||||
* returned by the next call to next(). This value could be either
|
||||
* <ul>
|
||||
* <li>index of the <b>first</b> character corresponding to the next
|
||||
* <li> Index of the <b>first</b> character corresponding to the next
|
||||
* collation element. This means that if <code>setOffset(offset)</code>
|
||||
* sets the index in the middle of a contraction, <code>getOffset()</code>
|
||||
* returns the index of the first character in the contraction, which
|
||||
* may not be equals to offset.
|
||||
* <li>if normalization is on, <code>getOffset()</code> may return the
|
||||
* may not be equals to the original offset that was set. Hence calling
|
||||
* getOffset() immediately after setOffset(offset) does not guarantee that
|
||||
* the original offset set will be returned.
|
||||
* <li> If normalization is on, <code>getOffset()</code> may return the
|
||||
* index of the <b>immediate</b> subsequent character, or composite
|
||||
* character with the first character, having a combining class of 0.
|
||||
* <li> the length of the source string if iteration has reached the end.
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>Note calling getOffset() immediately after setOffset(offset) may not
|
||||
* return the value offset.</p>
|
||||
* @return The character offset in the original text corresponding to the
|
||||
* @return The character offset in the source string corresponding to the
|
||||
* collation element that will be returned by the next call to
|
||||
* next().
|
||||
* @draft 2.2
|
||||
@ -111,8 +146,11 @@ public final class CollationElementIterator
|
||||
|
||||
|
||||
/**
|
||||
* Return the maximum length of any expansion sequences that end with the
|
||||
* specified collation element.
|
||||
* <p>
|
||||
* Returns the maximum length of any expansion sequence that ends with
|
||||
* the argument collation element ce. If there is no expansion with the
|
||||
* argument ce as the last element, 1 is returned.
|
||||
* </p>
|
||||
* @param ce a collation element returned by previous() or next().
|
||||
* @return the maximum length of any expansion sequences ending
|
||||
* with the specified collation element.
|
||||
@ -122,9 +160,11 @@ public final class CollationElementIterator
|
||||
{
|
||||
int start = 0;
|
||||
int limit = m_collator_.m_expansionEndCE_.length;
|
||||
long unsignedce = ce & 0xFFFFFFFFl;
|
||||
while (start < limit - 1) {
|
||||
int mid = start + ((limit - start) >> 1);
|
||||
if (ce <= m_collator_.m_expansionEndCE_[mid]) {
|
||||
long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
|
||||
if (unsignedce <= midce) {
|
||||
limit = mid;
|
||||
}
|
||||
else {
|
||||
@ -135,7 +175,8 @@ public final class CollationElementIterator
|
||||
if (m_collator_.m_expansionEndCE_[start] == ce) {
|
||||
result = m_collator_.m_expansionEndCEMaxSize_[start];
|
||||
}
|
||||
else if (m_collator_.m_expansionEndCE_[limit] == ce) {
|
||||
else if (limit < m_collator_.m_expansionEndCE_.length &&
|
||||
m_collator_.m_expansionEndCE_[limit] == ce) {
|
||||
result = m_collator_.m_expansionEndCEMaxSize_[limit];
|
||||
}
|
||||
else if ((ce & 0xFFFF) == 0x00C0) {
|
||||
@ -147,34 +188,49 @@ public final class CollationElementIterator
|
||||
// public other methods -------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Resets the cursor to the beginning of the string. The next call
|
||||
* to next() will return the first collation element in the string.</p>
|
||||
* <p>
|
||||
* Resets the cursor to the beginning of the string. The next call
|
||||
* to next() and previous will return the first and last collation element
|
||||
* in the string respectively.
|
||||
* </p>
|
||||
* <p>
|
||||
* If the RuleBasedCollator used in this iterator has its attributes
|
||||
* changed, calling reset() will reinitialize the iterator to use the new
|
||||
* RuleBasedCollator attributes.
|
||||
* </p>
|
||||
* @draft 2.2
|
||||
*/
|
||||
public synchronized void reset()
|
||||
public void reset()
|
||||
{
|
||||
m_source_.setIndex(0);
|
||||
m_source_.setIndex(m_source_.getBeginIndex());
|
||||
updateInternalState();
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Get the next collation element in the string.</p>
|
||||
* <p>This iterator iterates over a sequence of collation elements that
|
||||
* were built from the string. Because there isn't necessarily a
|
||||
* one-to-one mapping from characters to collation elements, this doesn't
|
||||
* mean the same thing as "return the collation element [or ordering
|
||||
* priority] of the next character in the string".</p>
|
||||
* <p>This function returns the collation element that the iterator is
|
||||
* <p>
|
||||
* Get the next collation element in the source string.
|
||||
* </p>
|
||||
* <p>
|
||||
* This iterator iterates over a sequence of collation elements that were
|
||||
* built from the string. Because there isn't necessarily a one-to-one
|
||||
* mapping from characters to collation elements, this doesn't mean the
|
||||
* same thing as "return the collation element [or ordering priority] of
|
||||
* the next character in the string".
|
||||
* </p>
|
||||
* <p>
|
||||
* This function returns the collation element that the iterator is
|
||||
* currently pointing to and then updates the internal pointer to point to
|
||||
* the next element. previous() updates the pointer first and then
|
||||
* returns the element. This means that when you change direction while
|
||||
* iterating (i.e., call next() and then call previous(), or call
|
||||
* previous() and then call next()), you'll get back the same element
|
||||
* twice.</p>
|
||||
* @return the next collation element
|
||||
* twice.
|
||||
* </p>
|
||||
* @return the next collation element or NULLORDER if the end of the
|
||||
* iteration has been reached.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public synchronized int next()
|
||||
public int next()
|
||||
{
|
||||
m_isForwards_ = true;
|
||||
if (m_CEBufferSize_ > 0) {
|
||||
@ -230,24 +286,30 @@ public final class CollationElementIterator
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Get the previous collation element in the string.</p>
|
||||
* <p>This iterator iterates over a sequence of collation elements that
|
||||
* <p>
|
||||
* Get the previous collation element in the source string.
|
||||
* </p>
|
||||
* <p>
|
||||
* This iterator iterates over a sequence of collation elements that
|
||||
* were built from the string. Because there isn't necessarily a
|
||||
* one-to-one mapping from characters to collation elements, this doesn't
|
||||
* mean the same thing as "return the collation element [or ordering
|
||||
* priority] of the previous character in the string".</p>
|
||||
* <p>This function updates the iterator's internal pointer to point to
|
||||
* priority] of the previous character in the string".
|
||||
* </p>
|
||||
* <p>
|
||||
* This function updates the iterator's internal pointer to point to
|
||||
* the collation element preceding the one it's currently pointing to and
|
||||
* then returns that element, while next() returns the current element and
|
||||
* then updates the pointer. This means that when you change direction
|
||||
* while iterating (i.e., call next() and then call previous(), or call
|
||||
* previous() and then call next()), you'll get back the same element
|
||||
* twice.</p>
|
||||
* twice.
|
||||
* </p>
|
||||
* @return the previous collation element, or NULLORDER when the start of
|
||||
* the iteration has been reached.
|
||||
* the iteration has been reached.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public synchronized int previous()
|
||||
public int previous()
|
||||
{
|
||||
if (m_source_.getIndex() <= 0 && m_isForwards_) {
|
||||
// if iterator is new or reset, we can immediate perform backwards
|
||||
@ -317,50 +379,66 @@ public final class CollationElementIterator
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the primary strength of a collation element.
|
||||
* Return the primary order of a collation element ce.
|
||||
* i.e. the first 16 bits of the argument ce.
|
||||
* @param ce the collation element
|
||||
* @return the element's primary strength
|
||||
* @return the element's 16 bits primary order.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static int primaryOrder(int ce)
|
||||
{
|
||||
return (ce & RuleBasedCollator.CE_PRIMARY_MASK_) >> CE_PRIMARY_SHIFT_;
|
||||
return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
|
||||
>>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
|
||||
}
|
||||
/**
|
||||
* Return the secondary strength of a collation element.
|
||||
* Return the secondary order of a collation element ce.
|
||||
* i.e. the 16th to 27th inclusive set of bits in the argument ce.
|
||||
* @param ce the collation element
|
||||
* @return the element's secondary strength
|
||||
* @return the element's 8 bits secondary order
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static short secondaryOrder(int ce)
|
||||
public final static int secondaryOrder(int ce)
|
||||
{
|
||||
return (short)((ce & RuleBasedCollator.CE_SECONDARY_MASK_)
|
||||
>> CE_SECONDARY_SHIFT_);
|
||||
return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
|
||||
>> RuleBasedCollator.CE_SECONDARY_SHIFT_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the tertiary strength of a collation element.
|
||||
* @param colelem the collation element
|
||||
* @return the element's tertiary strength
|
||||
* Return the tertiary order of a collation element ce. i.e. the last
|
||||
* 8 bits in the argument ce.
|
||||
* @param ce the collation element
|
||||
* @return the element's 8 bits tertiary order
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static short tertiaryOrder(int ce)
|
||||
public final static int tertiaryOrder(int ce)
|
||||
{
|
||||
return (short)(ce & RuleBasedCollator.CE_TERTIARY_MASK_);
|
||||
return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Sets the iterator to point to the collation element corresponding to
|
||||
* the specified character (the parameter is a CHARACTER offset in the
|
||||
* original string, not an offset into its corresponding sequence of
|
||||
* collation elements). The value returned by the next call to next()
|
||||
* will be the collation element corresponding to the specified position
|
||||
* in the text. If that position is in the middle of a contracting
|
||||
* character sequence, the result of the next call to next() is the
|
||||
* collation element for that sequence. This means that getOffset()
|
||||
* is not guaranteed to return the same value as was passed to a preceding
|
||||
* call to setOffset().</p>
|
||||
* @param offset new character offset into the original text to set.
|
||||
* <p>
|
||||
* Sets the iterator to point to the collation element corresponding to
|
||||
* the specified character argument offset. The value returned by the next
|
||||
* call to next() will be the collation element corresponding to the
|
||||
* characters at argument offset.
|
||||
* </p>
|
||||
* <p>
|
||||
* If argument offset is in the middle of a contracting character sequence,
|
||||
* the iterator is adjusted to the start of the contracting sequence. This
|
||||
* means that getOffset() is not guaranteed to return the same value as
|
||||
* the argument offset.
|
||||
* </p>
|
||||
* <p>
|
||||
* If the decomposition mode is on and argument offset is in the middle of
|
||||
* a decomposible range of source text, the iterator may not render a
|
||||
* correct result for
|
||||
* the next forwards or backwards iteration. User has to ensure that the
|
||||
* argument offset does not violate the mid of a decomposible range in
|
||||
* source text.
|
||||
* </p>
|
||||
* @param offset character offset into the original source string to
|
||||
* set. Note this argument is not an offset into the corresponding
|
||||
* sequence of collation elements
|
||||
* @draft 2.2
|
||||
*/
|
||||
public void setOffset(int offset)
|
||||
@ -388,7 +466,7 @@ public final class CollationElementIterator
|
||||
}
|
||||
updateInternalState();
|
||||
int prevoffset = 0;
|
||||
while (m_source_.getIndex() < offset) {
|
||||
while (m_source_.getIndex() <= offset) {
|
||||
prevoffset = m_source_.getIndex();
|
||||
next();
|
||||
}
|
||||
@ -399,59 +477,36 @@ public final class CollationElementIterator
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Set a new string over which to iterate.</p>
|
||||
* <p>Iteration will start from the start of source.</p>
|
||||
* @param source the new source text.
|
||||
* <p>
|
||||
* Sets a new source string for iteration and restart the iteration to
|
||||
* start from the beginning of the argument source.
|
||||
* </p>
|
||||
* @param source the new source string for iteration.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public synchronized void setText(String source)
|
||||
public void setText(String source)
|
||||
{
|
||||
m_source_ = new StringCharacterIterator(source);
|
||||
updateInternalState();
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Set a new string iterator over which to iterate.</p>
|
||||
* <p>Iteration will start from the start of source.</p>
|
||||
* @param source the new source text.
|
||||
* <p>
|
||||
* Sets a new source string iterator for iteration and restart the
|
||||
* iteration to start from the beginning of the argument source.
|
||||
* </p>
|
||||
* @param source the new source string iterator for iteration.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public synchronized void setText(CharacterIterator source)
|
||||
public void setText(CharacterIterator source)
|
||||
{
|
||||
m_source_ = source;
|
||||
m_source_.setIndex(0);
|
||||
m_source_.setIndex(m_source_.getBeginIndex());
|
||||
updateInternalState();
|
||||
}
|
||||
|
||||
// public miscellaneous methods -----------------------------------------
|
||||
|
||||
// protected data members -----------------------------------------------
|
||||
|
||||
/**
|
||||
* true if current codepoint was Hiragana
|
||||
*/
|
||||
protected boolean m_isCodePointHiragana_;
|
||||
/**
|
||||
* Position in the original string that starts with a non-FCD sequence
|
||||
*/
|
||||
protected int m_FCDStart_;
|
||||
/**
|
||||
* This is the CE from CEs buffer that should be returned.
|
||||
* Initial value is 0.
|
||||
* Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
|
||||
* backwards will end with m_CEBufferOffset_ == 0.
|
||||
* The next/previous after we reach the end/beginning of the m_CEBuffer_
|
||||
* will cause this value to be reset to 0.
|
||||
*/
|
||||
protected int m_CEBufferOffset_;
|
||||
/**
|
||||
* This is the position to which we have stored processed CEs.
|
||||
* Initial value is 0.
|
||||
* The next/previous after we reach the end/beginning of the m_CEBuffer_
|
||||
* will cause this value to be reset to 0.
|
||||
*/
|
||||
protected int m_CEBufferSize_;
|
||||
|
||||
// protected constructors -----------------------------------------------
|
||||
|
||||
/**
|
||||
@ -493,29 +548,95 @@ public final class CollationElementIterator
|
||||
updateInternalState();
|
||||
}
|
||||
|
||||
// protected methods ----------------------------------------------------
|
||||
// package private data members -----------------------------------------
|
||||
|
||||
/**
|
||||
* true if current codepoint was Hiragana
|
||||
*/
|
||||
boolean m_isCodePointHiragana_;
|
||||
/**
|
||||
* Position in the original string that starts with a non-FCD sequence
|
||||
*/
|
||||
int m_FCDStart_;
|
||||
/**
|
||||
* This is the CE from CEs buffer that should be returned.
|
||||
* Initial value is 0.
|
||||
* Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
|
||||
* backwards will end with m_CEBufferOffset_ == 0.
|
||||
* The next/previous after we reach the end/beginning of the m_CEBuffer_
|
||||
* will cause this value to be reset to 0.
|
||||
*/
|
||||
int m_CEBufferOffset_;
|
||||
/**
|
||||
* This is the position to which we have stored processed CEs.
|
||||
* Initial value is 0.
|
||||
* The next/previous after we reach the end/beginning of the m_CEBuffer_
|
||||
* will cause this value to be reset to 0.
|
||||
*/
|
||||
int m_CEBufferSize_;
|
||||
|
||||
/**
|
||||
* Checks if iterator is in the buffer zone
|
||||
* @return true if iterator is in buffer zone, false otherwise
|
||||
*/
|
||||
protected boolean isInBuffer()
|
||||
{
|
||||
return m_bufferOffset_ != -1;
|
||||
}
|
||||
// package private methods ----------------------------------------------
|
||||
|
||||
/**
|
||||
* Sets the collator used.
|
||||
* Internal use, all data members will be reset to the default values
|
||||
* @param collator to set
|
||||
*/
|
||||
protected void setCollator(RuleBasedCollator collator)
|
||||
void setCollator(RuleBasedCollator collator)
|
||||
{
|
||||
m_collator_ = collator;
|
||||
updateInternalState();
|
||||
}
|
||||
|
||||
// private data members -------------------------------------------------
|
||||
/**
|
||||
* <p>Sets the iterator to point to the collation element corresponding to
|
||||
* the specified character (the parameter is a CHARACTER offset in the
|
||||
* original string, not an offset into its corresponding sequence of
|
||||
* collation elements). The value returned by the next call to next()
|
||||
* will be the collation element corresponding to the specified position
|
||||
* in the text. Unlike the public method setOffset(int), this method does
|
||||
* not try to readjust the offset to the start of a contracting sequence.
|
||||
* getOffset() is guaranteed to return the same value as was passed to a
|
||||
* preceding call to setOffset().</p>
|
||||
* @param offset new character offset into the original text to set.
|
||||
* @draft 2.2
|
||||
*/
|
||||
void setExactOffset(int offset)
|
||||
{
|
||||
m_source_.setIndex(offset);
|
||||
updateInternalState();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if iterator is in the buffer zone
|
||||
* @return true if iterator is in buffer zone, false otherwise
|
||||
*/
|
||||
boolean isInBuffer()
|
||||
{
|
||||
return m_bufferOffset_ != -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a character is a Thai vowel, which sorts after its base
|
||||
* consonant.
|
||||
* @param ch character to test
|
||||
* @return true if ch is a Thai prevowel, false otherwise
|
||||
*/
|
||||
static final boolean isThaiPreVowel(char ch)
|
||||
{
|
||||
return (ch >= 0xe40 && ch <= 0xe44) || (ch >= 0xec0 && ch <= 0xec4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a character is a Thai base consonant, which sorts before
|
||||
* its prevowel
|
||||
* @param ch character to test
|
||||
* @return true if ch is a Thai base consonant, false otherwise
|
||||
*/
|
||||
static final boolean isThaiBaseConsonant(char ch)
|
||||
{
|
||||
return ch >= 0xe01 && ch <= 0xe2e;
|
||||
}
|
||||
|
||||
// private inner class --------------------------------------------------
|
||||
|
||||
@ -675,8 +796,6 @@ public final class CollationElementIterator
|
||||
private static final int CE_LONG_PRIMARY_TAG_ = 12;
|
||||
private static final int CE_CE_TAGS_COUNT = 13;
|
||||
private static final int CE_BYTE_COMMON_ = 0x05;
|
||||
private static final int CE_PRIMARY_SHIFT_ = 16;
|
||||
private static final int CE_SECONDARY_SHIFT_ = 8;
|
||||
|
||||
// end special ce values and tags ---------------------------------------
|
||||
|
||||
@ -773,21 +892,19 @@ public final class CollationElementIterator
|
||||
* Source offsets points to the current processing character.
|
||||
* </p>
|
||||
*/
|
||||
private void normalize()
|
||||
private void normalize()
|
||||
{
|
||||
/* synwee todo normalize to 1 before fcd
|
||||
try {
|
||||
decompose(m_buffer_, m_source_, m_FCDStart_, m_FCDLimit_,
|
||||
m_collator_.m_decomposition_);
|
||||
}
|
||||
catch (ArrayOutOfBoundsException e) {
|
||||
// increase the size of the buffer
|
||||
m_buffer_ = new char[m_buffer_.length << 1];
|
||||
decompose(m_buffer_, m_source_, m_FCDStart_, m_FCDLimit_,
|
||||
m_collator_.m_decomposition_);
|
||||
}
|
||||
*/
|
||||
m_bufferOffset_ = 0;
|
||||
int size = m_FCDLimit_ - m_FCDStart_;
|
||||
m_buffer_.delete(0, m_buffer_.length());
|
||||
m_source_.setIndex(m_FCDStart_);
|
||||
for (int i = 0; i < size; i ++) {
|
||||
m_buffer_.append(m_source_.current());
|
||||
m_source_.next();
|
||||
}
|
||||
String decomp = Normalizer.decompose(m_buffer_.toString(), false);
|
||||
m_buffer_.delete(0, m_buffer_.length());
|
||||
m_buffer_.append(decomp);
|
||||
m_bufferOffset_ = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -811,24 +928,22 @@ public final class CollationElementIterator
|
||||
{
|
||||
boolean result = true;
|
||||
|
||||
// srcP = collationSource->pos-1;
|
||||
|
||||
// Get the trailing combining class of the current character.
|
||||
// Get the trailing combining class of the current character.
|
||||
// If it's zero, we are OK.
|
||||
m_FCDStart_ = offset;
|
||||
m_source_.setIndex(offset);
|
||||
// trie access
|
||||
char fcd = 0; // synwee todo: unorm_getFCD16(ch);
|
||||
char fcd = NormalizerImpl.getFCD16(ch);
|
||||
if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
|
||||
ch = m_source_.next(); // CharacterIterator.DONE has 0 fcd
|
||||
if (UTF16.isTrailSurrogate(ch)) {
|
||||
fcd = 0xFFFF; // unorm_getFCD16FromSurrogatePair(fcd, ch);
|
||||
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
|
||||
} else {
|
||||
fcd = 0;
|
||||
}
|
||||
}
|
||||
|
||||
byte prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_);
|
||||
int prevTrailCC = fcd & LAST_BYTE_MASK_;
|
||||
|
||||
if (prevTrailCC != 0) {
|
||||
// The current char has a non-zero trailing CC. Scan forward until
|
||||
@ -839,16 +954,16 @@ public final class CollationElementIterator
|
||||
break;
|
||||
}
|
||||
// trie access
|
||||
fcd = 0; // unorm_getFCD16(ch);
|
||||
fcd = NormalizerImpl.getFCD16(ch);
|
||||
if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
|
||||
ch = m_source_.next();
|
||||
if (UTF16.isTrailSurrogate(ch)) {
|
||||
fcd = 0xFFFF; // unorm_getFCD16FromSurrogatePair(fcd, ch);
|
||||
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
|
||||
} else {
|
||||
fcd = 0;
|
||||
}
|
||||
}
|
||||
byte leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_);
|
||||
int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
|
||||
if (leadCC == 0) {
|
||||
// this is a base character, we stop the FCD checks
|
||||
break;
|
||||
@ -858,12 +973,12 @@ public final class CollationElementIterator
|
||||
result = false;
|
||||
}
|
||||
|
||||
prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_);
|
||||
prevTrailCC = fcd & LAST_BYTE_MASK_;
|
||||
}
|
||||
}
|
||||
m_FCDLimit_ = m_source_.getIndex();
|
||||
m_source_.setIndex(m_FCDStart_);
|
||||
m_source_.next();
|
||||
m_FCDLimit_ = m_source_.getIndex();
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -885,8 +1000,7 @@ public final class CollationElementIterator
|
||||
}
|
||||
else {
|
||||
// we are in the buffer, buffer offset will never be 0 here
|
||||
result = m_buffer_.charAt(m_bufferOffset_ ++);
|
||||
if (result == 0) {
|
||||
if (m_bufferOffset_ >= m_buffer_.length()) {
|
||||
// Null marked end of buffer, revert to the source string and
|
||||
// loop back to top to try again to get a character.
|
||||
m_source_.setIndex(m_FCDLimit_);
|
||||
@ -894,10 +1008,10 @@ public final class CollationElementIterator
|
||||
m_buffer_.delete(0, m_buffer_.length());
|
||||
return nextChar();
|
||||
}
|
||||
return result;
|
||||
return m_buffer_.charAt(m_bufferOffset_ ++);
|
||||
}
|
||||
|
||||
if (m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION
|
||||
if (m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
|
||||
|| m_bufferOffset_ != -1 || m_FCDLimit_ > startoffset
|
||||
// skip the fcd checks
|
||||
|| result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
|
||||
@ -934,20 +1048,10 @@ public final class CollationElementIterator
|
||||
* the buffer.
|
||||
* Source offsets points to the current processing character.</p>
|
||||
*/
|
||||
public void normalizeBackwards()
|
||||
private void normalizeBackwards()
|
||||
{
|
||||
int start = m_FCDStart_;
|
||||
int size = 0;
|
||||
/* synwee todo normalize including fcd
|
||||
try {
|
||||
size = decompose(m_buffer_, m_source_, start, m_FCDLimit_);
|
||||
}
|
||||
catch (ArrayOutOfBoundsException .) {
|
||||
m_buffer_ = new char[m_buffer_.length << 1];
|
||||
size = decompose(m_buffer_, m_source_, start, m_FCDLimit);
|
||||
}
|
||||
*/
|
||||
m_bufferOffset_ = size - 1;
|
||||
normalize();
|
||||
m_bufferOffset_ = m_buffer_.length();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -972,18 +1076,20 @@ public final class CollationElementIterator
|
||||
{
|
||||
boolean result = true;
|
||||
char fcd = 0;
|
||||
m_FCDLimit_ = offset;
|
||||
m_FCDLimit_ = offset + 1;
|
||||
m_source_.setIndex(offset);
|
||||
if (!UTF16.isSurrogate(ch)) {
|
||||
fcd = 0; // synwee todo unorm_getFCD16(fcdTrieIndex, c);
|
||||
fcd = NormalizerImpl.getFCD16(ch);
|
||||
}
|
||||
else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) {
|
||||
// note trail surrogate characters gets 0 fcd
|
||||
char trailch = ch;
|
||||
ch = m_source_.previous();
|
||||
if (UTF16.isLeadSurrogate(ch)) {
|
||||
fcd = 0; // unorm_getFCD16(fcdTrieIndex, c2);
|
||||
fcd = NormalizerImpl.getFCD16(ch);
|
||||
if (fcd != 0) {
|
||||
fcd = 0; // unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
|
||||
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd,
|
||||
trailch);
|
||||
}
|
||||
}
|
||||
else {
|
||||
@ -991,44 +1097,47 @@ public final class CollationElementIterator
|
||||
}
|
||||
}
|
||||
|
||||
byte leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_);
|
||||
if (leadCC != 0) {
|
||||
// The current char has a non-zero leading combining class.
|
||||
// Scan backward until we find a char with a trailing cc of zero.
|
||||
while (true) {
|
||||
if (m_source_.getIndex() == 0) {
|
||||
break;
|
||||
}
|
||||
int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
|
||||
// The current char has a non-zero leading combining class.
|
||||
// Scan backward until we find a char with a trailing cc of zero.
|
||||
|
||||
while (leadCC != 0) {
|
||||
offset = m_source_.getIndex();
|
||||
if (offset == 0) {
|
||||
break;
|
||||
}
|
||||
ch = m_source_.previous();
|
||||
if (!UTF16.isSurrogate(ch)) {
|
||||
fcd = NormalizerImpl.getFCD16(ch);
|
||||
}
|
||||
else if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) {
|
||||
char trail = ch;
|
||||
ch = m_source_.previous();
|
||||
if (!UTF16.isSurrogate(ch)) {
|
||||
fcd = 0; //unorm_getFCD16(fcdTrieIndex, c);
|
||||
}
|
||||
else {
|
||||
if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0)
|
||||
{
|
||||
ch = m_source_.previous();
|
||||
if (UTF16.isLeadSurrogate(ch)) {
|
||||
fcd = 0; // unorm_getFCD16(fcdTrieIndex, c2);
|
||||
}
|
||||
if (fcd != 0) {
|
||||
fcd = 0; // unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
|
||||
}
|
||||
} else {
|
||||
fcd = 0; // unpaired surrogate
|
||||
}
|
||||
byte prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_);
|
||||
if (prevTrailCC == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (leadCC < prevTrailCC) {
|
||||
result = false;
|
||||
}
|
||||
leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_);
|
||||
}
|
||||
}
|
||||
if (UTF16.isLeadSurrogate(ch)) {
|
||||
fcd = NormalizerImpl.getFCD16(ch);
|
||||
}
|
||||
if (fcd != 0) {
|
||||
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, trail);
|
||||
}
|
||||
}
|
||||
else {
|
||||
fcd = 0; // unpaired surrogate
|
||||
}
|
||||
int prevTrailCC = fcd & LAST_BYTE_MASK_;
|
||||
if (leadCC < prevTrailCC) {
|
||||
result = false;
|
||||
}
|
||||
leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
|
||||
}
|
||||
m_FCDStart_ = m_source_.getIndex(); // character with 0 lead/trail fcd
|
||||
|
||||
// storing character with 0 lead fcd or the 1st accent with a base
|
||||
// character before it
|
||||
if (fcd == 0) {
|
||||
m_FCDStart_ = offset;
|
||||
}
|
||||
else {
|
||||
m_FCDStart_ = m_source_.getIndex();
|
||||
}
|
||||
m_source_.setIndex(m_FCDLimit_);
|
||||
return result;
|
||||
}
|
||||
@ -1062,7 +1171,7 @@ public final class CollationElementIterator
|
||||
char result = m_source_.previous();
|
||||
int startoffset = m_source_.getIndex();
|
||||
if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
|
||||
|| m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION
|
||||
|| m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
|
||||
|| m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
|
||||
return result;
|
||||
}
|
||||
@ -1073,7 +1182,7 @@ public final class CollationElementIterator
|
||||
return result;
|
||||
}
|
||||
// Need a more complete FCD check and possible normalization.
|
||||
if (!FCDCheckBackwards(ch, startoffset)) {
|
||||
if (!FCDCheckBackwards(result, startoffset)) {
|
||||
normalizeBackwards();
|
||||
m_bufferOffset_ --;
|
||||
result = m_buffer_.charAt(m_bufferOffset_);
|
||||
@ -1085,52 +1194,17 @@ public final class CollationElementIterator
|
||||
* Determines if it is at the start of source iteration
|
||||
* @return true if iterator at the start, false otherwise
|
||||
*/
|
||||
private boolean isBackwardsStart()
|
||||
private final boolean isBackwardsStart()
|
||||
{
|
||||
return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
|
||||
|| (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a character is a Thai vowel, which sorts after its base
|
||||
* consonant.
|
||||
* @param ch character to test
|
||||
* @return true if ch is a Thai prevowel, false otherwise
|
||||
*/
|
||||
private boolean isThaiPreVowel(char ch)
|
||||
{
|
||||
return (ch >= 0xe40 && ch <= 0xe44) || (ch >= 0xec0 && ch <= 0xec4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a character is a Thai base consonant, which sorts before
|
||||
* its prevowel
|
||||
* @param ch character to test
|
||||
* @return true if ch is a Thai base consonant, false otherwise
|
||||
*/
|
||||
private boolean isThaiBaseConsonant(char ch)
|
||||
{
|
||||
return ch >= 0xe01 && ch <= 0xe2e;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determine if a character is a Jamo
|
||||
* @param ch character to test
|
||||
* @return true if ch is a Jamo, false otherwise
|
||||
*/
|
||||
private boolean isJamo(char ch)
|
||||
{
|
||||
return (ch - 0x1100 <= 0x1112 - 0x1100)
|
||||
|| (ch - 0x1161 <= 0x1175 - 0x1161)
|
||||
|| (ch - 0x11A8 <= 0x11C2 - 0x11A8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if iterator is at the end of its source string.
|
||||
* @return true if it is at the end, false otherwise
|
||||
*/
|
||||
private boolean isEnd()
|
||||
private final boolean isEnd()
|
||||
{
|
||||
if (m_bufferOffset_ >= 0) {
|
||||
if (m_bufferOffset_ != m_buffer_.length()) {
|
||||
@ -1155,7 +1229,8 @@ public final class CollationElementIterator
|
||||
* @param trail character
|
||||
* @return next CE for the surrogate characters
|
||||
*/
|
||||
private int nextSurrogate(RuleBasedCollator collator, int ce, char trail)
|
||||
private final int nextSurrogate(RuleBasedCollator collator, int ce,
|
||||
char trail)
|
||||
{
|
||||
if (!UTF16.isTrailSurrogate(trail)) {
|
||||
updateInternalState(m_backup_);
|
||||
@ -1188,7 +1263,7 @@ public final class CollationElementIterator
|
||||
* @param ch current character
|
||||
* @return next CE for Thai characters
|
||||
*/
|
||||
private int nextThai(RuleBasedCollator collator, int ce, char ch)
|
||||
private int nextThai(RuleBasedCollator collator, int ce, char ch)
|
||||
{
|
||||
if (m_bufferOffset_ != -1 // already swapped
|
||||
|| isEnd() || !isThaiBaseConsonant(m_source_.current())) {
|
||||
@ -1430,6 +1505,7 @@ public final class CollationElementIterator
|
||||
* @param collator collator to use
|
||||
* @param ce current ce
|
||||
* @param entrybackup entry backup iterator status
|
||||
* @return ce of the next contraction
|
||||
*/
|
||||
private int nextContraction(RuleBasedCollator collator, int ce)
|
||||
{
|
||||
@ -1895,7 +1971,7 @@ public final class CollationElementIterator
|
||||
return collator.m_contractionCE_[entryoffset];
|
||||
}
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while (collator.isUnsafe(ch)) {
|
||||
while (collator.isUnsafe(ch) || isThaiBaseConsonant(ch)) {
|
||||
buffer.insert(0, ch);
|
||||
ch = previousChar();
|
||||
if (isBackwardsStart()) {
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationKey.java,v $
|
||||
* $Date: 2002/05/16 20:04:49 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2002/06/21 23:56:44 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -15,28 +15,42 @@ package com.ibm.icu.text;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* <p>A <code>CollationKey</code> represents a <code>String</code> under the
|
||||
* <p>
|
||||
* A <code>CollationKey</code> represents a <code>String</code> under the
|
||||
* rules of a specific <code>Collator</code> object. Comparing two
|
||||
* <code>CollationKey</code>s returns the relative order of the
|
||||
* <code>String</code>s they represent. Using <code>CollationKey</code>s to
|
||||
* compare <code>String</code>s is generally faster than using
|
||||
* <code>Collator.compare</code>. Thus, when the <code>String</code>s must be
|
||||
* compared multiple times, for example when sorting a list of
|
||||
* <code>String</code>s. It's more efficient to use <code>CollationKey</code>s.
|
||||
* <code>String</code>s they represent.
|
||||
* </p>
|
||||
* <p>
|
||||
* <code>CollationKey</code> instances can not be create directly. Rather,
|
||||
* they are generated by calling <code>Collator.getCollationKey(String)</code>.
|
||||
* Since the rule set of each <code>Collator differs</code>, the sort orders of
|
||||
* the same string under two unique <code>Collator</code> may not be the same.
|
||||
* Hence comparing <code>CollationKey</code>s generated from different
|
||||
* <code>Collator</code> objects may not give the right results.
|
||||
* </p>
|
||||
* <p>
|
||||
* Similar to <code>CollationKey.compareTo(CollationKey)</code>,
|
||||
* the method <code>RuleBasedCollator.compare(String, String)</code> compares
|
||||
* two strings and returns the relative order. During the construction
|
||||
* of a <code>CollationKey</code> object, the entire source string is examined
|
||||
* and processed into a series of bits that are stored in the
|
||||
* <code>CollationKey</code> object. Bitwise comparison on the bit sequences
|
||||
* are then performed during <code>CollationKey.compareTo(CollationKey)</code>.
|
||||
* This comparison could incurr expensive startup costs while creating
|
||||
* the <code>CollationKey</code> object, but once the objects are created,
|
||||
* binary comparisons are fast, and is recommended when the same strings are
|
||||
* to be compared over and over again.
|
||||
* On the other hand <code>Collator.compare(String, String)</code> examines
|
||||
* and processes the string only until the first characters differing in order,
|
||||
* and is recommend for use if the <code>String</code>s are to be compared only
|
||||
* once.
|
||||
* </p>
|
||||
* <p>
|
||||
* Details of the composition of the bit sequence is located at
|
||||
* <a href=http://oss.software.ibm.com/icu/userguide/Collate_ServiceArchitecture.html>
|
||||
* user guide</a>.
|
||||
* </p>
|
||||
* <p>You can not create <code>CollationKey</code>s directly. Rather, generate
|
||||
* them by calling <code>Collator.getCollationKey(String)</code>. You can only
|
||||
* compare <code>CollationKey</code>s generated from the same
|
||||
* <code>Collator</code> object.</p>
|
||||
* <p>Generating a <code>CollationKey</code> for a <code>String</code>
|
||||
* involves examining the entire <code>String</code> and converting it to
|
||||
* series of bits that can be compared bitwise. This allows fast comparisons
|
||||
* once the keys are generated. The cost of generating keys is recouped in
|
||||
* faster comparisons when <code>String</code>s need to be compared many
|
||||
* times. On the other hand, the result of a comparison is often determined by
|
||||
* the first couple of characters of each <code>String</code>.
|
||||
* <code>Collator.compare(String, String)</code> examines only as many characters as it needs
|
||||
* which allows it to be faster when doing single comparisons.</p>
|
||||
* <p>The following example shows how <code>CollationKey</code>s might be used
|
||||
* to sort a list of <code>String</code>s.</p>
|
||||
* <blockquote>
|
||||
@ -63,7 +77,7 @@ import java.util.Arrays;
|
||||
* System.out.println( keys[2].getSourceString() );
|
||||
* </pre>
|
||||
* </blockquote>
|
||||
*
|
||||
* </p>
|
||||
* @see Collator
|
||||
* @see RuleBasedCollator
|
||||
* @author Syn Wee Quek
|
||||
@ -77,7 +91,7 @@ public final class CollationKey implements Comparable
|
||||
// public getters -------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the String that this CollationKey represents.
|
||||
* Returns the source string that this CollationKey represents.
|
||||
* @return source string that this CollationKey represents
|
||||
* @draft 2.2
|
||||
*/
|
||||
@ -87,11 +101,44 @@ public final class CollationKey implements Comparable
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Duplicates and returns the value of this CollationKey as a sequence
|
||||
* of big-endian bytes.</p>
|
||||
* <p>If two CollationKeys could be legitimately compared, then one could
|
||||
* compare the byte arrays of each to obtain the same result.</p>
|
||||
* @return CollationKey value in a sequence of big-endian byte bytes.
|
||||
* <p>
|
||||
* Duplicates and returns the value of this CollationKey as a sequence
|
||||
* of big-endian bytes terminated by a null.
|
||||
* </p>
|
||||
* <p>
|
||||
* If two CollationKeys could be legitimately compared, then one could
|
||||
* compare the byte arrays of each to obtain the same result.
|
||||
* <pre>
|
||||
* byte key1[] = collationkey1.toByteArray();
|
||||
* byte key2[] = collationkey2.toByteArray();
|
||||
* int i = 0;
|
||||
* while (key1[i] != 0 && key2[i] != 0) {
|
||||
* int key = key1[i] & 0xFF;
|
||||
* int targetkey = key2[i] & 0xFF;
|
||||
* if (key < targetkey) {
|
||||
* System.out.println("String 1 is less than string 2");
|
||||
* return;
|
||||
* }
|
||||
* if (targetkey < key) {
|
||||
* System.out.println("String 1 is more than string 2");
|
||||
* }
|
||||
* i ++;
|
||||
* }
|
||||
* int key = key1[i] & 0xFF;
|
||||
* int targetkey = key2[i] & 0xFF;
|
||||
* if (key < targetkey) {
|
||||
* System.out.println("String 1 is less than string 2");
|
||||
* return;
|
||||
* }
|
||||
* if (targetkey < key) {
|
||||
* System.out.println("String 1 is more than string 2");
|
||||
* return;
|
||||
* }
|
||||
* System.out.println("String 1 is equals to string 2");;
|
||||
* </pre>
|
||||
* </p>
|
||||
* @return CollationKey value in a sequence of big-endian byte bytes
|
||||
* terminated by a null.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public byte[] toByteArray()
|
||||
@ -112,15 +159,22 @@ public final class CollationKey implements Comparable
|
||||
// public other methods -------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Compare this CollationKey to the target CollationKey. The collation
|
||||
* rules of the Collator object which created these keys are applied.</p>
|
||||
* <p><strong>Note:</strong> CollationKeys created by different Collators
|
||||
* can not be compared.</p>
|
||||
* <p>
|
||||
* Compare this CollationKey to the argument target CollationKey.
|
||||
* The collation
|
||||
* rules of the Collator object which created these keys are applied.
|
||||
* </p>
|
||||
* <p>
|
||||
* <strong>Note:</strong> Comparison between CollationKeys created by
|
||||
* different Collators may not return the correct result. See class
|
||||
* documentation.
|
||||
* </p>
|
||||
* @param target target CollationKey
|
||||
* @return an integer value, if value is less than zero this CollationKey
|
||||
* is less than than target, if value is zero if they are equal
|
||||
* and value is greater than zero if this CollationKey is greater
|
||||
* than target.
|
||||
* @exception NullPointerException thrown when argument is null.
|
||||
* @see Collator#compare(String, String)
|
||||
* @draft 2.2
|
||||
*/
|
||||
@ -151,13 +205,21 @@ public final class CollationKey implements Comparable
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Compares this CollationKey with the specified Object.</p>
|
||||
* <p>
|
||||
* Compares this CollationKey with the specified Object.
|
||||
* The collation
|
||||
* rules of the Collator object which created these objects are applied.
|
||||
* </p>
|
||||
* <p>
|
||||
* See note in compareTo(CollationKey) for warnings of incorrect results
|
||||
* </p>
|
||||
* @param obj the Object to be compared.
|
||||
* @return Returns a negative integer, zero, or a positive integer
|
||||
* respectively if this CollationKey is less than, equal to, or
|
||||
* greater than the given Object.
|
||||
* @exception ClassCastException thrown when the specified Object is not a
|
||||
* CollationKey.
|
||||
* @exception ClassCastException thrown when the specified argument is not
|
||||
* a CollationKey. NullPointerException thrown when argument
|
||||
* is null.
|
||||
* @see #compareTo(CollationKey)
|
||||
* @draft 2.2
|
||||
*/
|
||||
@ -167,22 +229,52 @@ public final class CollationKey implements Comparable
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Compare this CollationKey and the target CollationKey for equality.
|
||||
* <p>
|
||||
* Compare this CollationKey and the argument target object for equality.
|
||||
* The collation
|
||||
* rules of the Collator object which created these objects are applied.
|
||||
* </p>
|
||||
* <p>The collation rules of the Collator object which created these keys
|
||||
* are applied.</p>
|
||||
* <p><strong>Note:</strong> CollationKeys created by different Collators
|
||||
* can not be compared.</p>
|
||||
* @param target the CollationKey to compare to.
|
||||
* <p>
|
||||
* See note in compareTo(CollationKey) for warnings of incorrect results
|
||||
* </p>
|
||||
* @param target the object to compare to.
|
||||
* @return true if two objects are equal, false otherwise.
|
||||
* @see #compareTo(CollationKey)
|
||||
* @exception ClassCastException thrown when the specified argument is not
|
||||
* a CollationKey. NullPointerException thrown when argument
|
||||
* is null.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public boolean equals(Object target)
|
||||
{
|
||||
if (!(target instanceof CollationKey)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return equals((CollationKey)target);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Compare this CollationKey and the argument target CollationKey for
|
||||
* equality.
|
||||
* The collation
|
||||
* rules of the Collator object which created these objects are applied.
|
||||
* </p>
|
||||
* <p>
|
||||
* See note in compareTo(CollationKey) for warnings of incorrect results
|
||||
* </p>
|
||||
* @param target the CollationKey to compare to.
|
||||
* @return true if two objects are equal, false otherwise.
|
||||
* @exception NullPointerException thrown when argument is null.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public boolean equals(CollationKey target)
|
||||
{
|
||||
if (this == target) {
|
||||
return true;
|
||||
}
|
||||
if (target == null || !(target instanceof CollationKey)) {
|
||||
if (target == null) {
|
||||
return false;
|
||||
}
|
||||
CollationKey other = (CollationKey)target;
|
||||
@ -200,12 +292,13 @@ public final class CollationKey implements Comparable
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Creates a hash code for this CollationKey. The hash value is
|
||||
* calculated on the key itself, not the String from which the key was
|
||||
* created. Thus if x and y are CollationKeys, then
|
||||
* x.hashCode(x) == y.hashCode() if x.equals(y) is true. This allows
|
||||
* language-sensitive comparison in a hash table.</p>
|
||||
* <p>See the CollatinKey class description for an example.</p>
|
||||
* <p>
|
||||
* Creates a hash code for this CollationKey. The hash value is calculated
|
||||
* on the key itself, not the String from which the key was created. Thus
|
||||
* if x and y are CollationKeys, then x.hashCode(x) == y.hashCode()
|
||||
* if x.equals(y) is true. This allows language-sensitive comparison in a
|
||||
* hash table.
|
||||
* </p>
|
||||
* @return the hash value.
|
||||
* @draft 2.2
|
||||
*/
|
||||
|
3487
icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
Normal file
3487
icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
Normal file
File diff suppressed because it is too large
Load Diff
1724
icu4j/src/com/ibm/icu/text/CollationRuleParser.java
Normal file
1724
icu4j/src/com/ibm/icu/text/CollationRuleParser.java
Normal file
File diff suppressed because it is too large
Load Diff
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Collator.java,v $
|
||||
* $Date: 2002/05/20 23:43:01 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2002/06/21 23:56:44 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -15,57 +15,103 @@ package com.ibm.icu.text;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* <p>The Collator class performs locale-sensitive String comparison.
|
||||
* You use this class to build searching and sorting routines for natural
|
||||
* language text.</p>
|
||||
* <p>Collator is an abstract base class. Subclasses implement specific
|
||||
* collation strategies. One subclass, RuleBasedCollator, is currently
|
||||
* provided and is applicable to a wide set of languages. Other subclasses
|
||||
* may be created to handle more specialized needs.</p>
|
||||
* <p>Like other locale-sensitive classes, you can use the static factory
|
||||
* method, getInstance, to obtain the appropriate Collator object for a given
|
||||
* locale. You will only need to look at the subclasses of Collator if you need
|
||||
* to understand the details of a particular collation strategy or if you need
|
||||
* to modify that strategy. </p>
|
||||
* <p>The following example shows how to compare two strings using the Collator
|
||||
* for the default locale.
|
||||
* <p>
|
||||
* Collator is an abstract base class, its subclasses performs
|
||||
* locale-sensitive String comparison. A concrete subclass, RuleBasedCollator,
|
||||
* is provided and it allows customization of the collation ordering by the use
|
||||
* of rule sets.
|
||||
* </p>
|
||||
* <p>
|
||||
* Following the
|
||||
* <a href=http://www.unicode.org>Unicode Consortium</a>'s specifications for
|
||||
* the <a href=http://www.unicode.org/unicode/reports/tr10/>
|
||||
* Unicode Collation Algorithm (UCA)</a>, there are
|
||||
* 5 different levels of strength used in comparisons.
|
||||
* <ul>
|
||||
* <li>PRIMARY strength: Typically, this is used to denote differences between
|
||||
* base characters (for example, "a" < "b").
|
||||
* It is the strongest difference. For example, dictionaries are divided
|
||||
* into different sections by base character.
|
||||
* <li>SECONDARY strength: Accents in the characters are considered secondary
|
||||
* differences (for example, "as" < "às" < "at"). Other
|
||||
* differences
|
||||
* between letters can also be considered secondary differences, depending
|
||||
* on the language. A secondary difference is ignored when there is a
|
||||
* primary difference anywhere in the strings.
|
||||
* <li>TERTIARY strength: Upper and lower case differences in characters are
|
||||
* distinguished at tertiary strength (for example, "ao" < "Ao" <
|
||||
* "aò"). In addition, a variant of a letter differs from the base
|
||||
* form on the tertiary strength (such as "A" and "Ⓐ"). Another
|
||||
* example is the
|
||||
* difference between large and small Kana. A tertiary difference is ignored
|
||||
* when there is a primary or secondary difference anywhere in the strings.
|
||||
* <li>QUATERNARY strength: When punctuation is ignored
|
||||
* <a href=http://www-124.ibm.com/icu/userguide/Collate_Concepts.html#Ignoring_Punctuation>
|
||||
* (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY
|
||||
* strength, an additional strength level can
|
||||
* be used to distinguish words with and without punctuation (for example,
|
||||
* "ab" < "a-b" < "aB").
|
||||
* This difference is ignored when there is a PRIMARY, SECONDARY or TERTIARY
|
||||
* difference. The QUATERNARY strength should only be used if ignoring
|
||||
* punctuation is required.
|
||||
* <li>IDENTICAL strength:
|
||||
* When all other strengths are equal, the IDENTICAL strength is used as a
|
||||
* tiebreaker. The Unicode code point values of the NFD form of each string
|
||||
* are compared, just in case there is no difference.
|
||||
* For example, Hebrew cantellation marks are only distinguished at this
|
||||
* strength. This strength should be used sparingly, as only code point
|
||||
* values differences between two strings is an extremely rare occurrence.
|
||||
* Using this strength substantially decreases the performance for both
|
||||
* comparison and collation key generation APIs. This strength also
|
||||
* increases the size of the collation key.
|
||||
* </ul>
|
||||
* Unlike the JDK, ICU4J's Collator deals only with 2 decomposition modes,
|
||||
* the canonical decomposition mode and one that does not use any decomposition.
|
||||
* The compatibility decomposition mode, java.text.Collator.FULL_DECOMPOSITION
|
||||
* is not supported here. If the canonical
|
||||
* decomposition mode is set, the Collator handles un-normalized text properly,
|
||||
* producing the same results as if the text were normalized in NFD. If
|
||||
* canonical decomposition is turned off, it is the user's responsibility to
|
||||
* ensure that all text is already in the appropriate form before performing
|
||||
* a comparison or before getting a CollationKey.
|
||||
* </p>
|
||||
* <p>
|
||||
* For more information about the collation service see the
|
||||
* <a href="http://oss.software.ibm.com/icu/userguide/Collate_Intro.html">users
|
||||
* guide</a>.
|
||||
* </p>
|
||||
* <p>
|
||||
* Examples of use
|
||||
* <pre>
|
||||
* // Compare two strings in the default locale
|
||||
* Collator myCollator = Collator.getInstance();
|
||||
* if (myCollator.compare("abc", "ABC") < 0) {
|
||||
* System.out.println("abc is less than ABC");
|
||||
* }
|
||||
* else {
|
||||
* System.out.println("abc is greater than or equal to ABC");
|
||||
* }
|
||||
* </pre>
|
||||
* <p>You can set a <code>Collator</code>'s <em>strength</em> property to
|
||||
* determine the level of difference considered significant in comparisons.
|
||||
* Four strengths are provided: <code>PRIMARY</code>, <code>SECONDARY</code>,
|
||||
* <code>TERTIARY</code>, and <code>IDENTICAL</code>. The exact assignment of
|
||||
* strengths to language features is locale dependant. For example, in Czech,
|
||||
* "e" and "f" are considered primary differences, while "e" and "\u00EA" are
|
||||
* secondary differences, "e" and "E" are tertiary differences and "e" and "e"
|
||||
* are identical. The following shows how both case and accents could be
|
||||
* ignored for US English.</p>
|
||||
* <pre>
|
||||
* //Get the Collator for US English and set its strength to PRIMARY
|
||||
* // Get the Collator for US English and set its strength to PRIMARY
|
||||
* Collator usCollator = Collator.getInstance(Locale.US);
|
||||
* usCollator.setStrength(Collator.PRIMARY);
|
||||
* if (usCollator.compare("abc", "ABC") == 0) {
|
||||
* System.out.println("Strings are equivalent");
|
||||
* }
|
||||
*
|
||||
* The following example shows how to compare two strings using the Collator
|
||||
* for the default locale.
|
||||
* // Compare two strings in the default locale
|
||||
* Collator myCollator = Collator.getInstance();
|
||||
* myCollator.setDecomposition(NO_DECOMPOSITION);
|
||||
* if (myCollator.compare("à\u0325", "a\u0325̀") != 0) {
|
||||
* System.out.println("à\u0325 is not equals to a\u0325̀ without decomposition");
|
||||
* myCollator.setDecomposition(CANONICAL_DECOMPOSITION);
|
||||
* if (myCollator.compare("à\u0325", "a\u0325̀") != 0) {
|
||||
* System.out.println("Error: à\u0325 should be equals to a\u0325̀ with decomposition");
|
||||
* }
|
||||
* else {
|
||||
* System.out.println("à\u0325 is equals to a\u0325̀ with decomposition");
|
||||
* }
|
||||
* }
|
||||
* else {
|
||||
* System.out.println("Error: à\u0325 should be not equals to a\u0325̀ without decomposition");
|
||||
* }
|
||||
* </pre>
|
||||
* <p>For comparing Strings exactly once, the compare method provides the best
|
||||
* performance. When sorting a list of Strings however, it is generally
|
||||
* necessary to compare each String multiple times. In this case,
|
||||
* CollationKeys provide better performance. The CollationKey class converts a
|
||||
* String to a series of bits that can be compared bitwise against other
|
||||
* CollationKeys. A CollationKey is created by a Collator object for a given
|
||||
* String.</p>
|
||||
* <p>Note: CollationKeys from different Collators can not be compared. See the
|
||||
* class description for CollationKey for an example using CollationKeys.
|
||||
* </p>
|
||||
* @see RuleBasedCollator
|
||||
* @see CollationKey
|
||||
* @author Syn Wee Quek
|
||||
* @since release 2.2, April 18 2002
|
||||
* @draft 2.2
|
||||
@ -76,92 +122,92 @@ public abstract class Collator
|
||||
// public data members ---------------------------------------------------
|
||||
|
||||
/**
|
||||
* Collator strength value. When set, only PRIMARY differences are
|
||||
* considered significant during comparison. The assignment of strengths
|
||||
* to language features is locale dependant. A common example is for
|
||||
* different base letters ("a" vs "b") to be considered a PRIMARY
|
||||
* difference.
|
||||
* Strongest collator strength value. Typically, used to denote differences
|
||||
* between base characters.
|
||||
* See class documentation for more explanation.
|
||||
* @see #setStrength
|
||||
* @see #getStrength
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static int PRIMARY
|
||||
= RuleBasedCollator.AttributeValue.PRIMARY_;
|
||||
public final static int PRIMARY = 0;
|
||||
/**
|
||||
* Collator strength value. When set, only SECONDARY and above
|
||||
* differences are considered significant during comparison. The
|
||||
* assignment of strengths to language features is locale dependant. A
|
||||
* common example is for different accented forms of the same base letter
|
||||
* ("a" vs "\u00E4") to be considered a SECONDARY difference.
|
||||
* Second level collator strength value.
|
||||
* Accents in the characters are considered secondary differences.
|
||||
* Other differences between letters can also be considered secondary
|
||||
* differences, depending on the language.
|
||||
* See class documentation for more explanation.
|
||||
* @see #setStrength
|
||||
* @see #getStrength
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static int SECONDARY
|
||||
= RuleBasedCollator.AttributeValue.SECONDARY_;
|
||||
public final static int SECONDARY = 1;
|
||||
/**
|
||||
* Collator strength value. When set, only TERTIARY and above differences
|
||||
* are considered significant during comparison. The assignment of
|
||||
* strengths to language features is locale dependant. A common example is
|
||||
* for case differences ("a" vs "A") to be considered a TERTIARY
|
||||
* difference.
|
||||
* Third level collator strength value.
|
||||
* Upper and lower case differences in characters are distinguished at this
|
||||
* strength level. In addition, a variant of a letter differs from the base
|
||||
* form on the tertiary level.
|
||||
* See class documentation for more explanation.
|
||||
* @see #setStrength
|
||||
* @see #getStrength
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static int TERTIARY
|
||||
= RuleBasedCollator.AttributeValue.TERTIARY_;
|
||||
|
||||
public final static int TERTIARY = 2;
|
||||
/**
|
||||
* Collator strength value. When set, only QUARTENARY and above differences
|
||||
* are considered significant during comparison. The assignment of
|
||||
* strengths to language features is locale dependant.
|
||||
* difference.
|
||||
* Fourth level collator strength value.
|
||||
* When punctuation is ignored
|
||||
* <a href=http://www-124.ibm.com/icu/userguide/Collate_Concepts.html#Ignoring_Punctuation>
|
||||
* (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY
|
||||
* strength, an additional strength level can
|
||||
* be used to distinguish words with and without punctuation
|
||||
* See class documentation for more explanation.
|
||||
* @see #setStrength
|
||||
* @see #getStrength
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static int QUATERNARY
|
||||
= RuleBasedCollator.AttributeValue.QUATERNARY_;
|
||||
|
||||
public final static int QUATERNARY = 3;
|
||||
/**
|
||||
* <p>Collator strength value. When set, all differences are considered
|
||||
* significant during comparison. The assignment of strengths to language
|
||||
* features is locale dependant. A common example is for control
|
||||
* characters ("\u0001" vs "\u0002") to be considered equal at
|
||||
* the PRIMARY, SECONDARY, and TERTIARY levels but different at the
|
||||
* IDENTICAL level. Additionally, differences between pre-composed
|
||||
* accents such as "\u00C0" (A-grave) and combining accents such as
|
||||
* "A\u0300" (A, combining-grave) will be considered significant at
|
||||
* the tertiary level if decomposition is set to NO_DECOMPOSITION.
|
||||
* <p>
|
||||
* Smallest Collator strength value. When all other strengths are equal,
|
||||
* the IDENTICAL strength is used as a tiebreaker. The Unicode code point
|
||||
* values of the NFD form of each string are compared, just in case there
|
||||
* is no difference.
|
||||
* See class documentation for more explanation.
|
||||
* </p>
|
||||
* <p>
|
||||
* Note this value is different from JDK's
|
||||
* </p>
|
||||
* <p>Note this value is different from JDK's</p>
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static int IDENTICAL
|
||||
= RuleBasedCollator.AttributeValue.IDENTICAL_;
|
||||
public final static int IDENTICAL = 15;
|
||||
|
||||
/**
|
||||
* <p>Decomposition mode value. With NO_DECOMPOSITION set, accented
|
||||
* characters will not be decomposed for collation. This is the default
|
||||
* setting and provides the fastest collation but will only produce
|
||||
* correct results for languages that do not use accents.</p>
|
||||
* <p>Note this value is different from JDK's</p>
|
||||
* <p>
|
||||
* Decomposition mode value. With NO_DECOMPOSITION set, Strings will not be
|
||||
* decomposed for collation. This is the default
|
||||
* decomposition setting unless otherwise specified by the locale used
|
||||
* to create the Collator.
|
||||
* </p>
|
||||
* <p>
|
||||
* Note this value is different from JDK's
|
||||
* </p>
|
||||
* @see #CANONICAL_DECOMPOSITION
|
||||
* @see #getDecomposition
|
||||
* @see #setDecomposition
|
||||
* @draft 2.2
|
||||
*/
|
||||
public final static int NO_DECOMPOSITION
|
||||
= RuleBasedCollator.AttributeValue.OFF_;
|
||||
|
||||
public final static int NO_DECOMPOSITION = 16;
|
||||
/**
|
||||
* <p>Decomposition mode value. With CANONICAL_DECOMPOSITION set,
|
||||
* <p>
|
||||
* Decomposition mode value. With CANONICAL_DECOMPOSITION set,
|
||||
* characters that are canonical variants according to Unicode 2.0 will be
|
||||
* decomposed for collation. This should be used to get correct collation
|
||||
* of accented characters.</p>
|
||||
* <p>CANONICAL_DECOMPOSITION corresponds to Normalization Form D as
|
||||
* decomposed for collation.
|
||||
* </p>
|
||||
* <p>
|
||||
* CANONICAL_DECOMPOSITION corresponds to Normalization Form D as
|
||||
* described in <a href="http://www.unicode.org/unicode/reports/tr15/">
|
||||
* Unicode Technical Report #15</a>.</p>
|
||||
* Unicode Technical Report #15</a>.
|
||||
* </p>
|
||||
* @see #NO_DECOMPOSITION
|
||||
* @see #getDecomposition
|
||||
* @see #setDecomposition
|
||||
* @draft 2.2
|
||||
@ -173,9 +219,15 @@ public abstract class Collator
|
||||
// public setters --------------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Sets this Collator's strength property. The strength property
|
||||
* <p>
|
||||
* Sets this Collator's strength property. The strength property
|
||||
* determines the minimum level of difference considered significant
|
||||
* during comparison.</p>
|
||||
* during comparison.
|
||||
* </p>
|
||||
* <p>
|
||||
* The default strength for the Collator is TERTIARY, unless specified
|
||||
* otherwise by the locale used to create the Collator.
|
||||
* </p>
|
||||
* <p>See the Collator class description for an example of use.</p>
|
||||
* @param the new strength value.
|
||||
* @see #getStrength
|
||||
@ -185,10 +237,11 @@ public abstract class Collator
|
||||
* @see #QUATERNARY
|
||||
* @see #IDENTICAL
|
||||
* @exception IllegalArgumentException If the new strength value is not one
|
||||
* of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
|
||||
* of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public void setStrength(int newStrength) {
|
||||
public void setStrength(int newStrength)
|
||||
{
|
||||
if ((newStrength != PRIMARY) &&
|
||||
(newStrength != SECONDARY) &&
|
||||
(newStrength != TERTIARY) &&
|
||||
@ -200,18 +253,38 @@ public abstract class Collator
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the decomposition mode of this Collator. See getDecomposition
|
||||
* for a description of decomposition mode.
|
||||
* <p>
|
||||
* Set the decomposition mode of this Collator.
|
||||
* Setting this decomposition property with CANONICAL_DECOMPOSITION allows
|
||||
* the Collator to handle
|
||||
* un-normalized text properly, producing the same results as if the text
|
||||
* were normalized. If NO_DECOMPOSITION is set, it is the user's
|
||||
* responsibility to insure that all text is already in the appropriate
|
||||
* form before a comparison or before getting a CollationKey. Adjusting
|
||||
* decomposition mode allows the user to select between faster and more
|
||||
* complete collation behavior.
|
||||
* </p>
|
||||
* <p>
|
||||
* Since a great majority of the world languages does not require text
|
||||
* normalization, most locales has NO_DECOMPOSITION has the default
|
||||
* decomposition mode.
|
||||
* <p>
|
||||
* The default decompositon mode for the Collator is NO_DECOMPOSITON,
|
||||
* unless specified otherwise by the locale used to create the Collator.
|
||||
* </p>
|
||||
* <p>
|
||||
* See getDecomposition for a description of decomposition mode.
|
||||
* </p>
|
||||
* @param decomposition the new decomposition mode
|
||||
* @see #getDecomposition
|
||||
* @see #NO_DECOMPOSITION
|
||||
* @see #CANONICAL_DECOMPOSITION
|
||||
* @see #FULL_DECOMPOSITION
|
||||
* @exception IllegalArgumentException If the given value is not a valid decomposition
|
||||
* mode.
|
||||
* @exception IllegalArgumentException If the given value is not a valid
|
||||
* decomposition mode.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public void setDecomposition(int decomposition) {
|
||||
public void setDecomposition(int decomposition)
|
||||
{
|
||||
if ((decomposition != NO_DECOMPOSITION) &&
|
||||
(decomposition != CANONICAL_DECOMPOSITION)) {
|
||||
throw new IllegalArgumentException("Wrong decomposition mode.");
|
||||
@ -225,9 +298,11 @@ public abstract class Collator
|
||||
* Gets the Collator for the current default locale.
|
||||
* The default locale is determined by java.util.Locale.getDefault().
|
||||
* @return the Collator for the default locale (for example, en_US) if it
|
||||
* is created successfully, otherwise if there is a failure,
|
||||
* null will be returned.
|
||||
* is created successfully. Otherwise if there is no Collator
|
||||
* associated with the current locale, the default UCA collator
|
||||
* will be returned.
|
||||
* @see java.util.Locale#getDefault
|
||||
* @see #getInstance(Locale)
|
||||
* @draft 2.2
|
||||
*/
|
||||
public static final Collator getInstance()
|
||||
@ -238,11 +313,13 @@ public abstract class Collator
|
||||
/**
|
||||
* Gets the Collator for the desired locale.
|
||||
* @param locale the desired locale.
|
||||
* @return Collator for the desired locale if it is created successfully,
|
||||
* otherwise if there is a failure, the default UCA collator will
|
||||
* be returned.
|
||||
* @return Collator for the desired locale if it is created successfully.
|
||||
* Otherwise if there is no Collator
|
||||
* associated with the current locale, the default UCA collator
|
||||
* will be returned.
|
||||
* @see java.util.Locale
|
||||
* @see java.util.ResourceBundle
|
||||
* @see #getInstance()
|
||||
* @draft 2.2
|
||||
*/
|
||||
public static final Collator getInstance(Locale locale)
|
||||
@ -256,15 +333,19 @@ public abstract class Collator
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns this Collator's strength property. The strength property
|
||||
* determines the minimum level of difference considered significant
|
||||
* during comparison.</p>
|
||||
* <p>See the Collator class description for an example of use.</p>
|
||||
* <p>
|
||||
* Returns this Collator's strength property. The strength property
|
||||
* determines the minimum level of difference considered significant.
|
||||
* </p>
|
||||
* <p>
|
||||
* See the Collator class description for more details.
|
||||
* </p>
|
||||
* @return this Collator's current strength property.
|
||||
* @see #setStrength
|
||||
* @see #PRIMARY
|
||||
* @see #SECONDARY
|
||||
* @see #TERTIARY
|
||||
* @see #QUATERNARY
|
||||
* @see #IDENTICAL
|
||||
* @draft 2.2
|
||||
*/
|
||||
@ -274,24 +355,17 @@ public abstract class Collator
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Get the decomposition mode of this Collator. Decomposition mode
|
||||
* determines how Unicode composed characters are handled. Adjusting
|
||||
* decomposition mode allows the user to select between faster and more
|
||||
* complete collation behavior.
|
||||
* <p>The three values for decomposition mode are:
|
||||
* <UL>
|
||||
* <LI>NO_DECOMPOSITION,
|
||||
* <LI>CANONICAL_DECOMPOSITION
|
||||
* <LI>FULL_DECOMPOSITION.
|
||||
* </UL>
|
||||
* See the documentation for these three constants for a description
|
||||
* of their meaning.
|
||||
* <p>
|
||||
* Get the decomposition mode of this Collator. Decomposition mode
|
||||
* determines how Unicode composed characters are handled.
|
||||
* </p>
|
||||
* <p>
|
||||
* See the Collator class description for more details.
|
||||
* </p>
|
||||
* @return the decomposition mode
|
||||
* @see #setDecomposition
|
||||
* @see #NO_DECOMPOSITION
|
||||
* @see #CANONICAL_DECOMPOSITION
|
||||
* @see #FULL_DECOMPOSITION
|
||||
* @draft 2.2
|
||||
*/
|
||||
public int getDecomposition()
|
||||
@ -302,91 +376,68 @@ public abstract class Collator
|
||||
// public other methods -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Convenience method for comparing the equality of two strings based on
|
||||
* this Collator's collation rules.
|
||||
* Convenience method for comparing the equality of two text Strings based
|
||||
* on this Collator's collation rules, strength and decomposition mode.
|
||||
* @param source the source string to be compared with.
|
||||
* @param target the target string to be compared with.
|
||||
* @return true if the strings are equal according to the collation
|
||||
* rules. false, otherwise.
|
||||
* @see #compare
|
||||
* @exception NullPointerException thrown if either arguments is null.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public boolean equals(String source, String target)
|
||||
public boolean equals(String source, String target)
|
||||
{
|
||||
return (compare(source, target) == 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning this Collator.
|
||||
* @return a cloned Collator of this object
|
||||
* @draft 2.2
|
||||
*/
|
||||
public Object clone()
|
||||
{
|
||||
try {
|
||||
return (Collator)super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new InternalError();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares the equality of two Collators.
|
||||
* @param that the Collator to be compared with this.
|
||||
* @return true if this Collator is the same as that Collator;
|
||||
* false otherwise.
|
||||
* false otherwise.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public boolean equals(Object that)
|
||||
{
|
||||
if (this == that) {
|
||||
return true;
|
||||
}
|
||||
if (that == null || getClass() != that.getClass()) {
|
||||
return false;
|
||||
}
|
||||
Collator other = (Collator) that;
|
||||
return ((m_strength_ == other.m_strength_) &&
|
||||
(m_decomposition_ == other.m_decomposition_));
|
||||
}
|
||||
public abstract boolean equals(Object that);
|
||||
|
||||
// public abstract methods -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Generates the hash code for this Collator.
|
||||
* Generates a unique hash code for this Collator.
|
||||
* @draft 2.2
|
||||
* @return 32 bit unique hash code
|
||||
*/
|
||||
public abstract int hashCode();
|
||||
|
||||
/**
|
||||
* <p>Compares the source string to the target string according to the
|
||||
* collation rules for this Collator. Returns an integer less than, equal
|
||||
* to or greater than zero depending on whether the source String is less
|
||||
* than, equal to or greater than the target string. See the Collator
|
||||
* class description for an example of use.</p>
|
||||
* <p>For a one time comparison, this method has the best performance. If
|
||||
* a given String will be involved in multiple comparisons,
|
||||
* CollationKey.compareTo() has the best performance. See the Collator
|
||||
* class description for an example using CollationKeys.</p>
|
||||
* @param source the source string.
|
||||
* @param target the target string.
|
||||
* <p>
|
||||
* Compares the source text String to the target text String according to
|
||||
* the collation rules, strength and decomposition mode for this Collator.
|
||||
* Returns an integer less than,
|
||||
* equal to or greater than zero depending on whether the source String is
|
||||
* less than, equal to or greater than the target String. See the Collator
|
||||
* class description for an example of use.
|
||||
* </p>
|
||||
* @param source the source String.
|
||||
* @param target the target String.
|
||||
* @return Returns an integer value. Value is less than zero if source is
|
||||
* less than target, value is zero if source and target are equal,
|
||||
* value is greater than zero if source is greater than target.
|
||||
* @see CollationKey
|
||||
* @see #getCollationKey
|
||||
* @exception NullPointerException thrown if either arguments is null.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public abstract int compare(String source, String target);
|
||||
|
||||
/**
|
||||
* <p>Transforms the String into a series of bits that can be compared
|
||||
* bitwise to other CollationKeys. CollationKeys provide better
|
||||
* performance than Collator.compare() when Strings are involved in
|
||||
* multiple comparisons.</p>
|
||||
* <p>See the Collator class description for an example using
|
||||
* CollationKeys.</p>
|
||||
* @param source the string to be transformed into a collation key.
|
||||
* <p>
|
||||
* Transforms the String into a series of bits that can be compared
|
||||
* bitwise to other CollationKeys. Bits generated depends on the collation
|
||||
* rules, strength and decomposition mode.
|
||||
* </p>
|
||||
* <p>See the CollationKey class documentation for more information.</p>
|
||||
* @param source the string to be transformed into a CollationKey.
|
||||
* @return the CollationKey for the given String based on this Collator's
|
||||
* collation rules. If the source String is null, a null
|
||||
* CollationKey is returned.
|
||||
@ -396,35 +447,18 @@ public abstract class Collator
|
||||
*/
|
||||
public abstract CollationKey getCollationKey(String source);
|
||||
|
||||
// protected data members ------------------------------------------------
|
||||
// protected constructor -------------------------------------------------
|
||||
|
||||
|
||||
// private data members --------------------------------------------------
|
||||
|
||||
/**
|
||||
* Collation strength
|
||||
*/
|
||||
protected int m_strength_;
|
||||
private int m_strength_ = TERTIARY;
|
||||
/**
|
||||
* Decomposition mode
|
||||
*/
|
||||
protected int m_decomposition_;
|
||||
|
||||
// protected constructor -------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Protected constructor for use by subclasses.
|
||||
* Public access to creating Collators is handled by the API getInstance().
|
||||
* </p>
|
||||
* @draft 2.2
|
||||
*/
|
||||
protected Collator() throws Exception
|
||||
{
|
||||
m_strength_ = TERTIARY;
|
||||
m_decomposition_ = CANONICAL_DECOMPOSITION;
|
||||
}
|
||||
|
||||
// protected methods -----------------------------------------------------
|
||||
|
||||
// private variables -----------------------------------------------------
|
||||
|
||||
// private methods -------------------------------------------------------
|
||||
private int m_decomposition_ = CANONICAL_DECOMPOSITION;
|
||||
}
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollatorReader.java,v $
|
||||
* $Date: 2002/05/16 20:04:49 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/06/21 23:56:47 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -140,26 +140,28 @@ final class CollatorReader
|
||||
* @exception IOException thrown when there's a data error.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public void readOptions(RuleBasedCollator rbc) throws IOException
|
||||
protected void readOptions(RuleBasedCollator rbc) throws IOException
|
||||
{
|
||||
rbc.m_variableTopValue_ = m_dataInputStream_.readInt();
|
||||
rbc.setAttributeDefault(RuleBasedCollator.Attribute.FRENCH_COLLATION_,
|
||||
m_dataInputStream_.readInt());
|
||||
rbc.setAttributeDefault(
|
||||
RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
|
||||
m_dataInputStream_.readInt());
|
||||
rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_FIRST_,
|
||||
m_dataInputStream_.readInt());
|
||||
rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_LEVEL_,
|
||||
m_dataInputStream_.readInt());
|
||||
rbc.setAttributeDefault(
|
||||
RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
|
||||
m_dataInputStream_.readInt());
|
||||
rbc.setAttributeDefault(RuleBasedCollator.Attribute.STRENGTH_,
|
||||
m_dataInputStream_.readInt());
|
||||
rbc.setAttributeDefault(
|
||||
RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
|
||||
m_dataInputStream_.readInt());
|
||||
rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt()
|
||||
== RuleBasedCollator.AttributeValue.ON_);
|
||||
rbc.m_defaultIsAlternateHandlingShifted_
|
||||
= (m_dataInputStream_.readInt() ==
|
||||
RuleBasedCollator.AttributeValue.SHIFTED_);
|
||||
rbc.m_defaultCaseFirst_ = m_dataInputStream_.readInt();
|
||||
rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt()
|
||||
== RuleBasedCollator.AttributeValue.ON_);
|
||||
int value = m_dataInputStream_.readInt();
|
||||
if (value == RuleBasedCollator.AttributeValue.ON_) {
|
||||
value = Collator.CANONICAL_DECOMPOSITION;
|
||||
}
|
||||
else {
|
||||
value = Collator.NO_DECOMPOSITION;
|
||||
}
|
||||
rbc.m_defaultDecomposition_ = value;
|
||||
rbc.m_defaultStrength_ = m_dataInputStream_.readInt();
|
||||
rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt()
|
||||
== RuleBasedCollator.AttributeValue.ON_);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -169,7 +171,7 @@ final class CollatorReader
|
||||
* @exception IOException thrown when there's a data error.
|
||||
* @draft 2.2
|
||||
*/
|
||||
public void read(RuleBasedCollator rbc) throws IOException
|
||||
protected void read(RuleBasedCollator rbc) throws IOException
|
||||
{
|
||||
readHeader(rbc);
|
||||
readOptions(rbc);
|
||||
@ -188,7 +190,8 @@ final class CollatorReader
|
||||
for (int i = 0; i < m_contractionCESize_; i ++) {
|
||||
rbc.m_contractionCE_[i] = m_dataInputStream_.readInt();
|
||||
}
|
||||
rbc.m_trie_ = new IntTrie(m_dataInputStream_, rbc);
|
||||
rbc.m_trie_ = new IntTrie(m_dataInputStream_,
|
||||
RuleBasedCollator.DataManipulate.getInstance());
|
||||
if (!rbc.m_trie_.isLatin1Linear()) {
|
||||
throw new IOException("Data corrupted, "
|
||||
+ "Collator Tries expected to have linear "
|
||||
@ -213,6 +216,43 @@ final class CollatorReader
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads in the inverse uca data
|
||||
* @param input input stream with the inverse uca data
|
||||
* @return an object containing the inverse uca data
|
||||
* @exception IOException thrown when error occurs while reading the
|
||||
* inverse uca
|
||||
*/
|
||||
protected static CollationParsedRuleBuilder.InverseUCA readInverseUCA(
|
||||
InputStream inputStream)
|
||||
throws IOException
|
||||
{
|
||||
ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_,
|
||||
DATA_FORMAT_VERSION_, UNICODE_VERSION_);
|
||||
CollationParsedRuleBuilder.InverseUCA result =
|
||||
new CollationParsedRuleBuilder.InverseUCA();
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
int bytesize = input.readInt();
|
||||
int tablesize = input.readInt(); // in int size
|
||||
int contsize = input.readInt(); // in char size
|
||||
int table = input.readInt(); // in bytes
|
||||
int conts = input.readInt(); // in bytes
|
||||
int size = tablesize * 3; // one column for each strength
|
||||
result.m_table_ = new int[size];
|
||||
result.m_continuations_ = new char[contsize];
|
||||
|
||||
for (int i = 0; i < size; i ++) {
|
||||
result.m_table_[i] = input.readInt();
|
||||
}
|
||||
for (int i = 0; i < contsize; i ++) {
|
||||
result.m_continuations_[i] = input.readChar();
|
||||
}
|
||||
input.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
// private inner class -----------------------------------------------
|
||||
|
||||
// private variables -------------------------------------------------
|
||||
|
||||
/**
|
||||
@ -231,6 +271,14 @@ final class CollatorReader
|
||||
private static final byte UNICODE_VERSION_[] = {(byte)0x3, (byte)0x0,
|
||||
(byte)0x0, (byte)0x0};
|
||||
/**
|
||||
* Inverse UCA file format version and id that this class understands.
|
||||
* No guarantees are made if a older version is used
|
||||
*/
|
||||
private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = {(byte)0x49,
|
||||
(byte)0x6e,
|
||||
(byte)0x76,
|
||||
(byte)0x43};
|
||||
/**
|
||||
* Corrupted error string
|
||||
*/
|
||||
private static final String CORRUPTED_DATA_ERROR_ =
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user