builder completed but not fully tested.
and documentation updates.

X-SVN-Rev: 8914
This commit is contained in:
Syn Wee Quek 2002-06-21 23:57:56 +00:00
parent 4a6e11bcba
commit 1cef5c4d34
9 changed files with 10637 additions and 2318 deletions

View File

@ -6,53 +6,77 @@ import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.UCharacterProperty;
/**
* <p>The <code>CollationElementIterator</code> class is used as an iterator
* to walk through each character of an international string. Use the iterator
* to return the ordering priority of the positioned character. The ordering
* priority of a character, which we refer to as a key, defines how a
* character is collated in the given collation object.</p>
* <p>For example, consider the following in Spanish:
* <p>
* The <code>CollationElementIterator</code> object is an iterator created
* by a RuleBasedCollator to walk through an international string. The return
* result of each iteration is a 32 bit collation element that defines the
* ordering priority of the next sequence of characters in the source string.
* </p>
* <p>For better illustration, consider the following in Spanish:
* <blockquote>
* <pre>
* "ca" -> the first key is key('c') and second key is key('a').
* "cha" -> the first key is key('ch') and second key is key('a').
* "ca" -> the first collation element is collation_element('c') and second
* collation element is collation_element('a').
*
* Since "ch" in Spanish sorts as one entity, the below example returns one
* collation element for the 2 characters 'c' and 'h'
*
* "cha" -> the first collation element is collation_element('ch') and second
* collation element is collation_element('a').
* </pre>
* </blockquote>
* And in German,
* <blockquote>
* <pre>
* "\u00e4b"-> the first key is key('a'), the second key is key('e'), and
* the third key is key('b').
* Since the character '&#230;' is a composed character of 'a' and 'e', the
* below example returns 2 collation elements for the single character
* '&#230;'
*
* "&#230;b" -> the first collation element is collation_element('a'), the
* second collation element is collation_element('e'), and the
* third collation element is collation_element('b').
* </pre>
* </blockquote>
* </p>
* <p>The key of a character is an integer composed of primary order(short),
* secondary order(byte), and tertiary order(byte). Java strictly defines
* the size and signedness of its primitive data types. Therefore, the static
* functions <code>primaryOrder</code>, <code>secondaryOrder</code>, and
* <code>tertiaryOrder</code> return <code>int</code>, <code>short</code>,
* and <code>short</code> respectively to ensure the correctness of the key
* value.</p>
* <p>
* Example of the iterator usage,
* For collation ordering comparison, the collation element results can not be
* compared simply by using basic arithmetric operators, e.g. &lt;, == or &gt;,
* further processing has to be done. Details can be found in the ICU
* <a href=http://oss.software.ibm.com/icu/userguide/Collate_ServiceArchitecture.html>
* user guide</a>. An example of using the CollationElementIterator for
* collation ordering comparison is the class <a href=StringSearch.html>
* com.ibm.icu.text.StringSearch</a>.
* </p>
* <p>
* To construct a CollationElementIterator object, users would have to call the
* factory method getCollationElementIterator() in a RuleBasedCollator object
* that defines the sorting order that is desired.
* </p>
* <p>
* Example:
* <blockquote>
* <pre>
* String testString = "This is a test";
* RuleBasedCollator ruleBasedCollator = (RuleBasedCollator)Collator.getInstance();
* CollationElementIterator collationElementIterator = ruleBasedCollator.getCollationElementIterator(testString);
* int primaryOrder = CollationElementIterator.primaryOrder(collationElementIterator.next());
* RuleBasedCollator rbc = new RuleBasedCollator("&amp;a&lt;b");
* CollationElementIterator collationElementIterator = rbc.getCollationElementIterator(testString);
* int primaryOrder = CollationElementIterator.IGNORABLE;
* while (primaryOrder != CollationElementIterator.NULLORDER) {
* int order = collationElementIterator.next();
* if (order != CollationElementIterator.IGNORABLE &&
* order != CollationElementIterator.NULLORDER) {
* // order is valid, not ignorable and we have not passed the end
* // of the iteration, we do something
* primaryOrder = CollationElementIterator.primaryOrder(order);
* System.out.println("Next primary order 0x" + Integer.toHexString(primaryOrder));
* }
* }
* </pre>
* </blockquote>
* </p>
* <p>
* <code>CollationElementIterator.next</code> returns the collation order
* of the next character. A collation order consists of primary order,
* secondary order and tertiary order. The data type of the collation
* order is <strong>int</strong>. The first 16 bits of a collation order
* is its primary order; the next 8 bits is the secondary order and the
* last 8 bits is the tertiary order.</p>
* @see Collator
* @see RuleBasedCollator
* @see Collator
* @see RuleBasedCollator
* @see StringSearch
* @author Syn Wee Quek
* @since release 2.2, April 18 2002
* @draft 2.2
@ -62,12 +86,22 @@ public final class CollationElementIterator
// public data members --------------------------------------------------
/**
* Null order which indicates the end of string is reached
* <p>This constant is returned by the iterator in the methods next() and
* previous() when the end or the beginning of the source string has been
* reached, and there are no more valid collation elements to return.</p>
* <p>See class documentation for an example of use.</p>
* @draft 2.2
* @see #next
* @see #previous
*/
public final static int NULLORDER = 0xffffffff;
/**
* Ignorable collation element order.
* <p>This constant is returned by the iterator in the methods next() and
* previous() when a collation element result is to be ignored.</p>
* <p>See class documentation for an example of use.</p>
* @draft 2.2
* @see #next
* @see #previous
*/
public static final int IGNORABLE = 0;
@ -76,24 +110,25 @@ public final class CollationElementIterator
// public getters -------------------------------------------------------
/**
* <p>Returns the character offset in the original text corresponding to
* the next collation element. (That is, getOffset() returns the position
* in the text corresponding to the collation element that will be
* returned by the next call to next().) This value could be either
* <p>Returns the character offset in the source string corresponding to
* the next collation element. i.e. getOffset() returns the position
* in source string corresponding to the collation element that will be
* returned by the next call to next(). This value could be either
* <ul>
* <li>index of the <b>first</b> character corresponding to the next
* <li> Index of the <b>first</b> character corresponding to the next
* collation element. This means that if <code>setOffset(offset)</code>
* sets the index in the middle of a contraction, <code>getOffset()</code>
* returns the index of the first character in the contraction, which
* may not be equals to offset.
* <li>if normalization is on, <code>getOffset()</code> may return the
* may not be equals to the original offset that was set. Hence calling
* getOffset() immediately after setOffset(offset) does not guarantee that
* the original offset set will be returned.
* <li> If normalization is on, <code>getOffset()</code> may return the
* index of the <b>immediate</b> subsequent character, or composite
* character with the first character, having a combining class of 0.
* <li> the length of the source string if iteration has reached the end.
* </ul>
* </p>
* <p>Note calling getOffset() immediately after setOffset(offset) may not
* return the value offset.</p>
* @return The character offset in the original text corresponding to the
* @return The character offset in the source string corresponding to the
* collation element that will be returned by the next call to
* next().
* @draft 2.2
@ -111,8 +146,11 @@ public final class CollationElementIterator
/**
* Return the maximum length of any expansion sequences that end with the
* specified collation element.
* <p>
* Returns the maximum length of any expansion sequence that ends with
* the argument collation element ce. If there is no expansion with the
* argument ce as the last element, 1 is returned.
* </p>
* @param ce a collation element returned by previous() or next().
* @return the maximum length of any expansion sequences ending
* with the specified collation element.
@ -122,9 +160,11 @@ public final class CollationElementIterator
{
int start = 0;
int limit = m_collator_.m_expansionEndCE_.length;
long unsignedce = ce & 0xFFFFFFFFl;
while (start < limit - 1) {
int mid = start + ((limit - start) >> 1);
if (ce <= m_collator_.m_expansionEndCE_[mid]) {
long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
if (unsignedce <= midce) {
limit = mid;
}
else {
@ -135,7 +175,8 @@ public final class CollationElementIterator
if (m_collator_.m_expansionEndCE_[start] == ce) {
result = m_collator_.m_expansionEndCEMaxSize_[start];
}
else if (m_collator_.m_expansionEndCE_[limit] == ce) {
else if (limit < m_collator_.m_expansionEndCE_.length &&
m_collator_.m_expansionEndCE_[limit] == ce) {
result = m_collator_.m_expansionEndCEMaxSize_[limit];
}
else if ((ce & 0xFFFF) == 0x00C0) {
@ -147,34 +188,49 @@ public final class CollationElementIterator
// public other methods -------------------------------------------------
/**
* <p>Resets the cursor to the beginning of the string. The next call
* to next() will return the first collation element in the string.</p>
* <p>
* Resets the cursor to the beginning of the string. The next call
* to next() and previous will return the first and last collation element
* in the string respectively.
* </p>
* <p>
* If the RuleBasedCollator used in this iterator has its attributes
* changed, calling reset() will reinitialize the iterator to use the new
* RuleBasedCollator attributes.
* </p>
* @draft 2.2
*/
public synchronized void reset()
public void reset()
{
m_source_.setIndex(0);
m_source_.setIndex(m_source_.getBeginIndex());
updateInternalState();
}
/**
* <p>Get the next collation element in the string.</p>
* <p>This iterator iterates over a sequence of collation elements that
* were built from the string. Because there isn't necessarily a
* one-to-one mapping from characters to collation elements, this doesn't
* mean the same thing as "return the collation element [or ordering
* priority] of the next character in the string".</p>
* <p>This function returns the collation element that the iterator is
* <p>
* Get the next collation element in the source string.
* </p>
* <p>
* This iterator iterates over a sequence of collation elements that were
* built from the string. Because there isn't necessarily a one-to-one
* mapping from characters to collation elements, this doesn't mean the
* same thing as "return the collation element [or ordering priority] of
* the next character in the string".
* </p>
* <p>
* This function returns the collation element that the iterator is
* currently pointing to and then updates the internal pointer to point to
* the next element. previous() updates the pointer first and then
* returns the element. This means that when you change direction while
* iterating (i.e., call next() and then call previous(), or call
* previous() and then call next()), you'll get back the same element
* twice.</p>
* @return the next collation element
* twice.
* </p>
* @return the next collation element or NULLORDER if the end of the
* iteration has been reached.
* @draft 2.2
*/
public synchronized int next()
public int next()
{
m_isForwards_ = true;
if (m_CEBufferSize_ > 0) {
@ -230,24 +286,30 @@ public final class CollationElementIterator
}
/**
* <p>Get the previous collation element in the string.</p>
* <p>This iterator iterates over a sequence of collation elements that
* <p>
* Get the previous collation element in the source string.
* </p>
* <p>
* This iterator iterates over a sequence of collation elements that
* were built from the string. Because there isn't necessarily a
* one-to-one mapping from characters to collation elements, this doesn't
* mean the same thing as "return the collation element [or ordering
* priority] of the previous character in the string".</p>
* <p>This function updates the iterator's internal pointer to point to
* priority] of the previous character in the string".
* </p>
* <p>
* This function updates the iterator's internal pointer to point to
* the collation element preceding the one it's currently pointing to and
* then returns that element, while next() returns the current element and
* then updates the pointer. This means that when you change direction
* while iterating (i.e., call next() and then call previous(), or call
* previous() and then call next()), you'll get back the same element
* twice.</p>
* twice.
* </p>
* @return the previous collation element, or NULLORDER when the start of
* the iteration has been reached.
* the iteration has been reached.
* @draft 2.2
*/
public synchronized int previous()
public int previous()
{
if (m_source_.getIndex() <= 0 && m_isForwards_) {
// if iterator is new or reset, we can immediate perform backwards
@ -317,50 +379,66 @@ public final class CollationElementIterator
}
/**
* Return the primary strength of a collation element.
* Return the primary order of a collation element ce.
* i.e. the first 16 bits of the argument ce.
* @param ce the collation element
* @return the element's primary strength
* @return the element's 16 bits primary order.
* @draft 2.2
*/
public final static int primaryOrder(int ce)
{
return (ce & RuleBasedCollator.CE_PRIMARY_MASK_) >> CE_PRIMARY_SHIFT_;
return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
>>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
}
/**
* Return the secondary strength of a collation element.
* Return the secondary order of a collation element ce.
* i.e. the 16th to 27th inclusive set of bits in the argument ce.
* @param ce the collation element
* @return the element's secondary strength
* @return the element's 8 bits secondary order
* @draft 2.2
*/
public final static short secondaryOrder(int ce)
public final static int secondaryOrder(int ce)
{
return (short)((ce & RuleBasedCollator.CE_SECONDARY_MASK_)
>> CE_SECONDARY_SHIFT_);
return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
>> RuleBasedCollator.CE_SECONDARY_SHIFT_;
}
/**
* Return the tertiary strength of a collation element.
* @param colelem the collation element
* @return the element's tertiary strength
* Return the tertiary order of a collation element ce. i.e. the last
* 8 bits in the argument ce.
* @param ce the collation element
* @return the element's 8 bits tertiary order
* @draft 2.2
*/
public final static short tertiaryOrder(int ce)
public final static int tertiaryOrder(int ce)
{
return (short)(ce & RuleBasedCollator.CE_TERTIARY_MASK_);
return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
}
/**
* <p>Sets the iterator to point to the collation element corresponding to
* the specified character (the parameter is a CHARACTER offset in the
* original string, not an offset into its corresponding sequence of
* collation elements). The value returned by the next call to next()
* will be the collation element corresponding to the specified position
* in the text. If that position is in the middle of a contracting
* character sequence, the result of the next call to next() is the
* collation element for that sequence. This means that getOffset()
* is not guaranteed to return the same value as was passed to a preceding
* call to setOffset().</p>
* @param offset new character offset into the original text to set.
* <p>
* Sets the iterator to point to the collation element corresponding to
* the specified character argument offset. The value returned by the next
* call to next() will be the collation element corresponding to the
* characters at argument offset.
* </p>
* <p>
* If argument offset is in the middle of a contracting character sequence,
* the iterator is adjusted to the start of the contracting sequence. This
* means that getOffset() is not guaranteed to return the same value as
* the argument offset.
* </p>
* <p>
* If the decomposition mode is on and argument offset is in the middle of
* a decomposible range of source text, the iterator may not render a
* correct result for
* the next forwards or backwards iteration. User has to ensure that the
* argument offset does not violate the mid of a decomposible range in
* source text.
* </p>
* @param offset character offset into the original source string to
* set. Note this argument is not an offset into the corresponding
* sequence of collation elements
* @draft 2.2
*/
public void setOffset(int offset)
@ -388,7 +466,7 @@ public final class CollationElementIterator
}
updateInternalState();
int prevoffset = 0;
while (m_source_.getIndex() < offset) {
while (m_source_.getIndex() <= offset) {
prevoffset = m_source_.getIndex();
next();
}
@ -399,59 +477,36 @@ public final class CollationElementIterator
}
/**
* <p>Set a new string over which to iterate.</p>
* <p>Iteration will start from the start of source.</p>
* @param source the new source text.
* <p>
* Sets a new source string for iteration and restart the iteration to
* start from the beginning of the argument source.
* </p>
* @param source the new source string for iteration.
* @draft 2.2
*/
public synchronized void setText(String source)
public void setText(String source)
{
m_source_ = new StringCharacterIterator(source);
updateInternalState();
}
/**
* <p>Set a new string iterator over which to iterate.</p>
* <p>Iteration will start from the start of source.</p>
* @param source the new source text.
* <p>
* Sets a new source string iterator for iteration and restart the
* iteration to start from the beginning of the argument source.
* </p>
* @param source the new source string iterator for iteration.
* @draft 2.2
*/
public synchronized void setText(CharacterIterator source)
public void setText(CharacterIterator source)
{
m_source_ = source;
m_source_.setIndex(0);
m_source_.setIndex(m_source_.getBeginIndex());
updateInternalState();
}
// public miscellaneous methods -----------------------------------------
// protected data members -----------------------------------------------
/**
* true if current codepoint was Hiragana
*/
protected boolean m_isCodePointHiragana_;
/**
* Position in the original string that starts with a non-FCD sequence
*/
protected int m_FCDStart_;
/**
* This is the CE from CEs buffer that should be returned.
* Initial value is 0.
* Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
* backwards will end with m_CEBufferOffset_ == 0.
* The next/previous after we reach the end/beginning of the m_CEBuffer_
* will cause this value to be reset to 0.
*/
protected int m_CEBufferOffset_;
/**
* This is the position to which we have stored processed CEs.
* Initial value is 0.
* The next/previous after we reach the end/beginning of the m_CEBuffer_
* will cause this value to be reset to 0.
*/
protected int m_CEBufferSize_;
// protected constructors -----------------------------------------------
/**
@ -493,29 +548,95 @@ public final class CollationElementIterator
updateInternalState();
}
// protected methods ----------------------------------------------------
// package private data members -----------------------------------------
/**
* true if current codepoint was Hiragana
*/
boolean m_isCodePointHiragana_;
/**
* Position in the original string that starts with a non-FCD sequence
*/
int m_FCDStart_;
/**
* This is the CE from CEs buffer that should be returned.
* Initial value is 0.
* Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
* backwards will end with m_CEBufferOffset_ == 0.
* The next/previous after we reach the end/beginning of the m_CEBuffer_
* will cause this value to be reset to 0.
*/
int m_CEBufferOffset_;
/**
* This is the position to which we have stored processed CEs.
* Initial value is 0.
* The next/previous after we reach the end/beginning of the m_CEBuffer_
* will cause this value to be reset to 0.
*/
int m_CEBufferSize_;
/**
* Checks if iterator is in the buffer zone
* @return true if iterator is in buffer zone, false otherwise
*/
protected boolean isInBuffer()
{
return m_bufferOffset_ != -1;
}
// package private methods ----------------------------------------------
/**
* Sets the collator used.
* Internal use, all data members will be reset to the default values
* @param collator to set
*/
protected void setCollator(RuleBasedCollator collator)
void setCollator(RuleBasedCollator collator)
{
m_collator_ = collator;
updateInternalState();
}
// private data members -------------------------------------------------
/**
* <p>Sets the iterator to point to the collation element corresponding to
* the specified character (the parameter is a CHARACTER offset in the
* original string, not an offset into its corresponding sequence of
* collation elements). The value returned by the next call to next()
* will be the collation element corresponding to the specified position
* in the text. Unlike the public method setOffset(int), this method does
* not try to readjust the offset to the start of a contracting sequence.
* getOffset() is guaranteed to return the same value as was passed to a
* preceding call to setOffset().</p>
* @param offset new character offset into the original text to set.
* @draft 2.2
*/
void setExactOffset(int offset)
{
m_source_.setIndex(offset);
updateInternalState();
}
/**
* Checks if iterator is in the buffer zone
* @return true if iterator is in buffer zone, false otherwise
*/
boolean isInBuffer()
{
return m_bufferOffset_ != -1;
}
/**
* Determine if a character is a Thai vowel, which sorts after its base
* consonant.
* @param ch character to test
* @return true if ch is a Thai prevowel, false otherwise
*/
static final boolean isThaiPreVowel(char ch)
{
return (ch >= 0xe40 && ch <= 0xe44) || (ch >= 0xec0 && ch <= 0xec4);
}
/**
* Determine if a character is a Thai base consonant, which sorts before
* its prevowel
* @param ch character to test
* @return true if ch is a Thai base consonant, false otherwise
*/
static final boolean isThaiBaseConsonant(char ch)
{
return ch >= 0xe01 && ch <= 0xe2e;
}
// private inner class --------------------------------------------------
@ -675,8 +796,6 @@ public final class CollationElementIterator
private static final int CE_LONG_PRIMARY_TAG_ = 12;
private static final int CE_CE_TAGS_COUNT = 13;
private static final int CE_BYTE_COMMON_ = 0x05;
private static final int CE_PRIMARY_SHIFT_ = 16;
private static final int CE_SECONDARY_SHIFT_ = 8;
// end special ce values and tags ---------------------------------------
@ -773,21 +892,19 @@ public final class CollationElementIterator
* Source offsets points to the current processing character.
* </p>
*/
private void normalize()
private void normalize()
{
/* synwee todo normalize to 1 before fcd
try {
decompose(m_buffer_, m_source_, m_FCDStart_, m_FCDLimit_,
m_collator_.m_decomposition_);
}
catch (ArrayOutOfBoundsException e) {
// increase the size of the buffer
m_buffer_ = new char[m_buffer_.length << 1];
decompose(m_buffer_, m_source_, m_FCDStart_, m_FCDLimit_,
m_collator_.m_decomposition_);
}
*/
m_bufferOffset_ = 0;
int size = m_FCDLimit_ - m_FCDStart_;
m_buffer_.delete(0, m_buffer_.length());
m_source_.setIndex(m_FCDStart_);
for (int i = 0; i < size; i ++) {
m_buffer_.append(m_source_.current());
m_source_.next();
}
String decomp = Normalizer.decompose(m_buffer_.toString(), false);
m_buffer_.delete(0, m_buffer_.length());
m_buffer_.append(decomp);
m_bufferOffset_ = 0;
}
/**
@ -811,24 +928,22 @@ public final class CollationElementIterator
{
boolean result = true;
// srcP = collationSource->pos-1;
// Get the trailing combining class of the current character.
// Get the trailing combining class of the current character.
// If it's zero, we are OK.
m_FCDStart_ = offset;
m_source_.setIndex(offset);
// trie access
char fcd = 0; // synwee todo: unorm_getFCD16(ch);
char fcd = NormalizerImpl.getFCD16(ch);
if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
ch = m_source_.next(); // CharacterIterator.DONE has 0 fcd
if (UTF16.isTrailSurrogate(ch)) {
fcd = 0xFFFF; // unorm_getFCD16FromSurrogatePair(fcd, ch);
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
} else {
fcd = 0;
}
}
byte prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_);
int prevTrailCC = fcd & LAST_BYTE_MASK_;
if (prevTrailCC != 0) {
// The current char has a non-zero trailing CC. Scan forward until
@ -839,16 +954,16 @@ public final class CollationElementIterator
break;
}
// trie access
fcd = 0; // unorm_getFCD16(ch);
fcd = NormalizerImpl.getFCD16(ch);
if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
ch = m_source_.next();
if (UTF16.isTrailSurrogate(ch)) {
fcd = 0xFFFF; // unorm_getFCD16FromSurrogatePair(fcd, ch);
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
} else {
fcd = 0;
}
}
byte leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_);
int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
if (leadCC == 0) {
// this is a base character, we stop the FCD checks
break;
@ -858,12 +973,12 @@ public final class CollationElementIterator
result = false;
}
prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_);
prevTrailCC = fcd & LAST_BYTE_MASK_;
}
}
m_FCDLimit_ = m_source_.getIndex();
m_source_.setIndex(m_FCDStart_);
m_source_.next();
m_FCDLimit_ = m_source_.getIndex();
return result;
}
@ -885,8 +1000,7 @@ public final class CollationElementIterator
}
else {
// we are in the buffer, buffer offset will never be 0 here
result = m_buffer_.charAt(m_bufferOffset_ ++);
if (result == 0) {
if (m_bufferOffset_ >= m_buffer_.length()) {
// Null marked end of buffer, revert to the source string and
// loop back to top to try again to get a character.
m_source_.setIndex(m_FCDLimit_);
@ -894,10 +1008,10 @@ public final class CollationElementIterator
m_buffer_.delete(0, m_buffer_.length());
return nextChar();
}
return result;
return m_buffer_.charAt(m_bufferOffset_ ++);
}
if (m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION
if (m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
|| m_bufferOffset_ != -1 || m_FCDLimit_ > startoffset
// skip the fcd checks
|| result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
@ -934,20 +1048,10 @@ public final class CollationElementIterator
* the buffer.
* Source offsets points to the current processing character.</p>
*/
public void normalizeBackwards()
private void normalizeBackwards()
{
int start = m_FCDStart_;
int size = 0;
/* synwee todo normalize including fcd
try {
size = decompose(m_buffer_, m_source_, start, m_FCDLimit_);
}
catch (ArrayOutOfBoundsException .) {
m_buffer_ = new char[m_buffer_.length << 1];
size = decompose(m_buffer_, m_source_, start, m_FCDLimit);
}
*/
m_bufferOffset_ = size - 1;
normalize();
m_bufferOffset_ = m_buffer_.length();
}
/**
@ -972,18 +1076,20 @@ public final class CollationElementIterator
{
boolean result = true;
char fcd = 0;
m_FCDLimit_ = offset;
m_FCDLimit_ = offset + 1;
m_source_.setIndex(offset);
if (!UTF16.isSurrogate(ch)) {
fcd = 0; // synwee todo unorm_getFCD16(fcdTrieIndex, c);
fcd = NormalizerImpl.getFCD16(ch);
}
else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) {
// note trail surrogate characters gets 0 fcd
char trailch = ch;
ch = m_source_.previous();
if (UTF16.isLeadSurrogate(ch)) {
fcd = 0; // unorm_getFCD16(fcdTrieIndex, c2);
fcd = NormalizerImpl.getFCD16(ch);
if (fcd != 0) {
fcd = 0; // unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd,
trailch);
}
}
else {
@ -991,44 +1097,47 @@ public final class CollationElementIterator
}
}
byte leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_);
if (leadCC != 0) {
// The current char has a non-zero leading combining class.
// Scan backward until we find a char with a trailing cc of zero.
while (true) {
if (m_source_.getIndex() == 0) {
break;
}
int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
// The current char has a non-zero leading combining class.
// Scan backward until we find a char with a trailing cc of zero.
while (leadCC != 0) {
offset = m_source_.getIndex();
if (offset == 0) {
break;
}
ch = m_source_.previous();
if (!UTF16.isSurrogate(ch)) {
fcd = NormalizerImpl.getFCD16(ch);
}
else if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) {
char trail = ch;
ch = m_source_.previous();
if (!UTF16.isSurrogate(ch)) {
fcd = 0; //unorm_getFCD16(fcdTrieIndex, c);
}
else {
if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0)
{
ch = m_source_.previous();
if (UTF16.isLeadSurrogate(ch)) {
fcd = 0; // unorm_getFCD16(fcdTrieIndex, c2);
}
if (fcd != 0) {
fcd = 0; // unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
}
} else {
fcd = 0; // unpaired surrogate
}
byte prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_);
if (prevTrailCC == 0) {
break;
}
if (leadCC < prevTrailCC) {
result = false;
}
leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_);
}
}
if (UTF16.isLeadSurrogate(ch)) {
fcd = NormalizerImpl.getFCD16(ch);
}
if (fcd != 0) {
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, trail);
}
}
else {
fcd = 0; // unpaired surrogate
}
int prevTrailCC = fcd & LAST_BYTE_MASK_;
if (leadCC < prevTrailCC) {
result = false;
}
leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
}
m_FCDStart_ = m_source_.getIndex(); // character with 0 lead/trail fcd
// storing character with 0 lead fcd or the 1st accent with a base
// character before it
if (fcd == 0) {
m_FCDStart_ = offset;
}
else {
m_FCDStart_ = m_source_.getIndex();
}
m_source_.setIndex(m_FCDLimit_);
return result;
}
@ -1062,7 +1171,7 @@ public final class CollationElementIterator
char result = m_source_.previous();
int startoffset = m_source_.getIndex();
if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
|| m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION
|| m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
|| m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
return result;
}
@ -1073,7 +1182,7 @@ public final class CollationElementIterator
return result;
}
// Need a more complete FCD check and possible normalization.
if (!FCDCheckBackwards(ch, startoffset)) {
if (!FCDCheckBackwards(result, startoffset)) {
normalizeBackwards();
m_bufferOffset_ --;
result = m_buffer_.charAt(m_bufferOffset_);
@ -1085,52 +1194,17 @@ public final class CollationElementIterator
* Determines if it is at the start of source iteration
* @return true if iterator at the start, false otherwise
*/
private boolean isBackwardsStart()
private final boolean isBackwardsStart()
{
return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
|| (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
}
/**
* Determine if a character is a Thai vowel, which sorts after its base
* consonant.
* @param ch character to test
* @return true if ch is a Thai prevowel, false otherwise
*/
private boolean isThaiPreVowel(char ch)
{
return (ch >= 0xe40 && ch <= 0xe44) || (ch >= 0xec0 && ch <= 0xec4);
}
/**
* Determine if a character is a Thai base consonant, which sorts before
* its prevowel
* @param ch character to test
* @return true if ch is a Thai base consonant, false otherwise
*/
private boolean isThaiBaseConsonant(char ch)
{
return ch >= 0xe01 && ch <= 0xe2e;
}
/**
* Determine if a character is a Jamo
* @param ch character to test
* @return true if ch is a Jamo, false otherwise
*/
private boolean isJamo(char ch)
{
return (ch - 0x1100 <= 0x1112 - 0x1100)
|| (ch - 0x1161 <= 0x1175 - 0x1161)
|| (ch - 0x11A8 <= 0x11C2 - 0x11A8);
}
/**
* Checks if iterator is at the end of its source string.
* @return true if it is at the end, false otherwise
*/
private boolean isEnd()
private final boolean isEnd()
{
if (m_bufferOffset_ >= 0) {
if (m_bufferOffset_ != m_buffer_.length()) {
@ -1155,7 +1229,8 @@ public final class CollationElementIterator
* @param trail character
* @return next CE for the surrogate characters
*/
private int nextSurrogate(RuleBasedCollator collator, int ce, char trail)
private final int nextSurrogate(RuleBasedCollator collator, int ce,
char trail)
{
if (!UTF16.isTrailSurrogate(trail)) {
updateInternalState(m_backup_);
@ -1188,7 +1263,7 @@ public final class CollationElementIterator
* @param ch current character
* @return next CE for Thai characters
*/
private int nextThai(RuleBasedCollator collator, int ce, char ch)
private int nextThai(RuleBasedCollator collator, int ce, char ch)
{
if (m_bufferOffset_ != -1 // already swapped
|| isEnd() || !isThaiBaseConsonant(m_source_.current())) {
@ -1430,6 +1505,7 @@ public final class CollationElementIterator
* @param collator collator to use
* @param ce current ce
* @param entrybackup entry backup iterator status
* @return ce of the next contraction
*/
private int nextContraction(RuleBasedCollator collator, int ce)
{
@ -1895,7 +1971,7 @@ public final class CollationElementIterator
return collator.m_contractionCE_[entryoffset];
}
StringBuffer buffer = new StringBuffer();
while (collator.isUnsafe(ch)) {
while (collator.isUnsafe(ch) || isThaiBaseConsonant(ch)) {
buffer.insert(0, ch);
ch = previousChar();
if (isBackwardsStart()) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationKey.java,v $
* $Date: 2002/05/16 20:04:49 $
* $Revision: 1.5 $
* $Date: 2002/06/21 23:56:44 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -15,28 +15,42 @@ package com.ibm.icu.text;
import java.util.Arrays;
/**
* <p>A <code>CollationKey</code> represents a <code>String</code> under the
* <p>
* A <code>CollationKey</code> represents a <code>String</code> under the
* rules of a specific <code>Collator</code> object. Comparing two
* <code>CollationKey</code>s returns the relative order of the
* <code>String</code>s they represent. Using <code>CollationKey</code>s to
* compare <code>String</code>s is generally faster than using
* <code>Collator.compare</code>. Thus, when the <code>String</code>s must be
* compared multiple times, for example when sorting a list of
* <code>String</code>s. It's more efficient to use <code>CollationKey</code>s.
* <code>String</code>s they represent.
* </p>
* <p>
* <code>CollationKey</code> instances can not be create directly. Rather,
* they are generated by calling <code>Collator.getCollationKey(String)</code>.
* Since the rule set of each <code>Collator differs</code>, the sort orders of
* the same string under two unique <code>Collator</code> may not be the same.
* Hence comparing <code>CollationKey</code>s generated from different
* <code>Collator</code> objects may not give the right results.
* </p>
* <p>
* Similar to <code>CollationKey.compareTo(CollationKey)</code>,
* the method <code>RuleBasedCollator.compare(String, String)</code> compares
* two strings and returns the relative order. During the construction
* of a <code>CollationKey</code> object, the entire source string is examined
* and processed into a series of bits that are stored in the
* <code>CollationKey</code> object. Bitwise comparison on the bit sequences
* are then performed during <code>CollationKey.compareTo(CollationKey)</code>.
* This comparison could incurr expensive startup costs while creating
* the <code>CollationKey</code> object, but once the objects are created,
* binary comparisons are fast, and is recommended when the same strings are
* to be compared over and over again.
* On the other hand <code>Collator.compare(String, String)</code> examines
* and processes the string only until the first characters differing in order,
* and is recommend for use if the <code>String</code>s are to be compared only
* once.
* </p>
* <p>
* Details of the composition of the bit sequence is located at
* <a href=http://oss.software.ibm.com/icu/userguide/Collate_ServiceArchitecture.html>
* user guide</a>.
* </p>
* <p>You can not create <code>CollationKey</code>s directly. Rather, generate
* them by calling <code>Collator.getCollationKey(String)</code>. You can only
* compare <code>CollationKey</code>s generated from the same
* <code>Collator</code> object.</p>
* <p>Generating a <code>CollationKey</code> for a <code>String</code>
* involves examining the entire <code>String</code> and converting it to
* series of bits that can be compared bitwise. This allows fast comparisons
* once the keys are generated. The cost of generating keys is recouped in
* faster comparisons when <code>String</code>s need to be compared many
* times. On the other hand, the result of a comparison is often determined by
* the first couple of characters of each <code>String</code>.
* <code>Collator.compare(String, String)</code> examines only as many characters as it needs
* which allows it to be faster when doing single comparisons.</p>
* <p>The following example shows how <code>CollationKey</code>s might be used
* to sort a list of <code>String</code>s.</p>
* <blockquote>
@ -63,7 +77,7 @@ import java.util.Arrays;
* System.out.println( keys[2].getSourceString() );
* </pre>
* </blockquote>
*
* </p>
* @see Collator
* @see RuleBasedCollator
* @author Syn Wee Quek
@ -77,7 +91,7 @@ public final class CollationKey implements Comparable
// public getters -------------------------------------------------------
/**
* Returns the String that this CollationKey represents.
* Returns the source string that this CollationKey represents.
* @return source string that this CollationKey represents
* @draft 2.2
*/
@ -87,11 +101,44 @@ public final class CollationKey implements Comparable
}
/**
* <p>Duplicates and returns the value of this CollationKey as a sequence
* of big-endian bytes.</p>
* <p>If two CollationKeys could be legitimately compared, then one could
* compare the byte arrays of each to obtain the same result.</p>
* @return CollationKey value in a sequence of big-endian byte bytes.
* <p>
* Duplicates and returns the value of this CollationKey as a sequence
* of big-endian bytes terminated by a null.
* </p>
* <p>
* If two CollationKeys could be legitimately compared, then one could
* compare the byte arrays of each to obtain the same result.
* <pre>
* byte key1[] = collationkey1.toByteArray();
* byte key2[] = collationkey2.toByteArray();
* int i = 0;
* while (key1[i] != 0 && key2[i] != 0) {
* int key = key1[i] & 0xFF;
* int targetkey = key2[i] & 0xFF;
* if (key &lt; targetkey) {
* System.out.println("String 1 is less than string 2");
* return;
* }
* if (targetkey &lt; key) {
* System.out.println("String 1 is more than string 2");
* }
* i ++;
* }
* int key = key1[i] & 0xFF;
* int targetkey = key2[i] & 0xFF;
* if (key &lt; targetkey) {
* System.out.println("String 1 is less than string 2");
* return;
* }
* if (targetkey &lt; key) {
* System.out.println("String 1 is more than string 2");
* return;
* }
* System.out.println("String 1 is equals to string 2");;
* </pre>
* </p>
* @return CollationKey value in a sequence of big-endian byte bytes
* terminated by a null.
* @draft 2.2
*/
public byte[] toByteArray()
@ -112,15 +159,22 @@ public final class CollationKey implements Comparable
// public other methods -------------------------------------------------
/**
* <p>Compare this CollationKey to the target CollationKey. The collation
* rules of the Collator object which created these keys are applied.</p>
* <p><strong>Note:</strong> CollationKeys created by different Collators
* can not be compared.</p>
* <p>
* Compare this CollationKey to the argument target CollationKey.
* The collation
* rules of the Collator object which created these keys are applied.
* </p>
* <p>
* <strong>Note:</strong> Comparison between CollationKeys created by
* different Collators may not return the correct result. See class
* documentation.
* </p>
* @param target target CollationKey
* @return an integer value, if value is less than zero this CollationKey
* is less than than target, if value is zero if they are equal
* and value is greater than zero if this CollationKey is greater
* than target.
* @exception NullPointerException thrown when argument is null.
* @see Collator#compare(String, String)
* @draft 2.2
*/
@ -151,13 +205,21 @@ public final class CollationKey implements Comparable
}
/**
* <p>Compares this CollationKey with the specified Object.</p>
* <p>
* Compares this CollationKey with the specified Object.
* The collation
* rules of the Collator object which created these objects are applied.
* </p>
* <p>
* See note in compareTo(CollationKey) for warnings of incorrect results
* </p>
* @param obj the Object to be compared.
* @return Returns a negative integer, zero, or a positive integer
* respectively if this CollationKey is less than, equal to, or
* greater than the given Object.
* @exception ClassCastException thrown when the specified Object is not a
* CollationKey.
* @exception ClassCastException thrown when the specified argument is not
* a CollationKey. NullPointerException thrown when argument
* is null.
* @see #compareTo(CollationKey)
* @draft 2.2
*/
@ -167,22 +229,52 @@ public final class CollationKey implements Comparable
}
/**
* <p>Compare this CollationKey and the target CollationKey for equality.
* <p>
* Compare this CollationKey and the argument target object for equality.
* The collation
* rules of the Collator object which created these objects are applied.
* </p>
* <p>The collation rules of the Collator object which created these keys
* are applied.</p>
* <p><strong>Note:</strong> CollationKeys created by different Collators
* can not be compared.</p>
* @param target the CollationKey to compare to.
* <p>
* See note in compareTo(CollationKey) for warnings of incorrect results
* </p>
* @param target the object to compare to.
* @return true if two objects are equal, false otherwise.
* @see #compareTo(CollationKey)
* @exception ClassCastException thrown when the specified argument is not
* a CollationKey. NullPointerException thrown when argument
* is null.
* @draft 2.2
*/
public boolean equals(Object target)
{
if (!(target instanceof CollationKey)) {
return false;
}
return equals((CollationKey)target);
}
/**
* <p>
* Compare this CollationKey and the argument target CollationKey for
* equality.
* The collation
* rules of the Collator object which created these objects are applied.
* </p>
* <p>
* See note in compareTo(CollationKey) for warnings of incorrect results
* </p>
* @param target the CollationKey to compare to.
* @return true if two objects are equal, false otherwise.
* @exception NullPointerException thrown when argument is null.
* @draft 2.2
*/
public boolean equals(CollationKey target)
{
if (this == target) {
return true;
}
if (target == null || !(target instanceof CollationKey)) {
if (target == null) {
return false;
}
CollationKey other = (CollationKey)target;
@ -200,12 +292,13 @@ public final class CollationKey implements Comparable
}
/**
* <p>Creates a hash code for this CollationKey. The hash value is
* calculated on the key itself, not the String from which the key was
* created. Thus if x and y are CollationKeys, then
* x.hashCode(x) == y.hashCode() if x.equals(y) is true. This allows
* language-sensitive comparison in a hash table.</p>
* <p>See the CollatinKey class description for an example.</p>
* <p>
* Creates a hash code for this CollationKey. The hash value is calculated
* on the key itself, not the String from which the key was created. Thus
* if x and y are CollationKeys, then x.hashCode(x) == y.hashCode()
* if x.equals(y) is true. This allows language-sensitive comparison in a
* hash table.
* </p>
* @return the hash value.
* @draft 2.2
*/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Collator.java,v $
* $Date: 2002/05/20 23:43:01 $
* $Revision: 1.6 $
* $Date: 2002/06/21 23:56:44 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -15,57 +15,103 @@ package com.ibm.icu.text;
import java.util.Locale;
/**
* <p>The Collator class performs locale-sensitive String comparison.
* You use this class to build searching and sorting routines for natural
* language text.</p>
* <p>Collator is an abstract base class. Subclasses implement specific
* collation strategies. One subclass, RuleBasedCollator, is currently
* provided and is applicable to a wide set of languages. Other subclasses
* may be created to handle more specialized needs.</p>
* <p>Like other locale-sensitive classes, you can use the static factory
* method, getInstance, to obtain the appropriate Collator object for a given
* locale. You will only need to look at the subclasses of Collator if you need
* to understand the details of a particular collation strategy or if you need
* to modify that strategy. </p>
* <p>The following example shows how to compare two strings using the Collator
* for the default locale.
* <p>
* Collator is an abstract base class, its subclasses performs
* locale-sensitive String comparison. A concrete subclass, RuleBasedCollator,
* is provided and it allows customization of the collation ordering by the use
* of rule sets.
* </p>
* <p>
* Following the
* <a href=http://www.unicode.org>Unicode Consortium</a>'s specifications for
* the <a href=http://www.unicode.org/unicode/reports/tr10/>
* Unicode Collation Algorithm (UCA)</a>, there are
* 5 different levels of strength used in comparisons.
* <ul>
* <li>PRIMARY strength: Typically, this is used to denote differences between
* base characters (for example, "a" &lt; "b").
* It is the strongest difference. For example, dictionaries are divided
* into different sections by base character.
* <li>SECONDARY strength: Accents in the characters are considered secondary
* differences (for example, "as" &lt; "&agrave;s" &lt; "at"). Other
* differences
* between letters can also be considered secondary differences, depending
* on the language. A secondary difference is ignored when there is a
* primary difference anywhere in the strings.
* <li>TERTIARY strength: Upper and lower case differences in characters are
* distinguished at tertiary strength (for example, "ao" &lt; "Ao" &lt;
* "a&ograve;"). In addition, a variant of a letter differs from the base
* form on the tertiary strength (such as "A" and "&#9398;"). Another
* example is the
* difference between large and small Kana. A tertiary difference is ignored
* when there is a primary or secondary difference anywhere in the strings.
* <li>QUATERNARY strength: When punctuation is ignored
* <a href=http://www-124.ibm.com/icu/userguide/Collate_Concepts.html#Ignoring_Punctuation>
* (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY
* strength, an additional strength level can
* be used to distinguish words with and without punctuation (for example,
* "ab" &lt; "a-b" &lt; "aB").
* This difference is ignored when there is a PRIMARY, SECONDARY or TERTIARY
* difference. The QUATERNARY strength should only be used if ignoring
* punctuation is required.
* <li>IDENTICAL strength:
* When all other strengths are equal, the IDENTICAL strength is used as a
* tiebreaker. The Unicode code point values of the NFD form of each string
* are compared, just in case there is no difference.
* For example, Hebrew cantellation marks are only distinguished at this
* strength. This strength should be used sparingly, as only code point
* values differences between two strings is an extremely rare occurrence.
* Using this strength substantially decreases the performance for both
* comparison and collation key generation APIs. This strength also
* increases the size of the collation key.
* </ul>
* Unlike the JDK, ICU4J's Collator deals only with 2 decomposition modes,
* the canonical decomposition mode and one that does not use any decomposition.
* The compatibility decomposition mode, java.text.Collator.FULL_DECOMPOSITION
* is not supported here. If the canonical
* decomposition mode is set, the Collator handles un-normalized text properly,
* producing the same results as if the text were normalized in NFD. If
* canonical decomposition is turned off, it is the user's responsibility to
* ensure that all text is already in the appropriate form before performing
* a comparison or before getting a CollationKey.
* </p>
* <p>
* For more information about the collation service see the
* <a href="http://oss.software.ibm.com/icu/userguide/Collate_Intro.html">users
* guide</a>.
* </p>
* <p>
* Examples of use
* <pre>
* // Compare two strings in the default locale
* Collator myCollator = Collator.getInstance();
* if (myCollator.compare("abc", "ABC") < 0) {
* System.out.println("abc is less than ABC");
* }
* else {
* System.out.println("abc is greater than or equal to ABC");
* }
* </pre>
* <p>You can set a <code>Collator</code>'s <em>strength</em> property to
* determine the level of difference considered significant in comparisons.
* Four strengths are provided: <code>PRIMARY</code>, <code>SECONDARY</code>,
* <code>TERTIARY</code>, and <code>IDENTICAL</code>. The exact assignment of
* strengths to language features is locale dependant. For example, in Czech,
* "e" and "f" are considered primary differences, while "e" and "\u00EA" are
* secondary differences, "e" and "E" are tertiary differences and "e" and "e"
* are identical. The following shows how both case and accents could be
* ignored for US English.</p>
* <pre>
* //Get the Collator for US English and set its strength to PRIMARY
* // Get the Collator for US English and set its strength to PRIMARY
* Collator usCollator = Collator.getInstance(Locale.US);
* usCollator.setStrength(Collator.PRIMARY);
* if (usCollator.compare("abc", "ABC") == 0) {
* System.out.println("Strings are equivalent");
* }
*
* The following example shows how to compare two strings using the Collator
* for the default locale.
* // Compare two strings in the default locale
* Collator myCollator = Collator.getInstance();
* myCollator.setDecomposition(NO_DECOMPOSITION);
* if (myCollator.compare("&agrave;&#92;u0325", "a&#92;u0325&#768;") != 0) {
* System.out.println("&agrave;&#92;u0325 is not equals to a&#92;u0325&#768; without decomposition");
* myCollator.setDecomposition(CANONICAL_DECOMPOSITION);
* if (myCollator.compare("&agrave;&#92;u0325", "a&#92;u0325&#768;") != 0) {
* System.out.println("Error: &agrave;&#92;u0325 should be equals to a&#92;u0325&#768; with decomposition");
* }
* else {
* System.out.println("&agrave;&#92;u0325 is equals to a&#92;u0325&#768; with decomposition");
* }
* }
* else {
* System.out.println("Error: &agrave;&#92;u0325 should be not equals to a&#92;u0325&#768; without decomposition");
* }
* </pre>
* <p>For comparing Strings exactly once, the compare method provides the best
* performance. When sorting a list of Strings however, it is generally
* necessary to compare each String multiple times. In this case,
* CollationKeys provide better performance. The CollationKey class converts a
* String to a series of bits that can be compared bitwise against other
* CollationKeys. A CollationKey is created by a Collator object for a given
* String.</p>
* <p>Note: CollationKeys from different Collators can not be compared. See the
* class description for CollationKey for an example using CollationKeys.
* </p>
* @see RuleBasedCollator
* @see CollationKey
* @author Syn Wee Quek
* @since release 2.2, April 18 2002
* @draft 2.2
@ -76,92 +122,92 @@ public abstract class Collator
// public data members ---------------------------------------------------
/**
* Collator strength value. When set, only PRIMARY differences are
* considered significant during comparison. The assignment of strengths
* to language features is locale dependant. A common example is for
* different base letters ("a" vs "b") to be considered a PRIMARY
* difference.
* Strongest collator strength value. Typically, used to denote differences
* between base characters.
* See class documentation for more explanation.
* @see #setStrength
* @see #getStrength
* @draft 2.2
*/
public final static int PRIMARY
= RuleBasedCollator.AttributeValue.PRIMARY_;
public final static int PRIMARY = 0;
/**
* Collator strength value. When set, only SECONDARY and above
* differences are considered significant during comparison. The
* assignment of strengths to language features is locale dependant. A
* common example is for different accented forms of the same base letter
* ("a" vs "\u00E4") to be considered a SECONDARY difference.
* Second level collator strength value.
* Accents in the characters are considered secondary differences.
* Other differences between letters can also be considered secondary
* differences, depending on the language.
* See class documentation for more explanation.
* @see #setStrength
* @see #getStrength
* @draft 2.2
*/
public final static int SECONDARY
= RuleBasedCollator.AttributeValue.SECONDARY_;
public final static int SECONDARY = 1;
/**
* Collator strength value. When set, only TERTIARY and above differences
* are considered significant during comparison. The assignment of
* strengths to language features is locale dependant. A common example is
* for case differences ("a" vs "A") to be considered a TERTIARY
* difference.
* Third level collator strength value.
* Upper and lower case differences in characters are distinguished at this
* strength level. In addition, a variant of a letter differs from the base
* form on the tertiary level.
* See class documentation for more explanation.
* @see #setStrength
* @see #getStrength
* @draft 2.2
*/
public final static int TERTIARY
= RuleBasedCollator.AttributeValue.TERTIARY_;
public final static int TERTIARY = 2;
/**
* Collator strength value. When set, only QUARTENARY and above differences
* are considered significant during comparison. The assignment of
* strengths to language features is locale dependant.
* difference.
* Fourth level collator strength value.
* When punctuation is ignored
* <a href=http://www-124.ibm.com/icu/userguide/Collate_Concepts.html#Ignoring_Punctuation>
* (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY
* strength, an additional strength level can
* be used to distinguish words with and without punctuation
* See class documentation for more explanation.
* @see #setStrength
* @see #getStrength
* @draft 2.2
*/
public final static int QUATERNARY
= RuleBasedCollator.AttributeValue.QUATERNARY_;
public final static int QUATERNARY = 3;
/**
* <p>Collator strength value. When set, all differences are considered
* significant during comparison. The assignment of strengths to language
* features is locale dependant. A common example is for control
* characters ("&#092;u0001" vs "&#092;u0002") to be considered equal at
* the PRIMARY, SECONDARY, and TERTIARY levels but different at the
* IDENTICAL level. Additionally, differences between pre-composed
* accents such as "&#092;u00C0" (A-grave) and combining accents such as
* "A&#092;u0300" (A, combining-grave) will be considered significant at
* the tertiary level if decomposition is set to NO_DECOMPOSITION.
* <p>
* Smallest Collator strength value. When all other strengths are equal,
* the IDENTICAL strength is used as a tiebreaker. The Unicode code point
* values of the NFD form of each string are compared, just in case there
* is no difference.
* See class documentation for more explanation.
* </p>
* <p>
* Note this value is different from JDK's
* </p>
* <p>Note this value is different from JDK's</p>
* @draft 2.2
*/
public final static int IDENTICAL
= RuleBasedCollator.AttributeValue.IDENTICAL_;
public final static int IDENTICAL = 15;
/**
* <p>Decomposition mode value. With NO_DECOMPOSITION set, accented
* characters will not be decomposed for collation. This is the default
* setting and provides the fastest collation but will only produce
* correct results for languages that do not use accents.</p>
* <p>Note this value is different from JDK's</p>
* <p>
* Decomposition mode value. With NO_DECOMPOSITION set, Strings will not be
* decomposed for collation. This is the default
* decomposition setting unless otherwise specified by the locale used
* to create the Collator.
* </p>
* <p>
* Note this value is different from JDK's
* </p>
* @see #CANONICAL_DECOMPOSITION
* @see #getDecomposition
* @see #setDecomposition
* @draft 2.2
*/
public final static int NO_DECOMPOSITION
= RuleBasedCollator.AttributeValue.OFF_;
public final static int NO_DECOMPOSITION = 16;
/**
* <p>Decomposition mode value. With CANONICAL_DECOMPOSITION set,
* <p>
* Decomposition mode value. With CANONICAL_DECOMPOSITION set,
* characters that are canonical variants according to Unicode 2.0 will be
* decomposed for collation. This should be used to get correct collation
* of accented characters.</p>
* <p>CANONICAL_DECOMPOSITION corresponds to Normalization Form D as
* decomposed for collation.
* </p>
* <p>
* CANONICAL_DECOMPOSITION corresponds to Normalization Form D as
* described in <a href="http://www.unicode.org/unicode/reports/tr15/">
* Unicode Technical Report #15</a>.</p>
* Unicode Technical Report #15</a>.
* </p>
* @see #NO_DECOMPOSITION
* @see #getDecomposition
* @see #setDecomposition
* @draft 2.2
@ -173,9 +219,15 @@ public abstract class Collator
// public setters --------------------------------------------------------
/**
* <p>Sets this Collator's strength property. The strength property
* <p>
* Sets this Collator's strength property. The strength property
* determines the minimum level of difference considered significant
* during comparison.</p>
* during comparison.
* </p>
* <p>
* The default strength for the Collator is TERTIARY, unless specified
* otherwise by the locale used to create the Collator.
* </p>
* <p>See the Collator class description for an example of use.</p>
* @param the new strength value.
* @see #getStrength
@ -185,10 +237,11 @@ public abstract class Collator
* @see #QUATERNARY
* @see #IDENTICAL
* @exception IllegalArgumentException If the new strength value is not one
* of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
* of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
* @draft 2.2
*/
public void setStrength(int newStrength) {
public void setStrength(int newStrength)
{
if ((newStrength != PRIMARY) &&
(newStrength != SECONDARY) &&
(newStrength != TERTIARY) &&
@ -200,18 +253,38 @@ public abstract class Collator
}
/**
* Set the decomposition mode of this Collator. See getDecomposition
* for a description of decomposition mode.
* <p>
* Set the decomposition mode of this Collator.
* Setting this decomposition property with CANONICAL_DECOMPOSITION allows
* the Collator to handle
* un-normalized text properly, producing the same results as if the text
* were normalized. If NO_DECOMPOSITION is set, it is the user's
* responsibility to insure that all text is already in the appropriate
* form before a comparison or before getting a CollationKey. Adjusting
* decomposition mode allows the user to select between faster and more
* complete collation behavior.
* </p>
* <p>
* Since a great majority of the world languages does not require text
* normalization, most locales has NO_DECOMPOSITION has the default
* decomposition mode.
* <p>
* The default decompositon mode for the Collator is NO_DECOMPOSITON,
* unless specified otherwise by the locale used to create the Collator.
* </p>
* <p>
* See getDecomposition for a description of decomposition mode.
* </p>
* @param decomposition the new decomposition mode
* @see #getDecomposition
* @see #NO_DECOMPOSITION
* @see #CANONICAL_DECOMPOSITION
* @see #FULL_DECOMPOSITION
* @exception IllegalArgumentException If the given value is not a valid decomposition
* mode.
* @exception IllegalArgumentException If the given value is not a valid
* decomposition mode.
* @draft 2.2
*/
public void setDecomposition(int decomposition) {
public void setDecomposition(int decomposition)
{
if ((decomposition != NO_DECOMPOSITION) &&
(decomposition != CANONICAL_DECOMPOSITION)) {
throw new IllegalArgumentException("Wrong decomposition mode.");
@ -225,9 +298,11 @@ public abstract class Collator
* Gets the Collator for the current default locale.
* The default locale is determined by java.util.Locale.getDefault().
* @return the Collator for the default locale (for example, en_US) if it
* is created successfully, otherwise if there is a failure,
* null will be returned.
* is created successfully. Otherwise if there is no Collator
* associated with the current locale, the default UCA collator
* will be returned.
* @see java.util.Locale#getDefault
* @see #getInstance(Locale)
* @draft 2.2
*/
public static final Collator getInstance()
@ -238,11 +313,13 @@ public abstract class Collator
/**
* Gets the Collator for the desired locale.
* @param locale the desired locale.
* @return Collator for the desired locale if it is created successfully,
* otherwise if there is a failure, the default UCA collator will
* be returned.
* @return Collator for the desired locale if it is created successfully.
* Otherwise if there is no Collator
* associated with the current locale, the default UCA collator
* will be returned.
* @see java.util.Locale
* @see java.util.ResourceBundle
* @see #getInstance()
* @draft 2.2
*/
public static final Collator getInstance(Locale locale)
@ -256,15 +333,19 @@ public abstract class Collator
}
/**
* <p>Returns this Collator's strength property. The strength property
* determines the minimum level of difference considered significant
* during comparison.</p>
* <p>See the Collator class description for an example of use.</p>
* <p>
* Returns this Collator's strength property. The strength property
* determines the minimum level of difference considered significant.
* </p>
* <p>
* See the Collator class description for more details.
* </p>
* @return this Collator's current strength property.
* @see #setStrength
* @see #PRIMARY
* @see #SECONDARY
* @see #TERTIARY
* @see #QUATERNARY
* @see #IDENTICAL
* @draft 2.2
*/
@ -274,24 +355,17 @@ public abstract class Collator
}
/**
* <p>Get the decomposition mode of this Collator. Decomposition mode
* determines how Unicode composed characters are handled. Adjusting
* decomposition mode allows the user to select between faster and more
* complete collation behavior.
* <p>The three values for decomposition mode are:
* <UL>
* <LI>NO_DECOMPOSITION,
* <LI>CANONICAL_DECOMPOSITION
* <LI>FULL_DECOMPOSITION.
* </UL>
* See the documentation for these three constants for a description
* of their meaning.
* <p>
* Get the decomposition mode of this Collator. Decomposition mode
* determines how Unicode composed characters are handled.
* </p>
* <p>
* See the Collator class description for more details.
* </p>
* @return the decomposition mode
* @see #setDecomposition
* @see #NO_DECOMPOSITION
* @see #CANONICAL_DECOMPOSITION
* @see #FULL_DECOMPOSITION
* @draft 2.2
*/
public int getDecomposition()
@ -302,91 +376,68 @@ public abstract class Collator
// public other methods -------------------------------------------------
/**
* Convenience method for comparing the equality of two strings based on
* this Collator's collation rules.
* Convenience method for comparing the equality of two text Strings based
* on this Collator's collation rules, strength and decomposition mode.
* @param source the source string to be compared with.
* @param target the target string to be compared with.
* @return true if the strings are equal according to the collation
* rules. false, otherwise.
* @see #compare
* @exception NullPointerException thrown if either arguments is null.
* @draft 2.2
*/
public boolean equals(String source, String target)
public boolean equals(String source, String target)
{
return (compare(source, target) == 0);
}
/**
* Cloning this Collator.
* @return a cloned Collator of this object
* @draft 2.2
*/
public Object clone()
{
try {
return (Collator)super.clone();
} catch (CloneNotSupportedException e) {
throw new InternalError();
}
}
/**
* Compares the equality of two Collators.
* @param that the Collator to be compared with this.
* @return true if this Collator is the same as that Collator;
* false otherwise.
* false otherwise.
* @draft 2.2
*/
public boolean equals(Object that)
{
if (this == that) {
return true;
}
if (that == null || getClass() != that.getClass()) {
return false;
}
Collator other = (Collator) that;
return ((m_strength_ == other.m_strength_) &&
(m_decomposition_ == other.m_decomposition_));
}
public abstract boolean equals(Object that);
// public abstract methods -----------------------------------------------
/**
* Generates the hash code for this Collator.
* Generates a unique hash code for this Collator.
* @draft 2.2
* @return 32 bit unique hash code
*/
public abstract int hashCode();
/**
* <p>Compares the source string to the target string according to the
* collation rules for this Collator. Returns an integer less than, equal
* to or greater than zero depending on whether the source String is less
* than, equal to or greater than the target string. See the Collator
* class description for an example of use.</p>
* <p>For a one time comparison, this method has the best performance. If
* a given String will be involved in multiple comparisons,
* CollationKey.compareTo() has the best performance. See the Collator
* class description for an example using CollationKeys.</p>
* @param source the source string.
* @param target the target string.
* <p>
* Compares the source text String to the target text String according to
* the collation rules, strength and decomposition mode for this Collator.
* Returns an integer less than,
* equal to or greater than zero depending on whether the source String is
* less than, equal to or greater than the target String. See the Collator
* class description for an example of use.
* </p>
* @param source the source String.
* @param target the target String.
* @return Returns an integer value. Value is less than zero if source is
* less than target, value is zero if source and target are equal,
* value is greater than zero if source is greater than target.
* @see CollationKey
* @see #getCollationKey
* @exception NullPointerException thrown if either arguments is null.
* @draft 2.2
*/
public abstract int compare(String source, String target);
/**
* <p>Transforms the String into a series of bits that can be compared
* bitwise to other CollationKeys. CollationKeys provide better
* performance than Collator.compare() when Strings are involved in
* multiple comparisons.</p>
* <p>See the Collator class description for an example using
* CollationKeys.</p>
* @param source the string to be transformed into a collation key.
* <p>
* Transforms the String into a series of bits that can be compared
* bitwise to other CollationKeys. Bits generated depends on the collation
* rules, strength and decomposition mode.
* </p>
* <p>See the CollationKey class documentation for more information.</p>
* @param source the string to be transformed into a CollationKey.
* @return the CollationKey for the given String based on this Collator's
* collation rules. If the source String is null, a null
* CollationKey is returned.
@ -396,35 +447,18 @@ public abstract class Collator
*/
public abstract CollationKey getCollationKey(String source);
// protected data members ------------------------------------------------
// protected constructor -------------------------------------------------
// private data members --------------------------------------------------
/**
* Collation strength
*/
protected int m_strength_;
private int m_strength_ = TERTIARY;
/**
* Decomposition mode
*/
protected int m_decomposition_;
// protected constructor -------------------------------------------------
/**
* <p>Protected constructor for use by subclasses.
* Public access to creating Collators is handled by the API getInstance().
* </p>
* @draft 2.2
*/
protected Collator() throws Exception
{
m_strength_ = TERTIARY;
m_decomposition_ = CANONICAL_DECOMPOSITION;
}
// protected methods -----------------------------------------------------
// private variables -----------------------------------------------------
// private methods -------------------------------------------------------
private int m_decomposition_ = CANONICAL_DECOMPOSITION;
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollatorReader.java,v $
* $Date: 2002/05/16 20:04:49 $
* $Revision: 1.2 $
* $Date: 2002/06/21 23:56:47 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -140,26 +140,28 @@ final class CollatorReader
* @exception IOException thrown when there's a data error.
* @draft 2.2
*/
public void readOptions(RuleBasedCollator rbc) throws IOException
protected void readOptions(RuleBasedCollator rbc) throws IOException
{
rbc.m_variableTopValue_ = m_dataInputStream_.readInt();
rbc.setAttributeDefault(RuleBasedCollator.Attribute.FRENCH_COLLATION_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(
RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_FIRST_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_LEVEL_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(
RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(RuleBasedCollator.Attribute.STRENGTH_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(
RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
m_dataInputStream_.readInt());
rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_);
rbc.m_defaultIsAlternateHandlingShifted_
= (m_dataInputStream_.readInt() ==
RuleBasedCollator.AttributeValue.SHIFTED_);
rbc.m_defaultCaseFirst_ = m_dataInputStream_.readInt();
rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_);
int value = m_dataInputStream_.readInt();
if (value == RuleBasedCollator.AttributeValue.ON_) {
value = Collator.CANONICAL_DECOMPOSITION;
}
else {
value = Collator.NO_DECOMPOSITION;
}
rbc.m_defaultDecomposition_ = value;
rbc.m_defaultStrength_ = m_dataInputStream_.readInt();
rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_);
}
/**
@ -169,7 +171,7 @@ final class CollatorReader
* @exception IOException thrown when there's a data error.
* @draft 2.2
*/
public void read(RuleBasedCollator rbc) throws IOException
protected void read(RuleBasedCollator rbc) throws IOException
{
readHeader(rbc);
readOptions(rbc);
@ -188,7 +190,8 @@ final class CollatorReader
for (int i = 0; i < m_contractionCESize_; i ++) {
rbc.m_contractionCE_[i] = m_dataInputStream_.readInt();
}
rbc.m_trie_ = new IntTrie(m_dataInputStream_, rbc);
rbc.m_trie_ = new IntTrie(m_dataInputStream_,
RuleBasedCollator.DataManipulate.getInstance());
if (!rbc.m_trie_.isLatin1Linear()) {
throw new IOException("Data corrupted, "
+ "Collator Tries expected to have linear "
@ -213,6 +216,43 @@ final class CollatorReader
}
}
/**
* Reads in the inverse uca data
* @param input input stream with the inverse uca data
* @return an object containing the inverse uca data
* @exception IOException thrown when error occurs while reading the
* inverse uca
*/
protected static CollationParsedRuleBuilder.InverseUCA readInverseUCA(
InputStream inputStream)
throws IOException
{
ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_,
DATA_FORMAT_VERSION_, UNICODE_VERSION_);
CollationParsedRuleBuilder.InverseUCA result =
new CollationParsedRuleBuilder.InverseUCA();
DataInputStream input = new DataInputStream(inputStream);
int bytesize = input.readInt();
int tablesize = input.readInt(); // in int size
int contsize = input.readInt(); // in char size
int table = input.readInt(); // in bytes
int conts = input.readInt(); // in bytes
int size = tablesize * 3; // one column for each strength
result.m_table_ = new int[size];
result.m_continuations_ = new char[contsize];
for (int i = 0; i < size; i ++) {
result.m_table_[i] = input.readInt();
}
for (int i = 0; i < contsize; i ++) {
result.m_continuations_[i] = input.readChar();
}
input.close();
return result;
}
// private inner class -----------------------------------------------
// private variables -------------------------------------------------
/**
@ -231,6 +271,14 @@ final class CollatorReader
private static final byte UNICODE_VERSION_[] = {(byte)0x3, (byte)0x0,
(byte)0x0, (byte)0x0};
/**
* Inverse UCA file format version and id that this class understands.
* No guarantees are made if a older version is used
*/
private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = {(byte)0x49,
(byte)0x6e,
(byte)0x76,
(byte)0x43};
/**
* Corrupted error string
*/
private static final String CORRUPTED_DATA_ERROR_ =

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff