From 1a8abc5b66b04ec41f5848da76b7455725d4b44e Mon Sep 17 00:00:00 2001 From: Syn Wee Quek Date: Wed, 8 Oct 2003 21:51:44 +0000 Subject: [PATCH] ICU-2212 Utilized UCharacterIterator in Collation X-SVN-Rev: 13361 --- .../dev/test/collator/CollationAPITest.java | 3 +- .../test/collator/CollationIteratorTest.java | 13 +- .../icu/impl/StringUCharacterIterator.java | 263 ++++++++++++++++++ .../icu/text/CollationElementIterator.java | 247 +++++++++------- .../com/ibm/icu/text/RuleBasedCollator.java | 32 ++- icu4j/src/com/ibm/icu/text/StringSearch.java | 8 +- 6 files changed, 458 insertions(+), 108 deletions(-) create mode 100644 icu4j/src/com/ibm/icu/impl/StringUCharacterIterator.java diff --git a/icu4j/src/com/ibm/icu/dev/test/collator/CollationAPITest.java b/icu4j/src/com/ibm/icu/dev/test/collator/CollationAPITest.java index 923daf5c77..dcfbc5e39a 100644 --- a/icu4j/src/com/ibm/icu/dev/test/collator/CollationAPITest.java +++ b/icu4j/src/com/ibm/icu/dev/test/collator/CollationAPITest.java @@ -353,7 +353,8 @@ public class CollationAPITest extends TestFmwk { CharacterIterator chariter=new StringCharacterIterator(testString1); // copy ctor CollationElementIterator iterator2 = ((RuleBasedCollator)col).getCollationElementIterator(chariter); - CollationElementIterator iterator3 = ((RuleBasedCollator)col).getCollationElementIterator(testString2); + UCharacterIterator uchariter=UCharacterIterator.getInstance(testString2); + CollationElementIterator iterator3 = ((RuleBasedCollator)col).getCollationElementIterator(uchariter); int offset = 0; offset = iterator1.getOffset(); diff --git a/icu4j/src/com/ibm/icu/dev/test/collator/CollationIteratorTest.java b/icu4j/src/com/ibm/icu/dev/test/collator/CollationIteratorTest.java index 619e415770..210a1d3128 100644 --- a/icu4j/src/com/ibm/icu/dev/test/collator/CollationIteratorTest.java +++ b/icu4j/src/com/ibm/icu/dev/test/collator/CollationIteratorTest.java @@ -426,12 +426,23 @@ public class CollationIteratorTest extends TestFmwk { //now use the overloaded setText(ChracterIterator&, UErrorCode) function to set the text CharacterIterator chariter = new StringCharacterIterator(test1); try { - iter2.setText(chariter); + iter2.setText(chariter); } catch (Exception e ) { errln("call to iter2->setText(chariter(test1)) failed."); return; } assertEqual(iter1, iter2); + + iter1.reset(); + //now use the overloaded setText(ChracterIterator&, UErrorCode) function to set the text + UCharacterIterator uchariter = UCharacterIterator.getInstance(test1); + try { + iter2.setText(uchariter); + } catch (Exception e ) { + errln("call to iter2->setText(uchariter(test1)) failed."); + return; + } + assertEqual(iter1, iter2); } /** diff --git a/icu4j/src/com/ibm/icu/impl/StringUCharacterIterator.java b/icu4j/src/com/ibm/icu/impl/StringUCharacterIterator.java new file mode 100644 index 0000000000..8c9501985a --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/StringUCharacterIterator.java @@ -0,0 +1,263 @@ +/* + ******************************************************************************* + * Copyright (C) 1996-2000, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/StringUCharacterIterator.java,v $ + * $Date: 2003/10/08 21:51:43 $ + * $Revision: 1.1 $ + * + ******************************************************************************* + */ +package com.ibm.icu.impl; + +import com.ibm.icu.text.UCharacterIterator; +import com.ibm.icu.text.UTF16; + +/** + * Used by Collation. UCharacterIterator on Strings. Can't use + * ReplaceableUCharacterIterator because it is not easy to do a fast setText. + * @author synwee + */ +public final class StringUCharacterIterator extends UCharacterIterator +{ + + // public constructor ------------------------------------------------------ + + /** + * Public constructor + * @param str text which the iterator will be based on + */ + public StringUCharacterIterator(String str) + { + if (str == null) { + throw new IllegalArgumentException(); + } + m_text_ = str; + m_currentIndex_ = 0; + } + + /** + * Public default constructor + */ + public StringUCharacterIterator() + { + m_text_ = ""; + m_currentIndex_ = 0; + } + + // public methods ---------------------------------------------------------- + + /** + * Creates a copy of this iterator, does not clone the underlying + * Stringobject + * @return copy of this iterator + */ + public Object clone() + { + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + return null; // never invoked + } + } + + /** + * Returns the current UTF16 character. + * @return current UTF16 character + */ + public int current() + { + if (m_currentIndex_ < m_text_.length()) { + return m_text_.charAt(m_currentIndex_); + } + return DONE; + } + + /** + * Returns the current codepoint + * @return current codepoint + */ + public int currentCodePoint() + { + // cannot use charAt due to it different + // behaviour when index is pointing at a + // trail surrogate, check for surrogates + + if (m_currentIndex_ >= m_text_.length()) { + return DONE; + } + char ch = m_text_.charAt(m_currentIndex_); + if (UTF16.isLeadSurrogate(ch)) { + // advance the index to get the next code point + m_currentIndex_ ++; + if (m_currentIndex_ < m_text_.length()) { + // due to post increment semantics current() after next() + // actually returns the next char which is what we want + char ch2 = m_text_.charAt(m_currentIndex_); + + if (UTF16.isTrailSurrogate(ch2)) { + // we found a surrogate pair + return UCharacterProperty.getRawSupplementary(ch, ch2); + } + } + // current should never change the current index so back off + m_currentIndex_ --; + } + return ch; + } + + /** + * Returns the length of the text + * @return length of the text + */ + public int getLength() + { + return m_text_.length(); + } + + /** + * Gets the current currentIndex in text. + * @return current currentIndex in text. + */ + public int getIndex() + { + return m_currentIndex_; + } + + /** + * Returns next UTF16 character and increments the iterator's currentIndex + * by 1. + * If the resulting currentIndex is greater or equal to the text length, + * the currentIndex is reset to the text length and a value of DONE is + * returned. + * @return next UTF16 character in text or DONE if the new currentIndex is + * off the end of the text range. + */ + public int next() + { + if (m_currentIndex_ < m_text_.length()) + { + return m_text_.charAt(m_currentIndex_ ++); + } + return DONE; + } + + + /** + * Returns previous UTF16 character and decrements the iterator's + * currentIndex by 1. + * If the resulting currentIndex is less than 0, the currentIndex is reset + * to 0 and a value of DONE is returned. + * @return next UTF16 character in text or DONE if the new currentIndex is + * off the start of the text range. + */ + public int previous() + { + if (m_currentIndex_ > 0) { + return m_text_.charAt(-- m_currentIndex_); + } + return DONE; + } + + /** + *

Sets the currentIndex to the specified currentIndex in the text and + * returns that single UTF16 character at currentIndex. + * This assumes the text is stored as 16-bit code units.

+ * @param currentIndex the currentIndex within the text. + * @exception IllegalArgumentException is thrown if an invalid currentIndex + * is supplied. i.e. currentIndex is out of bounds. + * @return the character at the specified currentIndex or DONE if the + * specified currentIndex is equal to the end of the text. + */ + public void setIndex(int currentIndex) throws IndexOutOfBoundsException + { + if (currentIndex < 0 || currentIndex > m_text_.length()) { + throw new IndexOutOfBoundsException(); + } + m_currentIndex_ = currentIndex; + } + + /** + * Fills the buffer with the underlying text storage of the iterator + * If the buffer capacity is not enough a exception is thrown. The capacity + * of the fill in buffer should at least be equal to length of text in the + * iterator obtained by calling getLength()Usage: + * + * + *
+     *         UChacterIterator iter = new UCharacterIterator.getInstance(text);
+     *         char[] buf = new char[iter.getLength()];
+     *         iter.getText(buf);
+     *         
+     *         OR
+     *         char[] buf= new char[1];
+     *         int len = 0;
+     *         for(;;){
+     *             try{
+     *                 len = iter.getText(buf);
+     *                 break;
+     *             }catch(IndexOutOfBoundsException e){
+     *                 buf = new char[iter.getLength()];
+     *             }
+     *         }
+     * 
+ *
+ * + * @param fillIn an array of chars to fill with the underlying UTF-16 code + * units. + * @param offset the position within the array to start putting the data. + * @return the number of code units added to fillIn, as a convenience + * @exception IndexOutOfBounds exception if there is not enough + * room after offset in the array, or if offset < 0. + */ + public int getText(char[] fillIn, int offset) + { + int length = m_text_.length(); + if (offset < 0 || offset + length > fillIn.length) { + throw new IndexOutOfBoundsException(Integer.toString(length)); + } + m_text_.getChars(0, length, fillIn, offset); + return length; + } + + /** + * Convenience method for returning the underlying text storage as as + * string + * @return the underlying text storage in the iterator as a string + */ + public String getText() + { + return m_text_; + } + + /** + * Reset this iterator to point to a new string. This method is used by + * other classes that want to avoid allocating new + * ReplaceableCharacterIterator objects every time their setText method + * is called. + * @param text The String to be iterated over + */ + public void setText(String text) + { + if (text == null) { + throw new NullPointerException(); + } + m_text_ = text; + m_currentIndex_ = 0; + } + + // private data members ---------------------------------------------------- + + /** + * Text string object + */ + private String m_text_; + /** + * Current currentIndex + */ + private int m_currentIndex_; + +} diff --git a/icu4j/src/com/ibm/icu/text/CollationElementIterator.java b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java index 2a3dda8973..f088d277fc 100755 --- a/icu4j/src/com/ibm/icu/text/CollationElementIterator.java +++ b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java @@ -9,12 +9,17 @@ */ package com.ibm.icu.text; -import java.text.StringCharacterIterator; -import java.text.CharacterIterator; +/*** + * import java.text.StringCharacterIterator; + * import java.text.CharacterIterator; + */ import com.ibm.icu.impl.NormalizerImpl; import com.ibm.icu.impl.UCharacterProperty; -import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.impl.StringUCharacterIterator; +import com.ibm.icu.impl.CharacterIteratorWrapper; import com.ibm.icu.impl.ICUDebug; +import com.ibm.icu.lang.UCharacter; +import java.text.CharacterIterator; /** *

CollationElementIterator is an iterator created by @@ -218,7 +223,7 @@ public final class CollationElementIterator */ public void reset() { - m_source_.setIndex(m_source_.getBeginIndex()); + m_source_.setToStart(); updateInternalState(); } @@ -255,13 +260,13 @@ public final class CollationElementIterator m_CEBufferSize_ = 0; m_CEBufferOffset_ = 0; } - - char ch = nextChar(); - /* System.out.println("ch " + Integer.toHexString(ch) + " " + - Integer.toHexString(m_source_.current()));*/ - if (ch == CharacterIterator.DONE) { + + int ch_int = nextChar(); + + if (ch_int == UCharacterIterator.DONE) { return NULLORDER; } + char ch = (char)ch_int; if (m_collator_.m_isHiragana4_) { m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309e) && !(ch > 0x3094 && ch < 0x309d); @@ -325,7 +330,7 @@ public final class CollationElementIterator if (m_source_.getIndex() <= 0 && m_isForwards_) { // if iterator is new or reset, we can immediate perform backwards // iteration even when the offset is not right. - m_source_.setIndex(m_source_.getEndIndex()); + m_source_.setToLimit(); updateInternalState(); } m_isForwards_ = false; @@ -337,10 +342,11 @@ public final class CollationElementIterator m_CEBufferSize_ = 0; m_CEBufferOffset_ = 0; } - char ch = previousChar(); - if (ch == CharacterIterator.DONE) { + int ch_int = previousChar(); + if (ch_int == UCharacterIterator.DONE) { return NULLORDER; } + char ch = (char)ch_int; if (m_collator_.m_isHiragana4_) { m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f); } @@ -477,13 +483,14 @@ public final class CollationElementIterator public void setOffset(int offset) { m_source_.setIndex(offset); - char ch = m_source_.current(); - if (ch != CharacterIterator.DONE && m_collator_.isUnsafe(ch)) { + int ch_int = m_source_.current(); + char ch = (char)ch_int; + if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) { // if it is unsafe we need to check if it is part of a contraction // or a surrogate character if (UTF16.isTrailSurrogate(ch)) { // if it is a surrogate pair we move up one character - char prevch = m_source_.previous(); + char prevch = (char)m_source_.previous(); if (!UTF16.isLeadSurrogate(prevch)) { m_source_.setIndex(offset); // go back to the same index } @@ -495,7 +502,7 @@ public final class CollationElementIterator if (!m_collator_.isUnsafe(ch)) { break; } - ch = m_source_.previous(); + ch = (char)m_source_.previous(); } updateInternalState(); int prevoffset = 0; @@ -510,12 +517,12 @@ public final class CollationElementIterator // direction code to prevent next and previous from returning a // character if we are already at the ends offset = m_source_.getIndex(); - if (offset == m_source_.getBeginIndex()) { + if (offset == 0/* m_source_.getBeginIndex() */) { // preventing previous() from returning characters from the end of // the string again if we are at the beginning m_isForwards_ = false; } - else if (offset == m_source_.getEndIndex()) { + else if (offset == m_source_.getLength()) { // preventing next() from returning characters from the start of // the string again if we are at the end m_isForwards_ = true; @@ -535,6 +542,22 @@ public final class CollationElementIterator m_source_ = m_srcUtilIter_; updateInternalState(); } + + /** + *

Set a new source string iterator for iteration, and reset the + * offset to the beginning of the text. + *

+ *

The source iterator's integrity will be preserved since a new copy + * will be created for use.

+ * @param source the new source string iterator for iteration. + * @draft ICU 2.8 + */ + public void setText(UCharacterIterator source) + { + m_srcUtilIter_.setText(source.getText()); + m_source_ = m_srcUtilIter_; + updateInternalState(); + } /** *

Set a new source string iterator for iteration, and reset the @@ -545,8 +568,8 @@ public final class CollationElementIterator */ public void setText(CharacterIterator source) { - m_source_ = source; - m_source_.setIndex(m_source_.getBeginIndex()); + m_source_ = new CharacterIteratorWrapper(source); + m_source_.setToStart(); updateInternalState(); } @@ -568,10 +591,13 @@ public final class CollationElementIterator if (that instanceof CollationElementIterator) { CollationElementIterator thatceiter = (CollationElementIterator)that; - if (m_collator_.equals(thatceiter.m_collator_) - && m_source_.equals(thatceiter.m_source_)) { - return true; + if (!m_collator_.equals(thatceiter.m_collator_)) { + return false; } + // checks the text + return m_source_.getIndex() == thatceiter.m_source_.getIndex() + && m_source_.getText().equals( + thatceiter.m_source_.getText()); } return false; } @@ -591,7 +617,7 @@ public final class CollationElementIterator */ CollationElementIterator(String source, RuleBasedCollator collator) { - m_srcUtilIter_ = new StringCharacterIterator(source); + m_srcUtilIter_ = new StringUCharacterIterator(source); m_utilStringBuffer_ = new StringBuffer(); m_source_ = m_srcUtilIter_; m_collator_ = collator; @@ -615,9 +641,34 @@ public final class CollationElementIterator CollationElementIterator(CharacterIterator source, RuleBasedCollator collator) { - m_srcUtilIter_ = new StringCharacterIterator(""); + m_srcUtilIter_ = new StringUCharacterIterator(); m_utilStringBuffer_ = new StringBuffer(); - m_source_ = source; + m_source_ = new CharacterIteratorWrapper(source); + m_collator_ = collator; + m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_]; + m_buffer_ = new StringBuffer(); + m_utilSpecialBackUp_ = new Backup(); + updateInternalState(); + } + + /** + *

CollationElementIterator constructor. This takes a source + * character iterator and a RuleBasedCollator. The iterator will + * walk through the source string based on the rules defined by + * the collator. If the source string is empty, NULLORDER will be + * returned on the first call to next().

+ * + * @param source the source string iterator. + * @param collator the RuleBasedCollator + * @draft ICU 2.2 + */ + CollationElementIterator(UCharacterIterator source, + RuleBasedCollator collator) + { + m_srcUtilIter_ = new StringUCharacterIterator(); + m_utilStringBuffer_ = new StringBuffer(); + m_srcUtilIter_.setText(source.getText()); + m_source_ = m_srcUtilIter_; m_collator_ = collator; m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_]; m_buffer_ = new StringBuffer(); @@ -717,11 +768,11 @@ public final class CollationElementIterator * @param ch character to test * @return true if ch is a Thai prevowel, false otherwise */ - static final boolean isThaiPreVowel(char ch) + static final boolean isThaiPreVowel(int ch) { return (ch >= 0xe40 && ch <= 0xe44) || (ch >= 0xec0 && ch <= 0xec4); } - + /** *

Sets the iterator to point to the collation element corresponding to * the specified character (the parameter is a CHARACTER offset in the @@ -736,9 +787,10 @@ public final class CollationElementIterator * @param source the new source string iterator for iteration. * @param offset to the source */ - void setText(CharacterIterator source, int offset) + void setText(UCharacterIterator source, int offset) { - m_source_ = source; + m_srcUtilIter_.setText(source.getText()); + m_source_ = m_srcUtilIter_; m_source_.setIndex(offset); updateInternalState(); } @@ -796,7 +848,7 @@ public final class CollationElementIterator /** * Source string iterator */ - private CharacterIterator m_source_; + private UCharacterIterator m_source_; /** * This is position to the m_buffer_, -1 if iterator is not in m_buffer_ */ @@ -846,7 +898,7 @@ public final class CollationElementIterator /** * Utility */ - private StringCharacterIterator m_srcUtilIter_; + private StringUCharacterIterator m_srcUtilIter_; private StringBuffer m_utilStringBuffer_; private StringBuffer m_utilSkippedBuffer_; private CollationElementIterator m_utilColEIter_; @@ -950,7 +1002,7 @@ public final class CollationElementIterator m_CEBufferOffset_ = 0; m_CEBufferSize_ = 0; m_FCDLimit_ = -1; - m_FCDStart_ = m_source_.getEndIndex(); + m_FCDStart_ = m_source_.getLength(); m_isHiragana4_ = m_collator_.m_isHiragana4_; m_isForwards_ = true; } @@ -1022,8 +1074,7 @@ public final class CollationElementIterator m_buffer_.setLength(0); m_source_.setIndex(m_FCDStart_); for (int i = 0; i < size; i ++) { - m_buffer_.append(m_source_.current()); - m_source_.next(); + m_buffer_.append((char)m_source_.next()); } String decomp = Normalizer.decompose(m_buffer_.toString(), false); m_buffer_.setLength(0); @@ -1059,7 +1110,9 @@ public final class CollationElementIterator // trie access char fcd = NormalizerImpl.getFCD16(ch); if (fcd != 0 && UTF16.isLeadSurrogate(ch)) { - ch = m_source_.next(); // CharacterIterator.DONE has 0 fcd + m_source_.next(); + ch = (char)m_source_.current(); + // UCharacterIterator.DONE has 0 fcd if (UTF16.isTrailSurrogate(ch)) { fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch); } else { @@ -1073,14 +1126,17 @@ public final class CollationElementIterator // The current char has a non-zero trailing CC. Scan forward until // we find a char with a leading cc of zero. while (true) { - ch = m_source_.next(); - if (ch == CharacterIterator.DONE) { + m_source_.next(); + int ch_int = m_source_.current(); + if (ch_int == UCharacterIterator.DONE) { break; } + ch = (char)ch_int; // trie access fcd = NormalizerImpl.getFCD16(ch); if (fcd != 0 && UTF16.isLeadSurrogate(ch)) { - ch = m_source_.next(); + m_source_.next(); + ch = (char)m_source_.current(); if (UTF16.isTrailSurrogate(ch)) { fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch); } else { @@ -1112,9 +1168,9 @@ public final class CollationElementIterator *

Offsets are returned at the next character.

* @return next fcd character */ - private char nextChar() + private int nextChar() { - char result; + int result; // loop handles the next character whether it is in the buffer or not. if (m_bufferOffset_ < 0) { @@ -1147,8 +1203,9 @@ public final class CollationElementIterator if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) { // We need to peek at the next character in order to tell if we are // FCD - char next = m_source_.next(); - if (next == CharacterIterator.DONE + m_source_.next(); + int next = m_source_.current(); + if (next == UCharacterIterator.DONE || next <= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) { return result; // end of source string and if next character // starts with a base character is always fcd. @@ -1156,7 +1213,7 @@ public final class CollationElementIterator } // Need a more complete FCD check and possible normalization. - if (!FCDCheck(result, startoffset)) { + if (!FCDCheck((char)result, startoffset)) { normalize(); result = m_buffer_.charAt(0); m_bufferOffset_ = 1; @@ -1206,7 +1263,7 @@ public final class CollationElementIterator else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) { // note trail surrogate characters gets 0 fcd char trailch = ch; - ch = m_source_.previous(); + ch = (char)m_source_.previous(); if (UTF16.isLeadSurrogate(ch)) { fcd = NormalizerImpl.getFCD16(ch); if (fcd != 0) { @@ -1228,13 +1285,13 @@ public final class CollationElementIterator if (offset == 0) { break; } - ch = m_source_.previous(); + ch = (char)m_source_.previous(); if (!UTF16.isSurrogate(ch)) { fcd = NormalizerImpl.getFCD16(ch); } else if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) { char trail = ch; - ch = m_source_.previous(); + ch = (char)m_source_.previous(); if (UTF16.isLeadSurrogate(ch)) { fcd = NormalizerImpl.getFCD16(ch); } @@ -1270,7 +1327,7 @@ public final class CollationElementIterator *

Offsets are returned at the current character.

* @return previous fcd character */ - private char previousChar() + private int previousChar() { if (m_bufferOffset_ >= 0) { m_bufferOffset_ --; @@ -1280,10 +1337,10 @@ public final class CollationElementIterator else { // At the start of buffer, route back to string. m_buffer_.setLength(0); - if (m_FCDStart_ == m_source_.getBeginIndex()) { + if (m_FCDStart_ == 0) { m_FCDStart_ = -1; - m_source_.setIndex(m_source_.getBeginIndex()); - return CharacterIterator.DONE; + m_source_.setIndex(0); + return UCharacterIterator.DONE; } else { m_FCDLimit_ = m_FCDStart_; @@ -1292,21 +1349,21 @@ public final class CollationElementIterator } } } - char result = m_source_.previous(); + int result = m_source_.previous(); int startoffset = m_source_.getIndex(); if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) { return result; } - char ch = m_source_.previous(); + int ch = m_source_.previous(); if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) { // if previous character is FCD m_source_.next(); return result; } // Need a more complete FCD check and possible normalization. - if (!FCDCheckBackwards(result, startoffset)) { + if (!FCDCheckBackwards((char)result, startoffset)) { normalizeBackwards(); m_bufferOffset_ --; result = m_buffer_.charAt(m_bufferOffset_); @@ -1340,10 +1397,10 @@ public final class CollationElementIterator } else { // at end of buffer. check if fcd is at the end - return m_FCDLimit_ == m_source_.getEndIndex(); + return m_FCDLimit_ == m_source_.getLength(); } } - return m_source_.getEndIndex() == m_source_.getIndex(); + return m_source_.getLength() == m_source_.getIndex(); } /** @@ -1408,12 +1465,12 @@ public final class CollationElementIterator // Note: this operation might activate the normalization buffer. We have to check for // that and act accordingly. m_FCDStart_ = m_source_.getIndex() - 1; - char thCh = nextChar(); + char thCh = (char)nextChar(); int cp = thCh; if (UTF16.isLeadSurrogate(thCh)) { if (!isEnd()) { backupInternalState(m_utilSpecialBackUp_); - char trailCh = nextChar(); + char trailCh = (char)nextChar(); if (UTF16.isTrailSurrogate(trailCh)) { cp = UCharacterProperty.getRawSupplementary( thCh, trailCh); @@ -1582,7 +1639,7 @@ public final class CollationElementIterator ce = collator.m_contractionCE_[offset]; break; } - char previous = previousChar(); + char previous = (char)previousChar(); while (previous > collator.m_contractionIndex_[offset]) { // contraction characters are ordered, skip smaller characters offset ++; @@ -1613,7 +1670,7 @@ public final class CollationElementIterator // 3. schar is a trail surrogate in a valid surrogate // sequence that is explicitly set to zero. if (!isBackwardsStart()) { - char lead = previousChar(); + char lead = (char)previousChar(); if (UTF16.isLeadSurrogate(lead)) { isZeroCE = collator.m_trie_.getLeadValue(lead); if (RuleBasedCollator.getTag(isZeroCE) @@ -1706,12 +1763,11 @@ public final class CollationElementIterator * Returns the current character for forward iteration * @return current character */ - private char currentChar() + private int currentChar() { if (m_bufferOffset_ < 0) { - char result = m_source_.previous(); - m_source_.next(); - return result; + m_source_.previous(); + return m_source_.next(); } // m_bufferOffset_ is never 0 in normal circumstances except after a @@ -1740,8 +1796,8 @@ public final class CollationElementIterator else { m_utilSkippedBuffer_.setLength(0); } - char ch = currentChar(); - m_utilSkippedBuffer_.append(currentChar()); + char ch = (char)currentChar(); + m_utilSkippedBuffer_.append((char)currentChar()); // accent after the first character if (m_utilSpecialDiscontiguousBackUp_ == null) { m_utilSpecialDiscontiguousBackUp_ = new Backup(); @@ -1750,14 +1806,15 @@ public final class CollationElementIterator char nextch = ch; while (true) { ch = nextch; - nextch = nextChar(); - if (nextch == CharacterIterator.DONE + int ch_int = nextChar(); + nextch = (char)ch_int; + if (ch_int == UCharacterIterator.DONE || getCombiningClass(nextch) == 0) { // if there are no more accents to move around // we don't have to shift previousChar, since we are resetting // the offset later if (multicontraction) { - if (nextch != CharacterIterator.DONE) { + if (ch_int != UCharacterIterator.DONE) { previousChar(); // backtrack } setDiscontiguous(m_utilSkippedBuffer_); @@ -1836,7 +1893,7 @@ public final class CollationElementIterator byte maxCC = (byte)(collator.m_contractionIndex_[offset] & 0xFF); // checks if all characters have the same combining class byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8); - char ch = nextChar(); + char ch = (char)nextChar(); offset ++; while (ch > collator.m_contractionIndex_[offset]) { // contraction characters are ordered, skip all smaller @@ -1859,7 +1916,7 @@ public final class CollationElementIterator else if (UTF16.isLeadSurrogate(ch)) { if (!isEnd()) { backupInternalState(m_utilSpecialBackUp_); - char trail = nextChar(); + char trail = (char)nextChar(); if (UTF16.isTrailSurrogate(trail)) { // do stuff with trail if (RuleBasedCollator.getTag(isZeroCE) @@ -1901,10 +1958,11 @@ public final class CollationElementIterator else { // Contraction is possibly discontiguous. // find the next character if ch is not a base character - char nextch = nextChar(); - if (nextch != CharacterIterator.DONE) { + int ch_int = nextChar(); + if (ch_int != UCharacterIterator.DONE) { previousChar(); } + char nextch = (char)ch_int; if (getCombiningClass(nextch) == 0) { previousChar(); // base character not part of discontiguous contraction @@ -2098,11 +2156,11 @@ public final class CollationElementIterator // Get next character. if (!isEnd()){ backupInternalState(m_utilSpecialBackUp_); - char ch = nextChar(); - int char32 = ch; + int char32 = nextChar(); + char ch = (char)char32; if (UTF16.isLeadSurrogate(ch)){ if (!isEnd()) { - char trail = nextChar(); + char trail = (char)nextChar(); if (UTF16.isTrailSurrogate(trail)) { char32 = UCharacterProperty.getRawSupplementary( ch, trail); @@ -2227,8 +2285,9 @@ public final class CollationElementIterator */ private int nextSurrogate(char ch) { - char nextch = nextChar(); - if (nextch != CharacterIterator.DONE && + int ch_int = nextChar(); + char nextch = (char)ch_int; + if (ch_int != CharacterIterator.DONE && UTF16.isTrailSurrogate(nextch)) { int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch); return nextImplicit(codepoint); @@ -2330,7 +2389,7 @@ public final class CollationElementIterator return IGNORABLE; } backupInternalState(m_utilSpecialBackUp_); - char trail = nextChar(); + char trail = (char)nextChar(); ce = nextSurrogate(collator, ce, trail); // calculate the supplementary code point value, // if surrogate was not tailored we go one more round @@ -2403,10 +2462,10 @@ public final class CollationElementIterator // check that ch is from the normalization buffer or not boolean innorm = m_bufferOffset_ >= 0; - char prevch = previousChar(); + int prevch = previousChar(); if (!isThaiPreVowel(prevch)) { // we now rearrange unconditionally do not check for base consonant - if (prevch != CharacterIterator.DONE) { + if (prevch != UCharacterIterator.DONE) { nextChar(); } // Treat Thai as a length one expansion @@ -2445,10 +2504,10 @@ public final class CollationElementIterator m_FCDLimit_ = m_FCDStart_ + 2; } if (reorder) { - m_buffer_.insert(1, prevch); + m_buffer_.insert(1, (char)prevch); } else { - m_buffer_.insert(0, prevch); + m_buffer_.insert(0, (char)prevch); } return IGNORABLE; } @@ -2475,7 +2534,7 @@ public final class CollationElementIterator ce = collator.m_contractionCE_[offset]; break; } - char prevch = previousChar(); + char prevch = (char)previousChar(); while (prevch > collator.m_contractionIndex_[offset]) { // since contraction codepoints are ordered, we skip all that // are smaller @@ -2505,7 +2564,7 @@ public final class CollationElementIterator // 3. schar is a trail surrogate in a valid surrogate // sequence that is explicitly set to zero. if (!isBackwardsStart()) { - char lead = previousChar(); + char lead = (char)previousChar(); if (UTF16.isLeadSurrogate(lead)) { isZeroCE = collator.m_trie_.getLeadValue(lead); if (RuleBasedCollator.getTag(isZeroCE) @@ -2563,7 +2622,7 @@ public final class CollationElementIterator m_utilStringBuffer_.setLength(0); // since we might encounter normalized characters (from the thai // processing) we can't use peekCharacter() here. - char prevch = previousChar(); + char prevch = (char)previousChar(); boolean atStart = false; while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) { m_utilStringBuffer_.insert(0, ch); @@ -2572,7 +2631,7 @@ public final class CollationElementIterator atStart = true; break; } - prevch = previousChar(); + prevch = (char)previousChar(); } if (!atStart) { // undo the previousChar() if we didn't reach the beginning @@ -2692,7 +2751,7 @@ public final class CollationElementIterator int char32 = ch; if (UTF16.isTrailSurrogate(ch)) { if (!isBackwardsStart()){ - char lead = previousChar(); + char lead = (char)previousChar(); if (UTF16.isLeadSurrogate(lead)) { char32 = UCharacterProperty.getRawSupplementary(lead, ch); @@ -2753,11 +2812,11 @@ public final class CollationElementIterator if (!isBackwardsStart()){ backupInternalState(m_utilSpecialBackUp_); - ch = previousChar(); - char32 = ch; + char32 = previousChar(); + ch = (char)ch; if (UTF16.isTrailSurrogate(ch)){ if (!isBackwardsStart()) { - char lead = previousChar(); + char lead = (char)previousChar(); if (UTF16.isLeadSurrogate(lead)) { char32 = UCharacterProperty.getRawSupplementary( @@ -2926,7 +2985,7 @@ public final class CollationElementIterator // we are at the start of the string, wrong place to be at return IGNORABLE; } - char prevch = previousChar(); + char prevch = (char)previousChar(); // Handles Han and Supplementary characters here. if (UTF16.isLeadSurrogate(prevch)) { return previousImplicit( @@ -3099,12 +3158,12 @@ public final class CollationElementIterator if (offset != 0) { int currentoffset = m_source_.getIndex(); m_source_.setIndex(currentoffset + offset); - char result = m_source_.current(); + char result = (char)m_source_.current(); m_source_.setIndex(currentoffset); return result; } else { - return m_source_.current(); + return (char)m_source_.current(); } } diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java index d867f5b9d3..8cad6e0621 100755 --- a/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java +++ b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java,v $ -* $Date: 2003/09/22 06:24:20 $ -* $Revision: 1.47 $ +* $Date: 2003/10/08 21:51:44 $ +* $Revision: 1.48 $ * ******************************************************************************* */ @@ -19,7 +19,6 @@ import java.util.Locale; import java.util.ResourceBundle; import java.util.Arrays; import java.text.CharacterIterator; -import java.text.StringCharacterIterator; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.util.VersionInfo; import com.ibm.icu.impl.IntTrie; @@ -28,6 +27,7 @@ import com.ibm.icu.impl.ICULocaleData; import com.ibm.icu.impl.BOCU; import com.ibm.icu.impl.Utility; import com.ibm.icu.impl.ICUDebug; +import com.ibm.icu.impl.StringUCharacterIterator; /** *

RuleBasedCollator is a concrete subclass of Collator. It allows @@ -255,6 +255,19 @@ public final class RuleBasedCollator extends Collator CharacterIterator newsource = (CharacterIterator)source.clone(); return new CollationElementIterator(newsource, this); } + + /** + * Return a CollationElementIterator for the given UCharacterIterator. + * The source iterator's integrity will be preserved since a new copy + * will be created for use. + * @see CollationElementIterator + * @draft ICU 2.8 + */ + public CollationElementIterator getCollationElementIterator( + UCharacterIterator source) + { + return new CollationElementIterator(source, this); + } // public setters -------------------------------------------------------- @@ -1731,9 +1744,10 @@ public final class RuleBasedCollator extends Collator if (ch < m_minUnsafe_) { return false; } - + if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { - if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) { + if (UTF16.isLeadSurrogate(ch) + || UTF16.isTrailSurrogate(ch)) { // Trail surrogate are always considered unsafe. return true; } @@ -1966,9 +1980,9 @@ public final class RuleBasedCollator extends Collator /** * Bunch of utility iterators */ - private StringCharacterIterator m_srcUtilIter_; + private StringUCharacterIterator m_srcUtilIter_; private CollationElementIterator m_srcUtilColEIter_; - private StringCharacterIterator m_tgtUtilIter_; + private StringUCharacterIterator m_tgtUtilIter_; private CollationElementIterator m_tgtUtilColEIter_; /** * Utility comparison flags @@ -3787,9 +3801,9 @@ public final class RuleBasedCollator extends Collator * Initializes utility iterators and byte buffer used by compare */ private final void initUtility() { - m_srcUtilIter_ = new StringCharacterIterator(new String("")); + m_srcUtilIter_ = new StringUCharacterIterator(); m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, this); - m_tgtUtilIter_ = new StringCharacterIterator(new String("")); + m_tgtUtilIter_ = new StringUCharacterIterator(); m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, this); m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary diff --git a/icu4j/src/com/ibm/icu/text/StringSearch.java b/icu4j/src/com/ibm/icu/text/StringSearch.java index 22aff1214b..3ac08b73c1 100755 --- a/icu4j/src/com/ibm/icu/text/StringSearch.java +++ b/icu4j/src/com/ibm/icu/text/StringSearch.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringSearch.java,v $ - * $Date: 2003/07/31 19:51:12 $ - * $Revision: 1.25 $ + * $Date: 2003/10/08 21:51:44 $ + * $Revision: 1.26 $ * ***************************************************************************************** */ @@ -17,6 +17,7 @@ import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.Locale; import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.impl.CharacterIteratorWrapper; import com.ibm.icu.impl.NormalizerImpl; /** @@ -1088,7 +1089,8 @@ public final class StringSearch extends SearchIterator || breakIterator.following(end - 1) == end); if (result) { // iterates the individual ces - m_utilColEIter_.setText(targetText, start); + m_utilColEIter_.setText( + new CharacterIteratorWrapper(targetText), start); for (int count = 0; count < m_pattern_.m_CELength_; count ++) { int ce = getCE(m_utilColEIter_.next());