From 076095ae7ec5ad8a70da2151987f76f2a5f26817 Mon Sep 17 00:00:00 2001 From: Syn Wee Quek Date: Wed, 22 May 2002 01:14:38 +0000 Subject: [PATCH] ICU-1897 bugs solved * japanese case level * more stringent checks on ce buffer size. X-SVN-Rev: 8665 --- .../icu/text/CollationElementIterator.java | 45 ++++++----- .../com/ibm/icu/text/RuleBasedCollator.java | 79 ++++++++++++------- 2 files changed, 75 insertions(+), 49 deletions(-) diff --git a/icu4j/src/com/ibm/icu/text/CollationElementIterator.java b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java index 4c149a9937..0ac52f78ed 100755 --- a/icu4j/src/com/ibm/icu/text/CollationElementIterator.java +++ b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java @@ -800,12 +800,14 @@ public final class CollationElementIterator *
  • The leading combining class from the current character is 0 or the * trailing combining class of the previous char was zero. * - * Incoming source offsets points to the next processing character. + * Incoming source offsets points to the current processing character. * Return source offsets points to the current processing character. *

    + * @param ch current character + * @param offset current character offset * @return true if FCDCheck passes, false otherwise */ - private boolean FCDCheck() + private boolean FCDCheck(char ch, int offset) { boolean result = true; @@ -813,8 +815,8 @@ public final class CollationElementIterator // Get the trailing combining class of the current character. // If it's zero, we are OK. - char ch = m_source_.previous(); - m_FCDStart_ = m_source_.getIndex(); + m_FCDStart_ = offset; + m_source_.setIndex(offset); // trie access char fcd = 0; // synwee todo: unorm_getFCD16(ch); if (fcd != 0 && UTF16.isLeadSurrogate(ch)) { @@ -860,6 +862,7 @@ public final class CollationElementIterator } } m_source_.setIndex(m_FCDStart_); + m_source_.next(); m_FCDLimit_ = m_source_.getIndex(); return result; } @@ -873,7 +876,8 @@ public final class CollationElementIterator private char nextChar() { char result; - // loop handles the next character whether it is in the buffer or not. + int startoffset = m_source_.getIndex(); + // loop handles the next character whether it is in the buffer or not. if (m_bufferOffset_ == -1) { // we're working on the source and not normalizing. fast path. // note Thai pre-vowel reordering uses buffer too @@ -890,22 +894,24 @@ public final class CollationElementIterator m_buffer_.delete(0, m_buffer_.length()); return nextChar(); } + return result; } if (m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION - || m_bufferOffset_ != -1 || m_FCDLimit_ > m_source_.getIndex() + || m_bufferOffset_ != -1 || m_FCDLimit_ > startoffset // skip the fcd checks || result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ // Fast fcd safe path. trail combining class == 0. ) { - m_source_.next(); + + m_source_.next(); return result; } - if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) { + if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) { // We need to peek at the next character in order to tell if we are // FCD - char next = m_source_.next(); + char next = m_source_.next(); if (next == CharacterIterator.DONE || next == LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) { return result; // end of source string and if next character @@ -914,18 +920,17 @@ public final class CollationElementIterator } // Need a more complete FCD check and possible normalization. - if (!FCDCheck()) { + if (!FCDCheck(result, startoffset)) { normalize(); result = m_buffer_.charAt(0); m_bufferOffset_ = 1; } - m_source_.next(); return result; } /** *

    Incremental normalization, this is an essential optimization. - *7 Assuming FCD checks has been done, normalize the non-FCD characters into + * Assuming FCD checks has been done, normalize the non-FCD characters into * the buffer. * Source offsets points to the current processing character.

    */ @@ -959,14 +964,16 @@ public final class CollationElementIterator * Input source offsets points to the previous character. * Return source offsets points to the current processing character. *

    + * @param ch current character + * @param offset current character offset * @return true if FCDCheck passes, false otherwise - */ - private boolean FCDCheckBackwards() + */ + private boolean FCDCheckBackwards(char ch, int offset) { boolean result = true; - char ch = m_source_.next(); char fcd = 0; - m_FCDLimit_ = m_source_.getIndex(); + m_FCDLimit_ = offset; + m_source_.setIndex(offset); if (!UTF16.isSurrogate(ch)) { fcd = 0; // synwee todo unorm_getFCD16(fcdTrieIndex, c); } @@ -1053,10 +1060,10 @@ public final class CollationElementIterator } } char result = m_source_.previous(); + int startoffset = m_source_.getIndex(); if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ || m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION - || m_FCDStart_ <= m_source_.getIndex() - || m_source_.getIndex() == 0) { + || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) { return result; } char ch = m_source_.previous(); @@ -1066,7 +1073,7 @@ public final class CollationElementIterator return result; } // Need a more complete FCD check and possible normalization. - if (!FCDCheckBackwards()) { + if (!FCDCheckBackwards(ch, startoffset)) { normalizeBackwards(); m_bufferOffset_ --; result = m_buffer_.charAt(m_bufferOffset_); diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java index fbfc790f33..9284ebb030 100755 --- a/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java +++ b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java,v $ -* $Date: 2002/05/20 23:43:01 $ -* $Revision: 1.6 $ +* $Date: 2002/05/22 01:14:38 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -809,7 +809,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } } - if (compare[4]) { // checkQuad + if (doShift4) { // checkQuad result = doQuaternaryCompare(cebuffer, lowestpvalue); if (result != 0) { return result; @@ -1320,7 +1320,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate protected void updateInternalState() { if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { - m_caseSwitch_ = CASE_SWITCH_; + m_caseSwitch_ = (byte)CASE_SWITCH_; } else { m_caseSwitch_ = NO_CASE_SWITCH_; @@ -1334,7 +1334,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate m_bottom3_ = COMMON_BOTTOM_3_; } else { - m_mask3_ = CE_KEEP_CASE_; + m_mask3_ = (byte)CE_KEEP_CASE_; m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_; if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { m_common3_ = COMMON_UPPER_FIRST_3_; @@ -1518,18 +1518,18 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate /** * Case first constants */ - private static final byte CASE_SWITCH_ = (byte)0xC0; - private static final byte NO_CASE_SWITCH_ = 0; + private static final int CASE_SWITCH_ = 0xC0; + private static final int NO_CASE_SWITCH_ = 0; /** * Case level constants */ - private static final byte CE_REMOVE_CASE_ = (byte)0x3F; - private static final byte CE_KEEP_CASE_ = (byte)0xFF; + private static final int CE_REMOVE_CASE_ = 0x3F; + private static final int CE_KEEP_CASE_ = 0xFF; /** * Case strength mask */ - private static final byte CE_CASE_BIT_MASK_ = (byte)0xC0; - private static final byte CE_CASE_MASK_3_ = (byte)0xFF; + private static final int CE_CASE_BIT_MASK_ = 0xC0; + private static final int CE_CASE_MASK_3_ = 0xFF; /** * Sortkey size factor. Values can be changed. */ @@ -2320,7 +2320,8 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate if (bytes[1].length <= bytescount[1] + isize) { bytes[1] = increase(bytes[1], bytescount[1], 1 + isize); } - BOSCU.writeIdenticalLevelRun(source, bytes[1], bytescount[1]); + bytescount[1] = BOSCU.writeIdenticalLevelRun(source, bytes[1], + bytescount[1]); } /** @@ -2533,16 +2534,20 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate { // if we reach here, the ce offset accessed is the last ce // appended to the buffer - boolean isNullOrder = (cebuffer[0][cebuffersize[0] - 1] - == CollationElementIterator.NULLORDER); - + boolean isSourceNullOrder = (cebuffer[0][cebuffersize[0] - 1] + == CollationElementIterator.NULLORDER); + boolean isTargetNullOrder = (cebuffer[1][cebuffersize[1] - 1] + == CollationElementIterator.NULLORDER); cebuffer[0] = null; cebuffer[1] = null; cebuffersize[0] = 0; cebuffersize[1] = 0; - if (isNullOrder) { + if (isSourceNullOrder) { return -1; } + if (isTargetNullOrder) { + return 1; + } // getting rid of the sign sorder >>>= CE_PRIMARY_SHIFT_; torder >>>= CE_PRIMARY_SHIFT_; @@ -2685,6 +2690,10 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate CollationElementIterator.NULLORDER) { return -1; } + if (cebuffer[1][toffset - 1] == + CollationElementIterator.NULLORDER) { + return 1; + } return (sorder < torder) ? -1 : 1; } } @@ -2696,11 +2705,11 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate int sorder = getSecondaryFrenchCE(cebuffer, offset, continuationoffset, 0); int torder = getSecondaryFrenchCE(cebuffer, offset, - continuationoffset,1); + continuationoffset, 1); if (sorder == torder) { - if (cebuffer[0][offset[0] - 1] - == CollationElementIterator.NULLORDER - || (offset[0] < 0 && offset[1] < 0)) { + if ((offset[0] < 0 && offset[1] < 0) + || cebuffer[0][offset[0]] + == CollationElementIterator.NULLORDER) { break; } } @@ -2729,17 +2738,19 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate while (result == CollationElementIterator.IGNORABLE && offset[index] >= 0) { if (continuationoffset[index] == 0) { - while (isContinuation(cebuffer[0][offset[index] --])); - // after this, sorder is at the start of continuation, - // and offset points before that - if (isContinuation(cebuffer[0][offset[index] + 1])) { - // save offset for later - continuationoffset[index] = offset[index]; - offset[index] += 2; - } + result = cebuffer[index][offset[index]]; + while (isContinuation(cebuffer[index][offset[index] --])); + // after this, sorder is at the start of continuation, + // and offset points before that + if (isContinuation(cebuffer[index][offset[index] + 1])) { + // save offset for later + continuationoffset[index] = offset[index]; + offset[index] += 2; + } + //} } else { - result = cebuffer[0][offset[index] ++]; + result = cebuffer[index][offset[index] ++]; if (!isContinuation(result)) { // we have finished with this continuation offset[index] = continuationoffset[index]; @@ -2780,7 +2791,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { torder = cebuffer[1][toffset ++]; - if (!isContinuation(sorder)) { + if (!isContinuation(torder)) { torder &= CE_CASE_MASK_3_; torder ^= m_caseSwitch_; } @@ -2853,6 +2864,10 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate CollationElementIterator.NULLORDER) { return -1; } + if (cebuffer[1][toffset - 1] == + CollationElementIterator.NULLORDER) { + return 1; + } return (sorder < torder) ? -1 : 1; } } @@ -2927,6 +2942,10 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate CollationElementIterator.NULLORDER) { return -1; } + if (cebuffer[1][toffset - 1] == + CollationElementIterator.NULLORDER) { + return 1; + } return (sorder < torder) ? -1 : 1; } }