From 65d107bf3df79e44e33fd6938c5ea19e23571e1a Mon Sep 17 00:00:00 2001 From: Syn Wee Quek Date: Sat, 26 Oct 2002 05:50:40 +0000 Subject: [PATCH] ICU-2191 Added * hasMoreCodePointsThan * StringComparator with code unit/point comparison * Still has to implement case insensitive comparison X-SVN-Rev: 10068 --- .../com/ibm/icu/dev/test/lang/UTF16Test.java | 217 +++++- icu4j/src/com/ibm/icu/text/UTF16.java | 668 +++++++++++++----- 2 files changed, 710 insertions(+), 175 deletions(-) diff --git a/icu4j/src/com/ibm/icu/dev/test/lang/UTF16Test.java b/icu4j/src/com/ibm/icu/dev/test/lang/UTF16Test.java index 7a6ca84eaa..88b847e768 100755 --- a/icu4j/src/com/ibm/icu/dev/test/lang/UTF16Test.java +++ b/icu4j/src/com/ibm/icu/dev/test/lang/UTF16Test.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/UTF16Test.java,v $ -* $Date: 2002/07/11 21:25:23 $ -* $Revision: 1.18 $ +* $Date: 2002/10/26 05:50:40 $ +* $Revision: 1.19 $ * ******************************************************************************* */ @@ -835,8 +835,8 @@ public final class UTF16Test extends TestFmwk String test2 = "test"; int testChar1 = 0x74; int testChar2 = 0x20402; - int testChar3 = 0xdc02; - int testChar4 = 0xd841; + // int testChar3 = 0xdc02; + // int testChar4 = 0xd841; String test3 = "\ud841\udc02\u0071\udc02\ud841\u0071\ud841\udc02\u0071\u0072\ud841\udc02\u0071\ud841\udc02\u0071\udc02\ud841\u0073"; String test4 = UCharacter.toString(testChar2); @@ -1042,7 +1042,7 @@ public final class UTF16Test extends TestFmwk if (UTF16.indexOf(INDEXOF_SUPPLEMENTARY_STRING_, ch, index) != expected || UTF16.indexOf(INDEXOF_SUPPLEMENTARY_STRING_, - UTF16.toString(ch), index) != + UCharacter.toString(ch), index) != expected) { errln("Failed finding index for supplementary 0x" + Integer.toHexString(ch)); @@ -1054,7 +1054,8 @@ public final class UTF16Test extends TestFmwk if (UTF16.lastIndexOf(INDEXOF_SUPPLEMENTARY_STRING_, ch, index) != expected || UTF16.lastIndexOf(INDEXOF_SUPPLEMENTARY_STRING_, - UTF16.toString(ch), index) != expected) + UCharacter.toString(ch), index) + != expected) { errln("Failed finding last index for supplementary 0x" + Integer.toHexString(ch)); @@ -1172,7 +1173,85 @@ public final class UTF16Test extends TestFmwk errln("reverse() failed with supplementary characters"); } } - + + /** + * Testing the setter and getter apis for StringComparator + */ + public void TestStringComparator() + { + UTF16.StringComparator compare = new UTF16.StringComparator(); + if (compare.getCodePointCompare() != false) { + errln("Default string comparator should be code unit compare"); + } + if (compare.getIgnoreCase() != false) { + errln("Default string comparator should be case sensitive compare"); + } + if (compare.getIgnoreCaseOption() + != UTF16.StringComparator.FOLD_CASE_DEFAULT) { + errln("Default string comparator should have fold case default compare"); + } + compare.setCodePointCompare(true); + if (compare.getCodePointCompare() != true) { + errln("Error setting code point compare"); + } + compare.setCodePointCompare(false); + if (compare.getCodePointCompare() != false) { + errln("Error setting code point compare"); + } + compare.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); + if (compare.getIgnoreCase() != true + || compare.getIgnoreCaseOption() + != UTF16.StringComparator.FOLD_CASE_DEFAULT) { + errln("Error setting ignore case and options"); + } + compare.setIgnoreCase(false, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); + if (compare.getIgnoreCase() != false + || compare.getIgnoreCaseOption() + != UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I) { + errln("Error setting ignore case and options"); + } + compare.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); + if (compare.getIgnoreCase() != true + || compare.getIgnoreCaseOption() + != UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I) { + errln("Error setting ignore case and options"); + } + compare.setIgnoreCase(false, UTF16.StringComparator.FOLD_CASE_DEFAULT); + if (compare.getIgnoreCase() != false + || compare.getIgnoreCaseOption() + != UTF16.StringComparator.FOLD_CASE_DEFAULT) { + errln("Error setting ignore case and options"); + } + } + + public void TestCodePointCompare() + { + // these strings are in ascending order + String str[] = {"\u0061", "\u20ac\ud801", "\u20ac\ud800\udc00", + "\ud800", "\ud800\uff61", "\udfff", + "\uff61\udfff", "\uff61\ud800\udc02", "\ud800\udc02", + "\ud84d\udc56"}; + UTF16.StringComparator cpcompare + = new UTF16.StringComparator(true, false, + UTF16.StringComparator.FOLD_CASE_DEFAULT); + UTF16.StringComparator cucompare + = new UTF16.StringComparator(); + for (int i = 0; i < str.length - 1; ++ i) { + if (cpcompare.compare(str[i], str[i + 1]) >= 0) { + errln("error: compare() in code point order fails for string " + + Utility.hex(str[i]) + " and " + + Utility.hex(str[i + 1])); + } + // test code unit compare + if (cucompare.compare(str[i], str[i + 1]) + != str[i].compareTo(str[i + 1])) { + errln("error: compare() in code unit order fails for string " + + Utility.hex(str[i]) + " and " + + Utility.hex(str[i + 1])); + } + } + } + public void TestCaseCompare() { String mixed = "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff"; @@ -1262,13 +1341,133 @@ public final class UTF16Test extends TestFmwk */ } + public void TestHasMoreCodePointsThan() + { + String str = "\u0061\u0062\ud800\udc00\ud801\udc01\u0063\ud802\u0064" + + "\udc03\u0065\u0066\ud804\udc04\ud805\udc05\u0067"; + int length = str.length(); + while (length >= 0) { + for (int i = 0; i <= length; ++ i) { + String s = str.substring(0, i); + for (int number = -1; number <= ((length - i) + 2); ++ number) { + boolean flag = UTF16.hasMoreCodePointsThan(s, number); + if (flag != (UTF16.countCodePoint(s) > number)) { + errln("hasMoreCodePointsThan(" + Utility.hex(s) + + ", " + number + ") = " + flag + " is wrong"); + } + } + } + -- length; + } + + // testing for null bad input + for(length = -1; length <= 1; ++ length) { + for (int i = 0; i <= length; ++ i) { + for (int number = -2; number <= 2; ++ number) { + boolean flag = UTF16.hasMoreCodePointsThan((String)null, + number); + if (flag != (UTF16.countCodePoint((String)null) > number)) { + errln("hasMoreCodePointsThan(null, " + number + ") = " + + flag + " is wrong"); + } + } + } + } + + length = str.length(); + while (length >= 0) { + for (int i = 0; i <= length; ++ i) { + StringBuffer s = new StringBuffer(str.substring(0, i)); + for (int number = -1; number <= ((length - i) + 2); ++ number) { + boolean flag = UTF16.hasMoreCodePointsThan(s, number); + if (flag != (UTF16.countCodePoint(s) > number)) { + errln("hasMoreCodePointsThan(" + Utility.hex(s) + + ", " + number + ") = " + flag + " is wrong"); + } + } + } + -- length; + } + + // testing for null bad input + for (length = -1; length <= 1; ++ length) { + for (int i = 0; i <= length; ++ i) { + for (int number = -2; number <= 2; ++ number) { + boolean flag = UTF16.hasMoreCodePointsThan( + (StringBuffer)null, number); + if (flag + != (UTF16.countCodePoint((StringBuffer)null) > number)) + { + errln("hasMoreCodePointsThan(null, " + number + ") = " + + flag + " is wrong"); + } + } + } + } + + char strarray[] = str.toCharArray(); + while (length >= 0) { + for (int limit = 0; limit <= length; ++ limit) { + for (int start = 0; start <= limit; ++ start) { + for (int number = -1; number <= ((limit - start) + 2); + ++ number) { + boolean flag = UTF16.hasMoreCodePointsThan(strarray, + start, limit, number); + if (flag != (UTF16.countCodePoint(strarray, start, + limit) > number)) { + errln("hasMoreCodePointsThan(" + + Utility.hex(str.substring(start, limit)) + + ", " + start + ", " + limit + ", " + number + + ") = " + flag + " is wrong"); + } + } + } + } + -- length; + } + + // testing for null bad input + for (length = -1; length <= 1; ++ length) { + for (int i = 0; i <= length; ++ i) { + for (int number = -2; number <= 2; ++ number) { + boolean flag = UTF16.hasMoreCodePointsThan( + (StringBuffer)null, number); + if (flag + != (UTF16.countCodePoint((StringBuffer)null) > number)) + { + errln("hasMoreCodePointsThan(null, " + number + ") = " + + flag + " is wrong"); + } + } + } + } + + // bad input + try { + UTF16.hasMoreCodePointsThan(strarray, -2, -1, 5); + errln("hasMoreCodePointsThan(chararray) with negative indexes has to throw an exception"); + } catch (Exception e) { + } + try { + UTF16.hasMoreCodePointsThan(strarray, 5, 2, 5); + errln("hasMoreCodePointsThan(chararray) with limit less than start index has to throw an exception"); + } catch (Exception e) { + } + try { + if (UTF16.hasMoreCodePointsThan(strarray, -2, 2, 5)) { + errln("hasMoreCodePointsThan(chararray) with negative start indexes can't return true"); + } + } catch (Exception e) { + } + } + public static void main(String[] arg) { try { UTF16Test test = new UTF16Test(); - // test.TestIndexOf(); test.run(arg); + // test.TestCodePointCompare(); } catch (Exception e) { @@ -1294,5 +1493,7 @@ public final class UTF16Test extends TestFmwk private final static String INDEXOF_SUPPLEMENTARY_STR_ = "\udc02\ud841"; private final static int INDEXOF_SUPPLEMENTARY_STR_INDEX_[] = {3, 16}; + + // private methods --------------------------------------------------- } diff --git a/icu4j/src/com/ibm/icu/text/UTF16.java b/icu4j/src/com/ibm/icu/text/UTF16.java index dacab271f3..3ad35710be 100755 --- a/icu4j/src/com/ibm/icu/text/UTF16.java +++ b/icu4j/src/com/ibm/icu/text/UTF16.java @@ -1,12 +1,12 @@ /** ******************************************************************************* -* Copyright (C) 1996-2001, International Business Machines Corporation and * +* Copyright (C) 1996-2002, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UTF16.java,v $ -* $Date: 2002/07/16 00:21:13 $ -* $Revision: 1.22 $ +* $Date: 2002/10/26 05:50:40 $ +* $Revision: 1.23 $ * ******************************************************************************* */ @@ -16,15 +16,15 @@ package com.ibm.icu.text; import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.impl.NormalizerImpl; /** -* Standalone utility class providing UTF16 character conversions and indexing -* conversions. +*

Standalone utility class providing UTF16 character conversions and indexing +* conversions.

*

Code that uses strings alone rarely need modification. * By design, UTF-16 does not allow overlap, so searching for strings is a safe * operation. Similarly, concatenation is always safe. Substringing is safe if * the start and end are both on UTF-32 boundaries. In normal code, the values * for start and end are on those boundaries, since they arose from operations * like searching. If not, the nearest UTF-32 boundaries can be determined -* using bounds(). +* using bounds().

* Examples: *

The following examples illustrate use of some of these methods. *

@@ -393,30 +393,6 @@ public final class UTF16
 	    return single; // return unmatched surrogate
     }
       
-    /**
-    * Extract a single UTF-32 value from a string.
-    * If a validity check is required, use 
-    * 
-    * UCharacter.isLegal() on the return value.
-    * If tbe char retrieved is part of a surrogate pair, its supplementary 
-    * character will be returned. If a complete supplementary character is 
-    * not found the incomplete character will be returned
-    * @return UTF-32 value for the UTF-32 value that contains the char at 
-    *         offset16. The boundaries of that codepoint are the same as in 
-    *         bounds32(). 
-    * @param source array of UTF-16 chars
-    * @param offset32 UTF-32 offset to the start of the character.
-    * @return a single UTF32 value
-    * @exception IndexOutOfBoundsException if offset16 is out of bounds.
-    * @deprecated to be removed after the year 2002, replaced by 
-    *      UTF16.charAt(source, UTF16.findOffsetFromCodePoint(source, 
-    *                   offset32));
-    */
-    public static int charAtCodePointOffset(String source, int offset32) 
-    {
-        return charAt(source, findOffsetFromCodePoint(source, offset32));
-    }
-      
     /**
     * Determines how many chars this char32 requires.
     * If a validity check is required, use 
@@ -569,30 +545,7 @@ public final class UTF16
     }
 
     /**
-    * Returns the type of the boundaries around the char at offset32. Used 
-    * for random access.
-    * @param source string to analyse
-    * @param offset32 UTF32 offset
-    * @return
-    *     
    - *
  • SINGLE_CHAR_BOUNDARY : a single char - *
  • LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at - * offset32 - *
- * For bit-twiddlers, see - * bounds(java.lang.String, int) for information on the choice of the - * boundary values. - * @exception IndexOutOfBoundsException if offset16 is out of bounds. - * @deprecated will be removed after end of year 2002, replaced by - * UTF16.bounds(source, UTF16.findOffsetFromCodePoint(source, offset32)); - */ - public static int boundsAtCodePointOffset(String source, int offset32) - { - return bounds(source, findOffsetFromCodePoint(source, offset32)); - } - - /** - * Determines whether the code value is a surrogate. + * Determines whether the code value is a surrogate. * @param ch the input character. * @return true iff the input character is a surrogate. */ @@ -1146,26 +1099,6 @@ public final class UTF16 return findCodePointOffset(source, start, limit, limit - start); } - /** - * Sets a code point into a UTF32 position. - * Adjusts target according if we are replacing a non-supplementary - * codepoint with a supplementary and vice versa. - * @param target stringbuffer - * @param offset32 UTF32 position to insert into - * @exception IndexOutOfBoundsException if offset32 is out of bounds. - * @param char32 code point - * @deprecated to be removed after the year 2002, - * UTF16.setCharAt(target, - * findOffsetFromCodePoint(target.toString(), offset32), - * char32); - */ - public static void setCharAtCodePointOffset(StringBuffer target, - int offset32, int char32) - { - int offset16 = findOffsetFromCodePoint(target.toString(), offset32); - setCharAt(target, offset16, char32); - } - /** * Set a code point into a UTF16 position. * Adjusts target according if we are replacing a non-supplementary @@ -2116,106 +2049,506 @@ public final class UTF16 } return result; } - + /** - * Compare strings using Unicode code point order, instead of UTF-16 code - * unit order. + * Check if the string contains more Unicode code points than a certain + * number. This is more efficient than counting all code points in the + * entire string and comparing that number with a threshold. + * This function may not need to scan the string at all if the length is + * within a certain range, and never needs to count more than 'number + 1' + * code points. Logically equivalent to (countCodePoint(s) > number). A + * Unicode code point may occupy either one or two code units. + * @param source The input string. + * @param number The number of code points in the string is compared + * against the 'number' parameter. + * @return boolean value for whether the string contains more Unicode code + * points than 'number'. + * @draft 2.4 + */ + public static boolean hasMoreCodePointsThan(String source, int number) + { + if (number < 0) { + return true; + } + if (source == null) { + return false; + } + int length = source.length(); + + // length >= 0 known + // source contains at least (length + 1) / 2 code points: <= 2 + // chars per cp + if (((length + 1) >> 1) > number) { + return true; + } + + // check if source does not even contain enough chars + int maxsupplementary = length - number; + if (maxsupplementary <= 0) { + return false; + } + + // there are maxsupplementary = length - number more chars than + // asked-for code points + + // count code points until they exceed and also check that there are + // no more than maxsupplementary supplementary code points (char pairs) + int start = 0; + while (true) { + if (length == 0) { + return false; + } + if (number == 0) { + return true; + } + if (isLeadSurrogate(source.charAt(start ++)) && start != length + && isTrailSurrogate(source.charAt(start))) { + start ++; + if (-- maxsupplementary <= 0) { + // too many pairs - too few code points + return false; + } + } + -- number; + } + } + + /** + * Check if the sub-range of char array, from argument start to limit, + * contains more Unicode code points than a certain + * number. This is more efficient than counting all code points in the + * entire char array range and comparing that number with a threshold. + * This function may not need to scan the char array at all if start and + * limit is within a certain range, and never needs to count more than + * 'number + 1' code points. + * Logically equivalent to (countCodePoint(source, start, limit) > number). + * A Unicode code point may occupy either one or two code units. + * @param source array of UTF-16 chars + * @param start offset to substring in the source array for analyzing + * @param limit offset to substring in the source array for analyzing + * @param number The number of code points in the string is compared + * against the 'number' parameter. + * @return boolean value for whether the string contains more Unicode code + * points than 'number'. + * @exception IndexOutOfBoundsException thrown when limit < start + * @draft 2.4 + */ + public static boolean hasMoreCodePointsThan(char source[], int start, + int limit, int number) + { + int length = limit - start; + if (length < 0 || start < 0 || limit < 0) { + throw new IndexOutOfBoundsException( + "Start and limit indexes should be non-negative and start <= limit"); + } + if (number < 0) { + return true; + } + if (source == null) { + return false; + } + + // length >= 0 known + // source contains at least (length + 1) / 2 code points: <= 2 + // chars per cp + if (((length + 1) >> 1) > number) { + return true; + } + + // check if source does not even contain enough chars + int maxsupplementary = length - number; + if (maxsupplementary <= 0) { + return false; + } + + // there are maxsupplementary = length - number more chars than + // asked-for code points + + // count code points until they exceed and also check that there are + // no more than maxsupplementary supplementary code points (char pairs) + while (true) { + if (length == 0) { + return false; + } + if (number == 0) { + return true; + } + if (isLeadSurrogate(source[start ++]) && start != limit + && isTrailSurrogate(source[start])) { + start ++; + if (-- maxsupplementary <= 0) { + // too many pairs - too few code points + return false; + } + } + -- number; + } + } + + /** + * Check if the string buffer contains more Unicode code points than a + * certain number. This is more efficient than counting all code points in + * the entire string buffer and comparing that number with a threshold. + * This function may not need to scan the string buffer at all if the + * length is within a certain range, and never needs to count more than + * 'number + 1' code points. Logically equivalent to + * (countCodePoint(s) > number). A Unicode code point may occupy either one + * or two code units. + * @param source The input string buffer. + * @param number The number of code points in the string buffer is compared + * against the 'number' parameter. + * @return boolean value for whether the string buffer contains more + * Unicode code points than 'number'. + * @draft 2.4 + */ + public static boolean hasMoreCodePointsThan(StringBuffer source, int number) + { + if (number < 0) { + return true; + } + if (source == null) { + return false; + } + int length = source.length(); + + // length >= 0 known + // source contains at least (length + 1) / 2 code points: <= 2 + // chars per cp + if (((length + 1) >> 1) > number) { + return true; + } + + // check if source does not even contain enough chars + int maxsupplementary = length - number; + if (maxsupplementary <= 0) { + return false; + } + + // there are maxsupplementary = length - number more chars than + // asked-for code points + + // count code points until they exceed and also check that there are + // no more than maxsupplementary supplementary code points (char pairs) + int start = 0; + while (true) { + if (length == 0) { + return false; + } + if (number == 0) { + return true; + } + if (isLeadSurrogate(source.charAt(start ++)) && start != length + && isTrailSurrogate(source.charAt(start))) { + start ++; + if (-- maxsupplementary <= 0) { + // too many pairs - too few code points + return false; + } + } + -- number; + } + } + + /** + *

UTF16 string comparator class. + * Allows UTF16 string comparison to be done with the various modes

+ *
    + *
  • Code point comparison or code unit comparison + *
  • Case sensitive comparison, case insensitive comparison or case + * insensitive comparison with special handling for character 'i'. + *
+ *

The code unit or code point comparison differ only when comparing + * supplementary code points (\u10000..\u10ffff) to BMP code points + * near the end of the BMP (i.e., \ue000..\uffff). In code unit + * comparison, high BMP code points sort after supplementary code points + * because they are stored as pairs of surrogates which are at + * \ud800..\udfff.

+ * @see #FOLD_CASE_DEFAULT + * @see #FOLD_CASE_EXCLUDE_SPECIAL_I + * @stable */ public static final class StringComparator implements java.util.Comparator { + // public constructor ------------------------------------------------ + /** - * Standard String compare. Only one small section is different, marked in - * the code. - */ - public int compare(Object a, Object b) + * Default constructor that does code unit comparison and case + * sensitive comparison. + */ + public StringComparator() { - if (a == b) { - return 0; - } - if (a == null) { - return -1; - } - if (b == null) { - return 1; - } - - String sa = (String) a; - String sb = (String) b; - int lena = sa.length(); - int lenb = sb.length(); - int len = lena; - if (len > lenb) { - len = lenb; - } - - for (int i = 0; i < len; ++i) - { - char ca = sa.charAt(i); - char cb = sb.charAt(i); - if (ca == cb) { - continue; // skip remap if equal - } - - // start of only different section - // if either code unit is below 0xd800, i.e., below the - // surrogate range, then nothing needs to be done - - // if both are >=0xd800 then special code adjusts code unit - // values so that all BMP code points (including single - // surrogate code points) sort below supplementary ones - - // this is necessary because surrogates are not at the end of - // the code unit range - if (ca >= LEAD_SURROGATE_MIN_VALUE - && cb >= LEAD_SURROGATE_MIN_VALUE) { - // subtract 0x2800 from BMP code points to make them - // smaller than supplementary ones - if ((ca <= LEAD_SURROGATE_MAX_VALUE && (i + 1) < lena - && isTrailSurrogate(sa.charAt(i + 1))) - || (isTrailSurrogate(ca) && i > 0 - && isLeadSurrogate(sa.charAt(i - 1)))) { - // part of a surrogate pair, leave >=d800 - } - else { - // BMP code point - may be surrogate code point - make - // 0 - && isLeadSurrogate(sb.charAt(i - 1)))) { - // part of a surrogate pair, leave >=d800 - } - else { - // BMP code point - may be surrogate code point - make - // < d800 - cb -= 0x2800; - } - } - - // end of only different section - - if (ca < cb) { - return -1; - } - - return 1; // wasn't equal, so return 1 - } - - if (lena < lenb) { - return -1; - } - - if (lena > lenb) { - return 1; - } - - return 0; + m_codePointCompare_ = false; + m_ignoreCase_ = false; + m_foldCase_ = FOLD_CASE_DEFAULT; } + /** + * Constructor that does comparison based on the argument options. + * @param codepointcompare flag to indicate true for code point + * comparison or false for code unit comparison. + * @param ignorecase false for case sensitive comparison, true for + * case-insensitive comparison + * @param foldcaseoption FOLD_CASE_DEFAULT or + * FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when + * ignorecase is set to true. If ignorecase is false, this option + * is ignored. + * @see #FOLD_CASE_DEFAULT + * @see #FOLD_CASE_EXCLUDE_SPECIAL_I + * @throws IllegalArgumentException if foldcaseoption is out of range + */ + public StringComparator(boolean codepointcompare, + boolean ignorecase, + int foldcaseoption) + { + m_codePointCompare_ = codepointcompare; + m_ignoreCase_ = ignorecase; + if (foldcaseoption < FOLD_CASE_DEFAULT + || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { + throw new IllegalArgumentException("Invalid fold case option"); + } + m_foldCase_ = foldcaseoption; + } + + // public data member ------------------------------------------------ + + /** + *

Option value for case folding comparison:

+ *

Comparison is case insensitive, strings are folded using default + * mappings defined in Unicode data file CaseFolding.txt, before + * comparison. + *

+ * @draft 2.4 + */ + public static final int FOLD_CASE_DEFAULT = 0; + /** + *

Option value for case folding comparison:

+ *

Comparison is case insensitive, strings are folded using modified + * mappings defined in Unicode data file CaseFolding.txt, before + * comparison. + *

+ *

The modified set of mappings is provided in a Unicode data file + * CaseFolding.txt to handle dotted I and dotless i appropriately for + * Turkic languages (tr, az).

+ *

Before Unicode 3.2, CaseFolding.txt contains mappings marked with + * 'I' that are to be included for default mappings and excluded for + * the Turkic-specific mappings.

+ *

Unicode 3.2 CaseFolding.txt instead contains mappings marked with + * 'T' that are to be excluded for default mappings and included for + * the Turkic-specific mappings.

+ * @draft 2.4 + */ + public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0; + + // public methods ---------------------------------------------------- + + // public setters ---------------------------------------------------- + + /** + * Sets the comparison mode to code point compare if flag is true. + * Otherwise comparison mode is set to code unit compare + * @param flag true for code point compare, false for code unit compare + */ + public void setCodePointCompare(boolean flag) + { + m_codePointCompare_ = flag; + } + + /** + * Sets the Comparator to case-insensitive comparison mode if argument + * is true, otherwise case sensitive comparison mode if set to false. + * @param ignorecase true for case-insitive comparison, false for + * case sensitive comparison + * @param foldcaseoptions FOLD_CASE_DEFAULT or + * FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when + * ignorecase is set to true. If ignorecase is false, this option + * is ignored. + * @see #FOLD_CASE_DEFAULT + * @see #FOLD_CASE_EXCLUDE_SPECIAL_I + */ + public void setIgnoreCase(boolean ignorecase, int foldcaseoption) + { + m_ignoreCase_ = ignorecase; + if (foldcaseoption < FOLD_CASE_DEFAULT + || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { + throw new IllegalArgumentException("Invalid fold case option"); + } + m_foldCase_ = foldcaseoption; + } + + // public getters ---------------------------------------------------- + + /** + * Checks if the comparison mode is code point compare. + * @return true for code point compare, false for code unit compare + */ + public boolean getCodePointCompare() + { + return m_codePointCompare_; + } + + /** + * Checks if Comparator is in the case insensitive mode. + * @return true if Comparator performs case insensitive comparison, + * false otherwise + */ + public boolean getIgnoreCase() + { + return m_ignoreCase_; + } + + /** + * Gets the fold case options set in Comparator to be used with case + * insensitive comparison. + * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I + * @see #FOLD_CASE_DEFAULT + * @see #FOLD_CASE_EXCLUDE_SPECIAL_I + */ + public int getIgnoreCaseOption() + { + return m_foldCase_; + } + + // public other methods ---------------------------------------------- + + /** + * Compare two strings depending on the options selected during + * construction. + * @param a first source string. + * @param b second source string. + * @return 0 returned if a == b. If a < b, a negative value is returned. + * Otherwise if a > b, a positive value is returned. + * @exception ClassCastException thrown when either a or b is not a + * String object + * @draft 2.4 + */ + public int compare(Object a, Object b) + { + String str1 = (String)a; + String str2 = (String)b; + + if (str1 == str2) { + return 0; + } + if (str1 == null) { + return -1; + } + if (str2 == null) { + return 1; + } + + if (m_ignoreCase_) { + return compareCaseInsensitive(str1, str2); + } + return compareCaseSensitive(str1, str2); + } + + // private data member ---------------------------------------------- + + /** + * Code unit comparison flag. True if code unit comparison is required. + * False if code point comparison is required. + */ + private boolean m_codePointCompare_; + /** + * Fold case comparison option. + */ + private int m_foldCase_; + /** + * Flag indicator if ignore case is to be used during comparison + */ + private boolean m_ignoreCase_; + /** + * Code point order offset for surrogate characters + */ + private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800; + + // private method --------------------------------------------------- + + /** + * Compares case insensitive. This is a direct port of ICU4C, to make + * maintainence life easier. + * @param s1 first string to compare + * @param s2 second string to compare + * @return -1 is s1 < s2, 0 if equals, + */ + private int compareCaseInsensitive(String s1, String s2) + { + return NormalizerImpl.cmpEquivFold(s1, s2, + m_foldCase_ | + Normalizer.COMPARE_IGNORE_CASE); + } + /** + * Compares case sensitive. This is a direct port of ICU4C, to make + * maintainence life easier. + * @param s1 first string to compare + * @param s2 second string to compare + * @return -1 is s1 < s2, 0 if equals, + */ + private int compareCaseSensitive(String s1, String s2) + { + // compare identical prefixes - they do not need to be fixed up + // limit1 = start1 + min(lenght1, length2) + int length1 = s1.length(); + int length2 = s2.length(); + int minlength = length1; + int result = 0; + if (length1 < length2) { + result = -1; + } + else if (length1 > length2) { + result = 1; + } + + char c1 = 0; + char c2 = 0; + int index = 0; + for (; index < minlength; index ++) { + c1 = s1.charAt(index); + c2 = s2.charAt(index); + // check pseudo-limit + if (c1 != c2) { + break; + } + } + + if (index == minlength) { + return result; + } + + // if both values are in or above the surrogate range, fix them up + if (c1 >= LEAD_SURROGATE_MIN_VALUE + && c2 >= LEAD_SURROGATE_MIN_VALUE && m_codePointCompare_) { + // subtract 0x2800 from BMP code points to make them smaller + // than supplementary ones + if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 + && isTrailSurrogate(s1.charAt(index + 1))) + || (isTrailSurrogate(c1) && index != 0 + && isLeadSurrogate(s1.charAt(index - 1)))) { + // part of a surrogate pair, leave >=d800 + } + else { + // BMP code point - may be surrogate code point - make + // < d800 + c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; + } + + if ((c2 <= LEAD_SURROGATE_MAX_VALUE + && (index + 1) != length2 + && isTrailSurrogate(s2.charAt(index + 1))) || + (isTrailSurrogate(c2) && index != 0 + && isLeadSurrogate(s2.charAt(index - 1)))) { + // part of a surrogate pair, leave >=d800 + } + else { + // BMP code point - may be surrogate code point - make > LEAD_SURROGATE_SHIFT_); - + >> LEAD_SURROGATE_SHIFT_); + // private methods ------------------------------------------------------ /** @@ -2248,6 +2581,7 @@ public final class UTF16 * points, 2 otherwise.

* @param ch code point * @return string representation of the code point + * @deprecated since 2.4, use UCharater.toString(int) instead */ public static String toString(int ch) {