Added
* hasMoreCodePointsThan
* StringComparator with code unit/point comparison
* Still has to implement case insensitive comparison

X-SVN-Rev: 10068
This commit is contained in:
Syn Wee Quek 2002-10-26 05:50:40 +00:00
parent 130f5c120b
commit 65d107bf3d
2 changed files with 710 additions and 175 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/UTF16Test.java,v $
* $Date: 2002/07/11 21:25:23 $
* $Revision: 1.18 $
* $Date: 2002/10/26 05:50:40 $
* $Revision: 1.19 $
*
*******************************************************************************
*/
@ -835,8 +835,8 @@ public final class UTF16Test extends TestFmwk
String test2 = "test";
int testChar1 = 0x74;
int testChar2 = 0x20402;
int testChar3 = 0xdc02;
int testChar4 = 0xd841;
// int testChar3 = 0xdc02;
// int testChar4 = 0xd841;
String test3 = "\ud841\udc02\u0071\udc02\ud841\u0071\ud841\udc02\u0071\u0072\ud841\udc02\u0071\ud841\udc02\u0071\udc02\ud841\u0073";
String test4 = UCharacter.toString(testChar2);
@ -1042,7 +1042,7 @@ public final class UTF16Test extends TestFmwk
if (UTF16.indexOf(INDEXOF_SUPPLEMENTARY_STRING_, ch, index) !=
expected ||
UTF16.indexOf(INDEXOF_SUPPLEMENTARY_STRING_,
UTF16.toString(ch), index) !=
UCharacter.toString(ch), index) !=
expected) {
errln("Failed finding index for supplementary 0x" +
Integer.toHexString(ch));
@ -1054,7 +1054,8 @@ public final class UTF16Test extends TestFmwk
if (UTF16.lastIndexOf(INDEXOF_SUPPLEMENTARY_STRING_, ch,
index) != expected ||
UTF16.lastIndexOf(INDEXOF_SUPPLEMENTARY_STRING_,
UTF16.toString(ch), index) != expected)
UCharacter.toString(ch), index)
!= expected)
{
errln("Failed finding last index for supplementary 0x" +
Integer.toHexString(ch));
@ -1172,7 +1173,85 @@ public final class UTF16Test extends TestFmwk
errln("reverse() failed with supplementary characters");
}
}
/**
* Testing the setter and getter apis for StringComparator
*/
public void TestStringComparator()
{
UTF16.StringComparator compare = new UTF16.StringComparator();
if (compare.getCodePointCompare() != false) {
errln("Default string comparator should be code unit compare");
}
if (compare.getIgnoreCase() != false) {
errln("Default string comparator should be case sensitive compare");
}
if (compare.getIgnoreCaseOption()
!= UTF16.StringComparator.FOLD_CASE_DEFAULT) {
errln("Default string comparator should have fold case default compare");
}
compare.setCodePointCompare(true);
if (compare.getCodePointCompare() != true) {
errln("Error setting code point compare");
}
compare.setCodePointCompare(false);
if (compare.getCodePointCompare() != false) {
errln("Error setting code point compare");
}
compare.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
if (compare.getIgnoreCase() != true
|| compare.getIgnoreCaseOption()
!= UTF16.StringComparator.FOLD_CASE_DEFAULT) {
errln("Error setting ignore case and options");
}
compare.setIgnoreCase(false, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
if (compare.getIgnoreCase() != false
|| compare.getIgnoreCaseOption()
!= UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I) {
errln("Error setting ignore case and options");
}
compare.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
if (compare.getIgnoreCase() != true
|| compare.getIgnoreCaseOption()
!= UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I) {
errln("Error setting ignore case and options");
}
compare.setIgnoreCase(false, UTF16.StringComparator.FOLD_CASE_DEFAULT);
if (compare.getIgnoreCase() != false
|| compare.getIgnoreCaseOption()
!= UTF16.StringComparator.FOLD_CASE_DEFAULT) {
errln("Error setting ignore case and options");
}
}
public void TestCodePointCompare()
{
// these strings are in ascending order
String str[] = {"\u0061", "\u20ac\ud801", "\u20ac\ud800\udc00",
"\ud800", "\ud800\uff61", "\udfff",
"\uff61\udfff", "\uff61\ud800\udc02", "\ud800\udc02",
"\ud84d\udc56"};
UTF16.StringComparator cpcompare
= new UTF16.StringComparator(true, false,
UTF16.StringComparator.FOLD_CASE_DEFAULT);
UTF16.StringComparator cucompare
= new UTF16.StringComparator();
for (int i = 0; i < str.length - 1; ++ i) {
if (cpcompare.compare(str[i], str[i + 1]) >= 0) {
errln("error: compare() in code point order fails for string "
+ Utility.hex(str[i]) + " and "
+ Utility.hex(str[i + 1]));
}
// test code unit compare
if (cucompare.compare(str[i], str[i + 1])
!= str[i].compareTo(str[i + 1])) {
errln("error: compare() in code unit order fails for string "
+ Utility.hex(str[i]) + " and "
+ Utility.hex(str[i + 1]));
}
}
}
public void TestCaseCompare()
{
String mixed = "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff";
@ -1262,13 +1341,133 @@ public final class UTF16Test extends TestFmwk
*/
}
public void TestHasMoreCodePointsThan()
{
String str = "\u0061\u0062\ud800\udc00\ud801\udc01\u0063\ud802\u0064"
+ "\udc03\u0065\u0066\ud804\udc04\ud805\udc05\u0067";
int length = str.length();
while (length >= 0) {
for (int i = 0; i <= length; ++ i) {
String s = str.substring(0, i);
for (int number = -1; number <= ((length - i) + 2); ++ number) {
boolean flag = UTF16.hasMoreCodePointsThan(s, number);
if (flag != (UTF16.countCodePoint(s) > number)) {
errln("hasMoreCodePointsThan(" + Utility.hex(s)
+ ", " + number + ") = " + flag + " is wrong");
}
}
}
-- length;
}
// testing for null bad input
for(length = -1; length <= 1; ++ length) {
for (int i = 0; i <= length; ++ i) {
for (int number = -2; number <= 2; ++ number) {
boolean flag = UTF16.hasMoreCodePointsThan((String)null,
number);
if (flag != (UTF16.countCodePoint((String)null) > number)) {
errln("hasMoreCodePointsThan(null, " + number + ") = "
+ flag + " is wrong");
}
}
}
}
length = str.length();
while (length >= 0) {
for (int i = 0; i <= length; ++ i) {
StringBuffer s = new StringBuffer(str.substring(0, i));
for (int number = -1; number <= ((length - i) + 2); ++ number) {
boolean flag = UTF16.hasMoreCodePointsThan(s, number);
if (flag != (UTF16.countCodePoint(s) > number)) {
errln("hasMoreCodePointsThan(" + Utility.hex(s)
+ ", " + number + ") = " + flag + " is wrong");
}
}
}
-- length;
}
// testing for null bad input
for (length = -1; length <= 1; ++ length) {
for (int i = 0; i <= length; ++ i) {
for (int number = -2; number <= 2; ++ number) {
boolean flag = UTF16.hasMoreCodePointsThan(
(StringBuffer)null, number);
if (flag
!= (UTF16.countCodePoint((StringBuffer)null) > number))
{
errln("hasMoreCodePointsThan(null, " + number + ") = "
+ flag + " is wrong");
}
}
}
}
char strarray[] = str.toCharArray();
while (length >= 0) {
for (int limit = 0; limit <= length; ++ limit) {
for (int start = 0; start <= limit; ++ start) {
for (int number = -1; number <= ((limit - start) + 2);
++ number) {
boolean flag = UTF16.hasMoreCodePointsThan(strarray,
start, limit, number);
if (flag != (UTF16.countCodePoint(strarray, start,
limit) > number)) {
errln("hasMoreCodePointsThan("
+ Utility.hex(str.substring(start, limit))
+ ", " + start + ", " + limit + ", " + number
+ ") = " + flag + " is wrong");
}
}
}
}
-- length;
}
// testing for null bad input
for (length = -1; length <= 1; ++ length) {
for (int i = 0; i <= length; ++ i) {
for (int number = -2; number <= 2; ++ number) {
boolean flag = UTF16.hasMoreCodePointsThan(
(StringBuffer)null, number);
if (flag
!= (UTF16.countCodePoint((StringBuffer)null) > number))
{
errln("hasMoreCodePointsThan(null, " + number + ") = "
+ flag + " is wrong");
}
}
}
}
// bad input
try {
UTF16.hasMoreCodePointsThan(strarray, -2, -1, 5);
errln("hasMoreCodePointsThan(chararray) with negative indexes has to throw an exception");
} catch (Exception e) {
}
try {
UTF16.hasMoreCodePointsThan(strarray, 5, 2, 5);
errln("hasMoreCodePointsThan(chararray) with limit less than start index has to throw an exception");
} catch (Exception e) {
}
try {
if (UTF16.hasMoreCodePointsThan(strarray, -2, 2, 5)) {
errln("hasMoreCodePointsThan(chararray) with negative start indexes can't return true");
}
} catch (Exception e) {
}
}
public static void main(String[] arg)
{
try
{
UTF16Test test = new UTF16Test();
// test.TestIndexOf();
test.run(arg);
// test.TestCodePointCompare();
}
catch (Exception e)
{
@ -1294,5 +1493,7 @@ public final class UTF16Test extends TestFmwk
private final static String INDEXOF_SUPPLEMENTARY_STR_ = "\udc02\ud841";
private final static int INDEXOF_SUPPLEMENTARY_STR_INDEX_[] =
{3, 16};
// private methods ---------------------------------------------------
}

View File

@ -1,12 +1,12 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* Copyright (C) 1996-2002, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UTF16.java,v $
* $Date: 2002/07/16 00:21:13 $
* $Revision: 1.22 $
* $Date: 2002/10/26 05:50:40 $
* $Revision: 1.23 $
*
*******************************************************************************
*/
@ -16,15 +16,15 @@ package com.ibm.icu.text;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.NormalizerImpl;
/**
* Standalone utility class providing UTF16 character conversions and indexing
* conversions.
* <p>Standalone utility class providing UTF16 character conversions and indexing
* conversions.</p>
* <p>Code that uses strings alone rarely need modification.
* By design, UTF-16 does not allow overlap, so searching for strings is a safe
* operation. Similarly, concatenation is always safe. Substringing is safe if
* the start and end are both on UTF-32 boundaries. In normal code, the values
* for start and end are on those boundaries, since they arose from operations
* like searching. If not, the nearest UTF-32 boundaries can be determined
* using <code>bounds()</code>.
* using <code>bounds()</code>.</p>
* <strong>Examples:</strong>
* <p>The following examples illustrate use of some of these methods.
* <pre>
@ -393,30 +393,6 @@ public final class UTF16
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a string.
* If a validity check is required, use
* <code><a href="../UCharacter.html#isLegal(char)">
* UCharacter.isLegal()</a></code> on the return value.
* If tbe char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @param source array of UTF-16 chars
* @param offset32 UTF-32 offset to the start of the character.
* @return a single UTF32 value
* @exception IndexOutOfBoundsException if offset16 is out of bounds.
* @deprecated to be removed after the year 2002, replaced by
* UTF16.charAt(source, UTF16.findOffsetFromCodePoint(source,
* offset32));
*/
public static int charAtCodePointOffset(String source, int offset32)
{
return charAt(source, findOffsetFromCodePoint(source, offset32));
}
/**
* Determines how many chars this char32 requires.
* If a validity check is required, use <code>
@ -569,30 +545,7 @@ public final class UTF16
}
/**
* Returns the type of the boundaries around the char at offset32. Used
* for random access.
* @param source string to analyse
* @param offset32 UTF32 offset
* @return
* <ul>
* <li> SINGLE_CHAR_BOUNDARY : a single char
* <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at
* offset32
* </ul>
* For bit-twiddlers, see <a href=#bounds(java.lang.String, int)>
* bounds(java.lang.String, int)</a> for information on the choice of the
* boundary values.
* @exception IndexOutOfBoundsException if offset16 is out of bounds.
* @deprecated will be removed after end of year 2002, replaced by
* UTF16.bounds(source, UTF16.findOffsetFromCodePoint(source, offset32));
*/
public static int boundsAtCodePointOffset(String source, int offset32)
{
return bounds(source, findOffsetFromCodePoint(source, offset32));
}
/**
* Determines whether the <b>code value is a surrogate.
* Determines whether the code value is a surrogate.
* @param ch the input character.
* @return true iff the input character is a surrogate.
*/
@ -1146,26 +1099,6 @@ public final class UTF16
return findCodePointOffset(source, start, limit, limit - start);
}
/**
* Sets a code point into a UTF32 position.
* Adjusts target according if we are replacing a non-supplementary
* codepoint with a supplementary and vice versa.
* @param target stringbuffer
* @param offset32 UTF32 position to insert into
* @exception IndexOutOfBoundsException if offset32 is out of bounds.
* @param char32 code point
* @deprecated to be removed after the year 2002,
* UTF16.setCharAt(target,
* findOffsetFromCodePoint(target.toString(), offset32),
* char32);
*/
public static void setCharAtCodePointOffset(StringBuffer target,
int offset32, int char32)
{
int offset16 = findOffsetFromCodePoint(target.toString(), offset32);
setCharAt(target, offset16, char32);
}
/**
* Set a code point into a UTF16 position.
* Adjusts target according if we are replacing a non-supplementary
@ -2116,106 +2049,506 @@ public final class UTF16
}
return result;
}
/**
* Compare strings using Unicode code point order, instead of UTF-16 code
* unit order.
* Check if the string contains more Unicode code points than a certain
* number. This is more efficient than counting all code points in the
* entire string and comparing that number with a threshold.
* This function may not need to scan the string at all if the length is
* within a certain range, and never needs to count more than 'number + 1'
* code points. Logically equivalent to (countCodePoint(s) > number). A
* Unicode code point may occupy either one or two code units.
* @param source The input string.
* @param number The number of code points in the string is compared
* against the 'number' parameter.
* @return boolean value for whether the string contains more Unicode code
* points than 'number'.
* @draft 2.4
*/
public static boolean hasMoreCodePointsThan(String source, int number)
{
if (number < 0) {
return true;
}
if (source == null) {
return false;
}
int length = source.length();
// length >= 0 known
// source contains at least (length + 1) / 2 code points: <= 2
// chars per cp
if (((length + 1) >> 1) > number) {
return true;
}
// check if source does not even contain enough chars
int maxsupplementary = length - number;
if (maxsupplementary <= 0) {
return false;
}
// there are maxsupplementary = length - number more chars than
// asked-for code points
// count code points until they exceed and also check that there are
// no more than maxsupplementary supplementary code points (char pairs)
int start = 0;
while (true) {
if (length == 0) {
return false;
}
if (number == 0) {
return true;
}
if (isLeadSurrogate(source.charAt(start ++)) && start != length
&& isTrailSurrogate(source.charAt(start))) {
start ++;
if (-- maxsupplementary <= 0) {
// too many pairs - too few code points
return false;
}
}
-- number;
}
}
/**
* Check if the sub-range of char array, from argument start to limit,
* contains more Unicode code points than a certain
* number. This is more efficient than counting all code points in the
* entire char array range and comparing that number with a threshold.
* This function may not need to scan the char array at all if start and
* limit is within a certain range, and never needs to count more than
* 'number + 1' code points.
* Logically equivalent to (countCodePoint(source, start, limit) > number).
* A Unicode code point may occupy either one or two code units.
* @param source array of UTF-16 chars
* @param start offset to substring in the source array for analyzing
* @param limit offset to substring in the source array for analyzing
* @param number The number of code points in the string is compared
* against the 'number' parameter.
* @return boolean value for whether the string contains more Unicode code
* points than 'number'.
* @exception IndexOutOfBoundsException thrown when limit &lt; start
* @draft 2.4
*/
public static boolean hasMoreCodePointsThan(char source[], int start,
int limit, int number)
{
int length = limit - start;
if (length < 0 || start < 0 || limit < 0) {
throw new IndexOutOfBoundsException(
"Start and limit indexes should be non-negative and start <= limit");
}
if (number < 0) {
return true;
}
if (source == null) {
return false;
}
// length >= 0 known
// source contains at least (length + 1) / 2 code points: <= 2
// chars per cp
if (((length + 1) >> 1) > number) {
return true;
}
// check if source does not even contain enough chars
int maxsupplementary = length - number;
if (maxsupplementary <= 0) {
return false;
}
// there are maxsupplementary = length - number more chars than
// asked-for code points
// count code points until they exceed and also check that there are
// no more than maxsupplementary supplementary code points (char pairs)
while (true) {
if (length == 0) {
return false;
}
if (number == 0) {
return true;
}
if (isLeadSurrogate(source[start ++]) && start != limit
&& isTrailSurrogate(source[start])) {
start ++;
if (-- maxsupplementary <= 0) {
// too many pairs - too few code points
return false;
}
}
-- number;
}
}
/**
* Check if the string buffer contains more Unicode code points than a
* certain number. This is more efficient than counting all code points in
* the entire string buffer and comparing that number with a threshold.
* This function may not need to scan the string buffer at all if the
* length is within a certain range, and never needs to count more than
* 'number + 1' code points. Logically equivalent to
* (countCodePoint(s) > number). A Unicode code point may occupy either one
* or two code units.
* @param source The input string buffer.
* @param number The number of code points in the string buffer is compared
* against the 'number' parameter.
* @return boolean value for whether the string buffer contains more
* Unicode code points than 'number'.
* @draft 2.4
*/
public static boolean hasMoreCodePointsThan(StringBuffer source, int number)
{
if (number < 0) {
return true;
}
if (source == null) {
return false;
}
int length = source.length();
// length >= 0 known
// source contains at least (length + 1) / 2 code points: <= 2
// chars per cp
if (((length + 1) >> 1) > number) {
return true;
}
// check if source does not even contain enough chars
int maxsupplementary = length - number;
if (maxsupplementary <= 0) {
return false;
}
// there are maxsupplementary = length - number more chars than
// asked-for code points
// count code points until they exceed and also check that there are
// no more than maxsupplementary supplementary code points (char pairs)
int start = 0;
while (true) {
if (length == 0) {
return false;
}
if (number == 0) {
return true;
}
if (isLeadSurrogate(source.charAt(start ++)) && start != length
&& isTrailSurrogate(source.charAt(start))) {
start ++;
if (-- maxsupplementary <= 0) {
// too many pairs - too few code points
return false;
}
}
-- number;
}
}
/**
* <p>UTF16 string comparator class.
* Allows UTF16 string comparison to be done with the various modes</p>
* <ul>
* <li> Code point comparison or code unit comparison
* <li> Case sensitive comparison, case insensitive comparison or case
* insensitive comparison with special handling for character 'i'.
* </ul>
* <p>The code unit or code point comparison differ only when comparing
* supplementary code points (&#92;u10000..&#92;u10ffff) to BMP code points
* near the end of the BMP (i.e., &#92;ue000..&#92;uffff). In code unit
* comparison, high BMP code points sort after supplementary code points
* because they are stored as pairs of surrogates which are at
* &#92;ud800..&#92;udfff.</p>
* @see #FOLD_CASE_DEFAULT
* @see #FOLD_CASE_EXCLUDE_SPECIAL_I
* @stable
*/
public static final class StringComparator implements java.util.Comparator
{
// public constructor ------------------------------------------------
/**
* Standard String compare. Only one small section is different, marked in
* the code.
*/
public int compare(Object a, Object b)
* Default constructor that does code unit comparison and case
* sensitive comparison.
*/
public StringComparator()
{
if (a == b) {
return 0;
}
if (a == null) {
return -1;
}
if (b == null) {
return 1;
}
String sa = (String) a;
String sb = (String) b;
int lena = sa.length();
int lenb = sb.length();
int len = lena;
if (len > lenb) {
len = lenb;
}
for (int i = 0; i < len; ++i)
{
char ca = sa.charAt(i);
char cb = sb.charAt(i);
if (ca == cb) {
continue; // skip remap if equal
}
// start of only different section
// if either code unit is below 0xd800, i.e., below the
// surrogate range, then nothing needs to be done
// if both are >=0xd800 then special code adjusts code unit
// values so that all BMP code points (including single
// surrogate code points) sort below supplementary ones
// this is necessary because surrogates are not at the end of
// the code unit range
if (ca >= LEAD_SURROGATE_MIN_VALUE
&& cb >= LEAD_SURROGATE_MIN_VALUE) {
// subtract 0x2800 from BMP code points to make them
// smaller than supplementary ones
if ((ca <= LEAD_SURROGATE_MAX_VALUE && (i + 1) < lena
&& isTrailSurrogate(sa.charAt(i + 1)))
|| (isTrailSurrogate(ca) && i > 0
&& isLeadSurrogate(sa.charAt(i - 1)))) {
// part of a surrogate pair, leave >=d800
}
else {
// BMP code point - may be surrogate code point - make
// <d800
ca -= 0x2800;
}
if ((cb <= LEAD_SURROGATE_MAX_VALUE && (i + 1) < lenb
&& isTrailSurrogate(sb.charAt(i + 1)))
|| (isTrailSurrogate(cb) && i > 0
&& isLeadSurrogate(sb.charAt(i - 1)))) {
// part of a surrogate pair, leave >=d800
}
else {
// BMP code point - may be surrogate code point - make
// < d800
cb -= 0x2800;
}
}
// end of only different section
if (ca < cb) {
return -1;
}
return 1; // wasn't equal, so return 1
}
if (lena < lenb) {
return -1;
}
if (lena > lenb) {
return 1;
}
return 0;
m_codePointCompare_ = false;
m_ignoreCase_ = false;
m_foldCase_ = FOLD_CASE_DEFAULT;
}
/**
* Constructor that does comparison based on the argument options.
* @param codepointcompare flag to indicate true for code point
* comparison or false for code unit comparison.
* @param ignorecase false for case sensitive comparison, true for
* case-insensitive comparison
* @param foldcaseoption FOLD_CASE_DEFAULT or
* FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when
* ignorecase is set to true. If ignorecase is false, this option
* is ignored.
* @see #FOLD_CASE_DEFAULT
* @see #FOLD_CASE_EXCLUDE_SPECIAL_I
* @throws IllegalArgumentException if foldcaseoption is out of range
*/
public StringComparator(boolean codepointcompare,
boolean ignorecase,
int foldcaseoption)
{
m_codePointCompare_ = codepointcompare;
m_ignoreCase_ = ignorecase;
if (foldcaseoption < FOLD_CASE_DEFAULT
|| foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
throw new IllegalArgumentException("Invalid fold case option");
}
m_foldCase_ = foldcaseoption;
}
// public data member ------------------------------------------------
/**
* <p>Option value for case folding comparison:</p>
* <p>Comparison is case insensitive, strings are folded using default
* mappings defined in Unicode data file CaseFolding.txt, before
* comparison.
* </p>
* @draft 2.4
*/
public static final int FOLD_CASE_DEFAULT = 0;
/**
* <p>Option value for case folding comparison:</p>
* <p>Comparison is case insensitive, strings are folded using modified
* mappings defined in Unicode data file CaseFolding.txt, before
* comparison.
* </p>
* <p>The modified set of mappings is provided in a Unicode data file
* CaseFolding.txt to handle dotted I and dotless i appropriately for
* Turkic languages (tr, az).</p>
* <p>Before Unicode 3.2, CaseFolding.txt contains mappings marked with
* 'I' that are to be included for default mappings and excluded for
* the Turkic-specific mappings.</p>
* <p>Unicode 3.2 CaseFolding.txt instead contains mappings marked with
* 'T' that are to be excluded for default mappings and included for
* the Turkic-specific mappings.</p>
* @draft 2.4
*/
public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0;
// public methods ----------------------------------------------------
// public setters ----------------------------------------------------
/**
* Sets the comparison mode to code point compare if flag is true.
* Otherwise comparison mode is set to code unit compare
* @param flag true for code point compare, false for code unit compare
*/
public void setCodePointCompare(boolean flag)
{
m_codePointCompare_ = flag;
}
/**
* Sets the Comparator to case-insensitive comparison mode if argument
* is true, otherwise case sensitive comparison mode if set to false.
* @param ignorecase true for case-insitive comparison, false for
* case sensitive comparison
* @param foldcaseoptions FOLD_CASE_DEFAULT or
* FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when
* ignorecase is set to true. If ignorecase is false, this option
* is ignored.
* @see #FOLD_CASE_DEFAULT
* @see #FOLD_CASE_EXCLUDE_SPECIAL_I
*/
public void setIgnoreCase(boolean ignorecase, int foldcaseoption)
{
m_ignoreCase_ = ignorecase;
if (foldcaseoption < FOLD_CASE_DEFAULT
|| foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
throw new IllegalArgumentException("Invalid fold case option");
}
m_foldCase_ = foldcaseoption;
}
// public getters ----------------------------------------------------
/**
* Checks if the comparison mode is code point compare.
* @return true for code point compare, false for code unit compare
*/
public boolean getCodePointCompare()
{
return m_codePointCompare_;
}
/**
* Checks if Comparator is in the case insensitive mode.
* @return true if Comparator performs case insensitive comparison,
* false otherwise
*/
public boolean getIgnoreCase()
{
return m_ignoreCase_;
}
/**
* Gets the fold case options set in Comparator to be used with case
* insensitive comparison.
* @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
* @see #FOLD_CASE_DEFAULT
* @see #FOLD_CASE_EXCLUDE_SPECIAL_I
*/
public int getIgnoreCaseOption()
{
return m_foldCase_;
}
// public other methods ----------------------------------------------
/**
* Compare two strings depending on the options selected during
* construction.
* @param a first source string.
* @param b second source string.
* @return 0 returned if a == b. If a < b, a negative value is returned.
* Otherwise if a > b, a positive value is returned.
* @exception ClassCastException thrown when either a or b is not a
* String object
* @draft 2.4
*/
public int compare(Object a, Object b)
{
String str1 = (String)a;
String str2 = (String)b;
if (str1 == str2) {
return 0;
}
if (str1 == null) {
return -1;
}
if (str2 == null) {
return 1;
}
if (m_ignoreCase_) {
return compareCaseInsensitive(str1, str2);
}
return compareCaseSensitive(str1, str2);
}
// private data member ----------------------------------------------
/**
* Code unit comparison flag. True if code unit comparison is required.
* False if code point comparison is required.
*/
private boolean m_codePointCompare_;
/**
* Fold case comparison option.
*/
private int m_foldCase_;
/**
* Flag indicator if ignore case is to be used during comparison
*/
private boolean m_ignoreCase_;
/**
* Code point order offset for surrogate characters
*/
private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
// private method ---------------------------------------------------
/**
* Compares case insensitive. This is a direct port of ICU4C, to make
* maintainence life easier.
* @param s1 first string to compare
* @param s2 second string to compare
* @return -1 is s1 &lt; s2, 0 if equals,
*/
private int compareCaseInsensitive(String s1, String s2)
{
return NormalizerImpl.cmpEquivFold(s1, s2,
m_foldCase_ |
Normalizer.COMPARE_IGNORE_CASE);
}
/**
* Compares case sensitive. This is a direct port of ICU4C, to make
* maintainence life easier.
* @param s1 first string to compare
* @param s2 second string to compare
* @return -1 is s1 &lt; s2, 0 if equals,
*/
private int compareCaseSensitive(String s1, String s2)
{
// compare identical prefixes - they do not need to be fixed up
// limit1 = start1 + min(lenght1, length2)
int length1 = s1.length();
int length2 = s2.length();
int minlength = length1;
int result = 0;
if (length1 < length2) {
result = -1;
}
else if (length1 > length2) {
result = 1;
}
char c1 = 0;
char c2 = 0;
int index = 0;
for (; index < minlength; index ++) {
c1 = s1.charAt(index);
c2 = s2.charAt(index);
// check pseudo-limit
if (c1 != c2) {
break;
}
}
if (index == minlength) {
return result;
}
// if both values are in or above the surrogate range, fix them up
if (c1 >= LEAD_SURROGATE_MIN_VALUE
&& c2 >= LEAD_SURROGATE_MIN_VALUE && m_codePointCompare_) {
// subtract 0x2800 from BMP code points to make them smaller
// than supplementary ones
if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1
&& isTrailSurrogate(s1.charAt(index + 1)))
|| (isTrailSurrogate(c1) && index != 0
&& isLeadSurrogate(s1.charAt(index - 1)))) {
// part of a surrogate pair, leave >=d800
}
else {
// BMP code point - may be surrogate code point - make
// < d800
c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
}
if ((c2 <= LEAD_SURROGATE_MAX_VALUE
&& (index + 1) != length2
&& isTrailSurrogate(s2.charAt(index + 1))) ||
(isTrailSurrogate(c2) && index != 0
&& isLeadSurrogate(s2.charAt(index - 1)))) {
// part of a surrogate pair, leave >=d800
}
else {
// BMP code point - may be surrogate code point - make <d800
c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
}
}
// now c1 and c2 are in UTF-32-compatible order
return c1 - c2;
}
}
// private data members -------------------------------------------------
@ -2234,8 +2567,8 @@ public final class UTF16
private static final int LEAD_SURROGATE_OFFSET_ =
LEAD_SURROGATE_MIN_VALUE -
(SUPPLEMENTARY_MIN_VALUE
>> LEAD_SURROGATE_SHIFT_);
>> LEAD_SURROGATE_SHIFT_);
// private methods ------------------------------------------------------
/**
@ -2248,6 +2581,7 @@ public final class UTF16
* points, 2 otherwise.</p>
* @param ch code point
* @return string representation of the code point
* @deprecated since 2.4, use UCharater.toString(int) instead
*/
public static String toString(int ch)
{