ICU-2191
Added * hasMoreCodePointsThan * StringComparator with code unit/point comparison * Still has to implement case insensitive comparison X-SVN-Rev: 10068
This commit is contained in:
parent
130f5c120b
commit
65d107bf3d
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/UTF16Test.java,v $
|
||||
* $Date: 2002/07/11 21:25:23 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2002/10/26 05:50:40 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -835,8 +835,8 @@ public final class UTF16Test extends TestFmwk
|
||||
String test2 = "test";
|
||||
int testChar1 = 0x74;
|
||||
int testChar2 = 0x20402;
|
||||
int testChar3 = 0xdc02;
|
||||
int testChar4 = 0xd841;
|
||||
// int testChar3 = 0xdc02;
|
||||
// int testChar4 = 0xd841;
|
||||
String test3 = "\ud841\udc02\u0071\udc02\ud841\u0071\ud841\udc02\u0071\u0072\ud841\udc02\u0071\ud841\udc02\u0071\udc02\ud841\u0073";
|
||||
String test4 = UCharacter.toString(testChar2);
|
||||
|
||||
@ -1042,7 +1042,7 @@ public final class UTF16Test extends TestFmwk
|
||||
if (UTF16.indexOf(INDEXOF_SUPPLEMENTARY_STRING_, ch, index) !=
|
||||
expected ||
|
||||
UTF16.indexOf(INDEXOF_SUPPLEMENTARY_STRING_,
|
||||
UTF16.toString(ch), index) !=
|
||||
UCharacter.toString(ch), index) !=
|
||||
expected) {
|
||||
errln("Failed finding index for supplementary 0x" +
|
||||
Integer.toHexString(ch));
|
||||
@ -1054,7 +1054,8 @@ public final class UTF16Test extends TestFmwk
|
||||
if (UTF16.lastIndexOf(INDEXOF_SUPPLEMENTARY_STRING_, ch,
|
||||
index) != expected ||
|
||||
UTF16.lastIndexOf(INDEXOF_SUPPLEMENTARY_STRING_,
|
||||
UTF16.toString(ch), index) != expected)
|
||||
UCharacter.toString(ch), index)
|
||||
!= expected)
|
||||
{
|
||||
errln("Failed finding last index for supplementary 0x" +
|
||||
Integer.toHexString(ch));
|
||||
@ -1172,7 +1173,85 @@ public final class UTF16Test extends TestFmwk
|
||||
errln("reverse() failed with supplementary characters");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Testing the setter and getter apis for StringComparator
|
||||
*/
|
||||
public void TestStringComparator()
|
||||
{
|
||||
UTF16.StringComparator compare = new UTF16.StringComparator();
|
||||
if (compare.getCodePointCompare() != false) {
|
||||
errln("Default string comparator should be code unit compare");
|
||||
}
|
||||
if (compare.getIgnoreCase() != false) {
|
||||
errln("Default string comparator should be case sensitive compare");
|
||||
}
|
||||
if (compare.getIgnoreCaseOption()
|
||||
!= UTF16.StringComparator.FOLD_CASE_DEFAULT) {
|
||||
errln("Default string comparator should have fold case default compare");
|
||||
}
|
||||
compare.setCodePointCompare(true);
|
||||
if (compare.getCodePointCompare() != true) {
|
||||
errln("Error setting code point compare");
|
||||
}
|
||||
compare.setCodePointCompare(false);
|
||||
if (compare.getCodePointCompare() != false) {
|
||||
errln("Error setting code point compare");
|
||||
}
|
||||
compare.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
|
||||
if (compare.getIgnoreCase() != true
|
||||
|| compare.getIgnoreCaseOption()
|
||||
!= UTF16.StringComparator.FOLD_CASE_DEFAULT) {
|
||||
errln("Error setting ignore case and options");
|
||||
}
|
||||
compare.setIgnoreCase(false, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
|
||||
if (compare.getIgnoreCase() != false
|
||||
|| compare.getIgnoreCaseOption()
|
||||
!= UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I) {
|
||||
errln("Error setting ignore case and options");
|
||||
}
|
||||
compare.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
|
||||
if (compare.getIgnoreCase() != true
|
||||
|| compare.getIgnoreCaseOption()
|
||||
!= UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I) {
|
||||
errln("Error setting ignore case and options");
|
||||
}
|
||||
compare.setIgnoreCase(false, UTF16.StringComparator.FOLD_CASE_DEFAULT);
|
||||
if (compare.getIgnoreCase() != false
|
||||
|| compare.getIgnoreCaseOption()
|
||||
!= UTF16.StringComparator.FOLD_CASE_DEFAULT) {
|
||||
errln("Error setting ignore case and options");
|
||||
}
|
||||
}
|
||||
|
||||
public void TestCodePointCompare()
|
||||
{
|
||||
// these strings are in ascending order
|
||||
String str[] = {"\u0061", "\u20ac\ud801", "\u20ac\ud800\udc00",
|
||||
"\ud800", "\ud800\uff61", "\udfff",
|
||||
"\uff61\udfff", "\uff61\ud800\udc02", "\ud800\udc02",
|
||||
"\ud84d\udc56"};
|
||||
UTF16.StringComparator cpcompare
|
||||
= new UTF16.StringComparator(true, false,
|
||||
UTF16.StringComparator.FOLD_CASE_DEFAULT);
|
||||
UTF16.StringComparator cucompare
|
||||
= new UTF16.StringComparator();
|
||||
for (int i = 0; i < str.length - 1; ++ i) {
|
||||
if (cpcompare.compare(str[i], str[i + 1]) >= 0) {
|
||||
errln("error: compare() in code point order fails for string "
|
||||
+ Utility.hex(str[i]) + " and "
|
||||
+ Utility.hex(str[i + 1]));
|
||||
}
|
||||
// test code unit compare
|
||||
if (cucompare.compare(str[i], str[i + 1])
|
||||
!= str[i].compareTo(str[i + 1])) {
|
||||
errln("error: compare() in code unit order fails for string "
|
||||
+ Utility.hex(str[i]) + " and "
|
||||
+ Utility.hex(str[i + 1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestCaseCompare()
|
||||
{
|
||||
String mixed = "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff";
|
||||
@ -1262,13 +1341,133 @@ public final class UTF16Test extends TestFmwk
|
||||
*/
|
||||
}
|
||||
|
||||
public void TestHasMoreCodePointsThan()
|
||||
{
|
||||
String str = "\u0061\u0062\ud800\udc00\ud801\udc01\u0063\ud802\u0064"
|
||||
+ "\udc03\u0065\u0066\ud804\udc04\ud805\udc05\u0067";
|
||||
int length = str.length();
|
||||
while (length >= 0) {
|
||||
for (int i = 0; i <= length; ++ i) {
|
||||
String s = str.substring(0, i);
|
||||
for (int number = -1; number <= ((length - i) + 2); ++ number) {
|
||||
boolean flag = UTF16.hasMoreCodePointsThan(s, number);
|
||||
if (flag != (UTF16.countCodePoint(s) > number)) {
|
||||
errln("hasMoreCodePointsThan(" + Utility.hex(s)
|
||||
+ ", " + number + ") = " + flag + " is wrong");
|
||||
}
|
||||
}
|
||||
}
|
||||
-- length;
|
||||
}
|
||||
|
||||
// testing for null bad input
|
||||
for(length = -1; length <= 1; ++ length) {
|
||||
for (int i = 0; i <= length; ++ i) {
|
||||
for (int number = -2; number <= 2; ++ number) {
|
||||
boolean flag = UTF16.hasMoreCodePointsThan((String)null,
|
||||
number);
|
||||
if (flag != (UTF16.countCodePoint((String)null) > number)) {
|
||||
errln("hasMoreCodePointsThan(null, " + number + ") = "
|
||||
+ flag + " is wrong");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
length = str.length();
|
||||
while (length >= 0) {
|
||||
for (int i = 0; i <= length; ++ i) {
|
||||
StringBuffer s = new StringBuffer(str.substring(0, i));
|
||||
for (int number = -1; number <= ((length - i) + 2); ++ number) {
|
||||
boolean flag = UTF16.hasMoreCodePointsThan(s, number);
|
||||
if (flag != (UTF16.countCodePoint(s) > number)) {
|
||||
errln("hasMoreCodePointsThan(" + Utility.hex(s)
|
||||
+ ", " + number + ") = " + flag + " is wrong");
|
||||
}
|
||||
}
|
||||
}
|
||||
-- length;
|
||||
}
|
||||
|
||||
// testing for null bad input
|
||||
for (length = -1; length <= 1; ++ length) {
|
||||
for (int i = 0; i <= length; ++ i) {
|
||||
for (int number = -2; number <= 2; ++ number) {
|
||||
boolean flag = UTF16.hasMoreCodePointsThan(
|
||||
(StringBuffer)null, number);
|
||||
if (flag
|
||||
!= (UTF16.countCodePoint((StringBuffer)null) > number))
|
||||
{
|
||||
errln("hasMoreCodePointsThan(null, " + number + ") = "
|
||||
+ flag + " is wrong");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
char strarray[] = str.toCharArray();
|
||||
while (length >= 0) {
|
||||
for (int limit = 0; limit <= length; ++ limit) {
|
||||
for (int start = 0; start <= limit; ++ start) {
|
||||
for (int number = -1; number <= ((limit - start) + 2);
|
||||
++ number) {
|
||||
boolean flag = UTF16.hasMoreCodePointsThan(strarray,
|
||||
start, limit, number);
|
||||
if (flag != (UTF16.countCodePoint(strarray, start,
|
||||
limit) > number)) {
|
||||
errln("hasMoreCodePointsThan("
|
||||
+ Utility.hex(str.substring(start, limit))
|
||||
+ ", " + start + ", " + limit + ", " + number
|
||||
+ ") = " + flag + " is wrong");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
-- length;
|
||||
}
|
||||
|
||||
// testing for null bad input
|
||||
for (length = -1; length <= 1; ++ length) {
|
||||
for (int i = 0; i <= length; ++ i) {
|
||||
for (int number = -2; number <= 2; ++ number) {
|
||||
boolean flag = UTF16.hasMoreCodePointsThan(
|
||||
(StringBuffer)null, number);
|
||||
if (flag
|
||||
!= (UTF16.countCodePoint((StringBuffer)null) > number))
|
||||
{
|
||||
errln("hasMoreCodePointsThan(null, " + number + ") = "
|
||||
+ flag + " is wrong");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// bad input
|
||||
try {
|
||||
UTF16.hasMoreCodePointsThan(strarray, -2, -1, 5);
|
||||
errln("hasMoreCodePointsThan(chararray) with negative indexes has to throw an exception");
|
||||
} catch (Exception e) {
|
||||
}
|
||||
try {
|
||||
UTF16.hasMoreCodePointsThan(strarray, 5, 2, 5);
|
||||
errln("hasMoreCodePointsThan(chararray) with limit less than start index has to throw an exception");
|
||||
} catch (Exception e) {
|
||||
}
|
||||
try {
|
||||
if (UTF16.hasMoreCodePointsThan(strarray, -2, 2, 5)) {
|
||||
errln("hasMoreCodePointsThan(chararray) with negative start indexes can't return true");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] arg)
|
||||
{
|
||||
try
|
||||
{
|
||||
UTF16Test test = new UTF16Test();
|
||||
// test.TestIndexOf();
|
||||
test.run(arg);
|
||||
// test.TestCodePointCompare();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
@ -1294,5 +1493,7 @@ public final class UTF16Test extends TestFmwk
|
||||
private final static String INDEXOF_SUPPLEMENTARY_STR_ = "\udc02\ud841";
|
||||
private final static int INDEXOF_SUPPLEMENTARY_STR_INDEX_[] =
|
||||
{3, 16};
|
||||
|
||||
// private methods ---------------------------------------------------
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2002, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UTF16.java,v $
|
||||
* $Date: 2002/07/16 00:21:13 $
|
||||
* $Revision: 1.22 $
|
||||
* $Date: 2002/10/26 05:50:40 $
|
||||
* $Revision: 1.23 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -16,15 +16,15 @@ package com.ibm.icu.text;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
/**
|
||||
* Standalone utility class providing UTF16 character conversions and indexing
|
||||
* conversions.
|
||||
* <p>Standalone utility class providing UTF16 character conversions and indexing
|
||||
* conversions.</p>
|
||||
* <p>Code that uses strings alone rarely need modification.
|
||||
* By design, UTF-16 does not allow overlap, so searching for strings is a safe
|
||||
* operation. Similarly, concatenation is always safe. Substringing is safe if
|
||||
* the start and end are both on UTF-32 boundaries. In normal code, the values
|
||||
* for start and end are on those boundaries, since they arose from operations
|
||||
* like searching. If not, the nearest UTF-32 boundaries can be determined
|
||||
* using <code>bounds()</code>.
|
||||
* using <code>bounds()</code>.</p>
|
||||
* <strong>Examples:</strong>
|
||||
* <p>The following examples illustrate use of some of these methods.
|
||||
* <pre>
|
||||
@ -393,30 +393,6 @@ public final class UTF16
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a string.
|
||||
* If a validity check is required, use
|
||||
* <code><a href="../UCharacter.html#isLegal(char)">
|
||||
* UCharacter.isLegal()</a></code> on the return value.
|
||||
* If tbe char retrieved is part of a surrogate pair, its supplementary
|
||||
* character will be returned. If a complete supplementary character is
|
||||
* not found the incomplete character will be returned
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at
|
||||
* offset16. The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @param source array of UTF-16 chars
|
||||
* @param offset32 UTF-32 offset to the start of the character.
|
||||
* @return a single UTF32 value
|
||||
* @exception IndexOutOfBoundsException if offset16 is out of bounds.
|
||||
* @deprecated to be removed after the year 2002, replaced by
|
||||
* UTF16.charAt(source, UTF16.findOffsetFromCodePoint(source,
|
||||
* offset32));
|
||||
*/
|
||||
public static int charAtCodePointOffset(String source, int offset32)
|
||||
{
|
||||
return charAt(source, findOffsetFromCodePoint(source, offset32));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines how many chars this char32 requires.
|
||||
* If a validity check is required, use <code>
|
||||
@ -569,30 +545,7 @@ public final class UTF16
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the type of the boundaries around the char at offset32. Used
|
||||
* for random access.
|
||||
* @param source string to analyse
|
||||
* @param offset32 UTF32 offset
|
||||
* @return
|
||||
* <ul>
|
||||
* <li> SINGLE_CHAR_BOUNDARY : a single char
|
||||
* <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at
|
||||
* offset32
|
||||
* </ul>
|
||||
* For bit-twiddlers, see <a href=#bounds(java.lang.String, int)>
|
||||
* bounds(java.lang.String, int)</a> for information on the choice of the
|
||||
* boundary values.
|
||||
* @exception IndexOutOfBoundsException if offset16 is out of bounds.
|
||||
* @deprecated will be removed after end of year 2002, replaced by
|
||||
* UTF16.bounds(source, UTF16.findOffsetFromCodePoint(source, offset32));
|
||||
*/
|
||||
public static int boundsAtCodePointOffset(String source, int offset32)
|
||||
{
|
||||
return bounds(source, findOffsetFromCodePoint(source, offset32));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the <b>code value is a surrogate.
|
||||
* Determines whether the code value is a surrogate.
|
||||
* @param ch the input character.
|
||||
* @return true iff the input character is a surrogate.
|
||||
*/
|
||||
@ -1146,26 +1099,6 @@ public final class UTF16
|
||||
return findCodePointOffset(source, start, limit, limit - start);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets a code point into a UTF32 position.
|
||||
* Adjusts target according if we are replacing a non-supplementary
|
||||
* codepoint with a supplementary and vice versa.
|
||||
* @param target stringbuffer
|
||||
* @param offset32 UTF32 position to insert into
|
||||
* @exception IndexOutOfBoundsException if offset32 is out of bounds.
|
||||
* @param char32 code point
|
||||
* @deprecated to be removed after the year 2002,
|
||||
* UTF16.setCharAt(target,
|
||||
* findOffsetFromCodePoint(target.toString(), offset32),
|
||||
* char32);
|
||||
*/
|
||||
public static void setCharAtCodePointOffset(StringBuffer target,
|
||||
int offset32, int char32)
|
||||
{
|
||||
int offset16 = findOffsetFromCodePoint(target.toString(), offset32);
|
||||
setCharAt(target, offset16, char32);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a code point into a UTF16 position.
|
||||
* Adjusts target according if we are replacing a non-supplementary
|
||||
@ -2116,106 +2049,506 @@ public final class UTF16
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compare strings using Unicode code point order, instead of UTF-16 code
|
||||
* unit order.
|
||||
* Check if the string contains more Unicode code points than a certain
|
||||
* number. This is more efficient than counting all code points in the
|
||||
* entire string and comparing that number with a threshold.
|
||||
* This function may not need to scan the string at all if the length is
|
||||
* within a certain range, and never needs to count more than 'number + 1'
|
||||
* code points. Logically equivalent to (countCodePoint(s) > number). A
|
||||
* Unicode code point may occupy either one or two code units.
|
||||
* @param source The input string.
|
||||
* @param number The number of code points in the string is compared
|
||||
* against the 'number' parameter.
|
||||
* @return boolean value for whether the string contains more Unicode code
|
||||
* points than 'number'.
|
||||
* @draft 2.4
|
||||
*/
|
||||
public static boolean hasMoreCodePointsThan(String source, int number)
|
||||
{
|
||||
if (number < 0) {
|
||||
return true;
|
||||
}
|
||||
if (source == null) {
|
||||
return false;
|
||||
}
|
||||
int length = source.length();
|
||||
|
||||
// length >= 0 known
|
||||
// source contains at least (length + 1) / 2 code points: <= 2
|
||||
// chars per cp
|
||||
if (((length + 1) >> 1) > number) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// check if source does not even contain enough chars
|
||||
int maxsupplementary = length - number;
|
||||
if (maxsupplementary <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// there are maxsupplementary = length - number more chars than
|
||||
// asked-for code points
|
||||
|
||||
// count code points until they exceed and also check that there are
|
||||
// no more than maxsupplementary supplementary code points (char pairs)
|
||||
int start = 0;
|
||||
while (true) {
|
||||
if (length == 0) {
|
||||
return false;
|
||||
}
|
||||
if (number == 0) {
|
||||
return true;
|
||||
}
|
||||
if (isLeadSurrogate(source.charAt(start ++)) && start != length
|
||||
&& isTrailSurrogate(source.charAt(start))) {
|
||||
start ++;
|
||||
if (-- maxsupplementary <= 0) {
|
||||
// too many pairs - too few code points
|
||||
return false;
|
||||
}
|
||||
}
|
||||
-- number;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the sub-range of char array, from argument start to limit,
|
||||
* contains more Unicode code points than a certain
|
||||
* number. This is more efficient than counting all code points in the
|
||||
* entire char array range and comparing that number with a threshold.
|
||||
* This function may not need to scan the char array at all if start and
|
||||
* limit is within a certain range, and never needs to count more than
|
||||
* 'number + 1' code points.
|
||||
* Logically equivalent to (countCodePoint(source, start, limit) > number).
|
||||
* A Unicode code point may occupy either one or two code units.
|
||||
* @param source array of UTF-16 chars
|
||||
* @param start offset to substring in the source array for analyzing
|
||||
* @param limit offset to substring in the source array for analyzing
|
||||
* @param number The number of code points in the string is compared
|
||||
* against the 'number' parameter.
|
||||
* @return boolean value for whether the string contains more Unicode code
|
||||
* points than 'number'.
|
||||
* @exception IndexOutOfBoundsException thrown when limit < start
|
||||
* @draft 2.4
|
||||
*/
|
||||
public static boolean hasMoreCodePointsThan(char source[], int start,
|
||||
int limit, int number)
|
||||
{
|
||||
int length = limit - start;
|
||||
if (length < 0 || start < 0 || limit < 0) {
|
||||
throw new IndexOutOfBoundsException(
|
||||
"Start and limit indexes should be non-negative and start <= limit");
|
||||
}
|
||||
if (number < 0) {
|
||||
return true;
|
||||
}
|
||||
if (source == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// length >= 0 known
|
||||
// source contains at least (length + 1) / 2 code points: <= 2
|
||||
// chars per cp
|
||||
if (((length + 1) >> 1) > number) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// check if source does not even contain enough chars
|
||||
int maxsupplementary = length - number;
|
||||
if (maxsupplementary <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// there are maxsupplementary = length - number more chars than
|
||||
// asked-for code points
|
||||
|
||||
// count code points until they exceed and also check that there are
|
||||
// no more than maxsupplementary supplementary code points (char pairs)
|
||||
while (true) {
|
||||
if (length == 0) {
|
||||
return false;
|
||||
}
|
||||
if (number == 0) {
|
||||
return true;
|
||||
}
|
||||
if (isLeadSurrogate(source[start ++]) && start != limit
|
||||
&& isTrailSurrogate(source[start])) {
|
||||
start ++;
|
||||
if (-- maxsupplementary <= 0) {
|
||||
// too many pairs - too few code points
|
||||
return false;
|
||||
}
|
||||
}
|
||||
-- number;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the string buffer contains more Unicode code points than a
|
||||
* certain number. This is more efficient than counting all code points in
|
||||
* the entire string buffer and comparing that number with a threshold.
|
||||
* This function may not need to scan the string buffer at all if the
|
||||
* length is within a certain range, and never needs to count more than
|
||||
* 'number + 1' code points. Logically equivalent to
|
||||
* (countCodePoint(s) > number). A Unicode code point may occupy either one
|
||||
* or two code units.
|
||||
* @param source The input string buffer.
|
||||
* @param number The number of code points in the string buffer is compared
|
||||
* against the 'number' parameter.
|
||||
* @return boolean value for whether the string buffer contains more
|
||||
* Unicode code points than 'number'.
|
||||
* @draft 2.4
|
||||
*/
|
||||
public static boolean hasMoreCodePointsThan(StringBuffer source, int number)
|
||||
{
|
||||
if (number < 0) {
|
||||
return true;
|
||||
}
|
||||
if (source == null) {
|
||||
return false;
|
||||
}
|
||||
int length = source.length();
|
||||
|
||||
// length >= 0 known
|
||||
// source contains at least (length + 1) / 2 code points: <= 2
|
||||
// chars per cp
|
||||
if (((length + 1) >> 1) > number) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// check if source does not even contain enough chars
|
||||
int maxsupplementary = length - number;
|
||||
if (maxsupplementary <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// there are maxsupplementary = length - number more chars than
|
||||
// asked-for code points
|
||||
|
||||
// count code points until they exceed and also check that there are
|
||||
// no more than maxsupplementary supplementary code points (char pairs)
|
||||
int start = 0;
|
||||
while (true) {
|
||||
if (length == 0) {
|
||||
return false;
|
||||
}
|
||||
if (number == 0) {
|
||||
return true;
|
||||
}
|
||||
if (isLeadSurrogate(source.charAt(start ++)) && start != length
|
||||
&& isTrailSurrogate(source.charAt(start))) {
|
||||
start ++;
|
||||
if (-- maxsupplementary <= 0) {
|
||||
// too many pairs - too few code points
|
||||
return false;
|
||||
}
|
||||
}
|
||||
-- number;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>UTF16 string comparator class.
|
||||
* Allows UTF16 string comparison to be done with the various modes</p>
|
||||
* <ul>
|
||||
* <li> Code point comparison or code unit comparison
|
||||
* <li> Case sensitive comparison, case insensitive comparison or case
|
||||
* insensitive comparison with special handling for character 'i'.
|
||||
* </ul>
|
||||
* <p>The code unit or code point comparison differ only when comparing
|
||||
* supplementary code points (\u10000..\u10ffff) to BMP code points
|
||||
* near the end of the BMP (i.e., \ue000..\uffff). In code unit
|
||||
* comparison, high BMP code points sort after supplementary code points
|
||||
* because they are stored as pairs of surrogates which are at
|
||||
* \ud800..\udfff.</p>
|
||||
* @see #FOLD_CASE_DEFAULT
|
||||
* @see #FOLD_CASE_EXCLUDE_SPECIAL_I
|
||||
* @stable
|
||||
*/
|
||||
public static final class StringComparator implements java.util.Comparator
|
||||
{
|
||||
// public constructor ------------------------------------------------
|
||||
|
||||
/**
|
||||
* Standard String compare. Only one small section is different, marked in
|
||||
* the code.
|
||||
*/
|
||||
public int compare(Object a, Object b)
|
||||
* Default constructor that does code unit comparison and case
|
||||
* sensitive comparison.
|
||||
*/
|
||||
public StringComparator()
|
||||
{
|
||||
if (a == b) {
|
||||
return 0;
|
||||
}
|
||||
if (a == null) {
|
||||
return -1;
|
||||
}
|
||||
if (b == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
String sa = (String) a;
|
||||
String sb = (String) b;
|
||||
int lena = sa.length();
|
||||
int lenb = sb.length();
|
||||
int len = lena;
|
||||
if (len > lenb) {
|
||||
len = lenb;
|
||||
}
|
||||
|
||||
for (int i = 0; i < len; ++i)
|
||||
{
|
||||
char ca = sa.charAt(i);
|
||||
char cb = sb.charAt(i);
|
||||
if (ca == cb) {
|
||||
continue; // skip remap if equal
|
||||
}
|
||||
|
||||
// start of only different section
|
||||
// if either code unit is below 0xd800, i.e., below the
|
||||
// surrogate range, then nothing needs to be done
|
||||
|
||||
// if both are >=0xd800 then special code adjusts code unit
|
||||
// values so that all BMP code points (including single
|
||||
// surrogate code points) sort below supplementary ones
|
||||
|
||||
// this is necessary because surrogates are not at the end of
|
||||
// the code unit range
|
||||
if (ca >= LEAD_SURROGATE_MIN_VALUE
|
||||
&& cb >= LEAD_SURROGATE_MIN_VALUE) {
|
||||
// subtract 0x2800 from BMP code points to make them
|
||||
// smaller than supplementary ones
|
||||
if ((ca <= LEAD_SURROGATE_MAX_VALUE && (i + 1) < lena
|
||||
&& isTrailSurrogate(sa.charAt(i + 1)))
|
||||
|| (isTrailSurrogate(ca) && i > 0
|
||||
&& isLeadSurrogate(sa.charAt(i - 1)))) {
|
||||
// part of a surrogate pair, leave >=d800
|
||||
}
|
||||
else {
|
||||
// BMP code point - may be surrogate code point - make
|
||||
// <d800
|
||||
ca -= 0x2800;
|
||||
}
|
||||
|
||||
if ((cb <= LEAD_SURROGATE_MAX_VALUE && (i + 1) < lenb
|
||||
&& isTrailSurrogate(sb.charAt(i + 1)))
|
||||
|| (isTrailSurrogate(cb) && i > 0
|
||||
&& isLeadSurrogate(sb.charAt(i - 1)))) {
|
||||
// part of a surrogate pair, leave >=d800
|
||||
}
|
||||
else {
|
||||
// BMP code point - may be surrogate code point - make
|
||||
// < d800
|
||||
cb -= 0x2800;
|
||||
}
|
||||
}
|
||||
|
||||
// end of only different section
|
||||
|
||||
if (ca < cb) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 1; // wasn't equal, so return 1
|
||||
}
|
||||
|
||||
if (lena < lenb) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (lena > lenb) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
m_codePointCompare_ = false;
|
||||
m_ignoreCase_ = false;
|
||||
m_foldCase_ = FOLD_CASE_DEFAULT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor that does comparison based on the argument options.
|
||||
* @param codepointcompare flag to indicate true for code point
|
||||
* comparison or false for code unit comparison.
|
||||
* @param ignorecase false for case sensitive comparison, true for
|
||||
* case-insensitive comparison
|
||||
* @param foldcaseoption FOLD_CASE_DEFAULT or
|
||||
* FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when
|
||||
* ignorecase is set to true. If ignorecase is false, this option
|
||||
* is ignored.
|
||||
* @see #FOLD_CASE_DEFAULT
|
||||
* @see #FOLD_CASE_EXCLUDE_SPECIAL_I
|
||||
* @throws IllegalArgumentException if foldcaseoption is out of range
|
||||
*/
|
||||
public StringComparator(boolean codepointcompare,
|
||||
boolean ignorecase,
|
||||
int foldcaseoption)
|
||||
{
|
||||
m_codePointCompare_ = codepointcompare;
|
||||
m_ignoreCase_ = ignorecase;
|
||||
if (foldcaseoption < FOLD_CASE_DEFAULT
|
||||
|| foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
|
||||
throw new IllegalArgumentException("Invalid fold case option");
|
||||
}
|
||||
m_foldCase_ = foldcaseoption;
|
||||
}
|
||||
|
||||
// public data member ------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Option value for case folding comparison:</p>
|
||||
* <p>Comparison is case insensitive, strings are folded using default
|
||||
* mappings defined in Unicode data file CaseFolding.txt, before
|
||||
* comparison.
|
||||
* </p>
|
||||
* @draft 2.4
|
||||
*/
|
||||
public static final int FOLD_CASE_DEFAULT = 0;
|
||||
/**
|
||||
* <p>Option value for case folding comparison:</p>
|
||||
* <p>Comparison is case insensitive, strings are folded using modified
|
||||
* mappings defined in Unicode data file CaseFolding.txt, before
|
||||
* comparison.
|
||||
* </p>
|
||||
* <p>The modified set of mappings is provided in a Unicode data file
|
||||
* CaseFolding.txt to handle dotted I and dotless i appropriately for
|
||||
* Turkic languages (tr, az).</p>
|
||||
* <p>Before Unicode 3.2, CaseFolding.txt contains mappings marked with
|
||||
* 'I' that are to be included for default mappings and excluded for
|
||||
* the Turkic-specific mappings.</p>
|
||||
* <p>Unicode 3.2 CaseFolding.txt instead contains mappings marked with
|
||||
* 'T' that are to be excluded for default mappings and included for
|
||||
* the Turkic-specific mappings.</p>
|
||||
* @draft 2.4
|
||||
*/
|
||||
public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0;
|
||||
|
||||
// public methods ----------------------------------------------------
|
||||
|
||||
// public setters ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Sets the comparison mode to code point compare if flag is true.
|
||||
* Otherwise comparison mode is set to code unit compare
|
||||
* @param flag true for code point compare, false for code unit compare
|
||||
*/
|
||||
public void setCodePointCompare(boolean flag)
|
||||
{
|
||||
m_codePointCompare_ = flag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the Comparator to case-insensitive comparison mode if argument
|
||||
* is true, otherwise case sensitive comparison mode if set to false.
|
||||
* @param ignorecase true for case-insitive comparison, false for
|
||||
* case sensitive comparison
|
||||
* @param foldcaseoptions FOLD_CASE_DEFAULT or
|
||||
* FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when
|
||||
* ignorecase is set to true. If ignorecase is false, this option
|
||||
* is ignored.
|
||||
* @see #FOLD_CASE_DEFAULT
|
||||
* @see #FOLD_CASE_EXCLUDE_SPECIAL_I
|
||||
*/
|
||||
public void setIgnoreCase(boolean ignorecase, int foldcaseoption)
|
||||
{
|
||||
m_ignoreCase_ = ignorecase;
|
||||
if (foldcaseoption < FOLD_CASE_DEFAULT
|
||||
|| foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
|
||||
throw new IllegalArgumentException("Invalid fold case option");
|
||||
}
|
||||
m_foldCase_ = foldcaseoption;
|
||||
}
|
||||
|
||||
// public getters ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Checks if the comparison mode is code point compare.
|
||||
* @return true for code point compare, false for code unit compare
|
||||
*/
|
||||
public boolean getCodePointCompare()
|
||||
{
|
||||
return m_codePointCompare_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if Comparator is in the case insensitive mode.
|
||||
* @return true if Comparator performs case insensitive comparison,
|
||||
* false otherwise
|
||||
*/
|
||||
public boolean getIgnoreCase()
|
||||
{
|
||||
return m_ignoreCase_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the fold case options set in Comparator to be used with case
|
||||
* insensitive comparison.
|
||||
* @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
|
||||
* @see #FOLD_CASE_DEFAULT
|
||||
* @see #FOLD_CASE_EXCLUDE_SPECIAL_I
|
||||
*/
|
||||
public int getIgnoreCaseOption()
|
||||
{
|
||||
return m_foldCase_;
|
||||
}
|
||||
|
||||
// public other methods ----------------------------------------------
|
||||
|
||||
/**
|
||||
* Compare two strings depending on the options selected during
|
||||
* construction.
|
||||
* @param a first source string.
|
||||
* @param b second source string.
|
||||
* @return 0 returned if a == b. If a < b, a negative value is returned.
|
||||
* Otherwise if a > b, a positive value is returned.
|
||||
* @exception ClassCastException thrown when either a or b is not a
|
||||
* String object
|
||||
* @draft 2.4
|
||||
*/
|
||||
public int compare(Object a, Object b)
|
||||
{
|
||||
String str1 = (String)a;
|
||||
String str2 = (String)b;
|
||||
|
||||
if (str1 == str2) {
|
||||
return 0;
|
||||
}
|
||||
if (str1 == null) {
|
||||
return -1;
|
||||
}
|
||||
if (str2 == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (m_ignoreCase_) {
|
||||
return compareCaseInsensitive(str1, str2);
|
||||
}
|
||||
return compareCaseSensitive(str1, str2);
|
||||
}
|
||||
|
||||
// private data member ----------------------------------------------
|
||||
|
||||
/**
|
||||
* Code unit comparison flag. True if code unit comparison is required.
|
||||
* False if code point comparison is required.
|
||||
*/
|
||||
private boolean m_codePointCompare_;
|
||||
/**
|
||||
* Fold case comparison option.
|
||||
*/
|
||||
private int m_foldCase_;
|
||||
/**
|
||||
* Flag indicator if ignore case is to be used during comparison
|
||||
*/
|
||||
private boolean m_ignoreCase_;
|
||||
/**
|
||||
* Code point order offset for surrogate characters
|
||||
*/
|
||||
private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
|
||||
|
||||
// private method ---------------------------------------------------
|
||||
|
||||
/**
|
||||
* Compares case insensitive. This is a direct port of ICU4C, to make
|
||||
* maintainence life easier.
|
||||
* @param s1 first string to compare
|
||||
* @param s2 second string to compare
|
||||
* @return -1 is s1 < s2, 0 if equals,
|
||||
*/
|
||||
private int compareCaseInsensitive(String s1, String s2)
|
||||
{
|
||||
return NormalizerImpl.cmpEquivFold(s1, s2,
|
||||
m_foldCase_ |
|
||||
Normalizer.COMPARE_IGNORE_CASE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares case sensitive. This is a direct port of ICU4C, to make
|
||||
* maintainence life easier.
|
||||
* @param s1 first string to compare
|
||||
* @param s2 second string to compare
|
||||
* @return -1 is s1 < s2, 0 if equals,
|
||||
*/
|
||||
private int compareCaseSensitive(String s1, String s2)
|
||||
{
|
||||
// compare identical prefixes - they do not need to be fixed up
|
||||
// limit1 = start1 + min(lenght1, length2)
|
||||
int length1 = s1.length();
|
||||
int length2 = s2.length();
|
||||
int minlength = length1;
|
||||
int result = 0;
|
||||
if (length1 < length2) {
|
||||
result = -1;
|
||||
}
|
||||
else if (length1 > length2) {
|
||||
result = 1;
|
||||
}
|
||||
|
||||
char c1 = 0;
|
||||
char c2 = 0;
|
||||
int index = 0;
|
||||
for (; index < minlength; index ++) {
|
||||
c1 = s1.charAt(index);
|
||||
c2 = s2.charAt(index);
|
||||
// check pseudo-limit
|
||||
if (c1 != c2) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (index == minlength) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// if both values are in or above the surrogate range, fix them up
|
||||
if (c1 >= LEAD_SURROGATE_MIN_VALUE
|
||||
&& c2 >= LEAD_SURROGATE_MIN_VALUE && m_codePointCompare_) {
|
||||
// subtract 0x2800 from BMP code points to make them smaller
|
||||
// than supplementary ones
|
||||
if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1
|
||||
&& isTrailSurrogate(s1.charAt(index + 1)))
|
||||
|| (isTrailSurrogate(c1) && index != 0
|
||||
&& isLeadSurrogate(s1.charAt(index - 1)))) {
|
||||
// part of a surrogate pair, leave >=d800
|
||||
}
|
||||
else {
|
||||
// BMP code point - may be surrogate code point - make
|
||||
// < d800
|
||||
c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
|
||||
}
|
||||
|
||||
if ((c2 <= LEAD_SURROGATE_MAX_VALUE
|
||||
&& (index + 1) != length2
|
||||
&& isTrailSurrogate(s2.charAt(index + 1))) ||
|
||||
(isTrailSurrogate(c2) && index != 0
|
||||
&& isLeadSurrogate(s2.charAt(index - 1)))) {
|
||||
// part of a surrogate pair, leave >=d800
|
||||
}
|
||||
else {
|
||||
// BMP code point - may be surrogate code point - make <d800
|
||||
c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
|
||||
}
|
||||
}
|
||||
|
||||
// now c1 and c2 are in UTF-32-compatible order
|
||||
return c1 - c2;
|
||||
}
|
||||
}
|
||||
|
||||
// private data members -------------------------------------------------
|
||||
@ -2234,8 +2567,8 @@ public final class UTF16
|
||||
private static final int LEAD_SURROGATE_OFFSET_ =
|
||||
LEAD_SURROGATE_MIN_VALUE -
|
||||
(SUPPLEMENTARY_MIN_VALUE
|
||||
>> LEAD_SURROGATE_SHIFT_);
|
||||
|
||||
>> LEAD_SURROGATE_SHIFT_);
|
||||
|
||||
// private methods ------------------------------------------------------
|
||||
|
||||
/**
|
||||
@ -2248,6 +2581,7 @@ public final class UTF16
|
||||
* points, 2 otherwise.</p>
|
||||
* @param ch code point
|
||||
* @return string representation of the code point
|
||||
* @deprecated since 2.4, use UCharater.toString(int) instead
|
||||
*/
|
||||
public static String toString(int ch)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user