ICU-1542 fix indexOf(UChar32 surrogate), lastIndexOf(UChar32 surrogate), u_strchr32(surrogate) to match char32At()/UTF_GET_CHAR()

X-SVN-Rev: 7744
This commit is contained in:
Markus Scherer 2002-02-22 02:00:42 +00:00
parent 4178e90458
commit b24a8e910f
6 changed files with 269 additions and 50 deletions

View File

@ -847,6 +847,20 @@ public:
/**
* Locate in this the first occurrence of the code point <TT>c</TT>,
* using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from indexOf(UChar c, ...) only for surrogates:
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but indexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
@ -867,6 +881,20 @@ public:
/**
* Locate in this the first occurrence of the code point <TT>c</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from indexOf(UChar c, ...) only for surrogates:
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but indexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @param start The offset at which searching will start.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
@ -893,6 +921,20 @@ public:
* Locate in this the first occurrence of the code point <TT>c</TT>
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from indexOf(UChar c, ...) only for surrogates:
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but indexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
@ -1027,6 +1069,20 @@ public:
/**
* Locate in this the last occurrence of the code point <TT>c</TT>,
* using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from lastIndexOf(UChar c, ...) only for surrogates:
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but lastIndexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
@ -1047,6 +1103,20 @@ public:
/**
* Locate in this the last occurrence of the code point <TT>c</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from lastIndexOf(UChar c, ...) only for surrogates:
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but lastIndexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @param start The offset at which searching will start.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
@ -1073,6 +1143,20 @@ public:
* Locate in this the last occurrence of the code point <TT>c</TT>
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from lastIndexOf(UChar c, ...) only for surrogates:
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but lastIndexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
@ -2687,10 +2771,20 @@ private:
UTextOffset start,
int32_t length) const;
// only for c>=0xd800
UTextOffset doIndexOf(UChar32 c,
UTextOffset start,
int32_t length) const;
UTextOffset doLastIndexOf(UChar c,
UTextOffset start,
int32_t length) const;
// only for c>=0xd800
UTextOffset doLastIndexOf(UChar32 c,
UTextOffset start,
int32_t length) const;
void doExtract(UTextOffset start,
int32_t length,
UChar *dst,
@ -3161,14 +3255,7 @@ UnicodeString::indexOf(UChar c) const
inline UTextOffset
UnicodeString::indexOf(UChar32 c) const {
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doIndexOf((UChar)c, 0, fLength);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t length = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, length, c);
return indexOf(buffer, length, 0);
}
return indexOf(c, 0, fLength);
}
inline UTextOffset
@ -3179,14 +3266,7 @@ UnicodeString::indexOf(UChar c,
inline UTextOffset
UnicodeString::indexOf(UChar32 c,
UTextOffset start) const {
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doIndexOf((UChar)c, start, fLength - start);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t length = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, length, c);
return indexOf(buffer, length, start);
}
return indexOf(c, start, fLength - start);
}
inline UTextOffset
@ -3199,13 +3279,10 @@ inline UTextOffset
UnicodeString::indexOf(UChar32 c,
UTextOffset start,
int32_t length) const {
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
if((uint32_t)c<0xd800) {
return doIndexOf((UChar)c, start, length);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t cLength = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, cLength, c);
return indexOf(buffer, cLength, start, length);
return doIndexOf(c, start, length);
}
}
@ -3259,14 +3336,7 @@ UnicodeString::lastIndexOf(UChar c) const
inline UTextOffset
UnicodeString::lastIndexOf(UChar32 c) const {
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doLastIndexOf((UChar)c, 0, fLength);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t count = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
return lastIndexOf(buffer, count, 0);
}
return lastIndexOf(c, 0, fLength);
}
inline UTextOffset
@ -3277,14 +3347,7 @@ UnicodeString::lastIndexOf(UChar c,
inline UTextOffset
UnicodeString::lastIndexOf(UChar32 c,
UTextOffset start) const {
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doLastIndexOf((UChar)c, start, fLength - start);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t count = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
return lastIndexOf(buffer, count, start);
}
return lastIndexOf(c, start, fLength - start);
}
inline UTextOffset
@ -3297,13 +3360,10 @@ inline UTextOffset
UnicodeString::lastIndexOf(UChar32 c,
UTextOffset start,
int32_t length) const {
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
if((uint32_t)c<0xd800) {
return doLastIndexOf((UChar)c, start, length);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t count = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
return lastIndexOf(buffer, count, start, length);
return doLastIndexOf(c, start, length);
}
}

View File

@ -144,6 +144,19 @@ u_strstr(const UChar *s, const UChar *substring);
/**
* Find the first occurence of a specified code point in a string.
*
* This function finds code points, which differs for BMP code points
* from u_strchr() only for surrogates:
* While u_strchr() finds any surrogate code units in a string,
* u_strchr32() finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" u_strchr()
* will find code units U+d800 at 0 and U+dc00 at 1,
* but u_strchr32() will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that UTF_GET_CHAR(u_strchr32(c))==c.
*
* @param s The string to search.
* @param c The code point (0..0x10ffff) to find.
* @return A pointer to the first occurrence of <TT>c</TT> in <TT>s</TT>,

View File

@ -803,6 +803,49 @@ UnicodeString::doIndexOf(UChar c,
return -1;
}
UTextOffset
UnicodeString::doIndexOf(UChar32 c,
UTextOffset start,
int32_t length) const {
// pin indices
pinIndices(start, length);
if(length == 0) {
return -1;
}
// c<0xd800 handled by inline function indexOf(UChar32 c, start, length)
if(c<=0xdfff) {
// surrogate code point
int32_t index;
while(length>0) {
index=doIndexOf((UChar)c, start, length);
if(index<0) {
return index;
}
if(
UTF_IS_SURROGATE_FIRST(c) ?
((index+1)<fLength && UTF_IS_TRAIL(fArray[index+1])) :
(index>0 && UTF_IS_LEAD(fArray[index-1]))
) {
// matched surrogate, not a surrogate code point, continue searching
length-=(index+1)-start;
start=index+1;
} else {
return index;
}
}
return -1;
} else if(c<=0xffff) {
// non-surrogate BMP code point
return doIndexOf((UChar)c, start, length);
} else {
// supplementary code point, search for string
UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
return indexOf(buffer, 2, start, length);
}
}
UTextOffset
UnicodeString::lastIndexOf(const UChar *srcChars,
UTextOffset srcStart,
@ -880,6 +923,48 @@ UnicodeString::doLastIndexOf(UChar c,
return -1;
}
UTextOffset
UnicodeString::doLastIndexOf(UChar32 c,
UTextOffset start,
int32_t length) const {
// pin indices
pinIndices(start, length);
if(length == 0) {
return -1;
}
// c<0xd800 handled by inline function lastIndexOf(UChar32 c, start, length)
if(c<=0xdfff) {
// surrogate code point
int32_t index;
while(length>0) {
index=doLastIndexOf((UChar)c, start, length);
if(index<0) {
return index;
}
if(
UTF_IS_SURROGATE_FIRST(c) ?
((index+1)<fLength && UTF_IS_TRAIL(fArray[index+1])) :
(index>0 && UTF_IS_LEAD(fArray[index-1]))
) {
// matched surrogate, not a surrogate code point, continue searching
length=index-start;
} else {
return index;
}
}
return -1;
} else if(c<=0xffff) {
// non-surrogate BMP code point
return doLastIndexOf((UChar)c, start, length);
} else {
// supplementary code point, search for string
UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
return lastIndexOf(buffer, 2, start, length);
}
}
UnicodeString&
UnicodeString::findAndReplace(UTextOffset start,
int32_t length,

View File

@ -81,13 +81,40 @@ u_strstr(const UChar *s, const UChar *substring) {
U_CAPI UChar * U_EXPORT2
u_strchr32(const UChar *s, UChar32 c) {
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
if(c < 0xd800) {
/* non-surrogate BMP code point */
return u_strchr(s, (UChar)c);
} else if(c <= 0xdfff) {
/* surrogate code point */
UChar *t;
for(;;) {
t = u_strchr(s, (UChar)c);
if(t == NULL) {
return NULL;
}
if(
UTF_IS_SURROGATE_FIRST(*t) ?
UTF_IS_TRAIL(*(t+1)) :
(s<t && UTF_IS_LEAD(*(t-1)))
) {
/* matched surrogate, not a surrogate code point, continue searching */
s = t + 1;
} else {
return t;
}
}
return NULL;
} else if(c <= 0xffff) {
/* non-surrogate BMP code point */
return u_strchr(s, (UChar)c);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH + 1];
UTextOffset i = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, i, c);
buffer[i] = 0;
/* supplementary code point, search for string */
UChar buffer[3];
buffer[0] = UTF16_LEAD(c);
buffer[1] = UTF16_TRAIL(c);
buffer[2] = 0;
return u_strstr(s, buffer);
}
}

View File

@ -1005,6 +1005,22 @@ static void TestStringFunctions()
dataTable[i][j][0] = saveVal; /* Put it back for the other tests */
}
/*
* test that u_strchr32()
* does not find surrogate code points when they are part of matched pairs
* (= part of supplementary code points)
* Jitterbug 1542
*/
{
UChar s[]={
/* 0 1 2 3 4 5 6 7 8 9 */
0x0061, 0xd841, 0xdc02, 0xd841, 0x0062, 0xdc02, 0xd841, 0xdc02, 0x0063, 0
};
if(u_strchr32(s, 0xd841)!=(s+3) || u_strchr32(s, 0xdc02)!=(s+5)) {
log_err("error: u_strchr32(surrogate) finds a partial supplementary code point\n");
}
}
log_verbose("Testing u_austrcpy()");
u_austrcpy(test,dataTable[0][0]);

View File

@ -855,12 +855,19 @@ UnicodeStringTest::TestSearching()
UChar testChar = 0x74;
UChar32 testChar32 = 0x20402;
UChar testData[]={0xd841, 0xdc02, 0x71, 0xdc02, 0xd841, 0x71, 0xd841, 0xdc02, 0x71, 0x72, 0xd841, 0xdc02, 0x71, 0xd841, 0xdc02, 0x71, 0xdc02, 0xd841, 0x73, 0x0000};
UChar testData[]={
// 0 1 2 3 4 5 6 7
0xd841, 0xdc02, 0x0071, 0xdc02, 0xd841, 0x0071, 0xd841, 0xdc02,
// 8 9 10 11 12 13 14 15
0x0071, 0x0072, 0xd841, 0xdc02, 0x0071, 0xd841, 0xdc02, 0x0071,
// 16 17 18 19
0xdc02, 0xd841, 0x0073, 0x0000
};
UnicodeString test3(testData);
UnicodeString test4(testChar32);
uint16_t occurrences = 0;
UTextOffset startPos = 0;
for ( ;
@ -984,6 +991,17 @@ UnicodeStringTest::TestSearching()
if (occurrences != 18)
errln((UnicodeString)"indexOf failed: expected to find 18 occurrences, found " + occurrences);
//---
// test that indexOf(UChar32) and lastIndexOf(UChar32)
// do not find surrogate code points when they are part of matched pairs
// (= part of supplementary code points)
// Jitterbug 1542
if(test3.indexOf((UChar32)0xd841) != 4 || test3.indexOf((UChar32)0xdc02) != 3) {
errln("error: UnicodeString::indexOf(UChar32 surrogate) finds a partial supplementary code point");
}
if(test3.lastIndexOf((UChar32)0xd841, 0, 17) != 4 || test3.lastIndexOf((UChar32)0xdc02, 0, 17) != 16) {
errln("error: UnicodeString::indexOf(UChar32 surrogate) finds a partial supplementary code point");
}
}
void