ICU-1542 fix indexOf(UChar32 surrogate), lastIndexOf(UChar32 surrogate), u_strchr32(surrogate) to match char32At()/UTF_GET_CHAR()
X-SVN-Rev: 7744
This commit is contained in:
parent
4178e90458
commit
b24a8e910f
@ -847,6 +847,20 @@ public:
|
||||
/**
|
||||
* Locate in this the first occurrence of the code point <TT>c</TT>,
|
||||
* using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from indexOf(UChar c, ...) only for surrogates:
|
||||
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but indexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
* @stable
|
||||
@ -867,6 +881,20 @@ public:
|
||||
/**
|
||||
* Locate in this the first occurrence of the code point <TT>c</TT>
|
||||
* starting at offset <TT>start</TT>, using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from indexOf(UChar c, ...) only for surrogates:
|
||||
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but indexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @param start The offset at which searching will start.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
@ -893,6 +921,20 @@ public:
|
||||
* Locate in this the first occurrence of the code point <TT>c</TT>
|
||||
* in the range [<TT>start</TT>, <TT>start + length</TT>),
|
||||
* using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from indexOf(UChar c, ...) only for surrogates:
|
||||
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but indexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @param start the offset into this at which to start matching
|
||||
* @param length the number of characters in this to search
|
||||
@ -1027,6 +1069,20 @@ public:
|
||||
/**
|
||||
* Locate in this the last occurrence of the code point <TT>c</TT>,
|
||||
* using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from lastIndexOf(UChar c, ...) only for surrogates:
|
||||
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but lastIndexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
* @stable
|
||||
@ -1047,6 +1103,20 @@ public:
|
||||
/**
|
||||
* Locate in this the last occurrence of the code point <TT>c</TT>
|
||||
* starting at offset <TT>start</TT>, using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from lastIndexOf(UChar c, ...) only for surrogates:
|
||||
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but lastIndexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @param start The offset at which searching will start.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
@ -1073,6 +1143,20 @@ public:
|
||||
* Locate in this the last occurrence of the code point <TT>c</TT>
|
||||
* in the range [<TT>start</TT>, <TT>start + length</TT>),
|
||||
* using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from lastIndexOf(UChar c, ...) only for surrogates:
|
||||
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but lastIndexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @param start the offset into this at which to start matching
|
||||
* @param length the number of characters in this to search
|
||||
@ -2687,10 +2771,20 @@ private:
|
||||
UTextOffset start,
|
||||
int32_t length) const;
|
||||
|
||||
// only for c>=0xd800
|
||||
UTextOffset doIndexOf(UChar32 c,
|
||||
UTextOffset start,
|
||||
int32_t length) const;
|
||||
|
||||
UTextOffset doLastIndexOf(UChar c,
|
||||
UTextOffset start,
|
||||
int32_t length) const;
|
||||
|
||||
// only for c>=0xd800
|
||||
UTextOffset doLastIndexOf(UChar32 c,
|
||||
UTextOffset start,
|
||||
int32_t length) const;
|
||||
|
||||
void doExtract(UTextOffset start,
|
||||
int32_t length,
|
||||
UChar *dst,
|
||||
@ -3161,14 +3255,7 @@ UnicodeString::indexOf(UChar c) const
|
||||
|
||||
inline UTextOffset
|
||||
UnicodeString::indexOf(UChar32 c) const {
|
||||
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
|
||||
return doIndexOf((UChar)c, 0, fLength);
|
||||
} else {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
int32_t length = 0;
|
||||
UTF_APPEND_CHAR_UNSAFE(buffer, length, c);
|
||||
return indexOf(buffer, length, 0);
|
||||
}
|
||||
return indexOf(c, 0, fLength);
|
||||
}
|
||||
|
||||
inline UTextOffset
|
||||
@ -3179,14 +3266,7 @@ UnicodeString::indexOf(UChar c,
|
||||
inline UTextOffset
|
||||
UnicodeString::indexOf(UChar32 c,
|
||||
UTextOffset start) const {
|
||||
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
|
||||
return doIndexOf((UChar)c, start, fLength - start);
|
||||
} else {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
int32_t length = 0;
|
||||
UTF_APPEND_CHAR_UNSAFE(buffer, length, c);
|
||||
return indexOf(buffer, length, start);
|
||||
}
|
||||
return indexOf(c, start, fLength - start);
|
||||
}
|
||||
|
||||
inline UTextOffset
|
||||
@ -3199,13 +3279,10 @@ inline UTextOffset
|
||||
UnicodeString::indexOf(UChar32 c,
|
||||
UTextOffset start,
|
||||
int32_t length) const {
|
||||
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
|
||||
if((uint32_t)c<0xd800) {
|
||||
return doIndexOf((UChar)c, start, length);
|
||||
} else {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
int32_t cLength = 0;
|
||||
UTF_APPEND_CHAR_UNSAFE(buffer, cLength, c);
|
||||
return indexOf(buffer, cLength, start, length);
|
||||
return doIndexOf(c, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3259,14 +3336,7 @@ UnicodeString::lastIndexOf(UChar c) const
|
||||
|
||||
inline UTextOffset
|
||||
UnicodeString::lastIndexOf(UChar32 c) const {
|
||||
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
|
||||
return doLastIndexOf((UChar)c, 0, fLength);
|
||||
} else {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
int32_t count = 0;
|
||||
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
|
||||
return lastIndexOf(buffer, count, 0);
|
||||
}
|
||||
return lastIndexOf(c, 0, fLength);
|
||||
}
|
||||
|
||||
inline UTextOffset
|
||||
@ -3277,14 +3347,7 @@ UnicodeString::lastIndexOf(UChar c,
|
||||
inline UTextOffset
|
||||
UnicodeString::lastIndexOf(UChar32 c,
|
||||
UTextOffset start) const {
|
||||
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
|
||||
return doLastIndexOf((UChar)c, start, fLength - start);
|
||||
} else {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
int32_t count = 0;
|
||||
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
|
||||
return lastIndexOf(buffer, count, start);
|
||||
}
|
||||
return lastIndexOf(c, start, fLength - start);
|
||||
}
|
||||
|
||||
inline UTextOffset
|
||||
@ -3297,13 +3360,10 @@ inline UTextOffset
|
||||
UnicodeString::lastIndexOf(UChar32 c,
|
||||
UTextOffset start,
|
||||
int32_t length) const {
|
||||
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
|
||||
if((uint32_t)c<0xd800) {
|
||||
return doLastIndexOf((UChar)c, start, length);
|
||||
} else {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
int32_t count = 0;
|
||||
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
|
||||
return lastIndexOf(buffer, count, start, length);
|
||||
return doLastIndexOf(c, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -144,6 +144,19 @@ u_strstr(const UChar *s, const UChar *substring);
|
||||
/**
|
||||
* Find the first occurence of a specified code point in a string.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from u_strchr() only for surrogates:
|
||||
* While u_strchr() finds any surrogate code units in a string,
|
||||
* u_strchr32() finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" u_strchr()
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but u_strchr32() will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that UTF_GET_CHAR(u_strchr32(c))==c.
|
||||
*
|
||||
* @param s The string to search.
|
||||
* @param c The code point (0..0x10ffff) to find.
|
||||
* @return A pointer to the first occurrence of <TT>c</TT> in <TT>s</TT>,
|
||||
|
@ -803,6 +803,49 @@ UnicodeString::doIndexOf(UChar c,
|
||||
return -1;
|
||||
}
|
||||
|
||||
UTextOffset
|
||||
UnicodeString::doIndexOf(UChar32 c,
|
||||
UTextOffset start,
|
||||
int32_t length) const {
|
||||
// pin indices
|
||||
pinIndices(start, length);
|
||||
if(length == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// c<0xd800 handled by inline function indexOf(UChar32 c, start, length)
|
||||
if(c<=0xdfff) {
|
||||
// surrogate code point
|
||||
int32_t index;
|
||||
|
||||
while(length>0) {
|
||||
index=doIndexOf((UChar)c, start, length);
|
||||
if(index<0) {
|
||||
return index;
|
||||
}
|
||||
if(
|
||||
UTF_IS_SURROGATE_FIRST(c) ?
|
||||
((index+1)<fLength && UTF_IS_TRAIL(fArray[index+1])) :
|
||||
(index>0 && UTF_IS_LEAD(fArray[index-1]))
|
||||
) {
|
||||
// matched surrogate, not a surrogate code point, continue searching
|
||||
length-=(index+1)-start;
|
||||
start=index+1;
|
||||
} else {
|
||||
return index;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
} else if(c<=0xffff) {
|
||||
// non-surrogate BMP code point
|
||||
return doIndexOf((UChar)c, start, length);
|
||||
} else {
|
||||
// supplementary code point, search for string
|
||||
UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
|
||||
return indexOf(buffer, 2, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
UTextOffset
|
||||
UnicodeString::lastIndexOf(const UChar *srcChars,
|
||||
UTextOffset srcStart,
|
||||
@ -880,6 +923,48 @@ UnicodeString::doLastIndexOf(UChar c,
|
||||
return -1;
|
||||
}
|
||||
|
||||
UTextOffset
|
||||
UnicodeString::doLastIndexOf(UChar32 c,
|
||||
UTextOffset start,
|
||||
int32_t length) const {
|
||||
// pin indices
|
||||
pinIndices(start, length);
|
||||
if(length == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// c<0xd800 handled by inline function lastIndexOf(UChar32 c, start, length)
|
||||
if(c<=0xdfff) {
|
||||
// surrogate code point
|
||||
int32_t index;
|
||||
|
||||
while(length>0) {
|
||||
index=doLastIndexOf((UChar)c, start, length);
|
||||
if(index<0) {
|
||||
return index;
|
||||
}
|
||||
if(
|
||||
UTF_IS_SURROGATE_FIRST(c) ?
|
||||
((index+1)<fLength && UTF_IS_TRAIL(fArray[index+1])) :
|
||||
(index>0 && UTF_IS_LEAD(fArray[index-1]))
|
||||
) {
|
||||
// matched surrogate, not a surrogate code point, continue searching
|
||||
length=index-start;
|
||||
} else {
|
||||
return index;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
} else if(c<=0xffff) {
|
||||
// non-surrogate BMP code point
|
||||
return doLastIndexOf((UChar)c, start, length);
|
||||
} else {
|
||||
// supplementary code point, search for string
|
||||
UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
|
||||
return lastIndexOf(buffer, 2, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
UnicodeString::findAndReplace(UTextOffset start,
|
||||
int32_t length,
|
||||
|
@ -81,13 +81,40 @@ u_strstr(const UChar *s, const UChar *substring) {
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strchr32(const UChar *s, UChar32 c) {
|
||||
if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
|
||||
if(c < 0xd800) {
|
||||
/* non-surrogate BMP code point */
|
||||
return u_strchr(s, (UChar)c);
|
||||
} else if(c <= 0xdfff) {
|
||||
/* surrogate code point */
|
||||
UChar *t;
|
||||
|
||||
for(;;) {
|
||||
t = u_strchr(s, (UChar)c);
|
||||
if(t == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
if(
|
||||
UTF_IS_SURROGATE_FIRST(*t) ?
|
||||
UTF_IS_TRAIL(*(t+1)) :
|
||||
(s<t && UTF_IS_LEAD(*(t-1)))
|
||||
) {
|
||||
/* matched surrogate, not a surrogate code point, continue searching */
|
||||
s = t + 1;
|
||||
} else {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
} else if(c <= 0xffff) {
|
||||
/* non-surrogate BMP code point */
|
||||
return u_strchr(s, (UChar)c);
|
||||
} else {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH + 1];
|
||||
UTextOffset i = 0;
|
||||
UTF_APPEND_CHAR_UNSAFE(buffer, i, c);
|
||||
buffer[i] = 0;
|
||||
/* supplementary code point, search for string */
|
||||
UChar buffer[3];
|
||||
|
||||
buffer[0] = UTF16_LEAD(c);
|
||||
buffer[1] = UTF16_TRAIL(c);
|
||||
buffer[2] = 0;
|
||||
return u_strstr(s, buffer);
|
||||
}
|
||||
}
|
||||
|
@ -1005,6 +1005,22 @@ static void TestStringFunctions()
|
||||
dataTable[i][j][0] = saveVal; /* Put it back for the other tests */
|
||||
}
|
||||
|
||||
/*
|
||||
* test that u_strchr32()
|
||||
* does not find surrogate code points when they are part of matched pairs
|
||||
* (= part of supplementary code points)
|
||||
* Jitterbug 1542
|
||||
*/
|
||||
{
|
||||
UChar s[]={
|
||||
/* 0 1 2 3 4 5 6 7 8 9 */
|
||||
0x0061, 0xd841, 0xdc02, 0xd841, 0x0062, 0xdc02, 0xd841, 0xdc02, 0x0063, 0
|
||||
};
|
||||
|
||||
if(u_strchr32(s, 0xd841)!=(s+3) || u_strchr32(s, 0xdc02)!=(s+5)) {
|
||||
log_err("error: u_strchr32(surrogate) finds a partial supplementary code point\n");
|
||||
}
|
||||
}
|
||||
|
||||
log_verbose("Testing u_austrcpy()");
|
||||
u_austrcpy(test,dataTable[0][0]);
|
||||
|
@ -855,12 +855,19 @@ UnicodeStringTest::TestSearching()
|
||||
UChar testChar = 0x74;
|
||||
|
||||
UChar32 testChar32 = 0x20402;
|
||||
UChar testData[]={0xd841, 0xdc02, 0x71, 0xdc02, 0xd841, 0x71, 0xd841, 0xdc02, 0x71, 0x72, 0xd841, 0xdc02, 0x71, 0xd841, 0xdc02, 0x71, 0xdc02, 0xd841, 0x73, 0x0000};
|
||||
UChar testData[]={
|
||||
// 0 1 2 3 4 5 6 7
|
||||
0xd841, 0xdc02, 0x0071, 0xdc02, 0xd841, 0x0071, 0xd841, 0xdc02,
|
||||
|
||||
// 8 9 10 11 12 13 14 15
|
||||
0x0071, 0x0072, 0xd841, 0xdc02, 0x0071, 0xd841, 0xdc02, 0x0071,
|
||||
|
||||
// 16 17 18 19
|
||||
0xdc02, 0xd841, 0x0073, 0x0000
|
||||
};
|
||||
UnicodeString test3(testData);
|
||||
UnicodeString test4(testChar32);
|
||||
|
||||
|
||||
|
||||
uint16_t occurrences = 0;
|
||||
UTextOffset startPos = 0;
|
||||
for ( ;
|
||||
@ -984,6 +991,17 @@ UnicodeStringTest::TestSearching()
|
||||
if (occurrences != 18)
|
||||
errln((UnicodeString)"indexOf failed: expected to find 18 occurrences, found " + occurrences);
|
||||
//---
|
||||
|
||||
// test that indexOf(UChar32) and lastIndexOf(UChar32)
|
||||
// do not find surrogate code points when they are part of matched pairs
|
||||
// (= part of supplementary code points)
|
||||
// Jitterbug 1542
|
||||
if(test3.indexOf((UChar32)0xd841) != 4 || test3.indexOf((UChar32)0xdc02) != 3) {
|
||||
errln("error: UnicodeString::indexOf(UChar32 surrogate) finds a partial supplementary code point");
|
||||
}
|
||||
if(test3.lastIndexOf((UChar32)0xd841, 0, 17) != 4 || test3.lastIndexOf((UChar32)0xdc02, 0, 17) != 16) {
|
||||
errln("error: UnicodeString::indexOf(UChar32 surrogate) finds a partial supplementary code point");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
Loading…
Reference in New Issue
Block a user