ICU-1542 fix indexOf(UChar32 surrogate), lastIndexOf(UChar32 surrogate), u_strchr32(surrogate) to match char32At()/UTF_GET_CHAR()

X-SVN-Rev: 7744
2002-02-22 02:00:42 +00:00 · 2002-02-22 02:00:42 +00:00 · b24a8e910f
commit b24a8e910f
parent 4178e90458
6 changed files with 269 additions and 50 deletions
--- a/icu4c/source/common/unicode/unistr.h
+++ b/icu4c/source/common/unicode/unistr.h
@ -847,6 +847,20 @@ public:
  /**
   * Locate in this the first occurrence of the code point <TT>c</TT>, 
   * using bitwise comparison.
+   *
+   * This function finds code points, which differs for BMP code points
+   * from indexOf(UChar c, ...) only for surrogates:
+   * While indexOf(UChar c, ...) finds any surrogate code units in a string,
+   * indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
+   * i.e., only those that do not combine with an adjacent surrogate
+   * to form a supplementary code point.
+   * For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
+   * will find code units U+d800 at 0 and U+dc00 at 1,
+   * but indexOf(UChar32 c, ...) will find neither because they
+   * combine to the code point U+10000.
+   * Either function will find U+d800 in "a\ud800b".
+   * This behavior ensures that char32At(indexOf(UChar32 c))==c.
+   *
   * @param c The code point to search for.
   * @return The offset into this of <TT>c</TT>, or -1 if not found.
   * @stable
@ -867,6 +881,20 @@ public:
  /**
   * Locate in this the first occurrence of the code point <TT>c</TT>
   * starting at offset <TT>start</TT>, using bitwise comparison.
+   *
+   * This function finds code points, which differs for BMP code points
+   * from indexOf(UChar c, ...) only for surrogates:
+   * While indexOf(UChar c, ...) finds any surrogate code units in a string,
+   * indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
+   * i.e., only those that do not combine with an adjacent surrogate
+   * to form a supplementary code point.
+   * For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
+   * will find code units U+d800 at 0 and U+dc00 at 1,
+   * but indexOf(UChar32 c, ...) will find neither because they
+   * combine to the code point U+10000.
+   * Either function will find U+d800 in "a\ud800b".
+   * This behavior ensures that char32At(indexOf(UChar32 c))==c.
+   *
   * @param c The code point to search for.
   * @param start The offset at which searching will start.
   * @return The offset into this of <TT>c</TT>, or -1 if not found.
@ -893,6 +921,20 @@ public:
   * Locate in this the first occurrence of the code point <TT>c</TT> 
   * in the range [<TT>start</TT>, <TT>start + length</TT>), 
   * using bitwise comparison.   
+   *
+   * This function finds code points, which differs for BMP code points
+   * from indexOf(UChar c, ...) only for surrogates:
+   * While indexOf(UChar c, ...) finds any surrogate code units in a string,
+   * indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
+   * i.e., only those that do not combine with an adjacent surrogate
+   * to form a supplementary code point.
+   * For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
+   * will find code units U+d800 at 0 and U+dc00 at 1,
+   * but indexOf(UChar32 c, ...) will find neither because they
+   * combine to the code point U+10000.
+   * Either function will find U+d800 in "a\ud800b".
+   * This behavior ensures that char32At(indexOf(UChar32 c))==c.
+   *
   * @param c The code point to search for.
   * @param start the offset into this at which to start matching
   * @param length the number of characters in this to search
@ -1027,6 +1069,20 @@ public:
  /**
   * Locate in this the last occurrence of the code point <TT>c</TT>, 
   * using bitwise comparison.
+   *
+   * This function finds code points, which differs for BMP code points
+   * from lastIndexOf(UChar c, ...) only for surrogates:
+   * While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
+   * lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
+   * i.e., only those that do not combine with an adjacent surrogate
+   * to form a supplementary code point.
+   * For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
+   * will find code units U+d800 at 0 and U+dc00 at 1,
+   * but lastIndexOf(UChar32 c, ...) will find neither because they
+   * combine to the code point U+10000.
+   * Either function will find U+d800 in "a\ud800b".
+   * This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
+   *
   * @param c The code point to search for.
   * @return The offset into this of <TT>c</TT>, or -1 if not found.
   * @stable
@ -1047,6 +1103,20 @@ public:
  /**
   * Locate in this the last occurrence of the code point <TT>c</TT>
   * starting at offset <TT>start</TT>, using bitwise comparison.
+   *
+   * This function finds code points, which differs for BMP code points
+   * from lastIndexOf(UChar c, ...) only for surrogates:
+   * While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
+   * lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
+   * i.e., only those that do not combine with an adjacent surrogate
+   * to form a supplementary code point.
+   * For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
+   * will find code units U+d800 at 0 and U+dc00 at 1,
+   * but lastIndexOf(UChar32 c, ...) will find neither because they
+   * combine to the code point U+10000.
+   * Either function will find U+d800 in "a\ud800b".
+   * This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
+   *
   * @param c The code point to search for.
   * @param start The offset at which searching will start.
   * @return The offset into this of <TT>c</TT>, or -1 if not found.
@ -1073,6 +1143,20 @@ public:
   * Locate in this the last occurrence of the code point <TT>c</TT> 
   * in the range [<TT>start</TT>, <TT>start + length</TT>), 
   * using bitwise comparison.   
+   *
+   * This function finds code points, which differs for BMP code points
+   * from lastIndexOf(UChar c, ...) only for surrogates:
+   * While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
+   * lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
+   * i.e., only those that do not combine with an adjacent surrogate
+   * to form a supplementary code point.
+   * For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
+   * will find code units U+d800 at 0 and U+dc00 at 1,
+   * but lastIndexOf(UChar32 c, ...) will find neither because they
+   * combine to the code point U+10000.
+   * Either function will find U+d800 in "a\ud800b".
+   * This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
+   *
   * @param c The code point to search for.
   * @param start the offset into this at which to start matching
   * @param length the number of characters in this to search
@ -2687,10 +2771,20 @@ private:
            UTextOffset start,
            int32_t length) const;

+  // only for c>=0xd800
+  UTextOffset doIndexOf(UChar32 c,
+                        UTextOffset start,
+                        int32_t length) const;
+
  UTextOffset doLastIndexOf(UChar c,
                UTextOffset start,
                int32_t length) const;

+  // only for c>=0xd800
+  UTextOffset doLastIndexOf(UChar32 c,
+                            UTextOffset start,
+                            int32_t length) const;
+
  void doExtract(UTextOffset start, 
         int32_t length, 
         UChar *dst, 
@ -3161,14 +3255,7 @@ UnicodeString::indexOf(UChar c) const

 inline UTextOffset 
 UnicodeString::indexOf(UChar32 c) const {
-  if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
-    return doIndexOf((UChar)c, 0, fLength);
-  } else {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    int32_t length = 0;
-    UTF_APPEND_CHAR_UNSAFE(buffer, length, c);
-    return indexOf(buffer, length, 0);
-  }
+  return indexOf(c, 0, fLength);
 }

 inline UTextOffset 
@ -3179,14 +3266,7 @@ UnicodeString::indexOf(UChar c,
 inline UTextOffset 
 UnicodeString::indexOf(UChar32 c,
               UTextOffset start) const {
-  if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
-    return doIndexOf((UChar)c, start, fLength - start);
-  } else {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    int32_t length = 0;
-    UTF_APPEND_CHAR_UNSAFE(buffer, length, c);
-    return indexOf(buffer, length, start);
-  }
+  return indexOf(c, start, fLength - start);
 }

 inline UTextOffset 
@ -3199,13 +3279,10 @@ inline UTextOffset
 UnicodeString::indexOf(UChar32 c,
               UTextOffset start,
               int32_t length) const {
-  if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
+  if((uint32_t)c<0xd800) {
    return doIndexOf((UChar)c, start, length);
  } else {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    int32_t cLength = 0;
-    UTF_APPEND_CHAR_UNSAFE(buffer, cLength, c);
-    return indexOf(buffer, cLength, start, length);
+    return doIndexOf(c, start, length);
  }
 }

@ -3259,14 +3336,7 @@ UnicodeString::lastIndexOf(UChar c) const

 inline UTextOffset 
 UnicodeString::lastIndexOf(UChar32 c) const {
-  if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
-    return doLastIndexOf((UChar)c, 0, fLength);
-  } else {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    int32_t count = 0;
-    UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
-    return lastIndexOf(buffer, count, 0);
-  }
+  return lastIndexOf(c, 0, fLength);
 }

 inline UTextOffset 
@ -3277,14 +3347,7 @@ UnicodeString::lastIndexOf(UChar c,
 inline UTextOffset 
 UnicodeString::lastIndexOf(UChar32 c,
               UTextOffset start) const {
-  if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
-    return doLastIndexOf((UChar)c, start, fLength - start);
-  } else {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    int32_t count = 0;
-    UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
-    return lastIndexOf(buffer, count, start);
-  }
+  return lastIndexOf(c, start, fLength - start);
 }

 inline UTextOffset 
@ -3297,13 +3360,10 @@ inline UTextOffset
 UnicodeString::lastIndexOf(UChar32 c,
               UTextOffset start,
               int32_t length) const {
-  if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
+  if((uint32_t)c<0xd800) {
    return doLastIndexOf((UChar)c, start, length);
  } else {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    int32_t count = 0;
-    UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
-    return lastIndexOf(buffer, count, start, length);
+    return doLastIndexOf(c, start, length);
  }
 }

--- a/icu4c/source/common/unicode/ustring.h
+++ b/icu4c/source/common/unicode/ustring.h
@ -144,6 +144,19 @@ u_strstr(const UChar *s, const UChar *substring);
 /**
 * Find the first occurence of a specified code point in a string.
 *
+ * This function finds code points, which differs for BMP code points
+ * from u_strchr() only for surrogates:
+ * While u_strchr() finds any surrogate code units in a string,
+ * u_strchr32() finds only unmatched surrogate code points,
+ * i.e., only those that do not combine with an adjacent surrogate
+ * to form a supplementary code point.
+ * For example, in a string "\ud800\udc00" u_strchr()
+ * will find code units U+d800 at 0 and U+dc00 at 1,
+ * but u_strchr32() will find neither because they
+ * combine to the code point U+10000.
+ * Either function will find U+d800 in "a\ud800b".
+ * This behavior ensures that UTF_GET_CHAR(u_strchr32(c))==c.
+ *
 * @param s The string to search.
 * @param c The code point (0..0x10ffff) to find.
 * @return A pointer to the first occurrence of <TT>c</TT> in <TT>s</TT>,
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@ -803,6 +803,49 @@ UnicodeString::doIndexOf(UChar c,
  return -1;
 }

+UTextOffset
+UnicodeString::doIndexOf(UChar32 c,
+                         UTextOffset start,
+                         int32_t length) const {
+  // pin indices
+  pinIndices(start, length);
+  if(length == 0) {
+    return -1;
+  }
+
+  // c<0xd800 handled by inline function indexOf(UChar32 c, start, length)
+  if(c<=0xdfff) {
+    // surrogate code point
+    int32_t index;
+
+    while(length>0) {
+      index=doIndexOf((UChar)c, start, length);
+      if(index<0) {
+        return index;
+      }
+      if(
+        UTF_IS_SURROGATE_FIRST(c) ?
+          ((index+1)<fLength && UTF_IS_TRAIL(fArray[index+1])) :
+          (index>0 && UTF_IS_LEAD(fArray[index-1]))
+      ) {
+        // matched surrogate, not a surrogate code point, continue searching
+        length-=(index+1)-start;
+        start=index+1;
+      } else {
+        return index;
+      }
+    }
+    return -1;
+  } else if(c<=0xffff) {
+    // non-surrogate BMP code point
+    return doIndexOf((UChar)c, start, length);
+  } else {
+    // supplementary code point, search for string
+    UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
+    return indexOf(buffer, 2, start, length);
+  }
+}
+
 UTextOffset 
 UnicodeString::lastIndexOf(const UChar *srcChars,
               UTextOffset srcStart,
@ -880,6 +923,48 @@ UnicodeString::doLastIndexOf(UChar c,
  return -1;
 }

+UTextOffset
+UnicodeString::doLastIndexOf(UChar32 c,
+                             UTextOffset start,
+                             int32_t length) const {
+  // pin indices
+  pinIndices(start, length);
+  if(length == 0) {
+    return -1;
+  }
+
+  // c<0xd800 handled by inline function lastIndexOf(UChar32 c, start, length)
+  if(c<=0xdfff) {
+    // surrogate code point
+    int32_t index;
+
+    while(length>0) {
+      index=doLastIndexOf((UChar)c, start, length);
+      if(index<0) {
+        return index;
+      }
+      if(
+        UTF_IS_SURROGATE_FIRST(c) ?
+          ((index+1)<fLength && UTF_IS_TRAIL(fArray[index+1])) :
+          (index>0 && UTF_IS_LEAD(fArray[index-1]))
+      ) {
+        // matched surrogate, not a surrogate code point, continue searching
+        length=index-start;
+      } else {
+        return index;
+      }
+    }
+    return -1;
+  } else if(c<=0xffff) {
+    // non-surrogate BMP code point
+    return doLastIndexOf((UChar)c, start, length);
+  } else {
+    // supplementary code point, search for string
+    UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
+    return lastIndexOf(buffer, 2, start, length);
+  }
+}
+
 UnicodeString& 
 UnicodeString::findAndReplace(UTextOffset start,
                  int32_t length,
--- a/icu4c/source/common/ustring.c
+++ b/icu4c/source/common/ustring.c
@ -81,13 +81,40 @@ u_strstr(const UChar *s, const UChar *substring) {

 U_CAPI UChar * U_EXPORT2
 u_strchr32(const UChar *s, UChar32 c) {
-  if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
+  if(c < 0xd800) {
+    /* non-surrogate BMP code point */
+    return u_strchr(s, (UChar)c);
+  } else if(c <= 0xdfff) {
+    /* surrogate code point */
+    UChar *t;
+
+    for(;;) {
+      t = u_strchr(s, (UChar)c);
+      if(t == NULL) {
+        return NULL;
+      }
+      if(
+        UTF_IS_SURROGATE_FIRST(*t) ?
+          UTF_IS_TRAIL(*(t+1)) :
+          (s<t && UTF_IS_LEAD(*(t-1)))
+      ) {
+        /* matched surrogate, not a surrogate code point, continue searching */
+        s = t + 1;
+      } else {
+        return t;
+      }
+    }
+    return NULL;
+  } else if(c <= 0xffff) {
+    /* non-surrogate BMP code point */
    return u_strchr(s, (UChar)c);
  } else {
-    UChar buffer[UTF_MAX_CHAR_LENGTH + 1];
-    UTextOffset i = 0;
-    UTF_APPEND_CHAR_UNSAFE(buffer, i, c);
-    buffer[i] = 0;
+    /* supplementary code point, search for string */
+    UChar buffer[3];
+
+    buffer[0] = UTF16_LEAD(c);
+    buffer[1] = UTF16_TRAIL(c);
+    buffer[2] = 0;
    return u_strstr(s, buffer);
  }
 }
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@ -1005,6 +1005,22 @@ static void TestStringFunctions()
        dataTable[i][j][0] = saveVal;   /* Put it back for the other tests */
    }

+    /*
+     * test that u_strchr32()
+     * does not find surrogate code points when they are part of matched pairs
+     * (= part of supplementary code points)
+     * Jitterbug 1542
+     */
+    {
+        UChar s[]={
+            /*   0       1       2       3       4       5       6       7       8  9 */
+            0x0061, 0xd841, 0xdc02, 0xd841, 0x0062, 0xdc02, 0xd841, 0xdc02, 0x0063, 0
+        };
+
+        if(u_strchr32(s, 0xd841)!=(s+3) || u_strchr32(s, 0xdc02)!=(s+5)) {
+            log_err("error: u_strchr32(surrogate) finds a partial supplementary code point\n");
+        }
+    }

    log_verbose("Testing u_austrcpy()");
    u_austrcpy(test,dataTable[0][0]);
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@ -855,12 +855,19 @@ UnicodeStringTest::TestSearching()
    UChar testChar = 0x74;
    
    UChar32 testChar32 = 0x20402;
-    UChar testData[]={0xd841, 0xdc02, 0x71, 0xdc02, 0xd841, 0x71, 0xd841, 0xdc02, 0x71, 0x72, 0xd841, 0xdc02, 0x71, 0xd841, 0xdc02, 0x71, 0xdc02, 0xd841, 0x73, 0x0000};
+    UChar testData[]={
+        //   0       1       2       3       4       5       6       7
+        0xd841, 0xdc02, 0x0071, 0xdc02, 0xd841, 0x0071, 0xd841, 0xdc02,
+
+        //   8       9      10      11      12      13      14      15
+        0x0071, 0x0072, 0xd841, 0xdc02, 0x0071, 0xd841, 0xdc02, 0x0071,
+
+        //  16      17      18      19
+        0xdc02, 0xd841, 0x0073, 0x0000
+    };
    UnicodeString test3(testData);
    UnicodeString test4(testChar32);

-
-
    uint16_t occurrences = 0;
    UTextOffset startPos = 0;
    for ( ;
@ -984,6 +991,17 @@ UnicodeStringTest::TestSearching()
    if (occurrences != 18)
        errln((UnicodeString)"indexOf failed: expected to find 18 occurrences, found " + occurrences);
    //---
+
+    // test that indexOf(UChar32) and lastIndexOf(UChar32)
+    // do not find surrogate code points when they are part of matched pairs
+    // (= part of supplementary code points)
+    // Jitterbug 1542
+    if(test3.indexOf((UChar32)0xd841) != 4 || test3.indexOf((UChar32)0xdc02) != 3) {
+        errln("error: UnicodeString::indexOf(UChar32 surrogate) finds a partial supplementary code point");
+    }
+    if(test3.lastIndexOf((UChar32)0xd841, 0, 17) != 4 || test3.lastIndexOf((UChar32)0xdc02, 0, 17) != 16) {
+        errln("error: UnicodeString::indexOf(UChar32 surrogate) finds a partial supplementary code point");
+    }
 }

 void