ICU-2906 make unescapeAt() handle \u-escaped surrogate pairs

X-SVN-Rev: 13340
2003-10-07 17:22:14 +00:00 · 2003-10-07 17:22:14 +00:00 · 94a17e18a5
commit 94a17e18a5
parent 402f683111
2 changed files with 45 additions and 4 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java,v $ 
- * $Date: 2003/09/29 23:20:36 $ 
- * $Revision: 1.53 $
+ * $Date: 2003/10/07 17:22:14 $ 
+ * $Revision: 1.54 $
 *
 *****************************************************************************************
 */
@ -1214,6 +1214,26 @@ public class UnicodeSetTest extends TestFmwk {
        }
    }

+    public void TestSurrogate() {
+        String DATA[] = {
+            // These should all behave identically
+            "[abc\\uD800\\uDC00]",
+            "[abc\uD800\uDC00]",
+            "[abc\\U00010000]",
+        };
+        for (int i=0; i<DATA.length; ++i) {
+            logln("Test pattern " + i + " :" + Utility.escape(DATA[i]));
+            UnicodeSet set = new UnicodeSet(DATA[i]);
+            expectContainment(set,
+                              CharsToUnicodeString("abc\\U00010000"),
+                              "\uD800;\uDC00"); // split apart surrogate-pair
+            if (set.size() != 4) {
+                errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " + 
+                      set.size() + ", expected 4"));
+            }
+        }
+    }
+
    void _testComplement(int a) {
        UnicodeSet x = bitsToSet(a);
        UnicodeSet z = bitsToSet(a);
--- a/icu4j/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/src/com/ibm/icu/impl/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Utility.java,v $
- * $Date: 2003/10/07 16:51:56 $
- * $Revision: 1.46 $
+ * $Date: 2003/10/07 17:22:14 $
+ * $Revision: 1.47 $
 *
 *****************************************************************************************
 */
@ -803,6 +803,27 @@ public final class Utility {
            if (result < 0 || result >= 0x110000) {
                return -1;
            }
+            // If a 'u' escape sequence (16-bit) specifies a lead
+            // surrogate, see if there is a trail surrogate after it,
+            // either as a 'u' escape or as a literal.  If so, join
+            // them up into a supplementary.
+            if (maxDig == 4 && offset < length &&
+                UTF16.isLeadSurrogate((char) result)) {
+                c = s.charAt(offset); // [sic] get 16-bit code unit
+                int ahead = offset+1;
+                // ONLY parse backslash 'u', nothing else
+                if (c == '\\' && (offset+1) < length &&
+                    s.charAt(offset+1) == 'u') {
+                    int o[] = new int[] { ahead };
+                    c = unescapeAt(s, o);
+                    ahead = o[0];
+                }
+                if (UTF16.isTrailSurrogate((char) c)) {
+                    offset = ahead;
+	            result = UCharacterProperty.getRawSupplementary(
+                                  (char) result, (char) c);
+                }
+            }
            offset16[0] = offset;
            return result;
        }