ICU-410 use unified unescape function(s)

X-SVN-Rev: 1897
2000-07-16 13:42:38 +00:00 · 2000-07-16 13:42:38 +00:00 · 5e9d055754
commit 5e9d055754
parent cb74716658
5 changed files with 21 additions and 143 deletions
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -240,27 +240,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit,
            if (pos == limit) {
                return syntaxError(RuleBasedTransliterator::TRAILING_BACKSLASH, rule, start);
            }
-
-            // UNLIKE THE JAVA version, we parse \uXXXX escapes.  We
-            // do not do this in Java because the compiler has already
-            // done it when the ResourceBundle file was compiled.
-            // Parse \uXXXX escapes
-            c = rule.charAt(pos++);
-            if (c == 0x0075/*u*/) {
-                if ((pos+4) > limit) {
+            UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\'
+            if (escaped == (UChar32) -1) {
                return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rule, start);
            }
-                c = (UChar)0x0000;
-                for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
-                    int32_t digit = Unicode::digit(rule.charAt(pos), 16);
-                    if (digit<0) {
-                        return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rule, start);
-                    }
-                    c = (UChar) ((c << 4) | digit);
-                }
-            }
- 
-            buf.append(c);
+            buf.append((UChar) escaped);
            continue;
        }
        // Handle quoted matter
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@ -812,30 +812,15 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
             * interpret '\\uxxxx' Unicode escapes here (as literals).
             */
            if (c == BACKSLASH) {
-                ++i;
-                if (i < pattern.length()) {
-                    c = pattern.charAt(i);
+                ++i; // Advance past '\\'
+                UChar32 escaped = pattern.unescapeAt(i);
+                if (escaped == (UChar32) -1) {
+                    status = U_ILLEGAL_ARGUMENT_ERROR;
+                    return;
+                }
                isLiteral = TRUE;
-                    if (c == 0x0075 /*u*/) {
-                        if ((i+4) >= pattern.length()) {
-                            status = U_ILLEGAL_ARGUMENT_ERROR;
-                            return;
-                        }
-                        c = (UChar)0x0000;
-                        for (int32_t j=(++i)+4; i<j; ++i) { // [sic]
-                            int32_t digit = Unicode::digit(pattern.charAt(i), 16);
-                            if (digit<0) {
-                                status = U_ILLEGAL_ARGUMENT_ERROR;
-                                return;
-                            }
-                            c = (UChar) ((c << 4) | digit);
-                        }
                --i; // Move i back to last parsed character
-                    }
-                } else {
-                    status = U_ILLEGAL_ARGUMENT_ERROR;
-                    return;
-                }
+                c = (UChar) escaped;
            }

            /* Parse variable references.  These are treated as literals.  If a
--- a/icu4c/source/test/cintltst/ccolltst.c
+++ b/icu4c/source/test/cintltst/ccolltst.c
@ -119,48 +119,10 @@ UChar* appendCompareResult(UCollationResult result, UChar* target)
    return target;
 }

-UChar* CharsToUChars(const char* chars)
-{
-    int unicode;
-    int i;
-    UChar *buffer;
-    UChar *alias;
-    int count = 0;
-
-    /* preflight */
-    for (i = 0; chars[i] != 0;) {
-        if ((chars[i] == '\\') && (chars[i+1] == 'u')) {
-            i += 6;
-        } else {
-            i++;
-        }
-        ++count;
-    }
-
-    alias = buffer = (UChar*) malloc(sizeof(UChar) * (count + 1));
-    
-    for (;;) {
-        /* search for \u or the end */
-        for(i = 0; chars[i] != 0 && !(chars[i] == '\\' && chars[i+1] == 'u'); ++i) {}
-
-        /* convert characters between escape sequences */
-        if(i > 0) {
-            u_charsToUChars(chars, alias, i);
-            chars += i;
-            alias += i;
-        }
-
-        /* did we reach the end or an escape sequence? */
-        if(*chars == 0) {
-            break;
-        }
-
-        /* unescape one character: we know that there is a \u sequence at chars[limit] */
-        chars += 2;
-        sscanf(chars, "%4X", &unicode);
-        *alias++ = (UChar)unicode;
-        chars += 4;
-    }
-    *alias = 0x0000;
-    return buffer;
+UChar* CharsToUChars(const char* str) {
+    /* Might be faster to just use uprv_strlen() as the preflight len - liu */
+    int32_t len = u_unescape(str, 0, 0); /* preflight */
+    UChar *buf = (UChar*) malloc(sizeof(UChar) * len);
+    u_unescape(str, buf, len);
+    return buf;
 }
--- a/icu4c/source/test/intltest/intltest.cpp
+++ b/icu4c/source/test/intltest/intltest.cpp
@ -22,6 +22,7 @@
 #include "unicode/ures.h"
 #include "unicode/coll.h"
 #include "unicode/smpdtfmt.h"
+#include "unicode/ustring.h"

 #include "intltest.h"
 #include "itmajor.h"
@ -1008,37 +1009,8 @@ main(int argc, char* argv[])
 */
 UnicodeString CharsToUnicodeString(const char* chars)
 {
-    int unicode;
-    int i;
-    UnicodeString result;
-    UChar buffer[400];
-
-    for (;;) {
-        /* repeat the following according to the length of the buffer */
-        do {
-            /* search for \u or the end */
-            for(i = 0; i < 400 && chars[i] != 0 && !(chars[i] == '\\' && chars[i+1] == 'u'); ++i) {}
-
-            /* convert characters between escape sequences */
-            if(i > 0) {
-                u_charsToUChars(chars, buffer, i);
-                result.append(buffer, i);
-                chars += i;
-            }
-        } while(i == 400);
-
-        /* did we reach the end or an escape sequence? */
-        if(*chars == 0) {
-            break;
-        }
-
-        /* unescape one character: we know that there is a \u sequence at chars[limit] */
-        chars += 2;
-        sscanf(chars, "%4X", &unicode);
-        result.append((UChar)unicode);
-        chars += 4;
-    }
-    return result;
+    UnicodeString str(chars, ""); // Invariant conversion
+    return str.unescape();
 }

 /*
--- a/icu4c/source/test/intltest/tstnorm.cpp
+++ b/icu4c/source/test/intltest/tstnorm.cpp
@ -43,33 +43,8 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
 */
 static UnicodeString str(const char *input)
 {
-  static const UnicodeString digitString1("0123456789ABCDEF");
-  static const UnicodeString digitString2("0123456789abcdef");
-  
-  UnicodeString result(input);
-  int index = 0;
-  
-  while ((index = result.indexOf("\\u")) != -1)
-    {
-      if (index + 6 <= result.length())
-    {
-      UChar c = 0;
-      for (int i = index + 2; i < index + 6; i++) {
-        UTextOffset value = digitString1.indexOf(result[i]);
-        
-        if (value == -1) {
-          value = digitString2.indexOf(result[i]);
-        }
-        c = (UChar)(c * 16 + value);
-      }
-      UnicodeString replace;
-      replace += c;
-      result.replace(index, 6, replace);
-    }
-      index += 1;
-    }
-
-  return result;
+    UnicodeString str(input, ""); // Invariant conversion
+    return str.unescape();
 }