ICU-4790 spoof cleanups

X-SVN-Rev: 25876
2009-04-23 01:30:02 +00:00 · 2009-04-23 01:30:02 +00:00 · b08b9e8625
commit b08b9e8625
parent 1c326702cc
4 changed files with 82 additions and 39 deletions
--- a/icu4c/source/i18n/unicode/uspoof.h
+++ b/icu4c/source/i18n/unicode/uspoof.h
@ -603,21 +603,16 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
 * or whole script - are determined by the check options set for the
 * USpoofChecker.
 *
- * TODO: expand on the following
- * There are four possible types of comarisons:
- *    Mixed Script,  Lower Case
- *    Mixed Script,  Any Case
- *    Single Script, Lower Case
- *    Single Script, Any Case
- * Which tests are performed is controlled by the flags
+ * The tests to be performed are controlled by the flags
 *   USPOOF_SINGLE_SCRIPT_CONFUSABLE 
 *   USPOOF_MIXED_SCRIPT_CONFUSABLE  
- * One or both of these must be set.
+ *   USPOOF_WHOLE_SCRIPT_CONFUSABLE
+ * At least one of these tests must be selected.
 * 
- * USPOOF_ANY_CASE is a modifier.  Choose it if the identifiers
- *   are case-sensitive and may be of mixed case.
- * If identifiers are normalized to lower case for comparison or
- * display to the user, do not select the ANY_CASE option.
+ * USPOOF_ANY_CASE is a modifier for the tests.  Select it if the identifiers
+ *   may be of mixed case.
+ * If identifiers are case folded for comparison and
+ * display to the user, do not select the USPOOF_ANY_CASE option.
 *
 *
 * @param sc      The USpoofChecker
--- a/icu4c/source/i18n/uspoof.cpp
+++ b/icu4c/source/i18n/uspoof.cpp
@ -551,6 +551,12 @@ uspoof_getSkeleton(const USpoofChecker *sc,
                   UChar *dest, int32_t destCapacity,
                   UErrorCode *status) {

+    // TODO:  this function could be sped up a bit
+    //        Skip the input normalization when not needed, work from callers data.
+    //        Put the initial skeleton straight into the caller's destination buffer.
+    //        It probably won't need normalization.
+    //        But these would make the structure more complicated.  
+
    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (U_FAILURE(*status)) {
        return 0;
@ -582,8 +588,8 @@ uspoof_getSkeleton(const USpoofChecker *sc,

    // NFKD transform of the user supplied input
    
-    UChar nfkdBuf[USPOOF_STACK_BUFFER_SIZE];
-    UChar *nfkdInput = nfkdBuf;
+    UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE];
+    UChar *nfkdInput = nfkdStackBuf;
    int32_t normalizedLen = unorm_normalize(
        s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);
    if (*status == U_BUFFER_OVERFLOW_ERROR) {
@ -597,47 +603,67 @@ uspoof_getSkeleton(const USpoofChecker *sc,
                                        nfkdInput, normalizedLen+1, status);
    }
    if (U_FAILURE(*status)) {
+        if (nfkdInput != nfkdStackBuf) {
+            uprv_free(nfkdInput);
+        }
        return 0;
    }

-    // buffer to hold the Unicode defined mappings for a single code point
+    // buffer to hold the Unicode defined skeleton mappings for a single code point
    UChar buf[USPOOF_MAX_SKELETON_EXPANSION];

-    // Apply the mapping to the NFKD form string
-    
+    // Apply the skeleton mapping to the NFKD normalized input string
+    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
    int32_t inputIndex = 0;
-    int32_t resultLen = 0;
+    UnicodeString skelStr;
    while (inputIndex < normalizedLen) {
        UChar32 c;
        U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);
        int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
-        if (resultLen + replaceLen < destCapacity) {
-            int i;
-            for (i=0; i<replaceLen; i++) {
-                dest[resultLen++] = buf[i];
-            }
-        } else {
-            // Storing the transformed string would overflow the dest buffer.
-            //   Don't bother storing anything, just sum up the required buffer size.
-            //   (We dont guarantee that a truncated buffer is filled to it's end)
-            resultLen += replaceLen;
-        }
+        skelStr.append(buf, replaceLen);
    }
-    
-    if (resultLen < destCapacity) {
-        dest[resultLen] = 0;
-    } else if (resultLen == destCapacity) {
-        *status = U_STRING_NOT_TERMINATED_WARNING;
-    } else {
-        *status = U_BUFFER_OVERFLOW_ERROR;
-    }
-    if (nfkdInput != nfkdBuf) {
+
+    if (nfkdInput != nfkdStackBuf) {
        uprv_free(nfkdInput);
    }
-    return resultLen;
+    
+    const UChar *result = skelStr.getBuffer();
+    int32_t  resultLen  = skelStr.length();
+    UChar   *normedResult = NULL;
+
+    // Check the skeleton for NFKD, normalize it if needed.
+    // Unnormalized results should be very rare.
+    if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {
+        normalizedLen = unorm_normalize(dest, resultLen, UNORM_NFKD, 0, NULL, 0, status);
+        UChar *normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
+        if (normedResult == NULL) {
+            *status = U_MEMORY_ALLOCATION_ERROR;
+            return 0;
+        }
+        unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status);
+        result = normedResult;
+        resultLen = normalizedLen;
+    }
+
+    // Copy the skeleton to the caller's buffer
+    if (U_SUCCESS(*status)) {
+        if (destCapacity == 0 || resultLen > destCapacity) {
+            *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
+        } else {
+            u_memcpy(dest, result, resultLen);
+            if (destCapacity > resultLen) {
+                dest[resultLen] = 0;
+            } else {
+                *status = U_STRING_NOT_TERMINATED_WARNING;
+            }
+        }
+     }       
+     uprv_free(normedResult);
+     return resultLen;
 }


+
 U_CAPI UnicodeString &  U_EXPORT2
 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
                                uint32_t type,
--- a/icu4c/source/test/intltest/itspoof.cpp
+++ b/icu4c/source/test/intltest/itspoof.cpp
@ -67,6 +67,12 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
                testSkeleton();
            }
            break;
+         case 2:
+            name = "TestAreConfusable";
+            if (exec) {
+                testAreConfusable();
+            }
+            break;
        default: name=""; break;
    }
 }
@ -199,4 +205,18 @@ void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
    }
 }

+void IntlTestSpoof::testAreConfusable() {
+    UErrorCode status = U_ZERO_ERROR;
+    TEST_SETUP
+        UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
+                         "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
+        UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
+                         "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
+        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
+        TEST_ASSERT_SUCCESS(status);
+
+    TEST_TEARDOWN;
+}
+
+
 #endif /* #if !UCONFIG_NO_SPOOF_DETECT*/
--- a/icu4c/source/test/intltest/itspoof.h
+++ b/icu4c/source/test/intltest/itspoof.h
@ -26,6 +26,8 @@ public:
    void  testSpoofAPI();

    void  testSkeleton();
+
+    void testAreConfusable();
    
    // Internal function to run a single skeleton test case.
    void  checkSkeleton(const USpoofChecker *sc, uint32_t flags,