ICU-4790 spoof cleanups

X-SVN-Rev: 25876
2009-04-23 01:30:02 +00:00 · 2009-04-23 01:30:02 +00:00 · b08b9e8625
commit b08b9e8625
parent 1c326702cc
4 changed files with 82 additions and 39 deletions
--- a/icu4c/source/i18n/unicode/uspoof.h
+++ b/icu4c/source/i18n/unicode/uspoof.h
@ -603,21 +603,16 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
 * or whole script - are determined by the check options set for the
 * USpoofChecker.
 *
- * TODO: expand on the following
+ * The tests to be performed are controlled by the flags
 * There are four possible types of comarisons:
 *    Mixed Script,  Lower Case
 *    Mixed Script,  Any Case
 *    Single Script, Lower Case
 *    Single Script, Any Case
 * Which tests are performed is controlled by the flags
 *   USPOOF_SINGLE_SCRIPT_CONFUSABLE 
 *   USPOOF_MIXED_SCRIPT_CONFUSABLE  
- * One or both of these must be set.
+ *   USPOOF_WHOLE_SCRIPT_CONFUSABLE
 * At least one of these tests must be selected.
 * 
- * USPOOF_ANY_CASE is a modifier.  Choose it if the identifiers
+ * USPOOF_ANY_CASE is a modifier for the tests.  Select it if the identifiers
- *   are case-sensitive and may be of mixed case.
+ *   may be of mixed case.
- * If identifiers are normalized to lower case for comparison or
+ * If identifiers are case folded for comparison and
- * display to the user, do not select the ANY_CASE option.
+ * display to the user, do not select the USPOOF_ANY_CASE option.
 *
 *
 * @param sc      The USpoofChecker
--- a/icu4c/source/i18n/uspoof.cpp
+++ b/icu4c/source/i18n/uspoof.cpp
@ -551,6 +551,12 @@ uspoof_getSkeleton(const USpoofChecker *sc,
                   UChar *dest, int32_t destCapacity,
                   UErrorCode *status) {
    // TODO:  this function could be sped up a bit
    //        Skip the input normalization when not needed, work from callers data.
    //        Put the initial skeleton straight into the caller's destination buffer.
    //        It probably won't need normalization.
    //        But these would make the structure more complicated.  
    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (U_FAILURE(*status)) {
        return 0;
@ -582,8 +588,8 @@ uspoof_getSkeleton(const USpoofChecker *sc,
    // NFKD transform of the user supplied input
-    UChar nfkdBuf[USPOOF_STACK_BUFFER_SIZE];
+    UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE];
-    UChar *nfkdInput = nfkdBuf;
+    UChar *nfkdInput = nfkdStackBuf;
    int32_t normalizedLen = unorm_normalize(
        s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);
    if (*status == U_BUFFER_OVERFLOW_ERROR) {
@ -597,47 +603,67 @@ uspoof_getSkeleton(const USpoofChecker *sc,
                                        nfkdInput, normalizedLen+1, status);
    }
    if (U_FAILURE(*status)) {
        if (nfkdInput != nfkdStackBuf) {
            uprv_free(nfkdInput);
        }
        return 0;
    }
-    // buffer to hold the Unicode defined mappings for a single code point
+    // buffer to hold the Unicode defined skeleton mappings for a single code point
    UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
-    // Apply the mapping to the NFKD form string
+    // Apply the skeleton mapping to the NFKD normalized input string
-    
+    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
    int32_t inputIndex = 0;
-    int32_t resultLen = 0;
+    UnicodeString skelStr;
    while (inputIndex < normalizedLen) {
        UChar32 c;
        U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);
        int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
-        if (resultLen + replaceLen < destCapacity) {
+        skelStr.append(buf, replaceLen);
            int i;
            for (i=0; i<replaceLen; i++) {
                dest[resultLen++] = buf[i];
            }
        } else {
            // Storing the transformed string would overflow the dest buffer.
            //   Don't bother storing anything, just sum up the required buffer size.
            //   (We dont guarantee that a truncated buffer is filled to it's end)
            resultLen += replaceLen;
        }
    }
-    
+
-    if (resultLen < destCapacity) {
+    if (nfkdInput != nfkdStackBuf) {
        dest[resultLen] = 0;
    } else if (resultLen == destCapacity) {
        *status = U_STRING_NOT_TERMINATED_WARNING;
    } else {
        *status = U_BUFFER_OVERFLOW_ERROR;
    }
    if (nfkdInput != nfkdBuf) {
        uprv_free(nfkdInput);
    }
-    return resultLen;
+    
    const UChar *result = skelStr.getBuffer();
    int32_t  resultLen  = skelStr.length();
    UChar   *normedResult = NULL;
    // Check the skeleton for NFKD, normalize it if needed.
    // Unnormalized results should be very rare.
    if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {
        normalizedLen = unorm_normalize(dest, resultLen, UNORM_NFKD, 0, NULL, 0, status);
        UChar *normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
        if (normedResult == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return 0;
        }
        unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status);
        result = normedResult;
        resultLen = normalizedLen;
    }
    // Copy the skeleton to the caller's buffer
    if (U_SUCCESS(*status)) {
        if (destCapacity == 0 || resultLen > destCapacity) {
            *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
        } else {
            u_memcpy(dest, result, resultLen);
            if (destCapacity > resultLen) {
                dest[resultLen] = 0;
            } else {
                *status = U_STRING_NOT_TERMINATED_WARNING;
            }
        }
     }       
     uprv_free(normedResult);
     return resultLen;
 }
 U_CAPI UnicodeString &  U_EXPORT2
 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
                                uint32_t type,
--- a/icu4c/source/test/intltest/itspoof.cpp
+++ b/icu4c/source/test/intltest/itspoof.cpp
@ -67,6 +67,12 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
                testSkeleton();
            }
            break;
         case 2:
            name = "TestAreConfusable";
            if (exec) {
                testAreConfusable();
            }
            break;
        default: name=""; break;
    }
 }
@ -199,4 +205,18 @@ void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
    }
 }
 void IntlTestSpoof::testAreConfusable() {
    UErrorCode status = U_ZERO_ERROR;
    TEST_SETUP
        UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
                         "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
        UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
                         "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
        TEST_ASSERT_SUCCESS(status);
    TEST_TEARDOWN;
 }
 #endif /* #if !UCONFIG_NO_SPOOF_DETECT*/
--- a/icu4c/source/test/intltest/itspoof.h
+++ b/icu4c/source/test/intltest/itspoof.h
@ -26,6 +26,8 @@ public:
    void  testSpoofAPI();
    void  testSkeleton();
    void testAreConfusable();
    // Internal function to run a single skeleton test case.
    void  checkSkeleton(const USpoofChecker *sc, uint32_t flags,