From b08b9e86257a09ee593e9d4a6d56995808b968a0 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 23 Apr 2009 01:30:02 +0000 Subject: [PATCH] ICU-4790 spoof cleanups X-SVN-Rev: 25876 --- icu4c/source/i18n/unicode/uspoof.h | 19 +++--- icu4c/source/i18n/uspoof.cpp | 80 +++++++++++++++++--------- icu4c/source/test/intltest/itspoof.cpp | 20 +++++++ icu4c/source/test/intltest/itspoof.h | 2 + 4 files changed, 82 insertions(+), 39 deletions(-) diff --git a/icu4c/source/i18n/unicode/uspoof.h b/icu4c/source/i18n/unicode/uspoof.h index 20b0106712..71838570ab 100644 --- a/icu4c/source/i18n/unicode/uspoof.h +++ b/icu4c/source/i18n/unicode/uspoof.h @@ -603,21 +603,16 @@ uspoof_checkUnicodeString(const USpoofChecker *sc, * or whole script - are determined by the check options set for the * USpoofChecker. * - * TODO: expand on the following - * There are four possible types of comarisons: - * Mixed Script, Lower Case - * Mixed Script, Any Case - * Single Script, Lower Case - * Single Script, Any Case - * Which tests are performed is controlled by the flags + * The tests to be performed are controlled by the flags * USPOOF_SINGLE_SCRIPT_CONFUSABLE * USPOOF_MIXED_SCRIPT_CONFUSABLE - * One or both of these must be set. + * USPOOF_WHOLE_SCRIPT_CONFUSABLE + * At least one of these tests must be selected. * - * USPOOF_ANY_CASE is a modifier. Choose it if the identifiers - * are case-sensitive and may be of mixed case. - * If identifiers are normalized to lower case for comparison or - * display to the user, do not select the ANY_CASE option. + * USPOOF_ANY_CASE is a modifier for the tests. Select it if the identifiers + * may be of mixed case. + * If identifiers are case folded for comparison and + * display to the user, do not select the USPOOF_ANY_CASE option. * * * @param sc The USpoofChecker diff --git a/icu4c/source/i18n/uspoof.cpp b/icu4c/source/i18n/uspoof.cpp index a493e8470b..2253d37b5c 100644 --- a/icu4c/source/i18n/uspoof.cpp +++ b/icu4c/source/i18n/uspoof.cpp @@ -551,6 +551,12 @@ uspoof_getSkeleton(const USpoofChecker *sc, UChar *dest, int32_t destCapacity, UErrorCode *status) { + // TODO: this function could be sped up a bit + // Skip the input normalization when not needed, work from callers data. + // Put the initial skeleton straight into the caller's destination buffer. + // It probably won't need normalization. + // But these would make the structure more complicated. + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; @@ -582,8 +588,8 @@ uspoof_getSkeleton(const USpoofChecker *sc, // NFKD transform of the user supplied input - UChar nfkdBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar *nfkdInput = nfkdBuf; + UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE]; + UChar *nfkdInput = nfkdStackBuf; int32_t normalizedLen = unorm_normalize( s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status); if (*status == U_BUFFER_OVERFLOW_ERROR) { @@ -597,47 +603,67 @@ uspoof_getSkeleton(const USpoofChecker *sc, nfkdInput, normalizedLen+1, status); } if (U_FAILURE(*status)) { + if (nfkdInput != nfkdStackBuf) { + uprv_free(nfkdInput); + } return 0; } - // buffer to hold the Unicode defined mappings for a single code point + // buffer to hold the Unicode defined skeleton mappings for a single code point UChar buf[USPOOF_MAX_SKELETON_EXPANSION]; - // Apply the mapping to the NFKD form string - + // Apply the skeleton mapping to the NFKD normalized input string + // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. int32_t inputIndex = 0; - int32_t resultLen = 0; + UnicodeString skelStr; while (inputIndex < normalizedLen) { UChar32 c; U16_NEXT(nfkdInput, inputIndex, normalizedLen, c); int32_t replaceLen = This->confusableLookup(c, tableMask, buf); - if (resultLen + replaceLen < destCapacity) { - int i; - for (i=0; i(uprv_malloc((normalizedLen+1)*sizeof(UChar))); + if (normedResult == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status); + result = normedResult; + resultLen = normalizedLen; + } + + // Copy the skeleton to the caller's buffer + if (U_SUCCESS(*status)) { + if (destCapacity == 0 || resultLen > destCapacity) { + *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING; + } else { + u_memcpy(dest, result, resultLen); + if (destCapacity > resultLen) { + dest[resultLen] = 0; + } else { + *status = U_STRING_NOT_TERMINATED_WARNING; + } + } + } + uprv_free(normedResult); + return resultLen; } + U_CAPI UnicodeString & U_EXPORT2 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, uint32_t type, diff --git a/icu4c/source/test/intltest/itspoof.cpp b/icu4c/source/test/intltest/itspoof.cpp index cb63dac091..230cf94468 100644 --- a/icu4c/source/test/intltest/itspoof.cpp +++ b/icu4c/source/test/intltest/itspoof.cpp @@ -67,6 +67,12 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name testSkeleton(); } break; + case 2: + name = "TestAreConfusable"; + if (exec) { + testAreConfusable(); + } + break; default: name=""; break; } } @@ -199,4 +205,18 @@ void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, } } +void IntlTestSpoof::testAreConfusable() { + UErrorCode status = U_ZERO_ERROR; + TEST_SETUP + UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. " + "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "); + UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " + "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "); + TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status)); + TEST_ASSERT_SUCCESS(status); + + TEST_TEARDOWN; +} + + #endif /* #if !UCONFIG_NO_SPOOF_DETECT*/ diff --git a/icu4c/source/test/intltest/itspoof.h b/icu4c/source/test/intltest/itspoof.h index 9bbcad5740..2c013aec11 100644 --- a/icu4c/source/test/intltest/itspoof.h +++ b/icu4c/source/test/intltest/itspoof.h @@ -26,6 +26,8 @@ public: void testSpoofAPI(); void testSkeleton(); + + void testAreConfusable(); // Internal function to run a single skeleton test case. void checkSkeleton(const USpoofChecker *sc, uint32_t flags,