From 497e88ec1f8078b4bef7632427f3d32a46eb03cf Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 24 Feb 2011 22:55:07 +0000 Subject: [PATCH] ICU-8341 spoof skeleton computed using NFD, not NFKD X-SVN-Rev: 29475 --- icu4c/source/i18n/uspoof.cpp | 50 +++++++++++++------------- icu4c/source/i18n/uspoof_impl.cpp | 16 ++++----- icu4c/source/i18n/uspoof_impl.h | 8 ++--- icu4c/source/test/intltest/itspoof.cpp | 21 ++++++----- 4 files changed, 47 insertions(+), 48 deletions(-) diff --git a/icu4c/source/i18n/uspoof.cpp b/icu4c/source/i18n/uspoof.cpp index 2cb785b7b9..48e3e45dc4 100644 --- a/icu4c/source/i18n/uspoof.cpp +++ b/icu4c/source/i18n/uspoof.cpp @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 2008-2009, International Business Machines Corporation +* Copyright (C) 2008-2011, International Business Machines Corporation * and others. All Rights Reserved. *************************************************************************** * file name: uspoof.cpp @@ -240,10 +240,10 @@ uspoof_check(const USpoofChecker *sc, if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { - // These are the checks that need to be done on NFKD input - NFKDBuffer normalizedInput(text, length, *status); - const UChar *nfkdText = normalizedInput.getBuffer(); - int32_t nfkdLength = normalizedInput.getLength(); + // These are the checks that need to be done on NFD input + NFDBuffer normalizedInput(text, length, *status); + const UChar *nfdText = normalizedInput.getBuffer(); + int32_t nfdLength = normalizedInput.getLength(); if (This->fChecks & USPOOF_INVISIBLE) { @@ -256,7 +256,7 @@ uspoof_check(const USpoofChecker *sc, UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; iwholeScriptCheck(nfkdText, nfkdLength, &scripts, *status); + This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); @@ -631,25 +631,25 @@ uspoof_getSkeleton(const USpoofChecker *sc, return 0; } - // NFKD transform of the user supplied input + // NFD transform of the user supplied input - UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar *nfkdInput = nfkdStackBuf; + UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE]; + UChar *nfdInput = nfdStackBuf; int32_t normalizedLen = unorm_normalize( - s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status); + s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status); if (*status == U_BUFFER_OVERFLOW_ERROR) { - nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar)); - if (nfkdInput == NULL) { + nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar)); + if (nfdInput == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } *status = U_ZERO_ERROR; - normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0, - nfkdInput, normalizedLen+1, status); + normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0, + nfdInput, normalizedLen+1, status); } if (U_FAILURE(*status)) { - if (nfkdInput != nfkdStackBuf) { - uprv_free(nfkdInput); + if (nfdInput != nfdStackBuf) { + uprv_free(nfdInput); } return 0; } @@ -657,36 +657,36 @@ uspoof_getSkeleton(const USpoofChecker *sc, // buffer to hold the Unicode defined skeleton mappings for a single code point UChar buf[USPOOF_MAX_SKELETON_EXPANSION]; - // Apply the skeleton mapping to the NFKD normalized input string + // Apply the skeleton mapping to the NFD normalized input string // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. int32_t inputIndex = 0; UnicodeString skelStr; while (inputIndex < normalizedLen) { UChar32 c; - U16_NEXT(nfkdInput, inputIndex, normalizedLen, c); + U16_NEXT(nfdInput, inputIndex, normalizedLen, c); int32_t replaceLen = This->confusableLookup(c, tableMask, buf); skelStr.append(buf, replaceLen); } - if (nfkdInput != nfkdStackBuf) { - uprv_free(nfkdInput); + if (nfdInput != nfdStackBuf) { + uprv_free(nfdInput); } const UChar *result = skelStr.getBuffer(); int32_t resultLen = skelStr.length(); UChar *normedResult = NULL; - // Check the skeleton for NFKD, normalize it if needed. + // Check the skeleton for NFD, normalize it if needed. // Unnormalized results should be very rare. - if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) { - normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status); + if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) { + normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status); normedResult = static_cast(uprv_malloc((normalizedLen+1)*sizeof(UChar))); if (normedResult == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } *status = U_ZERO_ERROR; - unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status); + unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status); result = normedResult; resultLen = normalizedLen; } diff --git a/icu4c/source/i18n/uspoof_impl.cpp b/icu4c/source/i18n/uspoof_impl.cpp index 322cc4da42..c6359a3515 100644 --- a/icu4c/source/i18n/uspoof_impl.cpp +++ b/icu4c/source/i18n/uspoof_impl.cpp @@ -222,7 +222,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de // // wholeScriptCheck() // -// Input text is already normalized to NFKD +// Input text is already normalized to NFD // Return the set of scripts, each of which can represent something that is // confusable with the input text. The script of the input text // is included; input consisting of characters from a single script will @@ -769,11 +769,11 @@ int32_t ScriptSet::countMembers() { //----------------------------------------------------------------------------- // -// NFKDBuffer Implementation. +// NFDBuffer Implementation. // //----------------------------------------------------------------------------- -NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) { +NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) { fNormalizedText = NULL; fNormalizedTextLength = 0; fOriginalText = text; @@ -782,32 +782,32 @@ NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) { } fNormalizedText = fSmallBuf; fNormalizedTextLength = unorm_normalize( - text, length, UNORM_NFKD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status); + text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { status = U_ZERO_ERROR; fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar)); if (fNormalizedText == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } else { - fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFKD, 0, + fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0, fNormalizedText, fNormalizedTextLength+1, &status); } } } -NFKDBuffer::~NFKDBuffer() { +NFDBuffer::~NFDBuffer() { if (fNormalizedText != fSmallBuf) { uprv_free(fNormalizedText); } fNormalizedText = 0; } -const UChar *NFKDBuffer::getBuffer() { +const UChar *NFDBuffer::getBuffer() { return fNormalizedText; } -int32_t NFKDBuffer::getLength() { +int32_t NFDBuffer::getLength() { return fNormalizedTextLength; } diff --git a/icu4c/source/i18n/uspoof_impl.h b/icu4c/source/i18n/uspoof_impl.h index 49b86e546e..fb125cefa6 100644 --- a/icu4c/source/i18n/uspoof_impl.h +++ b/icu4c/source/i18n/uspoof_impl.h @@ -214,7 +214,7 @@ class ScriptSet: public UMemory { //------------------------------------------------------------------------------- // -// NFKDBuffer A little class to handle the NFKD normalization that is +// NFDBuffer A little class to handle the NFD normalization that is // needed on incoming identifiers to be checked. // Takes care of buffer handling and normalization // @@ -223,10 +223,10 @@ class ScriptSet: public UMemory { // TODO: how to map position offsets back to user values? // //-------------------------------------------------------------------------------- -class NFKDBuffer: public UMemory { +class NFDBuffer: public UMemory { public: - NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status); - ~NFKDBuffer(); + NFDBuffer(const UChar *text, int32_t length, UErrorCode &status); + ~NFDBuffer(); const UChar *getBuffer(); int32_t getLength(); diff --git a/icu4c/source/test/intltest/itspoof.cpp b/icu4c/source/test/intltest/itspoof.cpp index a48d3881e8..dd2c276399 100644 --- a/icu4c/source/test/intltest/itspoof.cpp +++ b/icu4c/source/test/intltest/itspoof.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2010, International Business Machines Corporation +* Copyright (C) 2011, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** */ @@ -153,12 +153,6 @@ void IntlTestSpoof::testSkeleton() { " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.") - // FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to - // ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA - // This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping - // is never used in creating a skeleton. - CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651"); - CHECK_SKELETON(SL, "nochange", "nochange"); CHECK_SKELETON(MA, "love", "love"); CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l @@ -199,6 +193,11 @@ void IntlTestSpoof::testSkeleton() { CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027"); CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027"); + // 017F ; 0066 ; + // This mapping exists in the SA and MA tables + CHECK_SKELETON(MA, "\\u017F", "f"); + CHECK_SKELETON(SA, "\\u017F", "f"); + TEST_TEARDOWN; } @@ -354,16 +353,16 @@ void IntlTestSpoof::testConfData() { TEST_ASSERT_SUCCESS(status); while (parseLine.find()) { UnicodeString from = parseHex(parseLine.group(1, status)); - if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) { - // The source character was not NFKD. - // Skip this case; the first step in obtaining a skeleton is to NFKD the input, + if (!Normalizer::isNormalized(from, UNORM_NFD, status)) { + // The source character was not NFD. + // Skip this case; the first step in obtaining a skeleton is to NFD the input, // so the mapping in this line of confusables.txt will never be applied. continue; } UnicodeString rawExpected = parseHex(parseLine.group(2, status)); UnicodeString expected; - Normalizer::decompose(rawExpected, TRUE, 0, expected, status); + Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status); TEST_ASSERT_SUCCESS(status); int32_t skeletonType = 0;