From 497e88ec1f8078b4bef7632427f3d32a46eb03cf Mon Sep 17 00:00:00 2001
From: Andy Heninger <andy.heninger@gmail.com>
Date: Thu, 24 Feb 2011 22:55:07 +0000
Subject: [PATCH] ICU-8341 spoof skeleton computed using NFD, not NFKD

X-SVN-Rev: 29475
---
 icu4c/source/i18n/uspoof.cpp           | 50 +++++++++++++-------------
 icu4c/source/i18n/uspoof_impl.cpp      | 16 ++++-----
 icu4c/source/i18n/uspoof_impl.h        |  8 ++---
 icu4c/source/test/intltest/itspoof.cpp | 21 ++++++-----
 4 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/icu4c/source/i18n/uspoof.cpp b/icu4c/source/i18n/uspoof.cpp
index 2cb785b7b9..48e3e45dc4 100644
--- a/icu4c/source/i18n/uspoof.cpp
+++ b/icu4c/source/i18n/uspoof.cpp
@@ -1,6 +1,6 @@
 /*
 ***************************************************************************
-* Copyright (C) 2008-2009, International Business Machines Corporation
+* Copyright (C) 2008-2011, International Business Machines Corporation
 * and others. All Rights Reserved.
 ***************************************************************************
 *   file name:  uspoof.cpp
@@ -240,10 +240,10 @@ uspoof_check(const USpoofChecker *sc,
 
     if (This->fChecks & 
         (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
-        // These are the checks that need to be done on NFKD input
-        NFKDBuffer   normalizedInput(text, length, *status);
-        const UChar  *nfkdText = normalizedInput.getBuffer();
-        int32_t      nfkdLength = normalizedInput.getLength();
+        // These are the checks that need to be done on NFD input
+        NFDBuffer   normalizedInput(text, length, *status);
+        const UChar  *nfdText = normalizedInput.getBuffer();
+        int32_t      nfdLength = normalizedInput.getLength();
 
         if (This->fChecks & USPOOF_INVISIBLE) {
            
@@ -256,7 +256,7 @@ uspoof_check(const USpoofChecker *sc,
             UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
             
             for (i=0; i<length ;) {
-                U16_NEXT(nfkdText, i, nfkdLength, c);
+                U16_NEXT(nfdText, i, nfdLength, c);
                 if (u_charType(c) != U_NON_SPACING_MARK) {
                     firstNonspacingMark = 0;
                     if (haveMultipleMarks) {
@@ -304,7 +304,7 @@ uspoof_check(const USpoofChecker *sc,
             }
             
             ScriptSet scripts;
-            This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);
+            This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
             int32_t confusableScriptCount = scripts.countMembers();
             //printf("confusableScriptCount = %d\n", confusableScriptCount);
             
@@ -631,25 +631,25 @@ uspoof_getSkeleton(const USpoofChecker *sc,
         return 0;
     }
 
-    // NFKD transform of the user supplied input
+    // NFD transform of the user supplied input
     
-    UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE];
-    UChar *nfkdInput = nfkdStackBuf;
+    UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
+    UChar *nfdInput = nfdStackBuf;
     int32_t normalizedLen = unorm_normalize(
-        s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);
+        s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
     if (*status == U_BUFFER_OVERFLOW_ERROR) {
-        nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
-        if (nfkdInput == NULL) {
+        nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
+        if (nfdInput == NULL) {
             *status = U_MEMORY_ALLOCATION_ERROR;
             return 0;
         }
         *status = U_ZERO_ERROR;
-        normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0,
-                                        nfkdInput, normalizedLen+1, status);
+        normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
+                                        nfdInput, normalizedLen+1, status);
     }
     if (U_FAILURE(*status)) {
-        if (nfkdInput != nfkdStackBuf) {
-            uprv_free(nfkdInput);
+        if (nfdInput != nfdStackBuf) {
+            uprv_free(nfdInput);
         }
         return 0;
     }
@@ -657,36 +657,36 @@ uspoof_getSkeleton(const USpoofChecker *sc,
     // buffer to hold the Unicode defined skeleton mappings for a single code point
     UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
 
-    // Apply the skeleton mapping to the NFKD normalized input string
+    // Apply the skeleton mapping to the NFD normalized input string
     // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
     int32_t inputIndex = 0;
     UnicodeString skelStr;
     while (inputIndex < normalizedLen) {
         UChar32 c;
-        U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);
+        U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
         int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
         skelStr.append(buf, replaceLen);
     }
 
-    if (nfkdInput != nfkdStackBuf) {
-        uprv_free(nfkdInput);
+    if (nfdInput != nfdStackBuf) {
+        uprv_free(nfdInput);
     }
     
     const UChar *result = skelStr.getBuffer();
     int32_t  resultLen  = skelStr.length();
     UChar   *normedResult = NULL;
 
-    // Check the skeleton for NFKD, normalize it if needed.
+    // Check the skeleton for NFD, normalize it if needed.
     // Unnormalized results should be very rare.
-    if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {
-        normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status);
+    if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
+        normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
         normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
         if (normedResult == NULL) {
             *status = U_MEMORY_ALLOCATION_ERROR;
             return 0;
         }
         *status = U_ZERO_ERROR;
-        unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status);
+        unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
         result = normedResult;
         resultLen = normalizedLen;
     }
diff --git a/icu4c/source/i18n/uspoof_impl.cpp b/icu4c/source/i18n/uspoof_impl.cpp
index 322cc4da42..c6359a3515 100644
--- a/icu4c/source/i18n/uspoof_impl.cpp
+++ b/icu4c/source/i18n/uspoof_impl.cpp
@@ -222,7 +222,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
 //
 //  wholeScriptCheck()
 //
-//      Input text is already normalized to NFKD
+//      Input text is already normalized to NFD
 //      Return the set of scripts, each of which can represent something that is
 //             confusable with the input text.  The script of the input text
 //             is included; input consisting of characters from a single script will
@@ -769,11 +769,11 @@ int32_t ScriptSet::countMembers() {
 
 //-----------------------------------------------------------------------------
 //
-//  NFKDBuffer Implementation.
+//  NFDBuffer Implementation.
 //
 //-----------------------------------------------------------------------------
 
-NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
+NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
     fNormalizedText = NULL;
     fNormalizedTextLength = 0;
     fOriginalText = text;
@@ -782,32 +782,32 @@ NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
     }
     fNormalizedText = fSmallBuf;
     fNormalizedTextLength = unorm_normalize(
-        text, length, UNORM_NFKD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
+        text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
     if (status == U_BUFFER_OVERFLOW_ERROR) {
         status = U_ZERO_ERROR;
         fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
         if (fNormalizedText == NULL) {
             status = U_MEMORY_ALLOCATION_ERROR;
         } else {
-            fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFKD, 0,
+            fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
                                         fNormalizedText, fNormalizedTextLength+1, &status);
         }
     }
 }
 
 
-NFKDBuffer::~NFKDBuffer() {
+NFDBuffer::~NFDBuffer() {
     if (fNormalizedText != fSmallBuf) {
         uprv_free(fNormalizedText);
     }
     fNormalizedText = 0;
 }
 
-const UChar *NFKDBuffer::getBuffer() {
+const UChar *NFDBuffer::getBuffer() {
     return fNormalizedText;
 }
 
-int32_t NFKDBuffer::getLength() {
+int32_t NFDBuffer::getLength() {
     return fNormalizedTextLength;
 }
 
diff --git a/icu4c/source/i18n/uspoof_impl.h b/icu4c/source/i18n/uspoof_impl.h
index 49b86e546e..fb125cefa6 100644
--- a/icu4c/source/i18n/uspoof_impl.h
+++ b/icu4c/source/i18n/uspoof_impl.h
@@ -214,7 +214,7 @@ class ScriptSet: public UMemory {
 
 //-------------------------------------------------------------------------------
 //
-//  NFKDBuffer   A little class to handle the NFKD normalization that is
+//  NFDBuffer   A little class to handle the NFD normalization that is
 //               needed on incoming identifiers to be checked.
 //               Takes care of buffer handling and normalization
 //
@@ -223,10 +223,10 @@ class ScriptSet: public UMemory {
 //               TODO:  how to map position offsets back to user values?
 //
 //--------------------------------------------------------------------------------
-class NFKDBuffer: public UMemory {
+class NFDBuffer: public UMemory {
 public:
-    NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status);
-    ~NFKDBuffer();
+    NFDBuffer(const UChar *text, int32_t length, UErrorCode &status);
+    ~NFDBuffer();
     const UChar *getBuffer();
     int32_t getLength();
 
diff --git a/icu4c/source/test/intltest/itspoof.cpp b/icu4c/source/test/intltest/itspoof.cpp
index a48d3881e8..dd2c276399 100644
--- a/icu4c/source/test/intltest/itspoof.cpp
+++ b/icu4c/source/test/intltest/itspoof.cpp
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-* Copyright (C) 2010, International Business Machines Corporation 
+* Copyright (C) 2011, International Business Machines Corporation 
 * and others.  All Rights Reserved.
 **********************************************************************
 */
@@ -153,12 +153,6 @@ void IntlTestSpoof::testSkeleton() {
                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
 
-        // FC5F ;	FE74 0651 ;   ML  #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
-        //                                ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA	
-        //    This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping 
-        //    is never used in creating a skeleton.
-        CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");
-
         CHECK_SKELETON(SL, "nochange", "nochange");
         CHECK_SKELETON(MA, "love", "love"); 
         CHECK_SKELETON(MA, "1ove", "love");   // Digit 1 to letter l
@@ -199,6 +193,11 @@ void IntlTestSpoof::testSkeleton() {
         CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
         CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
 
+        // 017F ;  0066 ;
+        // This mapping exists in the SA and MA tables
+        CHECK_SKELETON(MA, "\\u017F", "f");
+        CHECK_SKELETON(SA, "\\u017F", "f");
+
     TEST_TEARDOWN;
 }
 
@@ -354,16 +353,16 @@ void IntlTestSpoof::testConfData() {
     TEST_ASSERT_SUCCESS(status);
     while (parseLine.find()) {
         UnicodeString from = parseHex(parseLine.group(1, status));
-        if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) {
-            // The source character was not NFKD.
-            // Skip this case; the first step in obtaining a skeleton is to NFKD the input,
+        if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
+            // The source character was not NFD.
+            // Skip this case; the first step in obtaining a skeleton is to NFD the input,
             //  so the mapping in this line of confusables.txt will never be applied.
             continue;
         }
 
         UnicodeString rawExpected = parseHex(parseLine.group(2, status));
         UnicodeString expected;
-        Normalizer::decompose(rawExpected, TRUE, 0, expected, status);
+        Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
         TEST_ASSERT_SUCCESS(status);
 
         int32_t skeletonType = 0;