ICU-8341 spoof skeleton computed using NFD, not NFKD

X-SVN-Rev: 29475
This commit is contained in:
Andy Heninger 2011-02-24 22:55:07 +00:00
parent d0670255e8
commit 497e88ec1f
4 changed files with 47 additions and 48 deletions

View File

@ -1,6 +1,6 @@
/* /*
*************************************************************************** ***************************************************************************
* Copyright (C) 2008-2009, International Business Machines Corporation * Copyright (C) 2008-2011, International Business Machines Corporation
* and others. All Rights Reserved. * and others. All Rights Reserved.
*************************************************************************** ***************************************************************************
* file name: uspoof.cpp * file name: uspoof.cpp
@ -240,10 +240,10 @@ uspoof_check(const USpoofChecker *sc,
if (This->fChecks & if (This->fChecks &
(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
// These are the checks that need to be done on NFKD input // These are the checks that need to be done on NFD input
NFKDBuffer normalizedInput(text, length, *status); NFDBuffer normalizedInput(text, length, *status);
const UChar *nfkdText = normalizedInput.getBuffer(); const UChar *nfdText = normalizedInput.getBuffer();
int32_t nfkdLength = normalizedInput.getLength(); int32_t nfdLength = normalizedInput.getLength();
if (This->fChecks & USPOOF_INVISIBLE) { if (This->fChecks & USPOOF_INVISIBLE) {
@ -256,7 +256,7 @@ uspoof_check(const USpoofChecker *sc,
UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
for (i=0; i<length ;) { for (i=0; i<length ;) {
U16_NEXT(nfkdText, i, nfkdLength, c); U16_NEXT(nfdText, i, nfdLength, c);
if (u_charType(c) != U_NON_SPACING_MARK) { if (u_charType(c) != U_NON_SPACING_MARK) {
firstNonspacingMark = 0; firstNonspacingMark = 0;
if (haveMultipleMarks) { if (haveMultipleMarks) {
@ -304,7 +304,7 @@ uspoof_check(const USpoofChecker *sc,
} }
ScriptSet scripts; ScriptSet scripts;
This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status); This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
int32_t confusableScriptCount = scripts.countMembers(); int32_t confusableScriptCount = scripts.countMembers();
//printf("confusableScriptCount = %d\n", confusableScriptCount); //printf("confusableScriptCount = %d\n", confusableScriptCount);
@ -631,25 +631,25 @@ uspoof_getSkeleton(const USpoofChecker *sc,
return 0; return 0;
} }
// NFKD transform of the user supplied input // NFD transform of the user supplied input
UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE]; UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
UChar *nfkdInput = nfkdStackBuf; UChar *nfdInput = nfdStackBuf;
int32_t normalizedLen = unorm_normalize( int32_t normalizedLen = unorm_normalize(
s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status); s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
if (*status == U_BUFFER_OVERFLOW_ERROR) { if (*status == U_BUFFER_OVERFLOW_ERROR) {
nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar)); nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
if (nfkdInput == NULL) { if (nfdInput == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR; *status = U_MEMORY_ALLOCATION_ERROR;
return 0; return 0;
} }
*status = U_ZERO_ERROR; *status = U_ZERO_ERROR;
normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0, normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
nfkdInput, normalizedLen+1, status); nfdInput, normalizedLen+1, status);
} }
if (U_FAILURE(*status)) { if (U_FAILURE(*status)) {
if (nfkdInput != nfkdStackBuf) { if (nfdInput != nfdStackBuf) {
uprv_free(nfkdInput); uprv_free(nfdInput);
} }
return 0; return 0;
} }
@ -657,36 +657,36 @@ uspoof_getSkeleton(const USpoofChecker *sc,
// buffer to hold the Unicode defined skeleton mappings for a single code point // buffer to hold the Unicode defined skeleton mappings for a single code point
UChar buf[USPOOF_MAX_SKELETON_EXPANSION]; UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
// Apply the skeleton mapping to the NFKD normalized input string // Apply the skeleton mapping to the NFD normalized input string
// Accumulate the skeleton, possibly unnormalized, in a UnicodeString. // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
int32_t inputIndex = 0; int32_t inputIndex = 0;
UnicodeString skelStr; UnicodeString skelStr;
while (inputIndex < normalizedLen) { while (inputIndex < normalizedLen) {
UChar32 c; UChar32 c;
U16_NEXT(nfkdInput, inputIndex, normalizedLen, c); U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
int32_t replaceLen = This->confusableLookup(c, tableMask, buf); int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
skelStr.append(buf, replaceLen); skelStr.append(buf, replaceLen);
} }
if (nfkdInput != nfkdStackBuf) { if (nfdInput != nfdStackBuf) {
uprv_free(nfkdInput); uprv_free(nfdInput);
} }
const UChar *result = skelStr.getBuffer(); const UChar *result = skelStr.getBuffer();
int32_t resultLen = skelStr.length(); int32_t resultLen = skelStr.length();
UChar *normedResult = NULL; UChar *normedResult = NULL;
// Check the skeleton for NFKD, normalize it if needed. // Check the skeleton for NFD, normalize it if needed.
// Unnormalized results should be very rare. // Unnormalized results should be very rare.
if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) { if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status); normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar))); normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
if (normedResult == NULL) { if (normedResult == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR; *status = U_MEMORY_ALLOCATION_ERROR;
return 0; return 0;
} }
*status = U_ZERO_ERROR; *status = U_ZERO_ERROR;
unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status); unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
result = normedResult; result = normedResult;
resultLen = normalizedLen; resultLen = normalizedLen;
} }

View File

@ -222,7 +222,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
// //
// wholeScriptCheck() // wholeScriptCheck()
// //
// Input text is already normalized to NFKD // Input text is already normalized to NFD
// Return the set of scripts, each of which can represent something that is // Return the set of scripts, each of which can represent something that is
// confusable with the input text. The script of the input text // confusable with the input text. The script of the input text
// is included; input consisting of characters from a single script will // is included; input consisting of characters from a single script will
@ -769,11 +769,11 @@ int32_t ScriptSet::countMembers() {
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
// //
// NFKDBuffer Implementation. // NFDBuffer Implementation.
// //
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) { NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
fNormalizedText = NULL; fNormalizedText = NULL;
fNormalizedTextLength = 0; fNormalizedTextLength = 0;
fOriginalText = text; fOriginalText = text;
@ -782,32 +782,32 @@ NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
} }
fNormalizedText = fSmallBuf; fNormalizedText = fSmallBuf;
fNormalizedTextLength = unorm_normalize( fNormalizedTextLength = unorm_normalize(
text, length, UNORM_NFKD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status); text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
if (status == U_BUFFER_OVERFLOW_ERROR) { if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR; status = U_ZERO_ERROR;
fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar)); fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
if (fNormalizedText == NULL) { if (fNormalizedText == NULL) {
status = U_MEMORY_ALLOCATION_ERROR; status = U_MEMORY_ALLOCATION_ERROR;
} else { } else {
fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFKD, 0, fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
fNormalizedText, fNormalizedTextLength+1, &status); fNormalizedText, fNormalizedTextLength+1, &status);
} }
} }
} }
NFKDBuffer::~NFKDBuffer() { NFDBuffer::~NFDBuffer() {
if (fNormalizedText != fSmallBuf) { if (fNormalizedText != fSmallBuf) {
uprv_free(fNormalizedText); uprv_free(fNormalizedText);
} }
fNormalizedText = 0; fNormalizedText = 0;
} }
const UChar *NFKDBuffer::getBuffer() { const UChar *NFDBuffer::getBuffer() {
return fNormalizedText; return fNormalizedText;
} }
int32_t NFKDBuffer::getLength() { int32_t NFDBuffer::getLength() {
return fNormalizedTextLength; return fNormalizedTextLength;
} }

View File

@ -214,7 +214,7 @@ class ScriptSet: public UMemory {
//------------------------------------------------------------------------------- //-------------------------------------------------------------------------------
// //
// NFKDBuffer A little class to handle the NFKD normalization that is // NFDBuffer A little class to handle the NFD normalization that is
// needed on incoming identifiers to be checked. // needed on incoming identifiers to be checked.
// Takes care of buffer handling and normalization // Takes care of buffer handling and normalization
// //
@ -223,10 +223,10 @@ class ScriptSet: public UMemory {
// TODO: how to map position offsets back to user values? // TODO: how to map position offsets back to user values?
// //
//-------------------------------------------------------------------------------- //--------------------------------------------------------------------------------
class NFKDBuffer: public UMemory { class NFDBuffer: public UMemory {
public: public:
NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status); NFDBuffer(const UChar *text, int32_t length, UErrorCode &status);
~NFKDBuffer(); ~NFDBuffer();
const UChar *getBuffer(); const UChar *getBuffer();
int32_t getLength(); int32_t getLength();

View File

@ -1,6 +1,6 @@
/* /*
********************************************************************** **********************************************************************
* Copyright (C) 2010, International Business Machines Corporation * Copyright (C) 2011, International Business Machines Corporation
* and others. All Rights Reserved. * and others. All Rights Reserved.
********************************************************************** **********************************************************************
*/ */
@ -153,12 +153,6 @@ void IntlTestSpoof::testSkeleton() {
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.") " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
// FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
// ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA
// This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping
// is never used in creating a skeleton.
CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");
CHECK_SKELETON(SL, "nochange", "nochange"); CHECK_SKELETON(SL, "nochange", "nochange");
CHECK_SKELETON(MA, "love", "love"); CHECK_SKELETON(MA, "love", "love");
CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l
@ -199,6 +193,11 @@ void IntlTestSpoof::testSkeleton() {
CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027"); CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027"); CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
// 017F ; 0066 ;
// This mapping exists in the SA and MA tables
CHECK_SKELETON(MA, "\\u017F", "f");
CHECK_SKELETON(SA, "\\u017F", "f");
TEST_TEARDOWN; TEST_TEARDOWN;
} }
@ -354,16 +353,16 @@ void IntlTestSpoof::testConfData() {
TEST_ASSERT_SUCCESS(status); TEST_ASSERT_SUCCESS(status);
while (parseLine.find()) { while (parseLine.find()) {
UnicodeString from = parseHex(parseLine.group(1, status)); UnicodeString from = parseHex(parseLine.group(1, status));
if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) { if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
// The source character was not NFKD. // The source character was not NFD.
// Skip this case; the first step in obtaining a skeleton is to NFKD the input, // Skip this case; the first step in obtaining a skeleton is to NFD the input,
// so the mapping in this line of confusables.txt will never be applied. // so the mapping in this line of confusables.txt will never be applied.
continue; continue;
} }
UnicodeString rawExpected = parseHex(parseLine.group(2, status)); UnicodeString rawExpected = parseHex(parseLine.group(2, status));
UnicodeString expected; UnicodeString expected;
Normalizer::decompose(rawExpected, TRUE, 0, expected, status); Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
TEST_ASSERT_SUCCESS(status); TEST_ASSERT_SUCCESS(status);
int32_t skeletonType = 0; int32_t skeletonType = 0;