ICU-8341 spoof skeleton computed using NFD, not NFKD
X-SVN-Rev: 29475
This commit is contained in:
parent
d0670255e8
commit
497e88ec1f
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
***************************************************************************
|
***************************************************************************
|
||||||
* Copyright (C) 2008-2009, International Business Machines Corporation
|
* Copyright (C) 2008-2011, International Business Machines Corporation
|
||||||
* and others. All Rights Reserved.
|
* and others. All Rights Reserved.
|
||||||
***************************************************************************
|
***************************************************************************
|
||||||
* file name: uspoof.cpp
|
* file name: uspoof.cpp
|
||||||
@ -240,10 +240,10 @@ uspoof_check(const USpoofChecker *sc,
|
|||||||
|
|
||||||
if (This->fChecks &
|
if (This->fChecks &
|
||||||
(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
|
(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
|
||||||
// These are the checks that need to be done on NFKD input
|
// These are the checks that need to be done on NFD input
|
||||||
NFKDBuffer normalizedInput(text, length, *status);
|
NFDBuffer normalizedInput(text, length, *status);
|
||||||
const UChar *nfkdText = normalizedInput.getBuffer();
|
const UChar *nfdText = normalizedInput.getBuffer();
|
||||||
int32_t nfkdLength = normalizedInput.getLength();
|
int32_t nfdLength = normalizedInput.getLength();
|
||||||
|
|
||||||
if (This->fChecks & USPOOF_INVISIBLE) {
|
if (This->fChecks & USPOOF_INVISIBLE) {
|
||||||
|
|
||||||
@ -256,7 +256,7 @@ uspoof_check(const USpoofChecker *sc,
|
|||||||
UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
|
UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
|
||||||
|
|
||||||
for (i=0; i<length ;) {
|
for (i=0; i<length ;) {
|
||||||
U16_NEXT(nfkdText, i, nfkdLength, c);
|
U16_NEXT(nfdText, i, nfdLength, c);
|
||||||
if (u_charType(c) != U_NON_SPACING_MARK) {
|
if (u_charType(c) != U_NON_SPACING_MARK) {
|
||||||
firstNonspacingMark = 0;
|
firstNonspacingMark = 0;
|
||||||
if (haveMultipleMarks) {
|
if (haveMultipleMarks) {
|
||||||
@ -304,7 +304,7 @@ uspoof_check(const USpoofChecker *sc,
|
|||||||
}
|
}
|
||||||
|
|
||||||
ScriptSet scripts;
|
ScriptSet scripts;
|
||||||
This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);
|
This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
|
||||||
int32_t confusableScriptCount = scripts.countMembers();
|
int32_t confusableScriptCount = scripts.countMembers();
|
||||||
//printf("confusableScriptCount = %d\n", confusableScriptCount);
|
//printf("confusableScriptCount = %d\n", confusableScriptCount);
|
||||||
|
|
||||||
@ -631,25 +631,25 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// NFKD transform of the user supplied input
|
// NFD transform of the user supplied input
|
||||||
|
|
||||||
UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE];
|
UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
|
||||||
UChar *nfkdInput = nfkdStackBuf;
|
UChar *nfdInput = nfdStackBuf;
|
||||||
int32_t normalizedLen = unorm_normalize(
|
int32_t normalizedLen = unorm_normalize(
|
||||||
s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);
|
s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
|
||||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||||
nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
|
nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
|
||||||
if (nfkdInput == NULL) {
|
if (nfdInput == NULL) {
|
||||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
*status = U_ZERO_ERROR;
|
*status = U_ZERO_ERROR;
|
||||||
normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0,
|
normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
|
||||||
nfkdInput, normalizedLen+1, status);
|
nfdInput, normalizedLen+1, status);
|
||||||
}
|
}
|
||||||
if (U_FAILURE(*status)) {
|
if (U_FAILURE(*status)) {
|
||||||
if (nfkdInput != nfkdStackBuf) {
|
if (nfdInput != nfdStackBuf) {
|
||||||
uprv_free(nfkdInput);
|
uprv_free(nfdInput);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -657,36 +657,36 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
|||||||
// buffer to hold the Unicode defined skeleton mappings for a single code point
|
// buffer to hold the Unicode defined skeleton mappings for a single code point
|
||||||
UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
|
UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
|
||||||
|
|
||||||
// Apply the skeleton mapping to the NFKD normalized input string
|
// Apply the skeleton mapping to the NFD normalized input string
|
||||||
// Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
|
// Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
|
||||||
int32_t inputIndex = 0;
|
int32_t inputIndex = 0;
|
||||||
UnicodeString skelStr;
|
UnicodeString skelStr;
|
||||||
while (inputIndex < normalizedLen) {
|
while (inputIndex < normalizedLen) {
|
||||||
UChar32 c;
|
UChar32 c;
|
||||||
U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);
|
U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
|
||||||
int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
|
int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
|
||||||
skelStr.append(buf, replaceLen);
|
skelStr.append(buf, replaceLen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nfkdInput != nfkdStackBuf) {
|
if (nfdInput != nfdStackBuf) {
|
||||||
uprv_free(nfkdInput);
|
uprv_free(nfdInput);
|
||||||
}
|
}
|
||||||
|
|
||||||
const UChar *result = skelStr.getBuffer();
|
const UChar *result = skelStr.getBuffer();
|
||||||
int32_t resultLen = skelStr.length();
|
int32_t resultLen = skelStr.length();
|
||||||
UChar *normedResult = NULL;
|
UChar *normedResult = NULL;
|
||||||
|
|
||||||
// Check the skeleton for NFKD, normalize it if needed.
|
// Check the skeleton for NFD, normalize it if needed.
|
||||||
// Unnormalized results should be very rare.
|
// Unnormalized results should be very rare.
|
||||||
if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {
|
if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
|
||||||
normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status);
|
normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
|
||||||
normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
|
normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
|
||||||
if (normedResult == NULL) {
|
if (normedResult == NULL) {
|
||||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
*status = U_ZERO_ERROR;
|
*status = U_ZERO_ERROR;
|
||||||
unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status);
|
unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
|
||||||
result = normedResult;
|
result = normedResult;
|
||||||
resultLen = normalizedLen;
|
resultLen = normalizedLen;
|
||||||
}
|
}
|
||||||
|
@ -222,7 +222,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
|
|||||||
//
|
//
|
||||||
// wholeScriptCheck()
|
// wholeScriptCheck()
|
||||||
//
|
//
|
||||||
// Input text is already normalized to NFKD
|
// Input text is already normalized to NFD
|
||||||
// Return the set of scripts, each of which can represent something that is
|
// Return the set of scripts, each of which can represent something that is
|
||||||
// confusable with the input text. The script of the input text
|
// confusable with the input text. The script of the input text
|
||||||
// is included; input consisting of characters from a single script will
|
// is included; input consisting of characters from a single script will
|
||||||
@ -769,11 +769,11 @@ int32_t ScriptSet::countMembers() {
|
|||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// NFKDBuffer Implementation.
|
// NFDBuffer Implementation.
|
||||||
//
|
//
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
|
|
||||||
NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
|
NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
|
||||||
fNormalizedText = NULL;
|
fNormalizedText = NULL;
|
||||||
fNormalizedTextLength = 0;
|
fNormalizedTextLength = 0;
|
||||||
fOriginalText = text;
|
fOriginalText = text;
|
||||||
@ -782,32 +782,32 @@ NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
|
|||||||
}
|
}
|
||||||
fNormalizedText = fSmallBuf;
|
fNormalizedText = fSmallBuf;
|
||||||
fNormalizedTextLength = unorm_normalize(
|
fNormalizedTextLength = unorm_normalize(
|
||||||
text, length, UNORM_NFKD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
|
text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
|
||||||
if (status == U_BUFFER_OVERFLOW_ERROR) {
|
if (status == U_BUFFER_OVERFLOW_ERROR) {
|
||||||
status = U_ZERO_ERROR;
|
status = U_ZERO_ERROR;
|
||||||
fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
|
fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
|
||||||
if (fNormalizedText == NULL) {
|
if (fNormalizedText == NULL) {
|
||||||
status = U_MEMORY_ALLOCATION_ERROR;
|
status = U_MEMORY_ALLOCATION_ERROR;
|
||||||
} else {
|
} else {
|
||||||
fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFKD, 0,
|
fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
|
||||||
fNormalizedText, fNormalizedTextLength+1, &status);
|
fNormalizedText, fNormalizedTextLength+1, &status);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
NFKDBuffer::~NFKDBuffer() {
|
NFDBuffer::~NFDBuffer() {
|
||||||
if (fNormalizedText != fSmallBuf) {
|
if (fNormalizedText != fSmallBuf) {
|
||||||
uprv_free(fNormalizedText);
|
uprv_free(fNormalizedText);
|
||||||
}
|
}
|
||||||
fNormalizedText = 0;
|
fNormalizedText = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const UChar *NFKDBuffer::getBuffer() {
|
const UChar *NFDBuffer::getBuffer() {
|
||||||
return fNormalizedText;
|
return fNormalizedText;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t NFKDBuffer::getLength() {
|
int32_t NFDBuffer::getLength() {
|
||||||
return fNormalizedTextLength;
|
return fNormalizedTextLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -214,7 +214,7 @@ class ScriptSet: public UMemory {
|
|||||||
|
|
||||||
//-------------------------------------------------------------------------------
|
//-------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// NFKDBuffer A little class to handle the NFKD normalization that is
|
// NFDBuffer A little class to handle the NFD normalization that is
|
||||||
// needed on incoming identifiers to be checked.
|
// needed on incoming identifiers to be checked.
|
||||||
// Takes care of buffer handling and normalization
|
// Takes care of buffer handling and normalization
|
||||||
//
|
//
|
||||||
@ -223,10 +223,10 @@ class ScriptSet: public UMemory {
|
|||||||
// TODO: how to map position offsets back to user values?
|
// TODO: how to map position offsets back to user values?
|
||||||
//
|
//
|
||||||
//--------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------
|
||||||
class NFKDBuffer: public UMemory {
|
class NFDBuffer: public UMemory {
|
||||||
public:
|
public:
|
||||||
NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status);
|
NFDBuffer(const UChar *text, int32_t length, UErrorCode &status);
|
||||||
~NFKDBuffer();
|
~NFDBuffer();
|
||||||
const UChar *getBuffer();
|
const UChar *getBuffer();
|
||||||
int32_t getLength();
|
int32_t getLength();
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
**********************************************************************
|
**********************************************************************
|
||||||
* Copyright (C) 2010, International Business Machines Corporation
|
* Copyright (C) 2011, International Business Machines Corporation
|
||||||
* and others. All Rights Reserved.
|
* and others. All Rights Reserved.
|
||||||
**********************************************************************
|
**********************************************************************
|
||||||
*/
|
*/
|
||||||
@ -153,12 +153,6 @@ void IntlTestSpoof::testSkeleton() {
|
|||||||
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
|
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
|
||||||
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
|
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
|
||||||
|
|
||||||
// FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
|
|
||||||
// ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA
|
|
||||||
// This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping
|
|
||||||
// is never used in creating a skeleton.
|
|
||||||
CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");
|
|
||||||
|
|
||||||
CHECK_SKELETON(SL, "nochange", "nochange");
|
CHECK_SKELETON(SL, "nochange", "nochange");
|
||||||
CHECK_SKELETON(MA, "love", "love");
|
CHECK_SKELETON(MA, "love", "love");
|
||||||
CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l
|
CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l
|
||||||
@ -199,6 +193,11 @@ void IntlTestSpoof::testSkeleton() {
|
|||||||
CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
|
CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
|
||||||
CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
|
CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
|
||||||
|
|
||||||
|
// 017F ; 0066 ;
|
||||||
|
// This mapping exists in the SA and MA tables
|
||||||
|
CHECK_SKELETON(MA, "\\u017F", "f");
|
||||||
|
CHECK_SKELETON(SA, "\\u017F", "f");
|
||||||
|
|
||||||
TEST_TEARDOWN;
|
TEST_TEARDOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,16 +353,16 @@ void IntlTestSpoof::testConfData() {
|
|||||||
TEST_ASSERT_SUCCESS(status);
|
TEST_ASSERT_SUCCESS(status);
|
||||||
while (parseLine.find()) {
|
while (parseLine.find()) {
|
||||||
UnicodeString from = parseHex(parseLine.group(1, status));
|
UnicodeString from = parseHex(parseLine.group(1, status));
|
||||||
if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) {
|
if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
|
||||||
// The source character was not NFKD.
|
// The source character was not NFD.
|
||||||
// Skip this case; the first step in obtaining a skeleton is to NFKD the input,
|
// Skip this case; the first step in obtaining a skeleton is to NFD the input,
|
||||||
// so the mapping in this line of confusables.txt will never be applied.
|
// so the mapping in this line of confusables.txt will never be applied.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
UnicodeString rawExpected = parseHex(parseLine.group(2, status));
|
UnicodeString rawExpected = parseHex(parseLine.group(2, status));
|
||||||
UnicodeString expected;
|
UnicodeString expected;
|
||||||
Normalizer::decompose(rawExpected, TRUE, 0, expected, status);
|
Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
|
||||||
TEST_ASSERT_SUCCESS(status);
|
TEST_ASSERT_SUCCESS(status);
|
||||||
|
|
||||||
int32_t skeletonType = 0;
|
int32_t skeletonType = 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user