diff --git a/icu4c/source/i18n/identifier_info.cpp b/icu4c/source/i18n/identifier_info.cpp deleted file mode 100644 index 6118dccc35..0000000000 --- a/icu4c/source/i18n/identifier_info.cpp +++ /dev/null @@ -1,313 +0,0 @@ -// Copyright (C) 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -********************************************************************** -* Copyright (C) 2012-2014, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -*/ - -#include "unicode/utypes.h" - -#include "unicode/uchar.h" -#include "unicode/utf16.h" - -#include "identifier_info.h" -#include "mutex.h" -#include "scriptset.h" -#include "ucln_in.h" -#include "uvector.h" - -U_NAMESPACE_BEGIN - -static UnicodeSet *ASCII; -static ScriptSet *JAPANESE; -static ScriptSet *CHINESE; -static ScriptSet *KOREAN; -static ScriptSet *CONFUSABLE_WITH_LATIN; -static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER; - - -U_CDECL_BEGIN -static UBool U_CALLCONV -IdentifierInfo_cleanup(void) { - delete ASCII; - ASCII = NULL; - delete JAPANESE; - JAPANESE = NULL; - delete CHINESE; - CHINESE = NULL; - delete KOREAN; - KOREAN = NULL; - delete CONFUSABLE_WITH_LATIN; - CONFUSABLE_WITH_LATIN = NULL; - gIdentifierInfoInitOnce.reset(); - return TRUE; -} - -static void U_CALLCONV -IdentifierInfo_init(UErrorCode &status) { - ASCII = new UnicodeSet(0, 0x7f); - JAPANESE = new ScriptSet(); - CHINESE = new ScriptSet(); - KOREAN = new ScriptSet(); - CONFUSABLE_WITH_LATIN = new ScriptSet(); - if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL - || CONFUSABLE_WITH_LATIN == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - ASCII->freeze(); - JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) - .set(USCRIPT_KATAKANA, status); - CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); - KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); - CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) - .set(USCRIPT_CHEROKEE, status); - ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); -} -U_CDECL_END - - -IdentifierInfo::IdentifierInfo(UErrorCode &status): - fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), - fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { - umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status); - if (U_FAILURE(status)) { - return; - } - - fIdentifier = new UnicodeString(); - fRequiredScripts = new ScriptSet(); - fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); - uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); - fCommonAmongAlternates = new ScriptSet(); - fNumerics = new UnicodeSet(); - fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); - - if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || - fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { - status = U_MEMORY_ALLOCATION_ERROR; - } -} - -IdentifierInfo::~IdentifierInfo() { - delete fIdentifier; - delete fRequiredScripts; - uhash_close(fScriptSetSet); - delete fCommonAmongAlternates; - delete fNumerics; - delete fIdentifierProfile; -} - - -IdentifierInfo &IdentifierInfo::clear() { - fRequiredScripts->resetAll(); - uhash_removeAll(fScriptSetSet); - fNumerics->clear(); - fCommonAmongAlternates->resetAll(); - return *this; -} - - -IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { - *fIdentifierProfile = identifierProfile; - return *this; -} - - -const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { - return *fIdentifierProfile; -} - - -IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { - if (U_FAILURE(status)) { - return *this; - } - *fIdentifier = identifier; - clear(); - ScriptSet scriptsForCP; - UChar32 cp; - for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { - cp = identifier.char32At(i); - // Store a representative character for each kind of decimal digit - if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { - // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value - fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); - } - UScriptCode extensions[500]; - int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status); - if (U_FAILURE(status)) { - return *this; - } - scriptsForCP.resetAll(); - for (int32_t j=0; jUnion(scriptsForCP); - break; - default: - if (!fRequiredScripts->intersects(scriptsForCP) - && !uhash_geti(fScriptSetSet, &scriptsForCP)) { - // If the set hasn't been added already, add it - // (Add a copy, fScriptSetSet takes ownership of the copy.) - uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); - } - break; - } - } - // Now make a final pass through ScriptSetSet to remove alternates that came before singles. - // [Kana], [Kana Hira] => [Kana] - // This is relatively infrequent, so doesn't have to be optimized. - // We also compute any commonalities among the alternates. - if (uhash_count(fScriptSetSet) > 0) { - fCommonAmongAlternates->setAll(); - for (int32_t it = UHASH_FIRST;;) { - const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); - if (nextHashEl == NULL) { - break; - } - ScriptSet *next = static_cast(nextHashEl->key.pointer); - // [Kana], [Kana Hira] => [Kana] - if (fRequiredScripts->intersects(*next)) { - uhash_removeElement(fScriptSetSet, nextHashEl); - } else { - fCommonAmongAlternates->intersect(*next); - // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] - for (int32_t otherIt = UHASH_FIRST;;) { - const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); - if (otherHashEl == NULL) { - break; - } - ScriptSet *other = static_cast(otherHashEl->key.pointer); - if (next != other && next->contains(*other)) { - uhash_removeElement(fScriptSetSet, nextHashEl); - break; - } - } - } - } - } - if (uhash_count(fScriptSetSet) == 0) { - fCommonAmongAlternates->resetAll(); - } - return *this; -} - - -const UnicodeString *IdentifierInfo::getIdentifier() const { - return fIdentifier; -} - -const ScriptSet *IdentifierInfo::getScripts() const { - return fRequiredScripts; -} - -const UHashtable *IdentifierInfo::getAlternates() const { - return fScriptSetSet; -} - - -const UnicodeSet *IdentifierInfo::getNumerics() const { - return fNumerics; -} - -const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { - return fCommonAmongAlternates; -} - -#if !UCONFIG_NO_NORMALIZATION - -URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { - if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { - return USPOOF_UNRESTRICTIVE; - } - if (ASCII->containsAll(*fIdentifier)) { - return USPOOF_ASCII; - } - // This is a bit tricky. We look at a number of factors. - // The number of scripts in the text. - // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) - // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) - - // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the - // time it is created, in setIdentifier(). - int32_t cardinalityPlus = fRequiredScripts->countMembers() + - (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); - if (cardinalityPlus < 2) { - return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; - } - if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) - || containsWithAlternates(*KOREAN, *fRequiredScripts)) { - return USPOOF_HIGHLY_RESTRICTIVE; - } - if (cardinalityPlus == 2 && - fRequiredScripts->test(USCRIPT_LATIN, status) && - !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { - return USPOOF_MODERATELY_RESTRICTIVE; - } - return USPOOF_MINIMALLY_RESTRICTIVE; -} - -#endif /* !UCONFIG_NO_NORMALIZATION */ - -int32_t IdentifierInfo::getScriptCount() const { - // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. - int32_t count = fRequiredScripts->countMembers() + - (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); - return count; -} - - - -UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { - if (!container.contains(containee)) { - return FALSE; - } - for (int32_t iter = UHASH_FIRST; ;) { - const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); - if (hashEl == NULL) { - break; - } - ScriptSet *alternatives = static_cast(hashEl->key.pointer); - if (!container.intersects(*alternatives)) { - return false; - } - } - return true; -} - -UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { - UVector sorted(status); - if (U_FAILURE(status)) { - return dest; - } - for (int32_t pos = UHASH_FIRST; ;) { - const UHashElement *el = uhash_nextElement(alternates, &pos); - if (el == NULL) { - break; - } - ScriptSet *ss = static_cast(el->key.pointer); - sorted.addElement(ss, status); - } - sorted.sort(uhash_compareScriptSet, status); - UnicodeString separator = UNICODE_STRING_SIMPLE("; "); - for (int32_t i=0; i0) { - dest.append(separator); - } - ScriptSet *ss = static_cast(sorted.elementAt(i)); - ss->displayScripts(dest); - } - return dest; -} - -U_NAMESPACE_END - diff --git a/icu4c/source/i18n/identifier_info.h b/icu4c/source/i18n/identifier_info.h deleted file mode 100644 index 832220306e..0000000000 --- a/icu4c/source/i18n/identifier_info.h +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright (C) 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -********************************************************************** -* Copyright (C) 2014, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -* -* indentifier_info.h -* -* created on: 2013 Jan 7 -* created by: Andy Heninger -*/ - -#ifndef __IDENTIFIER_INFO_H__ -#define __IDENTIFIER_INFO_H__ - -#include "unicode/utypes.h" - -#include "unicode/uniset.h" -#include "unicode/uspoof.h" -#include "uhash.h" - -U_NAMESPACE_BEGIN - -class ScriptSet; - -// TODO(andy): review consistency of reference vs pointer arguments to the funcions. - -/** - * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile - * then setIdentifier. Available methods include: - *
    - *
  1. call getScripts for the specific scripts in the identifier. The identifier contains at least one character in - * each of these. - *
  2. call getAlternates to get cases where a character is not limited to a single script. For example, it could be - * either Katakana or Hiragana. - *
  3. call getCommonAmongAlternates to find out if any scripts are common to all the alternates. - *
  4. call getNumerics to get a representative character (with value zero) for each of the decimal number systems in - * the identifier. - *
  5. call getRestrictionLevel to see what the UTS36 restriction level is. - *
- * - * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo - */ -class U_I18N_API IdentifierInfo : public UMemory { - - public: - /** - * Create an identifier info object. Subsequently, call setIdentifier(), etc. - * @internal - */ - IdentifierInfo(UErrorCode &status); - - /** - * Destructor - */ - virtual ~IdentifierInfo(); - - private: - /* Disallow copying for now. Can be added if there's a need. */ - IdentifierInfo(const IdentifierInfo &other); - - public: - - /** - * Set the identifier profile: the characters that are to be allowed in the identifier. - * - * @param identifierProfile the characters that are to be allowed in the identifier - * @return this - * @internal - */ - IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); - - /** - * Get the identifier profile: the characters that are to be allowed in the identifier. - * - * @return The characters that are to be allowed in the identifier. - * @internal - */ - const UnicodeSet &getIdentifierProfile() const; - - - /** - * Set an identifier to analyze. Afterwards, call methods like getScripts() - * - * @param identifier the identifier to analyze - * @param status Errorcode, set if errors occur. - * @return this - * @internal - */ - IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); - - - /** - * Get the identifier that was analyzed. The returned string is owned by the ICU library, - * and must not be deleted by the caller. - * - * @return the identifier that was analyzed. - * @internal - */ - const UnicodeString *getIdentifier() const; - - - /** - * Get the scripts found in the identifiers. - * - * @return the set of explicit scripts. - * @internal - */ - const ScriptSet *getScripts() const; - - /** - * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then - * the set consisting of those scripts will be returned. - * - * @return a uhash, with each key being of type (ScriptSet *). - * This is a set, not a map, so the value stored in the uhash is not relevant. - * (It is, in fact, 1). - * Ownership of the uhash and its contents remains with the IndetifierInfo object, - * and remains valid until a new identifer is set or until the object is deleted. - * @internal - */ - const UHashtable *getAlternates() const; - - /** - * Get the representative characters (zeros) for the numerics found in the identifier. - * - * @return the set of explicit scripts. - * @internal - */ - const UnicodeSet *getNumerics() const; - - /** - * Find out which scripts are in common among the alternates. - * - * @return the set of scripts that are in common among the alternates. - * @internal - */ - const ScriptSet *getCommonAmongAlternates() const; - - /** - * Get the number of scripts appearing in the identifier. - * Note: Common and Inherited scripts are omitted from the count. - * Note: Result may be high when the identifier contains characters - * with alternate scripts. The distinction between - * 0, 1 and > 1 will remain valid, however. - * @return the number of scripts. - */ - int32_t getScriptCount() const; - -#if !UCONFIG_NO_NORMALIZATION - - /** - * Find the "tightest" restriction level that the identifier satisfies. - * - * @return the restriction level. - * @internal - */ - URestrictionLevel getRestrictionLevel(UErrorCode &status) const; - -#endif /*!UCONFIG_NO_NORMALIZATION */ - - UnicodeString toString() const; - - /** - * Produce a readable string of alternates. - * - * @param alternates a UHashtable of UScriptSets. - * Keys only, no meaningful values in the UHash. - * @return display form - * @internal - */ - static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); - - private: - - IdentifierInfo & clear(); - UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; - - UnicodeString *fIdentifier; - ScriptSet *fRequiredScripts; - UHashtable *fScriptSetSet; - ScriptSet *fCommonAmongAlternates; - UnicodeSet *fNumerics; - UnicodeSet *fIdentifierProfile; -}; - -U_NAMESPACE_END - -#endif // __IDENTIFIER_INFO_H__ - diff --git a/icu4c/source/i18n/unicode/uspoof.h b/icu4c/source/i18n/unicode/uspoof.h index 5151993ca9..996d98d952 100644 --- a/icu4c/source/i18n/unicode/uspoof.h +++ b/icu4c/source/i18n/unicode/uspoof.h @@ -42,10 +42,10 @@ * Unicode Technical Standard #39, has two main functions: * *
    - *
  1. Checking whether two strings are visually confusable with each other, such as "desordenado" and - * "ԁеѕогԁепаԁо".
  2. + *
  3. Checking whether two strings are visually confusable with each other, such as "Harvest" and + * "Ηarvest", where the second string starts with the Greek capital letter Eta.
  4. *
  5. Checking whether an individual string is likely to be an attempt at confusing the reader (spoof - * detection), such as "pаypаl" spelled with Cyrillic 'а' characters.
  6. + * detection), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes. *
* *

@@ -63,19 +63,25 @@ * * \code{.c} * UErrorCode status = U_ZERO_ERROR; + * UChar* str1 = (UChar*) u"Harvest"; + * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA + * * USpoofChecker* sc = uspoof_open(&status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); - * int32_t bitmask = uspoof_areConfusable(sc, (UChar*) u"desordenado", -1, (UChar*) u"ԁеѕогԁепаԁо", -1, &status); - * UBool result = (bitmask & USPOOF_ALL_CHECKS) != 0; - * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1) + * + * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status); + * UBool result = bitmask != 0; + * // areConfusable: 1 (status: U_ZERO_ERROR) + * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); * uspoof_close(sc); * \endcode * *

- * The second line of the example creates a USpoofChecker object; the third line enables confusable - * checking and disables all other checks; the fourth line performs the confusability test; and the fifth line extracts - * the result out of the confusability test. For best performance, the instance should be created once (e.g., upon - * application startup), and the efficient {@link uspoof_areConfusable} method can be used at runtime. + * The call to {@link uspoof_open} creates a USpoofChecker object; the call to {@link uspoof_setChecks} + * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the + * confusability test; and the following line extracts the result out of the return value. For best performance, + * the instance should be created once (e.g., upon application startup), and the efficient + * {@link uspoof_areConfusable} method can be used at runtime. * *

* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call @@ -95,27 +101,28 @@ * * \code{.c} * UErrorCode status = U_ZERO_ERROR; - * UChar* str1 = (UChar*) u"desordenado"; - * UChar* str2 = (UChar*) u"ԁеѕогԁепаԁо"; + * UChar* str1 = (UChar*) u"Harvest"; + * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA * * USpoofChecker* sc = uspoof_open(&status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); * * // Get skeleton 1 * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status); - * UChar* skel1 = (UChar*) malloc(skel1Len * sizeof(UChar)); + * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar)); * status = U_ZERO_ERROR; * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status); * * // Get skeleton 2 * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status); - * UChar* skel2 = (UChar*) malloc(skel2Len * sizeof(UChar)); + * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar)); * status = U_ZERO_ERROR; * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status); * * // Are the skeletons the same? - * UBool result = (skel1Len == skel2Len) && memcmp(skel1, skel2, skel1Len) == 0; - * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1) + * UBool result = u_strCompare(skel1, -1, skel2, -1, FALSE) == 0; + * // areConfusable: 1 (status: U_ZERO_ERROR) + * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); * uspoof_close(sc); * free(skel1); * free(skel2); @@ -126,21 +133,19 @@ * {uspoof_areConfusable} many times in a loop, {uspoof_getSkeleton} can be used instead, as shown below: * * \code{.c} - * // Setup: * UErrorCode status = U_ZERO_ERROR; - * UChar* dictionary[2] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; - * UChar* skeletons[sizeof(dictionary)/sizeof(UChar*)]; - * int32_t skeletonLengths[sizeof(dictionary)/sizeof(UChar*)]; + * #define DICTIONARY_LENGTH 2 + * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; + * UChar* skeletons[DICTIONARY_LENGTH]; * UChar* str = (UChar*) u"1orern"; * * // Setup: * USpoofChecker* sc = uspoof_open(&status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); - * for (size_t i=0; i - * The code '1342177280' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since + * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check. * *

@@ -351,13 +356,13 @@ * A USpoofChecker instance may be used repeatedly to perform checks on any number of identifiers. * *

- * Thread Safety: Thread Safety: The test functions for checking a single identifier, or for testing whether + * Thread Safety: The test functions for checking a single identifier, or for testing whether * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads, * using the same USpoofChecker instance. * *

* More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are - * thread safe. Those that take a non-const USpoofChecier are not thread safe.. + * thread safe. Those that take a non-const USpoofChecker are not thread safe.. * * @stable ICU 4.6 */ @@ -419,13 +424,9 @@ typedef enum USpoofChecks { * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to * make {@link uspoof_areConfusable} return only those types of confusables. * - *

Note: if you wish to use {@link uspoof_getSkeleton}, it is required that you enable at least one of the - * CONFUSABLE flags. - * * @see uspoof_areConfusable * @see uspoof_getSkeleton * @draft ICU 58 - * @provisional This API might change or be removed in a future release. */ USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, @@ -471,7 +472,7 @@ typedef enum USpoofChecks { USPOOF_INVISIBLE = 32, /** Check that an identifier contains only characters from a specified set - * of acceptable characters. See {@link uspoof_setAllowedChars} + * of acceptable characters. See {@link uspoof_setAllowedChars} and * {@link uspoof_setAllowedLocales}. Note that a string that fails this check * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check. */ @@ -750,14 +751,16 @@ U_STABLE int32_t U_EXPORT2 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); /** - * Set the loosest restriction level allowed for strings. The default if this is not called is - * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and - * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are - * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. - * @param restrictionLevel The loosest restriction level allowed. - * @see URestrictionLevel - * @stable ICU 51 - */ + * Set the loosest restriction level allowed for strings. The default if this is not called is + * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and + * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are + * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. + * + * @param sc The USpoofChecker + * @param restrictionLevel The loosest restriction level allowed. + * @see URestrictionLevel + * @stable ICU 51 + */ U_STABLE void U_EXPORT2 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); @@ -1059,6 +1062,8 @@ uspoof_checkUnicodeString(const USpoofChecker *sc, * @param sc The USpoofChecker * @param id The identifier to be checked for possible security issues, * in UTF-16 format. + * @param length the length of the string to be checked, or -1 if the string is + * zero terminated. * @param checkResult An instance of USpoofCheckResult to be filled with * details about the identifier. Can be NULL. * @param status The error code, set if an error occurred while attempting to @@ -1259,7 +1264,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode * * *

    *
  • {@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}
  • - *
  • {@link USPOOF_MIXED_SCRIPT_CONFUSABLE
  • + *
  • {@link USPOOF_MIXED_SCRIPT_CONFUSABLE}
  • *
  • {@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}
  • *
* diff --git a/icu4c/source/i18n/uspoof_impl.cpp b/icu4c/source/i18n/uspoof_impl.cpp index 70026697f0..a7ce8ee260 100644 --- a/icu4c/source/i18n/uspoof_impl.cpp +++ b/icu4c/source/i18n/uspoof_impl.cpp @@ -62,13 +62,13 @@ void SpoofImpl::construct(UErrorCode& status) { if (U_FAILURE(status)) { return; } UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); - allowedCharsSet->freeze(); fAllowedCharsSet = allowedCharsSet; fAllowedLocales = uprv_strdup(""); if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } + allowedCharsSet->freeze(); } @@ -85,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : fSpoofData = src.fSpoofData->addReference(); } fAllowedCharsSet = static_cast(src.fAllowedCharsSet->clone()); - if (fAllowedCharsSet == NULL) { + fAllowedLocales = uprv_strdup(src.fAllowedLocales); + if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } - fAllowedLocales = uprv_strdup(src.fAllowedLocales); fRestrictionLevel = src.fRestrictionLevel; } diff --git a/icu4c/source/i18n/uspoof_impl.h b/icu4c/source/i18n/uspoof_impl.h index aa95dbcb54..c6e5c938e5 100644 --- a/icu4c/source/i18n/uspoof_impl.h +++ b/icu4c/source/i18n/uspoof_impl.h @@ -123,7 +123,7 @@ public: // Used to convert this CheckResult to the older int32_t return value API int32_t toCombinedBitmask(int32_t expectedChecks); - // Data Members (all stack-allocated) + // Data Members int32_t fMagic; // Internal sanity check. int32_t fChecks; // Bit vector of checks that were failed. UnicodeSet fNumerics; // Set of numerics found in the string. diff --git a/icu4c/source/i18n/uspoof_wsconf.cpp b/icu4c/source/i18n/uspoof_wsconf.cpp deleted file mode 100644 index 477a3b70b0..0000000000 --- a/icu4c/source/i18n/uspoof_wsconf.cpp +++ /dev/null @@ -1,438 +0,0 @@ -// Copyright (C) 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 2008-2013, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: uspoof_wsconf.cpp -* encoding: US-ASCII -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2009Jan05 (refactoring earlier files) -* created by: Andy Heninger -* -* Internal functions for compililing Whole Script confusable source data -* into its binary (runtime) form. The binary data format is described -* in uspoof_impl.h -*/ - -#include "unicode/utypes.h" -#include "unicode/uspoof.h" - -#if !UCONFIG_NO_NORMALIZATION - -#if !UCONFIG_NO_REGULAR_EXPRESSIONS - -#include "unicode/unorm.h" -#include "unicode/uregex.h" -#include "unicode/ustring.h" -#include "cmemory.h" -#include "scriptset.h" -#include "uspoof_impl.h" -#include "uhash.h" -#include "uvector.h" -#include "uassert.h" -#include "uspoof_wsconf.h" - -U_NAMESPACE_USE - - -// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt -// Example Lines: -// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O -// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I -// | | | | -// | | | |---- Which table, Any Case or Lower Case (A or L) -// | | |----------Target script. We need this. -// | |----------------Src script. Should match the script of the source -// | code points. Beyond checking that, we don't keep it. -// |--------------------------------Source code points or range. -// -// The expression will match _all_ lines, including erroneous lines. -// The result of the parse is returned via the contents of the (match) groups. -static const char *parseExp = - "(?m)" // Multi-line mode - "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. - "|^(?:" // OR - "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. - "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. - "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. - "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 - "[ \\t]*(?:#.*?)?" // Trailing commment - ")$|" // OR - "^(.*?)$"; // An error line. Group 8. - // Any line not matching the preceding - // parts of the expression.will match - // this, and thus be flagged as an error - - -// Extract a regular expression match group into a char * string. -// The group must contain only invariant characters. -// Used for script names -// -static void extractGroup( - URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { - - UChar ubuf[50]; - ubuf[0] = 0; - destBuf[0] = 0; - int32_t len = uregex_group(e, group, ubuf, 50, &status); - if (U_FAILURE(status) || len == -1 || len >= destCapacity) { - return; - } - UnicodeString s(FALSE, ubuf, len); // Aliasing constructor - s.extract(0, len, destBuf, destCapacity, US_INV); -} - - - -U_NAMESPACE_BEGIN - -// Build the Whole Script Confusable data -// -// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, -// because everything is local to this one build function anyhow, -// OR -// break this function into more reasonably sized pieces, with -// state in WSConfusableDataBuilder. -// -void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, - int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) -{ - if (U_FAILURE(status)) { - return; - } - URegularExpression *parseRegexp = NULL; - int32_t inputLen = 0; - UChar *input = NULL; - int32_t lineNum = 0; - - UVector *scriptSets = NULL; - uint32_t rtScriptSetsCount = 2; - - UTrie2 *anyCaseTrie = NULL; - UTrie2 *lowerCaseTrie = NULL; - - anyCaseTrie = utrie2_open(0, 0, &status); - lowerCaseTrie = utrie2_open(0, 0, &status); - - UnicodeString pattern(parseExp, -1, US_INV); - - // The scriptSets vector provides a mapping from TRIE values to the set of scripts. - // - // Reserved TRIE values: - // 0: Code point has no whole script confusables. - // 1: Code point is of script Common or Inherited. - // These code points do not participate in whole script confusable detection. - // (This is logically equivalent to saying that they contain confusables in - // all scripts) - // - // Because Trie values are indexes into the ScriptSets vector, pre-fill - // vector positions 0 and 1 to avoid conflicts with the reserved values. - - scriptSets = new UVector(status); - if (scriptSets == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - scriptSets->addElement((void *)NULL, status); - scriptSets->addElement((void *)NULL, status); - - // Convert the user input data from UTF-8 to UChar (UTF-16) - u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); - if (status != U_BUFFER_OVERFLOW_ERROR) { - goto cleanup; - } - status = U_ZERO_ERROR; - input = static_cast(uprv_malloc((inputLen+1) * sizeof(UChar))); - if (input == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); - - parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); - - // Zap any Byte Order Mark at the start of input. Changing it to a space is benign - // given the syntax of the input. - if (*input == 0xfeff) { - *input = 0x20; - } - - // Parse the input, one line per iteration of this loop. - uregex_setText(parseRegexp, input, inputLen, &status); - while (uregex_findNext(parseRegexp, &status)) { - lineNum++; - if (uregex_start(parseRegexp, 1, &status) >= 0) { - // this was a blank or comment line. - continue; - } - if (uregex_start(parseRegexp, 8, &status) >= 0) { - // input file syntax error. - status = U_PARSE_ERROR; - goto cleanup; - } - if (U_FAILURE(status)) { - goto cleanup; - } - - // Pick up the start and optional range end code points from the parsed line. - UChar32 startCodePoint = SpoofImpl::ScanHex( - input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); - UChar32 endCodePoint = startCodePoint; - if (uregex_start(parseRegexp, 3, &status) >=0) { - endCodePoint = SpoofImpl::ScanHex( - input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); - } - - // Extract the two script names from the source line. We need these in an 8 bit - // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on - // to the ICU u_getPropertyValueEnum() function. Ugh. - char srcScriptName[20]; - char targScriptName[20]; - extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); - extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); - UScriptCode srcScript = - static_cast(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); - UScriptCode targScript = - static_cast(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); - if (U_FAILURE(status)) { - goto cleanup; - } - if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { - status = U_INVALID_FORMAT_ERROR; - goto cleanup; - } - - // select the table - (A) any case or (L) lower case only - UTrie2 *table = anyCaseTrie; - if (uregex_start(parseRegexp, 7, &status) >= 0) { - table = lowerCaseTrie; - } - - // Build the set of scripts containing confusable characters for - // the code point(s) specified in this input line. - // Sanity check that the script of the source code point is the same - // as the source script indicated in the input file. Failure of this check is - // an error in the input file. - // Include the source script in the set (needed for Mixed Script Confusable detection). - // - UChar32 cp; - for (cp=startCodePoint; cp<=endCodePoint; cp++) { - int32_t setIndex = utrie2_get32(table, cp); - BuilderScriptSet *bsset = NULL; - if (setIndex > 0) { - U_ASSERT(setIndex < scriptSets->size()); - bsset = static_cast(scriptSets->elementAt(setIndex)); - } else { - bsset = new BuilderScriptSet(); - if (bsset == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - bsset->codePoint = cp; - bsset->trie = table; - bsset->sset = new ScriptSet(); - setIndex = scriptSets->size(); - bsset->index = setIndex; - bsset->rindex = 0; - if (bsset->sset == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - scriptSets->addElement(bsset, status); - utrie2_set32(table, cp, setIndex, &status); - } - bsset->sset->set(targScript, status); - bsset->sset->set(srcScript, status); - - if (U_FAILURE(status)) { - goto cleanup; - } - UScriptCode cpScript = uscript_getScript(cp, &status); - if (cpScript != srcScript) { - status = U_INVALID_FORMAT_ERROR; - goto cleanup; - } - } - } - - // Eliminate duplicate script sets. At this point we have a separate - // script set for every code point that had data in the input file. - // - // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them - // - // printf("Number of scriptSets: %d\n", scriptSets->size()); - { - int32_t duplicateCount = 0; - rtScriptSetsCount = 2; - for (int32_t outeri=2; outerisize(); outeri++) { - BuilderScriptSet *outerSet = static_cast(scriptSets->elementAt(outeri)); - if (outerSet->index != static_cast(outeri)) { - // This set was already identified as a duplicate. - // It will not be allocated a position in the runtime array of ScriptSets. - continue; - } - outerSet->rindex = rtScriptSetsCount++; - for (int32_t inneri=outeri+1; innerisize(); inneri++) { - BuilderScriptSet *innerSet = static_cast(scriptSets->elementAt(inneri)); - if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { - delete innerSet->sset; - innerSet->scriptSetOwned = FALSE; - innerSet->sset = outerSet->sset; - innerSet->index = outeri; - innerSet->rindex = outerSet->rindex; - duplicateCount++; - } - // But this doesn't get all. We need to fix the TRIE. - } - } - // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); - } - - - - // Update the Trie values to be reflect the run time script indexes (after duplicate merging). - // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets - // are unused, which is why the loop index starts at 2.) - { - for (int32_t i=2; isize(); i++) { - BuilderScriptSet *bSet = static_cast(scriptSets->elementAt(i)); - if (bSet->rindex != (uint32_t)i) { - utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); - } - } - } - - // For code points with script==Common or script==Inherited, - // Set the reserved value of 1 into both Tries. These characters do not participate - // in Whole Script Confusable detection; this reserved value is the means - // by which they are detected. - { - UnicodeSet ignoreSet; - ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); - UnicodeSet inheritedSet; - inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); - ignoreSet.addAll(inheritedSet); - for (int32_t rn=0; rnfSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; - spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; - spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; - void *where = spImpl->fSpoofData->reserveSpace(size, status); - utrie2_serialize(anyCaseTrie, where, size, &status); - - utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); - size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); - // printf("Lower case Trie size: %d\n", size); - if (status != U_BUFFER_OVERFLOW_ERROR) { - goto cleanup; - } - status = U_ZERO_ERROR; - spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; - spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; - spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; - where = spImpl->fSpoofData->reserveSpace(size, status); - utrie2_serialize(lowerCaseTrie, where, size, &status); - - spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; - spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; - ScriptSet *rtScriptSets = static_cast - (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); - uint32_t rindex = 2; - for (int32_t i=2; isize(); i++) { - BuilderScriptSet *bSet = static_cast(scriptSets->elementAt(i)); - if (bSet->rindex < rindex) { - // We have already copied this script set to the serialized data. - continue; - } - U_ASSERT(rindex == bSet->rindex); - rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. - rindex++; - } - } - - // Open new utrie2s from the serialized data. We don't want to keep the ones - // we just built because we would then have two copies of the data, one internal to - // the utries that we have already constructed, and one in the serialized data area. - // An alternative would be to not pre-serialize the Trie data, but that makes the - // spoof detector data different, depending on how the detector was constructed. - // It's simpler to keep the data always the same. - - spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( - UTRIE2_16_VALUE_BITS, - (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, - spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, - NULL, - &status); - - spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( - UTRIE2_16_VALUE_BITS, - (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, - spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, - NULL, - &status); - - - -cleanup: - if (U_FAILURE(status)) { - pe->line = lineNum; - } - uregex_close(parseRegexp); - uprv_free(input); - - int32_t i; - if (scriptSets != NULL) { - for (i=0; isize(); i++) { - BuilderScriptSet *bsset = static_cast(scriptSets->elementAt(i)); - delete bsset; - } - delete scriptSets; - } - utrie2_close(anyCaseTrie); - utrie2_close(lowerCaseTrie); - return; -} - -U_NAMESPACE_END - - - -BuilderScriptSet::BuilderScriptSet() { - codePoint = -1; - trie = NULL; - sset = NULL; - index = 0; - rindex = 0; - scriptSetOwned = TRUE; -} - -BuilderScriptSet::~BuilderScriptSet() { - if (scriptSetOwned) { - delete sset; - } -} - -#endif -#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS - diff --git a/icu4c/source/i18n/uspoof_wsconf.h b/icu4c/source/i18n/uspoof_wsconf.h deleted file mode 100644 index 4ef0c0f5f1..0000000000 --- a/icu4c/source/i18n/uspoof_wsconf.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (C) 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 2008-2012, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: uspoof_buildwsconf.h -* encoding: US-ASCII -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2009Jan19 -* created by: Andy Heninger -* -* Internal classes and functions -* for compiling whole script confusable data into its binary (runtime) form. -*/ - -#ifndef __USPOOF_BUILDWSCONF_H__ -#define __USPOOF_BUILDWSCONF_H__ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_NORMALIZATION - -#if !UCONFIG_NO_REGULAR_EXPRESSIONS - -#include "uspoof_impl.h" -#include "utrie2.h" - - -U_NAMESPACE_BEGIN - -// -// class BuilderScriptSet. Represents the set of scripts (Script Codes) -// containing characters that are confusable with one specific -// code point. -// - -class BuilderScriptSet: public UMemory { - public: - UChar32 codePoint; // The source code point. - UTrie2 *trie; // Any-case or Lower-case Trie. - // These Trie tables are the final result of the - // build. This flag indicates which of the two - // this set of data is for. - ScriptSet *sset; // The set of scripts itself. - - // Vectors of all B - uint32_t index; // Index of this set in the Build Time vector - // of script sets. - uint32_t rindex; // Index of this set in the final (runtime) - // array of sets. - UBool scriptSetOwned; // True if this BuilderScriptSet owns (should delete) - // its underlying sset. - - BuilderScriptSet(); - ~BuilderScriptSet(); -}; - - -void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, - int32_t confusablesWSLen, UParseError *pe, UErrorCode &status); - -U_NAMESPACE_END - -#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS -#endif // !UCONFIG_NO_NORMALIZATION -#endif diff --git a/icu4c/source/test/cintltst/spooftest.c b/icu4c/source/test/cintltst/spooftest.c index 6a82372ab7..f012db0f91 100644 --- a/icu4c/source/test/cintltst/spooftest.c +++ b/icu4c/source/test/cintltst/spooftest.c @@ -478,7 +478,7 @@ static void TestUSpoofCAPI(void) { const UChar* tests[] = { goodLatin, scMixed, scLatin, goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana }; - for (int32_t i=0; i