ICU-12549 Revisions to uspoof.h documentation. Actually removing identifier_info.h and other obsolete files from r39218.

X-SVN-Rev: 39297
This commit is contained in:
Shane Carr 2016-09-20 21:06:55 +00:00
parent 3a8a02cae1
commit d5d266654b
8 changed files with 73 additions and 1083 deletions

View File

@ -1,313 +0,0 @@
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "identifier_info.h"
#include "mutex.h"
#include "scriptset.h"
#include "ucln_in.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
static UnicodeSet *ASCII;
static ScriptSet *JAPANESE;
static ScriptSet *CHINESE;
static ScriptSet *KOREAN;
static ScriptSet *CONFUSABLE_WITH_LATIN;
static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
U_CDECL_BEGIN
static UBool U_CALLCONV
IdentifierInfo_cleanup(void) {
delete ASCII;
ASCII = NULL;
delete JAPANESE;
JAPANESE = NULL;
delete CHINESE;
CHINESE = NULL;
delete KOREAN;
KOREAN = NULL;
delete CONFUSABLE_WITH_LATIN;
CONFUSABLE_WITH_LATIN = NULL;
gIdentifierInfoInitOnce.reset();
return TRUE;
}
static void U_CALLCONV
IdentifierInfo_init(UErrorCode &status) {
ASCII = new UnicodeSet(0, 0x7f);
JAPANESE = new ScriptSet();
CHINESE = new ScriptSet();
KOREAN = new ScriptSet();
CONFUSABLE_WITH_LATIN = new ScriptSet();
if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
|| CONFUSABLE_WITH_LATIN == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
ASCII->freeze();
JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
.set(USCRIPT_KATAKANA, status);
CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
.set(USCRIPT_CHEROKEE, status);
ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
}
U_CDECL_END
IdentifierInfo::IdentifierInfo(UErrorCode &status):
fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
if (U_FAILURE(status)) {
return;
}
fIdentifier = new UnicodeString();
fRequiredScripts = new ScriptSet();
fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
fCommonAmongAlternates = new ScriptSet();
fNumerics = new UnicodeSet();
fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
IdentifierInfo::~IdentifierInfo() {
delete fIdentifier;
delete fRequiredScripts;
uhash_close(fScriptSetSet);
delete fCommonAmongAlternates;
delete fNumerics;
delete fIdentifierProfile;
}
IdentifierInfo &IdentifierInfo::clear() {
fRequiredScripts->resetAll();
uhash_removeAll(fScriptSetSet);
fNumerics->clear();
fCommonAmongAlternates->resetAll();
return *this;
}
IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
*fIdentifierProfile = identifierProfile;
return *this;
}
const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
return *fIdentifierProfile;
}
IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
*fIdentifier = identifier;
clear();
ScriptSet scriptsForCP;
UChar32 cp;
for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
cp = identifier.char32At(i);
// Store a representative character for each kind of decimal digit
if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
// Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
}
UScriptCode extensions[500];
int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
if (U_FAILURE(status)) {
return *this;
}
scriptsForCP.resetAll();
for (int32_t j=0; j<extensionsCount; j++) {
scriptsForCP.set(extensions[j], status);
}
scriptsForCP.reset(USCRIPT_COMMON, status);
scriptsForCP.reset(USCRIPT_INHERITED, status);
switch (scriptsForCP.countMembers()) {
case 0: break;
case 1:
// Single script, record it.
fRequiredScripts->Union(scriptsForCP);
break;
default:
if (!fRequiredScripts->intersects(scriptsForCP)
&& !uhash_geti(fScriptSetSet, &scriptsForCP)) {
// If the set hasn't been added already, add it
// (Add a copy, fScriptSetSet takes ownership of the copy.)
uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
}
break;
}
}
// Now make a final pass through ScriptSetSet to remove alternates that came before singles.
// [Kana], [Kana Hira] => [Kana]
// This is relatively infrequent, so doesn't have to be optimized.
// We also compute any commonalities among the alternates.
if (uhash_count(fScriptSetSet) > 0) {
fCommonAmongAlternates->setAll();
for (int32_t it = UHASH_FIRST;;) {
const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
if (nextHashEl == NULL) {
break;
}
ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
// [Kana], [Kana Hira] => [Kana]
if (fRequiredScripts->intersects(*next)) {
uhash_removeElement(fScriptSetSet, nextHashEl);
} else {
fCommonAmongAlternates->intersect(*next);
// [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
for (int32_t otherIt = UHASH_FIRST;;) {
const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
if (otherHashEl == NULL) {
break;
}
ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
if (next != other && next->contains(*other)) {
uhash_removeElement(fScriptSetSet, nextHashEl);
break;
}
}
}
}
}
if (uhash_count(fScriptSetSet) == 0) {
fCommonAmongAlternates->resetAll();
}
return *this;
}
const UnicodeString *IdentifierInfo::getIdentifier() const {
return fIdentifier;
}
const ScriptSet *IdentifierInfo::getScripts() const {
return fRequiredScripts;
}
const UHashtable *IdentifierInfo::getAlternates() const {
return fScriptSetSet;
}
const UnicodeSet *IdentifierInfo::getNumerics() const {
return fNumerics;
}
const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
return fCommonAmongAlternates;
}
#if !UCONFIG_NO_NORMALIZATION
URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
return USPOOF_UNRESTRICTIVE;
}
if (ASCII->containsAll(*fIdentifier)) {
return USPOOF_ASCII;
}
// This is a bit tricky. We look at a number of factors.
// The number of scripts in the text.
// Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
// Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
// Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
// time it is created, in setIdentifier().
int32_t cardinalityPlus = fRequiredScripts->countMembers() +
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
if (cardinalityPlus < 2) {
return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
}
if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
|| containsWithAlternates(*KOREAN, *fRequiredScripts)) {
return USPOOF_HIGHLY_RESTRICTIVE;
}
if (cardinalityPlus == 2 &&
fRequiredScripts->test(USCRIPT_LATIN, status) &&
!fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
return USPOOF_MODERATELY_RESTRICTIVE;
}
return USPOOF_MINIMALLY_RESTRICTIVE;
}
#endif /* !UCONFIG_NO_NORMALIZATION */
int32_t IdentifierInfo::getScriptCount() const {
// Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
int32_t count = fRequiredScripts->countMembers() +
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
return count;
}
UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
if (!container.contains(containee)) {
return FALSE;
}
for (int32_t iter = UHASH_FIRST; ;) {
const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
if (hashEl == NULL) {
break;
}
ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
if (!container.intersects(*alternatives)) {
return false;
}
}
return true;
}
UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
UVector sorted(status);
if (U_FAILURE(status)) {
return dest;
}
for (int32_t pos = UHASH_FIRST; ;) {
const UHashElement *el = uhash_nextElement(alternates, &pos);
if (el == NULL) {
break;
}
ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
sorted.addElement(ss, status);
}
sorted.sort(uhash_compareScriptSet, status);
UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
for (int32_t i=0; i<sorted.size(); i++) {
if (i>0) {
dest.append(separator);
}
ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
ss->displayScripts(dest);
}
return dest;
}
U_NAMESPACE_END

View File

@ -1,192 +0,0 @@
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2014, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* indentifier_info.h
*
* created on: 2013 Jan 7
* created by: Andy Heninger
*/
#ifndef __IDENTIFIER_INFO_H__
#define __IDENTIFIER_INFO_H__
#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "unicode/uspoof.h"
#include "uhash.h"
U_NAMESPACE_BEGIN
class ScriptSet;
// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
/**
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
* then setIdentifier. Available methods include:
* <ol>
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
* each of these.
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
* either Katakana or Hiragana.
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
* the identifier.
* <li>call getRestrictionLevel to see what the UTS36 restriction level is.
* </ol>
*
* This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
*/
class U_I18N_API IdentifierInfo : public UMemory {
public:
/**
* Create an identifier info object. Subsequently, call setIdentifier(), etc.
* @internal
*/
IdentifierInfo(UErrorCode &status);
/**
* Destructor
*/
virtual ~IdentifierInfo();
private:
/* Disallow copying for now. Can be added if there's a need. */
IdentifierInfo(const IdentifierInfo &other);
public:
/**
* Set the identifier profile: the characters that are to be allowed in the identifier.
*
* @param identifierProfile the characters that are to be allowed in the identifier
* @return this
* @internal
*/
IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
/**
* Get the identifier profile: the characters that are to be allowed in the identifier.
*
* @return The characters that are to be allowed in the identifier.
* @internal
*/
const UnicodeSet &getIdentifierProfile() const;
/**
* Set an identifier to analyze. Afterwards, call methods like getScripts()
*
* @param identifier the identifier to analyze
* @param status Errorcode, set if errors occur.
* @return this
* @internal
*/
IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
/**
* Get the identifier that was analyzed. The returned string is owned by the ICU library,
* and must not be deleted by the caller.
*
* @return the identifier that was analyzed.
* @internal
*/
const UnicodeString *getIdentifier() const;
/**
* Get the scripts found in the identifiers.
*
* @return the set of explicit scripts.
* @internal
*/
const ScriptSet *getScripts() const;
/**
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
* the set consisting of those scripts will be returned.
*
* @return a uhash, with each key being of type (ScriptSet *).
* This is a set, not a map, so the value stored in the uhash is not relevant.
* (It is, in fact, 1).
* Ownership of the uhash and its contents remains with the IndetifierInfo object,
* and remains valid until a new identifer is set or until the object is deleted.
* @internal
*/
const UHashtable *getAlternates() const;
/**
* Get the representative characters (zeros) for the numerics found in the identifier.
*
* @return the set of explicit scripts.
* @internal
*/
const UnicodeSet *getNumerics() const;
/**
* Find out which scripts are in common among the alternates.
*
* @return the set of scripts that are in common among the alternates.
* @internal
*/
const ScriptSet *getCommonAmongAlternates() const;
/**
* Get the number of scripts appearing in the identifier.
* Note: Common and Inherited scripts are omitted from the count.
* Note: Result may be high when the identifier contains characters
* with alternate scripts. The distinction between
* 0, 1 and > 1 will remain valid, however.
* @return the number of scripts.
*/
int32_t getScriptCount() const;
#if !UCONFIG_NO_NORMALIZATION
/**
* Find the "tightest" restriction level that the identifier satisfies.
*
* @return the restriction level.
* @internal
*/
URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
#endif /*!UCONFIG_NO_NORMALIZATION */
UnicodeString toString() const;
/**
* Produce a readable string of alternates.
*
* @param alternates a UHashtable of UScriptSets.
* Keys only, no meaningful values in the UHash.
* @return display form
* @internal
*/
static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
private:
IdentifierInfo & clear();
UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
UnicodeString *fIdentifier;
ScriptSet *fRequiredScripts;
UHashtable *fScriptSetSet;
ScriptSet *fCommonAmongAlternates;
UnicodeSet *fNumerics;
UnicodeSet *fIdentifierProfile;
};
U_NAMESPACE_END
#endif // __IDENTIFIER_INFO_H__

View File

@ -42,10 +42,10 @@
* <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
* *
* <ol> * <ol>
* <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desordenado" and * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
* "ԁеѕогԁепаԁо".</li> * &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
* <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
* detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li> * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
* </ol> * </ol>
* *
* <p> * <p>
@ -63,19 +63,25 @@
* *
* \code{.c} * \code{.c}
* UErrorCode status = U_ZERO_ERROR; * UErrorCode status = U_ZERO_ERROR;
* UChar* str1 = (UChar*) u"Harvest";
* UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
*
* USpoofChecker* sc = uspoof_open(&status); * USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
* int32_t bitmask = uspoof_areConfusable(sc, (UChar*) u"desordenado", -1, (UChar*) u"ԁеѕогԁепаԁо", -1, &status); *
* UBool result = (bitmask & USPOOF_ALL_CHECKS) != 0; * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
* printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1) * UBool result = bitmask != 0;
* // areConfusable: 1 (status: U_ZERO_ERROR)
* printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc); * uspoof_close(sc);
* \endcode * \endcode
* *
* <p> * <p>
* The second line of the example creates a <code>USpoofChecker</code> object; the third line enables confusable * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
* checking and disables all other checks; the fourth line performs the confusability test; and the fifth line extracts * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
* the result out of the confusability test. For best performance, the instance should be created once (e.g., upon * confusability test; and the following line extracts the result out of the return value. For best performance,
* application startup), and the efficient {@link uspoof_areConfusable} method can be used at runtime. * the instance should be created once (e.g., upon application startup), and the efficient
* {@link uspoof_areConfusable} method can be used at runtime.
* *
* <p> * <p>
* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
@ -95,27 +101,28 @@
* *
* \code{.c} * \code{.c}
* UErrorCode status = U_ZERO_ERROR; * UErrorCode status = U_ZERO_ERROR;
* UChar* str1 = (UChar*) u"desordenado"; * UChar* str1 = (UChar*) u"Harvest";
* UChar* str2 = (UChar*) u"ԁеѕогԁепаԁо"; * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
* *
* USpoofChecker* sc = uspoof_open(&status); * USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
* *
* // Get skeleton 1 * // Get skeleton 1
* int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status); * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
* UChar* skel1 = (UChar*) malloc(skel1Len * sizeof(UChar)); * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
* status = U_ZERO_ERROR; * status = U_ZERO_ERROR;
* uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status); * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
* *
* // Get skeleton 2 * // Get skeleton 2
* int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status); * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
* UChar* skel2 = (UChar*) malloc(skel2Len * sizeof(UChar)); * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
* status = U_ZERO_ERROR; * status = U_ZERO_ERROR;
* uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status); * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
* *
* // Are the skeletons the same? * // Are the skeletons the same?
* UBool result = (skel1Len == skel2Len) && memcmp(skel1, skel2, skel1Len) == 0; * UBool result = u_strCompare(skel1, -1, skel2, -1, FALSE) == 0;
* printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1) * // areConfusable: 1 (status: U_ZERO_ERROR)
* printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc); * uspoof_close(sc);
* free(skel1); * free(skel1);
* free(skel2); * free(skel2);
@ -126,21 +133,19 @@
* {uspoof_areConfusable} many times in a loop, {uspoof_getSkeleton} can be used instead, as shown below: * {uspoof_areConfusable} many times in a loop, {uspoof_getSkeleton} can be used instead, as shown below:
* *
* \code{.c} * \code{.c}
* // Setup:
* UErrorCode status = U_ZERO_ERROR; * UErrorCode status = U_ZERO_ERROR;
* UChar* dictionary[2] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; * #define DICTIONARY_LENGTH 2
* UChar* skeletons[sizeof(dictionary)/sizeof(UChar*)]; * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
* int32_t skeletonLengths[sizeof(dictionary)/sizeof(UChar*)]; * UChar* skeletons[DICTIONARY_LENGTH];
* UChar* str = (UChar*) u"1orern"; * UChar* str = (UChar*) u"1orern";
* *
* // Setup: * // Setup:
* USpoofChecker* sc = uspoof_open(&status); * USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
* for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) { * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
* UChar* word = dictionary[i]; * UChar* word = dictionary[i];
* int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status); * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
* skeletons[i] = (UChar*) malloc(len * sizeof(UChar)); * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
* skeletonLengths[i] = len;
* status = U_ZERO_ERROR; * status = U_ZERO_ERROR;
* uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status); * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
* } * }
@ -148,22 +153,20 @@
* // Live Check: * // Live Check:
* { * {
* int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status); * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
* UChar* skel = (UChar*) malloc(len * sizeof(UChar)); * UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
* status = U_ZERO_ERROR; * status = U_ZERO_ERROR;
* uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status); * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
* UBool result = FALSE; * UBool result = FALSE;
* for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) { * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
* if (len == skeletonLengths[i] && memcmp(skel, skeletons[i], len) == 0) { * result = u_strCompare(skel, -1, skeletons[i], -1, FALSE) == 0;
* result = TRUE; * if (result == TRUE) { break; }
* }
* } * }
* // Has confusable in dictionary: 1 (success: 1) * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
* printf("Has confusable in dictionary: %d (success: %d)\n", result, U_SUCCESS(status)); * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
* free(skel); * free(skel);
* } * }
* *
* // Cleanup: * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
* for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
* free(skeletons[i]); * free(skeletons[i]);
* } * }
* uspoof_close(sc); * uspoof_close(sc);
@ -182,7 +185,7 @@
* *
* \code{.c} * \code{.c}
* UErrorCode status = U_ZERO_ERROR; * UErrorCode status = U_ZERO_ERROR;
* UChar* str = (UChar*) u"pаypаl"; // with Cyrillic 'а' characters * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
* *
* // Get the default set of allowable characters: * // Get the default set of allowable characters:
* USet* allowed = uset_openEmpty(); * USet* allowed = uset_openEmpty();
@ -195,7 +198,8 @@
* *
* int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status); * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
* UBool result = bitmask != 0; * UBool result = bitmask != 0;
* printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status)); // fails checks: 1 (success: 1) * // fails checks: 1 (status: U_ZERO_ERROR)
* printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc); * uspoof_close(sc);
* uset_close(allowed); * uset_close(allowed);
* \endcode * \endcode
@ -216,7 +220,7 @@
* *
* \code{.c} * \code{.c}
* UErrorCode status = U_ZERO_ERROR; * UErrorCode status = U_ZERO_ERROR;
* UChar* str = (UChar*) u"pаypаl"; // with Cyrillic 'а' characters * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
* *
* // Get the default set of allowable characters: * // Get the default set of allowable characters:
* USet* allowed = uset_openEmpty(); * USet* allowed = uset_openEmpty();
@ -233,8 +237,8 @@
* int32_t failures1 = bitmask; * int32_t failures1 = bitmask;
* int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status); * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
* assert(failures1 == failures2); * assert(failures1 == failures2);
* // checks that failed: 16 (success: 1) * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
* printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status)); * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
* *
* // Cleanup: * // Cleanup:
* uspoof_close(sc); * uspoof_close(sc);
@ -247,7 +251,7 @@
* *
* \code{.cpp} * \code{.cpp}
* UErrorCode status = U_ZERO_ERROR; * UErrorCode status = U_ZERO_ERROR;
* UnicodeString str((UChar*) u"pаypаl"); // with Cyrillic 'а' characters * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
* *
* // Get the default set of allowable characters: * // Get the default set of allowable characters:
* UnicodeSet allowed; * UnicodeSet allowed;
@ -264,8 +268,8 @@
* int32_t failures1 = bitmask; * int32_t failures1 = bitmask;
* int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status); * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
* assert(failures1 == failures2); * assert(failures1 == failures2);
* // checks that failed: 16 (success: 1) * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
* printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status)); * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
* *
* // Explicit cleanup not necessary. * // Explicit cleanup not necessary.
* \endcode * \endcode
@ -291,14 +295,15 @@
* *
* \code{.c} * \code{.c}
* UErrorCode status = U_ZERO_ERROR; * UErrorCode status = U_ZERO_ERROR;
* UChar* str = (UChar*) u"8"; * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR
* *
* USpoofChecker* sc = uspoof_open(&status); * USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status); * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
* *
* int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status); * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
* UBool result = bitmask != 0; * UBool result = bitmask != 0;
* printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status)); // fails checks: 1 (success: 1) * // fails checks: 1 (status: U_ZERO_ERROR)
* printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc); * uspoof_close(sc);
* \endcode * \endcode
* *
@ -307,7 +312,7 @@
* *
* \code{.cpp} * \code{.cpp}
* UErrorCode status = U_ZERO_ERROR; * UErrorCode status = U_ZERO_ERROR;
* UnicodeString str((UChar*) u"pаypаl"); // with Cyrillic 'а' characters * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
* *
* // Get the default set of allowable characters: * // Get the default set of allowable characters:
* UnicodeSet allowed; * UnicodeSet allowed;
@ -323,14 +328,14 @@
* int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
* *
* URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status); * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
* // Since USPOOF_AUX_INFO was enabled, the restriction level is also available via the bitmask: * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
* assert((restrictionLevel & bitmask) == restrictionLevel); * assert((restrictionLevel & bitmask) == restrictionLevel);
* // Restriction level: 1342177280 (success: 1) * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
* printf("Restriction level: %d (success: %d)\n", restrictionLevel, U_SUCCESS(status)); * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
* \endcode * \endcode
* *
* <p> * <p>
* The code '1342177280' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
* USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check. * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
* *
* <p> * <p>
@ -351,13 +356,13 @@
* A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
* *
* <p> * <p>
* <b>Thread Safety:</b> Thread Safety: The test functions for checking a single identifier, or for testing whether * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
* two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads, * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
* using the same USpoofChecker instance. * using the same USpoofChecker instance.
* *
* <p> * <p>
* More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
* thread safe. Those that take a non-const USpoofChecier are not thread safe.. * thread safe. Those that take a non-const USpoofChecker are not thread safe..
* *
* @stable ICU 4.6 * @stable ICU 4.6
*/ */
@ -419,13 +424,9 @@ typedef enum USpoofChecks {
* the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
* make {@link uspoof_areConfusable} return only those types of confusables. * make {@link uspoof_areConfusable} return only those types of confusables.
* *
* <p>Note: if you wish to use {@link uspoof_getSkeleton}, it is required that you enable at least one of the
* CONFUSABLE flags.
*
* @see uspoof_areConfusable * @see uspoof_areConfusable
* @see uspoof_getSkeleton * @see uspoof_getSkeleton
* @draft ICU 58 * @draft ICU 58
* @provisional This API might change or be removed in a future release.
*/ */
USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
@ -471,7 +472,7 @@ typedef enum USpoofChecks {
USPOOF_INVISIBLE = 32, USPOOF_INVISIBLE = 32,
/** Check that an identifier contains only characters from a specified set /** Check that an identifier contains only characters from a specified set
* of acceptable characters. See {@link uspoof_setAllowedChars} * of acceptable characters. See {@link uspoof_setAllowedChars} and
* {@link uspoof_setAllowedLocales}. Note that a string that fails this check * {@link uspoof_setAllowedLocales}. Note that a string that fails this check
* will also fail the {@link USPOOF_RESTRICTION_LEVEL} check. * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
*/ */
@ -750,14 +751,16 @@ U_STABLE int32_t U_EXPORT2
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
/** /**
* Set the loosest restriction level allowed for strings. The default if this is not called is * Set the loosest restriction level allowed for strings. The default if this is not called is
* {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
* {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
* to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
* @param restrictionLevel The loosest restriction level allowed. *
* @see URestrictionLevel * @param sc The USpoofChecker
* @stable ICU 51 * @param restrictionLevel The loosest restriction level allowed.
*/ * @see URestrictionLevel
* @stable ICU 51
*/
U_STABLE void U_EXPORT2 U_STABLE void U_EXPORT2
uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
@ -1059,6 +1062,8 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
* @param sc The USpoofChecker * @param sc The USpoofChecker
* @param id The identifier to be checked for possible security issues, * @param id The identifier to be checked for possible security issues,
* in UTF-16 format. * in UTF-16 format.
* @param length the length of the string to be checked, or -1 if the string is
* zero terminated.
* @param checkResult An instance of USpoofCheckResult to be filled with * @param checkResult An instance of USpoofCheckResult to be filled with
* details about the identifier. Can be NULL. * details about the identifier. Can be NULL.
* @param status The error code, set if an error occurred while attempting to * @param status The error code, set if an error occurred while attempting to
@ -1259,7 +1264,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
* *
* <ul> * <ul>
* <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li> * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
* <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE</li> * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
* <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li> * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
* </ul> * </ul>
* *

View File

@ -62,13 +62,13 @@ void SpoofImpl::construct(UErrorCode& status) {
if (U_FAILURE(status)) { return; } if (U_FAILURE(status)) { return; }
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet; fAllowedCharsSet = allowedCharsSet;
fAllowedLocales = uprv_strdup(""); fAllowedLocales = uprv_strdup("");
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR; status = U_MEMORY_ALLOCATION_ERROR;
return; return;
} }
allowedCharsSet->freeze();
} }
@ -85,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
fSpoofData = src.fSpoofData->addReference(); fSpoofData = src.fSpoofData->addReference();
} }
fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
if (fAllowedCharsSet == NULL) { fAllowedLocales = uprv_strdup(src.fAllowedLocales);
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR; status = U_MEMORY_ALLOCATION_ERROR;
} }
fAllowedLocales = uprv_strdup(src.fAllowedLocales);
fRestrictionLevel = src.fRestrictionLevel; fRestrictionLevel = src.fRestrictionLevel;
} }

View File

@ -123,7 +123,7 @@ public:
// Used to convert this CheckResult to the older int32_t return value API // Used to convert this CheckResult to the older int32_t return value API
int32_t toCombinedBitmask(int32_t expectedChecks); int32_t toCombinedBitmask(int32_t expectedChecks);
// Data Members (all stack-allocated) // Data Members
int32_t fMagic; // Internal sanity check. int32_t fMagic; // Internal sanity check.
int32_t fChecks; // Bit vector of checks that were failed. int32_t fChecks; // Bit vector of checks that were failed.
UnicodeSet fNumerics; // Set of numerics found in the string. UnicodeSet fNumerics; // Set of numerics found in the string.

View File

@ -1,438 +0,0 @@
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2008-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uspoof_wsconf.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009Jan05 (refactoring earlier files)
* created by: Andy Heninger
*
* Internal functions for compililing Whole Script confusable source data
* into its binary (runtime) form. The binary data format is described
* in uspoof_impl.h
*/
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#if !UCONFIG_NO_NORMALIZATION
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/unorm.h"
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "scriptset.h"
#include "uspoof_impl.h"
#include "uhash.h"
#include "uvector.h"
#include "uassert.h"
#include "uspoof_wsconf.h"
U_NAMESPACE_USE
// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
// Example Lines:
// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
// | | | |
// | | | |---- Which table, Any Case or Lower Case (A or L)
// | | |----------Target script. We need this.
// | |----------------Src script. Should match the script of the source
// | code points. Beyond checking that, we don't keep it.
// |--------------------------------Source code points or range.
//
// The expression will match _all_ lines, including erroneous lines.
// The result of the parse is returned via the contents of the (match) groups.
static const char *parseExp =
"(?m)" // Multi-line mode
"^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
"|^(?:" // OR
"\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
"\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
"\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
"\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
"[ \\t]*(?:#.*?)?" // Trailing commment
")$|" // OR
"^(.*?)$"; // An error line. Group 8.
// Any line not matching the preceding
// parts of the expression.will match
// this, and thus be flagged as an error
// Extract a regular expression match group into a char * string.
// The group must contain only invariant characters.
// Used for script names
//
static void extractGroup(
URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
UChar ubuf[50];
ubuf[0] = 0;
destBuf[0] = 0;
int32_t len = uregex_group(e, group, ubuf, 50, &status);
if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
return;
}
UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
s.extract(0, len, destBuf, destCapacity, US_INV);
}
U_NAMESPACE_BEGIN
// Build the Whole Script Confusable data
//
// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
// because everything is local to this one build function anyhow,
// OR
// break this function into more reasonably sized pieces, with
// state in WSConfusableDataBuilder.
//
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
URegularExpression *parseRegexp = NULL;
int32_t inputLen = 0;
UChar *input = NULL;
int32_t lineNum = 0;
UVector *scriptSets = NULL;
uint32_t rtScriptSetsCount = 2;
UTrie2 *anyCaseTrie = NULL;
UTrie2 *lowerCaseTrie = NULL;
anyCaseTrie = utrie2_open(0, 0, &status);
lowerCaseTrie = utrie2_open(0, 0, &status);
UnicodeString pattern(parseExp, -1, US_INV);
// The scriptSets vector provides a mapping from TRIE values to the set of scripts.
//
// Reserved TRIE values:
// 0: Code point has no whole script confusables.
// 1: Code point is of script Common or Inherited.
// These code points do not participate in whole script confusable detection.
// (This is logically equivalent to saying that they contain confusables in
// all scripts)
//
// Because Trie values are indexes into the ScriptSets vector, pre-fill
// vector positions 0 and 1 to avoid conflicts with the reserved values.
scriptSets = new UVector(status);
if (scriptSets == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
scriptSets->addElement((void *)NULL, status);
scriptSets->addElement((void *)NULL, status);
// Convert the user input data from UTF-8 to UChar (UTF-16)
u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
goto cleanup;
}
status = U_ZERO_ERROR;
input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
if (input == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
// given the syntax of the input.
if (*input == 0xfeff) {
*input = 0x20;
}
// Parse the input, one line per iteration of this loop.
uregex_setText(parseRegexp, input, inputLen, &status);
while (uregex_findNext(parseRegexp, &status)) {
lineNum++;
if (uregex_start(parseRegexp, 1, &status) >= 0) {
// this was a blank or comment line.
continue;
}
if (uregex_start(parseRegexp, 8, &status) >= 0) {
// input file syntax error.
status = U_PARSE_ERROR;
goto cleanup;
}
if (U_FAILURE(status)) {
goto cleanup;
}
// Pick up the start and optional range end code points from the parsed line.
UChar32 startCodePoint = SpoofImpl::ScanHex(
input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
UChar32 endCodePoint = startCodePoint;
if (uregex_start(parseRegexp, 3, &status) >=0) {
endCodePoint = SpoofImpl::ScanHex(
input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
}
// Extract the two script names from the source line. We need these in an 8 bit
// default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
// to the ICU u_getPropertyValueEnum() function. Ugh.
char srcScriptName[20];
char targScriptName[20];
extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
UScriptCode srcScript =
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
UScriptCode targScript =
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
if (U_FAILURE(status)) {
goto cleanup;
}
if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
status = U_INVALID_FORMAT_ERROR;
goto cleanup;
}
// select the table - (A) any case or (L) lower case only
UTrie2 *table = anyCaseTrie;
if (uregex_start(parseRegexp, 7, &status) >= 0) {
table = lowerCaseTrie;
}
// Build the set of scripts containing confusable characters for
// the code point(s) specified in this input line.
// Sanity check that the script of the source code point is the same
// as the source script indicated in the input file. Failure of this check is
// an error in the input file.
// Include the source script in the set (needed for Mixed Script Confusable detection).
//
UChar32 cp;
for (cp=startCodePoint; cp<=endCodePoint; cp++) {
int32_t setIndex = utrie2_get32(table, cp);
BuilderScriptSet *bsset = NULL;
if (setIndex > 0) {
U_ASSERT(setIndex < scriptSets->size());
bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
} else {
bsset = new BuilderScriptSet();
if (bsset == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
bsset->codePoint = cp;
bsset->trie = table;
bsset->sset = new ScriptSet();
setIndex = scriptSets->size();
bsset->index = setIndex;
bsset->rindex = 0;
if (bsset->sset == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
scriptSets->addElement(bsset, status);
utrie2_set32(table, cp, setIndex, &status);
}
bsset->sset->set(targScript, status);
bsset->sset->set(srcScript, status);
if (U_FAILURE(status)) {
goto cleanup;
}
UScriptCode cpScript = uscript_getScript(cp, &status);
if (cpScript != srcScript) {
status = U_INVALID_FORMAT_ERROR;
goto cleanup;
}
}
}
// Eliminate duplicate script sets. At this point we have a separate
// script set for every code point that had data in the input file.
//
// We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
//
// printf("Number of scriptSets: %d\n", scriptSets->size());
{
int32_t duplicateCount = 0;
rtScriptSetsCount = 2;
for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
if (outerSet->index != static_cast<uint32_t>(outeri)) {
// This set was already identified as a duplicate.
// It will not be allocated a position in the runtime array of ScriptSets.
continue;
}
outerSet->rindex = rtScriptSetsCount++;
for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
delete innerSet->sset;
innerSet->scriptSetOwned = FALSE;
innerSet->sset = outerSet->sset;
innerSet->index = outeri;
innerSet->rindex = outerSet->rindex;
duplicateCount++;
}
// But this doesn't get all. We need to fix the TRIE.
}
}
// printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
}
// Update the Trie values to be reflect the run time script indexes (after duplicate merging).
// (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
// are unused, which is why the loop index starts at 2.)
{
for (int32_t i=2; i<scriptSets->size(); i++) {
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
if (bSet->rindex != (uint32_t)i) {
utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
}
}
}
// For code points with script==Common or script==Inherited,
// Set the reserved value of 1 into both Tries. These characters do not participate
// in Whole Script Confusable detection; this reserved value is the means
// by which they are detected.
{
UnicodeSet ignoreSet;
ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
UnicodeSet inheritedSet;
inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
ignoreSet.addAll(inheritedSet);
for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
UChar32 rangeStart = ignoreSet.getRangeStart(rn);
UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
}
}
// Serialize the data to the Spoof Detector
{
utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
// printf("Any case Trie size: %d\n", size);
if (status != U_BUFFER_OVERFLOW_ERROR) {
goto cleanup;
}
status = U_ZERO_ERROR;
spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
void *where = spImpl->fSpoofData->reserveSpace(size, status);
utrie2_serialize(anyCaseTrie, where, size, &status);
utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
// printf("Lower case Trie size: %d\n", size);
if (status != U_BUFFER_OVERFLOW_ERROR) {
goto cleanup;
}
status = U_ZERO_ERROR;
spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
where = spImpl->fSpoofData->reserveSpace(size, status);
utrie2_serialize(lowerCaseTrie, where, size, &status);
spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
ScriptSet *rtScriptSets = static_cast<ScriptSet *>
(spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
uint32_t rindex = 2;
for (int32_t i=2; i<scriptSets->size(); i++) {
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
if (bSet->rindex < rindex) {
// We have already copied this script set to the serialized data.
continue;
}
U_ASSERT(rindex == bSet->rindex);
rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
rindex++;
}
}
// Open new utrie2s from the serialized data. We don't want to keep the ones
// we just built because we would then have two copies of the data, one internal to
// the utries that we have already constructed, and one in the serialized data area.
// An alternative would be to not pre-serialize the Trie data, but that makes the
// spoof detector data different, depending on how the detector was constructed.
// It's simpler to keep the data always the same.
spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
UTRIE2_16_VALUE_BITS,
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
NULL,
&status);
spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
UTRIE2_16_VALUE_BITS,
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
NULL,
&status);
cleanup:
if (U_FAILURE(status)) {
pe->line = lineNum;
}
uregex_close(parseRegexp);
uprv_free(input);
int32_t i;
if (scriptSets != NULL) {
for (i=0; i<scriptSets->size(); i++) {
BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
delete bsset;
}
delete scriptSets;
}
utrie2_close(anyCaseTrie);
utrie2_close(lowerCaseTrie);
return;
}
U_NAMESPACE_END
BuilderScriptSet::BuilderScriptSet() {
codePoint = -1;
trie = NULL;
sset = NULL;
index = 0;
rindex = 0;
scriptSetOwned = TRUE;
}
BuilderScriptSet::~BuilderScriptSet() {
if (scriptSetOwned) {
delete sset;
}
}
#endif
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

View File

@ -1,72 +0,0 @@
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2008-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uspoof_buildwsconf.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009Jan19
* created by: Andy Heninger
*
* Internal classes and functions
* for compiling whole script confusable data into its binary (runtime) form.
*/
#ifndef __USPOOF_BUILDWSCONF_H__
#define __USPOOF_BUILDWSCONF_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "uspoof_impl.h"
#include "utrie2.h"
U_NAMESPACE_BEGIN
//
// class BuilderScriptSet. Represents the set of scripts (Script Codes)
// containing characters that are confusable with one specific
// code point.
//
class BuilderScriptSet: public UMemory {
public:
UChar32 codePoint; // The source code point.
UTrie2 *trie; // Any-case or Lower-case Trie.
// These Trie tables are the final result of the
// build. This flag indicates which of the two
// this set of data is for.
ScriptSet *sset; // The set of scripts itself.
// Vectors of all B
uint32_t index; // Index of this set in the Build Time vector
// of script sets.
uint32_t rindex; // Index of this set in the final (runtime)
// array of sets.
UBool scriptSetOwned; // True if this BuilderScriptSet owns (should delete)
// its underlying sset.
BuilderScriptSet();
~BuilderScriptSet();
};
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status);
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif // !UCONFIG_NO_NORMALIZATION
#endif

View File

@ -478,7 +478,7 @@ static void TestUSpoofCAPI(void) {
const UChar* tests[] = { goodLatin, scMixed, scLatin, const UChar* tests[] = { goodLatin, scMixed, scLatin,
goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana }; goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana };
for (int32_t i=0; i<sizeof(tests)/sizeof(UChar*); i++) { for (int32_t i=0; i<UPRV_LENGTHOF(tests); i++) {
const UChar* str = tests[i]; const UChar* str = tests[i];
// Basic test // Basic test