ICU-12549 Revisions to uspoof.h documentation. Actually removing identifier_info.h and other obsolete files from r39218.
X-SVN-Rev: 39297
This commit is contained in:
parent
3a8a02cae1
commit
d5d266654b
@ -1,313 +0,0 @@
|
||||
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/utf16.h"
|
||||
|
||||
#include "identifier_info.h"
|
||||
#include "mutex.h"
|
||||
#include "scriptset.h"
|
||||
#include "ucln_in.h"
|
||||
#include "uvector.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
static UnicodeSet *ASCII;
|
||||
static ScriptSet *JAPANESE;
|
||||
static ScriptSet *CHINESE;
|
||||
static ScriptSet *KOREAN;
|
||||
static ScriptSet *CONFUSABLE_WITH_LATIN;
|
||||
static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
|
||||
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
IdentifierInfo_cleanup(void) {
|
||||
delete ASCII;
|
||||
ASCII = NULL;
|
||||
delete JAPANESE;
|
||||
JAPANESE = NULL;
|
||||
delete CHINESE;
|
||||
CHINESE = NULL;
|
||||
delete KOREAN;
|
||||
KOREAN = NULL;
|
||||
delete CONFUSABLE_WITH_LATIN;
|
||||
CONFUSABLE_WITH_LATIN = NULL;
|
||||
gIdentifierInfoInitOnce.reset();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void U_CALLCONV
|
||||
IdentifierInfo_init(UErrorCode &status) {
|
||||
ASCII = new UnicodeSet(0, 0x7f);
|
||||
JAPANESE = new ScriptSet();
|
||||
CHINESE = new ScriptSet();
|
||||
KOREAN = new ScriptSet();
|
||||
CONFUSABLE_WITH_LATIN = new ScriptSet();
|
||||
if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
|
||||
|| CONFUSABLE_WITH_LATIN == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
ASCII->freeze();
|
||||
JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
|
||||
.set(USCRIPT_KATAKANA, status);
|
||||
CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
|
||||
KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
|
||||
CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
|
||||
.set(USCRIPT_CHEROKEE, status);
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
|
||||
IdentifierInfo::IdentifierInfo(UErrorCode &status):
|
||||
fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
|
||||
fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
|
||||
umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
fIdentifier = new UnicodeString();
|
||||
fRequiredScripts = new ScriptSet();
|
||||
fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
|
||||
uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
|
||||
fCommonAmongAlternates = new ScriptSet();
|
||||
fNumerics = new UnicodeSet();
|
||||
fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
|
||||
|
||||
if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
|
||||
fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
IdentifierInfo::~IdentifierInfo() {
|
||||
delete fIdentifier;
|
||||
delete fRequiredScripts;
|
||||
uhash_close(fScriptSetSet);
|
||||
delete fCommonAmongAlternates;
|
||||
delete fNumerics;
|
||||
delete fIdentifierProfile;
|
||||
}
|
||||
|
||||
|
||||
IdentifierInfo &IdentifierInfo::clear() {
|
||||
fRequiredScripts->resetAll();
|
||||
uhash_removeAll(fScriptSetSet);
|
||||
fNumerics->clear();
|
||||
fCommonAmongAlternates->resetAll();
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
|
||||
*fIdentifierProfile = identifierProfile;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
|
||||
return *fIdentifierProfile;
|
||||
}
|
||||
|
||||
|
||||
IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
*fIdentifier = identifier;
|
||||
clear();
|
||||
ScriptSet scriptsForCP;
|
||||
UChar32 cp;
|
||||
for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
|
||||
cp = identifier.char32At(i);
|
||||
// Store a representative character for each kind of decimal digit
|
||||
if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
|
||||
// Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
|
||||
fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
|
||||
}
|
||||
UScriptCode extensions[500];
|
||||
int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
scriptsForCP.resetAll();
|
||||
for (int32_t j=0; j<extensionsCount; j++) {
|
||||
scriptsForCP.set(extensions[j], status);
|
||||
}
|
||||
scriptsForCP.reset(USCRIPT_COMMON, status);
|
||||
scriptsForCP.reset(USCRIPT_INHERITED, status);
|
||||
switch (scriptsForCP.countMembers()) {
|
||||
case 0: break;
|
||||
case 1:
|
||||
// Single script, record it.
|
||||
fRequiredScripts->Union(scriptsForCP);
|
||||
break;
|
||||
default:
|
||||
if (!fRequiredScripts->intersects(scriptsForCP)
|
||||
&& !uhash_geti(fScriptSetSet, &scriptsForCP)) {
|
||||
// If the set hasn't been added already, add it
|
||||
// (Add a copy, fScriptSetSet takes ownership of the copy.)
|
||||
uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Now make a final pass through ScriptSetSet to remove alternates that came before singles.
|
||||
// [Kana], [Kana Hira] => [Kana]
|
||||
// This is relatively infrequent, so doesn't have to be optimized.
|
||||
// We also compute any commonalities among the alternates.
|
||||
if (uhash_count(fScriptSetSet) > 0) {
|
||||
fCommonAmongAlternates->setAll();
|
||||
for (int32_t it = UHASH_FIRST;;) {
|
||||
const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
|
||||
if (nextHashEl == NULL) {
|
||||
break;
|
||||
}
|
||||
ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
|
||||
// [Kana], [Kana Hira] => [Kana]
|
||||
if (fRequiredScripts->intersects(*next)) {
|
||||
uhash_removeElement(fScriptSetSet, nextHashEl);
|
||||
} else {
|
||||
fCommonAmongAlternates->intersect(*next);
|
||||
// [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
|
||||
for (int32_t otherIt = UHASH_FIRST;;) {
|
||||
const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
|
||||
if (otherHashEl == NULL) {
|
||||
break;
|
||||
}
|
||||
ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
|
||||
if (next != other && next->contains(*other)) {
|
||||
uhash_removeElement(fScriptSetSet, nextHashEl);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (uhash_count(fScriptSetSet) == 0) {
|
||||
fCommonAmongAlternates->resetAll();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
const UnicodeString *IdentifierInfo::getIdentifier() const {
|
||||
return fIdentifier;
|
||||
}
|
||||
|
||||
const ScriptSet *IdentifierInfo::getScripts() const {
|
||||
return fRequiredScripts;
|
||||
}
|
||||
|
||||
const UHashtable *IdentifierInfo::getAlternates() const {
|
||||
return fScriptSetSet;
|
||||
}
|
||||
|
||||
|
||||
const UnicodeSet *IdentifierInfo::getNumerics() const {
|
||||
return fNumerics;
|
||||
}
|
||||
|
||||
const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
|
||||
return fCommonAmongAlternates;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
|
||||
if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
|
||||
return USPOOF_UNRESTRICTIVE;
|
||||
}
|
||||
if (ASCII->containsAll(*fIdentifier)) {
|
||||
return USPOOF_ASCII;
|
||||
}
|
||||
// This is a bit tricky. We look at a number of factors.
|
||||
// The number of scripts in the text.
|
||||
// Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
|
||||
// Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
|
||||
|
||||
// Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
|
||||
// time it is created, in setIdentifier().
|
||||
int32_t cardinalityPlus = fRequiredScripts->countMembers() +
|
||||
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
|
||||
if (cardinalityPlus < 2) {
|
||||
return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
|
||||
}
|
||||
if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
|
||||
|| containsWithAlternates(*KOREAN, *fRequiredScripts)) {
|
||||
return USPOOF_HIGHLY_RESTRICTIVE;
|
||||
}
|
||||
if (cardinalityPlus == 2 &&
|
||||
fRequiredScripts->test(USCRIPT_LATIN, status) &&
|
||||
!fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
|
||||
return USPOOF_MODERATELY_RESTRICTIVE;
|
||||
}
|
||||
return USPOOF_MINIMALLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
#endif /* !UCONFIG_NO_NORMALIZATION */
|
||||
|
||||
int32_t IdentifierInfo::getScriptCount() const {
|
||||
// Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
|
||||
int32_t count = fRequiredScripts->countMembers() +
|
||||
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
|
||||
UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
|
||||
if (!container.contains(containee)) {
|
||||
return FALSE;
|
||||
}
|
||||
for (int32_t iter = UHASH_FIRST; ;) {
|
||||
const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
|
||||
if (hashEl == NULL) {
|
||||
break;
|
||||
}
|
||||
ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
|
||||
if (!container.intersects(*alternatives)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
|
||||
UVector sorted(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return dest;
|
||||
}
|
||||
for (int32_t pos = UHASH_FIRST; ;) {
|
||||
const UHashElement *el = uhash_nextElement(alternates, &pos);
|
||||
if (el == NULL) {
|
||||
break;
|
||||
}
|
||||
ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
|
||||
sorted.addElement(ss, status);
|
||||
}
|
||||
sorted.sort(uhash_compareScriptSet, status);
|
||||
UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
|
||||
for (int32_t i=0; i<sorted.size(); i++) {
|
||||
if (i>0) {
|
||||
dest.append(separator);
|
||||
}
|
||||
ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
|
||||
ss->displayScripts(dest);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -1,192 +0,0 @@
|
||||
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* indentifier_info.h
|
||||
*
|
||||
* created on: 2013 Jan 7
|
||||
* created by: Andy Heninger
|
||||
*/
|
||||
|
||||
#ifndef __IDENTIFIER_INFO_H__
|
||||
#define __IDENTIFIER_INFO_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class ScriptSet;
|
||||
|
||||
// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
|
||||
|
||||
/**
|
||||
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
|
||||
* then setIdentifier. Available methods include:
|
||||
* <ol>
|
||||
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
|
||||
* each of these.
|
||||
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
|
||||
* either Katakana or Hiragana.
|
||||
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
|
||||
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
|
||||
* the identifier.
|
||||
* <li>call getRestrictionLevel to see what the UTS36 restriction level is.
|
||||
* </ol>
|
||||
*
|
||||
* This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
|
||||
*/
|
||||
class U_I18N_API IdentifierInfo : public UMemory {
|
||||
|
||||
public:
|
||||
/**
|
||||
* Create an identifier info object. Subsequently, call setIdentifier(), etc.
|
||||
* @internal
|
||||
*/
|
||||
IdentifierInfo(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~IdentifierInfo();
|
||||
|
||||
private:
|
||||
/* Disallow copying for now. Can be added if there's a need. */
|
||||
IdentifierInfo(const IdentifierInfo &other);
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Set the identifier profile: the characters that are to be allowed in the identifier.
|
||||
*
|
||||
* @param identifierProfile the characters that are to be allowed in the identifier
|
||||
* @return this
|
||||
* @internal
|
||||
*/
|
||||
IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
|
||||
|
||||
/**
|
||||
* Get the identifier profile: the characters that are to be allowed in the identifier.
|
||||
*
|
||||
* @return The characters that are to be allowed in the identifier.
|
||||
* @internal
|
||||
*/
|
||||
const UnicodeSet &getIdentifierProfile() const;
|
||||
|
||||
|
||||
/**
|
||||
* Set an identifier to analyze. Afterwards, call methods like getScripts()
|
||||
*
|
||||
* @param identifier the identifier to analyze
|
||||
* @param status Errorcode, set if errors occur.
|
||||
* @return this
|
||||
* @internal
|
||||
*/
|
||||
IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Get the identifier that was analyzed. The returned string is owned by the ICU library,
|
||||
* and must not be deleted by the caller.
|
||||
*
|
||||
* @return the identifier that was analyzed.
|
||||
* @internal
|
||||
*/
|
||||
const UnicodeString *getIdentifier() const;
|
||||
|
||||
|
||||
/**
|
||||
* Get the scripts found in the identifiers.
|
||||
*
|
||||
* @return the set of explicit scripts.
|
||||
* @internal
|
||||
*/
|
||||
const ScriptSet *getScripts() const;
|
||||
|
||||
/**
|
||||
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
|
||||
* the set consisting of those scripts will be returned.
|
||||
*
|
||||
* @return a uhash, with each key being of type (ScriptSet *).
|
||||
* This is a set, not a map, so the value stored in the uhash is not relevant.
|
||||
* (It is, in fact, 1).
|
||||
* Ownership of the uhash and its contents remains with the IndetifierInfo object,
|
||||
* and remains valid until a new identifer is set or until the object is deleted.
|
||||
* @internal
|
||||
*/
|
||||
const UHashtable *getAlternates() const;
|
||||
|
||||
/**
|
||||
* Get the representative characters (zeros) for the numerics found in the identifier.
|
||||
*
|
||||
* @return the set of explicit scripts.
|
||||
* @internal
|
||||
*/
|
||||
const UnicodeSet *getNumerics() const;
|
||||
|
||||
/**
|
||||
* Find out which scripts are in common among the alternates.
|
||||
*
|
||||
* @return the set of scripts that are in common among the alternates.
|
||||
* @internal
|
||||
*/
|
||||
const ScriptSet *getCommonAmongAlternates() const;
|
||||
|
||||
/**
|
||||
* Get the number of scripts appearing in the identifier.
|
||||
* Note: Common and Inherited scripts are omitted from the count.
|
||||
* Note: Result may be high when the identifier contains characters
|
||||
* with alternate scripts. The distinction between
|
||||
* 0, 1 and > 1 will remain valid, however.
|
||||
* @return the number of scripts.
|
||||
*/
|
||||
int32_t getScriptCount() const;
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
/**
|
||||
* Find the "tightest" restriction level that the identifier satisfies.
|
||||
*
|
||||
* @return the restriction level.
|
||||
* @internal
|
||||
*/
|
||||
URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
|
||||
|
||||
#endif /*!UCONFIG_NO_NORMALIZATION */
|
||||
|
||||
UnicodeString toString() const;
|
||||
|
||||
/**
|
||||
* Produce a readable string of alternates.
|
||||
*
|
||||
* @param alternates a UHashtable of UScriptSets.
|
||||
* Keys only, no meaningful values in the UHash.
|
||||
* @return display form
|
||||
* @internal
|
||||
*/
|
||||
static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
|
||||
|
||||
private:
|
||||
|
||||
IdentifierInfo & clear();
|
||||
UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
|
||||
|
||||
UnicodeString *fIdentifier;
|
||||
ScriptSet *fRequiredScripts;
|
||||
UHashtable *fScriptSetSet;
|
||||
ScriptSet *fCommonAmongAlternates;
|
||||
UnicodeSet *fNumerics;
|
||||
UnicodeSet *fIdentifierProfile;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __IDENTIFIER_INFO_H__
|
||||
|
@ -42,10 +42,10 @@
|
||||
* <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desordenado" and
|
||||
* "ԁеѕогԁепаԁо".</li>
|
||||
* <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
|
||||
* "Ηarvest", where the second string starts with the Greek capital letter Eta.</li>
|
||||
* <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
|
||||
* detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
|
||||
* detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
|
||||
* </ol>
|
||||
*
|
||||
* <p>
|
||||
@ -63,19 +63,25 @@
|
||||
*
|
||||
* \code{.c}
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* UChar* str1 = (UChar*) u"Harvest";
|
||||
* UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
|
||||
*
|
||||
* USpoofChecker* sc = uspoof_open(&status);
|
||||
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
|
||||
* int32_t bitmask = uspoof_areConfusable(sc, (UChar*) u"desordenado", -1, (UChar*) u"ԁеѕогԁепаԁо", -1, &status);
|
||||
* UBool result = (bitmask & USPOOF_ALL_CHECKS) != 0;
|
||||
* printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1)
|
||||
*
|
||||
* int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
|
||||
* UBool result = bitmask != 0;
|
||||
* // areConfusable: 1 (status: U_ZERO_ERROR)
|
||||
* printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
|
||||
* uspoof_close(sc);
|
||||
* \endcode
|
||||
*
|
||||
* <p>
|
||||
* The second line of the example creates a <code>USpoofChecker</code> object; the third line enables confusable
|
||||
* checking and disables all other checks; the fourth line performs the confusability test; and the fifth line extracts
|
||||
* the result out of the confusability test. For best performance, the instance should be created once (e.g., upon
|
||||
* application startup), and the efficient {@link uspoof_areConfusable} method can be used at runtime.
|
||||
* The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
|
||||
* enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
|
||||
* confusability test; and the following line extracts the result out of the return value. For best performance,
|
||||
* the instance should be created once (e.g., upon application startup), and the efficient
|
||||
* {@link uspoof_areConfusable} method can be used at runtime.
|
||||
*
|
||||
* <p>
|
||||
* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
|
||||
@ -95,27 +101,28 @@
|
||||
*
|
||||
* \code{.c}
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* UChar* str1 = (UChar*) u"desordenado";
|
||||
* UChar* str2 = (UChar*) u"ԁеѕогԁепаԁо";
|
||||
* UChar* str1 = (UChar*) u"Harvest";
|
||||
* UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
|
||||
*
|
||||
* USpoofChecker* sc = uspoof_open(&status);
|
||||
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
|
||||
*
|
||||
* // Get skeleton 1
|
||||
* int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
|
||||
* UChar* skel1 = (UChar*) malloc(skel1Len * sizeof(UChar));
|
||||
* UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
|
||||
* status = U_ZERO_ERROR;
|
||||
* uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
|
||||
*
|
||||
* // Get skeleton 2
|
||||
* int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
|
||||
* UChar* skel2 = (UChar*) malloc(skel2Len * sizeof(UChar));
|
||||
* UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
|
||||
* status = U_ZERO_ERROR;
|
||||
* uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
|
||||
*
|
||||
* // Are the skeletons the same?
|
||||
* UBool result = (skel1Len == skel2Len) && memcmp(skel1, skel2, skel1Len) == 0;
|
||||
* printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1)
|
||||
* UBool result = u_strCompare(skel1, -1, skel2, -1, FALSE) == 0;
|
||||
* // areConfusable: 1 (status: U_ZERO_ERROR)
|
||||
* printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
|
||||
* uspoof_close(sc);
|
||||
* free(skel1);
|
||||
* free(skel2);
|
||||
@ -126,21 +133,19 @@
|
||||
* {uspoof_areConfusable} many times in a loop, {uspoof_getSkeleton} can be used instead, as shown below:
|
||||
*
|
||||
* \code{.c}
|
||||
* // Setup:
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* UChar* dictionary[2] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
|
||||
* UChar* skeletons[sizeof(dictionary)/sizeof(UChar*)];
|
||||
* int32_t skeletonLengths[sizeof(dictionary)/sizeof(UChar*)];
|
||||
* #define DICTIONARY_LENGTH 2
|
||||
* UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
|
||||
* UChar* skeletons[DICTIONARY_LENGTH];
|
||||
* UChar* str = (UChar*) u"1orern";
|
||||
*
|
||||
* // Setup:
|
||||
* USpoofChecker* sc = uspoof_open(&status);
|
||||
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
|
||||
* for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
|
||||
* for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
|
||||
* UChar* word = dictionary[i];
|
||||
* int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
|
||||
* skeletons[i] = (UChar*) malloc(len * sizeof(UChar));
|
||||
* skeletonLengths[i] = len;
|
||||
* skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
|
||||
* status = U_ZERO_ERROR;
|
||||
* uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
|
||||
* }
|
||||
@ -148,22 +153,20 @@
|
||||
* // Live Check:
|
||||
* {
|
||||
* int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
|
||||
* UChar* skel = (UChar*) malloc(len * sizeof(UChar));
|
||||
* UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
|
||||
* status = U_ZERO_ERROR;
|
||||
* uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
|
||||
* UBool result = FALSE;
|
||||
* for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
|
||||
* if (len == skeletonLengths[i] && memcmp(skel, skeletons[i], len) == 0) {
|
||||
* result = TRUE;
|
||||
* }
|
||||
* for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
|
||||
* result = u_strCompare(skel, -1, skeletons[i], -1, FALSE) == 0;
|
||||
* if (result == TRUE) { break; }
|
||||
* }
|
||||
* // Has confusable in dictionary: 1 (success: 1)
|
||||
* printf("Has confusable in dictionary: %d (success: %d)\n", result, U_SUCCESS(status));
|
||||
* // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
|
||||
* printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
|
||||
* free(skel);
|
||||
* }
|
||||
*
|
||||
* // Cleanup:
|
||||
* for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
|
||||
* for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
|
||||
* free(skeletons[i]);
|
||||
* }
|
||||
* uspoof_close(sc);
|
||||
@ -182,7 +185,7 @@
|
||||
*
|
||||
* \code{.c}
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* UChar* str = (UChar*) u"pаypаl"; // with Cyrillic 'а' characters
|
||||
* UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
|
||||
*
|
||||
* // Get the default set of allowable characters:
|
||||
* USet* allowed = uset_openEmpty();
|
||||
@ -195,7 +198,8 @@
|
||||
*
|
||||
* int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
|
||||
* UBool result = bitmask != 0;
|
||||
* printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status)); // fails checks: 1 (success: 1)
|
||||
* // fails checks: 1 (status: U_ZERO_ERROR)
|
||||
* printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
|
||||
* uspoof_close(sc);
|
||||
* uset_close(allowed);
|
||||
* \endcode
|
||||
@ -216,7 +220,7 @@
|
||||
*
|
||||
* \code{.c}
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* UChar* str = (UChar*) u"pаypаl"; // with Cyrillic 'а' characters
|
||||
* UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
|
||||
*
|
||||
* // Get the default set of allowable characters:
|
||||
* USet* allowed = uset_openEmpty();
|
||||
@ -233,8 +237,8 @@
|
||||
* int32_t failures1 = bitmask;
|
||||
* int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
|
||||
* assert(failures1 == failures2);
|
||||
* // checks that failed: 16 (success: 1)
|
||||
* printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
|
||||
* // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
|
||||
* printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
|
||||
*
|
||||
* // Cleanup:
|
||||
* uspoof_close(sc);
|
||||
@ -247,7 +251,7 @@
|
||||
*
|
||||
* \code{.cpp}
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* UnicodeString str((UChar*) u"pаypаl"); // with Cyrillic 'а' characters
|
||||
* UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
|
||||
*
|
||||
* // Get the default set of allowable characters:
|
||||
* UnicodeSet allowed;
|
||||
@ -264,8 +268,8 @@
|
||||
* int32_t failures1 = bitmask;
|
||||
* int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
|
||||
* assert(failures1 == failures2);
|
||||
* // checks that failed: 16 (success: 1)
|
||||
* printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
|
||||
* // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
|
||||
* printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
|
||||
*
|
||||
* // Explicit cleanup not necessary.
|
||||
* \endcode
|
||||
@ -291,14 +295,15 @@
|
||||
*
|
||||
* \code{.c}
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* UChar* str = (UChar*) u"৪8";
|
||||
* UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR
|
||||
*
|
||||
* USpoofChecker* sc = uspoof_open(&status);
|
||||
* uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
|
||||
*
|
||||
* int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
|
||||
* UBool result = bitmask != 0;
|
||||
* printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status)); // fails checks: 1 (success: 1)
|
||||
* // fails checks: 1 (status: U_ZERO_ERROR)
|
||||
* printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
|
||||
* uspoof_close(sc);
|
||||
* \endcode
|
||||
*
|
||||
@ -307,7 +312,7 @@
|
||||
*
|
||||
* \code{.cpp}
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* UnicodeString str((UChar*) u"pаypаl"); // with Cyrillic 'а' characters
|
||||
* UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
|
||||
*
|
||||
* // Get the default set of allowable characters:
|
||||
* UnicodeSet allowed;
|
||||
@ -323,14 +328,14 @@
|
||||
* int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
|
||||
*
|
||||
* URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
|
||||
* // Since USPOOF_AUX_INFO was enabled, the restriction level is also available via the bitmask:
|
||||
* // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
|
||||
* assert((restrictionLevel & bitmask) == restrictionLevel);
|
||||
* // Restriction level: 1342177280 (success: 1)
|
||||
* printf("Restriction level: %d (success: %d)\n", restrictionLevel, U_SUCCESS(status));
|
||||
* // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
|
||||
* printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
|
||||
* \endcode
|
||||
*
|
||||
* <p>
|
||||
* The code '1342177280' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
|
||||
* The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
|
||||
* USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
|
||||
*
|
||||
* <p>
|
||||
@ -351,13 +356,13 @@
|
||||
* A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
|
||||
*
|
||||
* <p>
|
||||
* <b>Thread Safety:</b> Thread Safety: The test functions for checking a single identifier, or for testing whether
|
||||
* <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
|
||||
* two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
|
||||
* using the same USpoofChecker instance.
|
||||
*
|
||||
* <p>
|
||||
* More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
|
||||
* thread safe. Those that take a non-const USpoofChecier are not thread safe..
|
||||
* thread safe. Those that take a non-const USpoofChecker are not thread safe..
|
||||
*
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
@ -419,13 +424,9 @@ typedef enum USpoofChecks {
|
||||
* the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
|
||||
* make {@link uspoof_areConfusable} return only those types of confusables.
|
||||
*
|
||||
* <p>Note: if you wish to use {@link uspoof_getSkeleton}, it is required that you enable at least one of the
|
||||
* CONFUSABLE flags.
|
||||
*
|
||||
* @see uspoof_areConfusable
|
||||
* @see uspoof_getSkeleton
|
||||
* @draft ICU 58
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
|
||||
|
||||
@ -471,7 +472,7 @@ typedef enum USpoofChecks {
|
||||
USPOOF_INVISIBLE = 32,
|
||||
|
||||
/** Check that an identifier contains only characters from a specified set
|
||||
* of acceptable characters. See {@link uspoof_setAllowedChars}
|
||||
* of acceptable characters. See {@link uspoof_setAllowedChars} and
|
||||
* {@link uspoof_setAllowedLocales}. Note that a string that fails this check
|
||||
* will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
|
||||
*/
|
||||
@ -750,14 +751,16 @@ U_STABLE int32_t U_EXPORT2
|
||||
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Set the loosest restriction level allowed for strings. The default if this is not called is
|
||||
* {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
|
||||
* {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
|
||||
* to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
|
||||
* @param restrictionLevel The loosest restriction level allowed.
|
||||
* @see URestrictionLevel
|
||||
* @stable ICU 51
|
||||
*/
|
||||
* Set the loosest restriction level allowed for strings. The default if this is not called is
|
||||
* {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
|
||||
* {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
|
||||
* to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param restrictionLevel The loosest restriction level allowed.
|
||||
* @see URestrictionLevel
|
||||
* @stable ICU 51
|
||||
*/
|
||||
U_STABLE void U_EXPORT2
|
||||
uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
|
||||
|
||||
@ -1059,6 +1062,8 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
|
||||
* @param sc The USpoofChecker
|
||||
* @param id The identifier to be checked for possible security issues,
|
||||
* in UTF-16 format.
|
||||
* @param length the length of the string to be checked, or -1 if the string is
|
||||
* zero terminated.
|
||||
* @param checkResult An instance of USpoofCheckResult to be filled with
|
||||
* details about the identifier. Can be NULL.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
@ -1259,7 +1264,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
|
||||
* <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE</li>
|
||||
* <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
|
||||
* <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
|
||||
* </ul>
|
||||
*
|
||||
|
@ -62,13 +62,13 @@ void SpoofImpl::construct(UErrorCode& status) {
|
||||
if (U_FAILURE(status)) { return; }
|
||||
|
||||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
allowedCharsSet->freeze();
|
||||
fAllowedCharsSet = allowedCharsSet;
|
||||
fAllowedLocales = uprv_strdup("");
|
||||
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
allowedCharsSet->freeze();
|
||||
}
|
||||
|
||||
|
||||
@ -85,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
|
||||
fSpoofData = src.fSpoofData->addReference();
|
||||
}
|
||||
fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
|
||||
if (fAllowedCharsSet == NULL) {
|
||||
fAllowedLocales = uprv_strdup(src.fAllowedLocales);
|
||||
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
fAllowedLocales = uprv_strdup(src.fAllowedLocales);
|
||||
fRestrictionLevel = src.fRestrictionLevel;
|
||||
}
|
||||
|
||||
|
@ -123,7 +123,7 @@ public:
|
||||
// Used to convert this CheckResult to the older int32_t return value API
|
||||
int32_t toCombinedBitmask(int32_t expectedChecks);
|
||||
|
||||
// Data Members (all stack-allocated)
|
||||
// Data Members
|
||||
int32_t fMagic; // Internal sanity check.
|
||||
int32_t fChecks; // Bit vector of checks that were failed.
|
||||
UnicodeSet fNumerics; // Set of numerics found in the string.
|
||||
|
@ -1,438 +0,0 @@
|
||||
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2008-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: uspoof_wsconf.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009Jan05 (refactoring earlier files)
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Internal functions for compililing Whole Script confusable source data
|
||||
* into its binary (runtime) form. The binary data format is described
|
||||
* in uspoof_impl.h
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/uregex.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "scriptset.h"
|
||||
#include "uspoof_impl.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
#include "uassert.h"
|
||||
#include "uspoof_wsconf.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
||||
// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
|
||||
// Example Lines:
|
||||
// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
|
||||
// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
|
||||
// | | | |
|
||||
// | | | |---- Which table, Any Case or Lower Case (A or L)
|
||||
// | | |----------Target script. We need this.
|
||||
// | |----------------Src script. Should match the script of the source
|
||||
// | code points. Beyond checking that, we don't keep it.
|
||||
// |--------------------------------Source code points or range.
|
||||
//
|
||||
// The expression will match _all_ lines, including erroneous lines.
|
||||
// The result of the parse is returned via the contents of the (match) groups.
|
||||
static const char *parseExp =
|
||||
"(?m)" // Multi-line mode
|
||||
"^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
|
||||
"|^(?:" // OR
|
||||
"\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
|
||||
"\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
|
||||
"\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
|
||||
"\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
|
||||
"[ \\t]*(?:#.*?)?" // Trailing commment
|
||||
")$|" // OR
|
||||
"^(.*?)$"; // An error line. Group 8.
|
||||
// Any line not matching the preceding
|
||||
// parts of the expression.will match
|
||||
// this, and thus be flagged as an error
|
||||
|
||||
|
||||
// Extract a regular expression match group into a char * string.
|
||||
// The group must contain only invariant characters.
|
||||
// Used for script names
|
||||
//
|
||||
static void extractGroup(
|
||||
URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
|
||||
|
||||
UChar ubuf[50];
|
||||
ubuf[0] = 0;
|
||||
destBuf[0] = 0;
|
||||
int32_t len = uregex_group(e, group, ubuf, 50, &status);
|
||||
if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
|
||||
return;
|
||||
}
|
||||
UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
|
||||
s.extract(0, len, destBuf, destCapacity, US_INV);
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Build the Whole Script Confusable data
|
||||
//
|
||||
// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
|
||||
// because everything is local to this one build function anyhow,
|
||||
// OR
|
||||
// break this function into more reasonably sized pieces, with
|
||||
// state in WSConfusableDataBuilder.
|
||||
//
|
||||
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
|
||||
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
URegularExpression *parseRegexp = NULL;
|
||||
int32_t inputLen = 0;
|
||||
UChar *input = NULL;
|
||||
int32_t lineNum = 0;
|
||||
|
||||
UVector *scriptSets = NULL;
|
||||
uint32_t rtScriptSetsCount = 2;
|
||||
|
||||
UTrie2 *anyCaseTrie = NULL;
|
||||
UTrie2 *lowerCaseTrie = NULL;
|
||||
|
||||
anyCaseTrie = utrie2_open(0, 0, &status);
|
||||
lowerCaseTrie = utrie2_open(0, 0, &status);
|
||||
|
||||
UnicodeString pattern(parseExp, -1, US_INV);
|
||||
|
||||
// The scriptSets vector provides a mapping from TRIE values to the set of scripts.
|
||||
//
|
||||
// Reserved TRIE values:
|
||||
// 0: Code point has no whole script confusables.
|
||||
// 1: Code point is of script Common or Inherited.
|
||||
// These code points do not participate in whole script confusable detection.
|
||||
// (This is logically equivalent to saying that they contain confusables in
|
||||
// all scripts)
|
||||
//
|
||||
// Because Trie values are indexes into the ScriptSets vector, pre-fill
|
||||
// vector positions 0 and 1 to avoid conflicts with the reserved values.
|
||||
|
||||
scriptSets = new UVector(status);
|
||||
if (scriptSets == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
scriptSets->addElement((void *)NULL, status);
|
||||
scriptSets->addElement((void *)NULL, status);
|
||||
|
||||
// Convert the user input data from UTF-8 to UChar (UTF-16)
|
||||
u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
goto cleanup;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
|
||||
if (input == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
|
||||
|
||||
parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
|
||||
|
||||
// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
|
||||
// given the syntax of the input.
|
||||
if (*input == 0xfeff) {
|
||||
*input = 0x20;
|
||||
}
|
||||
|
||||
// Parse the input, one line per iteration of this loop.
|
||||
uregex_setText(parseRegexp, input, inputLen, &status);
|
||||
while (uregex_findNext(parseRegexp, &status)) {
|
||||
lineNum++;
|
||||
if (uregex_start(parseRegexp, 1, &status) >= 0) {
|
||||
// this was a blank or comment line.
|
||||
continue;
|
||||
}
|
||||
if (uregex_start(parseRegexp, 8, &status) >= 0) {
|
||||
// input file syntax error.
|
||||
status = U_PARSE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Pick up the start and optional range end code points from the parsed line.
|
||||
UChar32 startCodePoint = SpoofImpl::ScanHex(
|
||||
input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
|
||||
UChar32 endCodePoint = startCodePoint;
|
||||
if (uregex_start(parseRegexp, 3, &status) >=0) {
|
||||
endCodePoint = SpoofImpl::ScanHex(
|
||||
input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
|
||||
}
|
||||
|
||||
// Extract the two script names from the source line. We need these in an 8 bit
|
||||
// default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
|
||||
// to the ICU u_getPropertyValueEnum() function. Ugh.
|
||||
char srcScriptName[20];
|
||||
char targScriptName[20];
|
||||
extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
|
||||
extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
|
||||
UScriptCode srcScript =
|
||||
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
|
||||
UScriptCode targScript =
|
||||
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanup;
|
||||
}
|
||||
if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// select the table - (A) any case or (L) lower case only
|
||||
UTrie2 *table = anyCaseTrie;
|
||||
if (uregex_start(parseRegexp, 7, &status) >= 0) {
|
||||
table = lowerCaseTrie;
|
||||
}
|
||||
|
||||
// Build the set of scripts containing confusable characters for
|
||||
// the code point(s) specified in this input line.
|
||||
// Sanity check that the script of the source code point is the same
|
||||
// as the source script indicated in the input file. Failure of this check is
|
||||
// an error in the input file.
|
||||
// Include the source script in the set (needed for Mixed Script Confusable detection).
|
||||
//
|
||||
UChar32 cp;
|
||||
for (cp=startCodePoint; cp<=endCodePoint; cp++) {
|
||||
int32_t setIndex = utrie2_get32(table, cp);
|
||||
BuilderScriptSet *bsset = NULL;
|
||||
if (setIndex > 0) {
|
||||
U_ASSERT(setIndex < scriptSets->size());
|
||||
bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
|
||||
} else {
|
||||
bsset = new BuilderScriptSet();
|
||||
if (bsset == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
bsset->codePoint = cp;
|
||||
bsset->trie = table;
|
||||
bsset->sset = new ScriptSet();
|
||||
setIndex = scriptSets->size();
|
||||
bsset->index = setIndex;
|
||||
bsset->rindex = 0;
|
||||
if (bsset->sset == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
scriptSets->addElement(bsset, status);
|
||||
utrie2_set32(table, cp, setIndex, &status);
|
||||
}
|
||||
bsset->sset->set(targScript, status);
|
||||
bsset->sset->set(srcScript, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanup;
|
||||
}
|
||||
UScriptCode cpScript = uscript_getScript(cp, &status);
|
||||
if (cpScript != srcScript) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Eliminate duplicate script sets. At this point we have a separate
|
||||
// script set for every code point that had data in the input file.
|
||||
//
|
||||
// We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
|
||||
//
|
||||
// printf("Number of scriptSets: %d\n", scriptSets->size());
|
||||
{
|
||||
int32_t duplicateCount = 0;
|
||||
rtScriptSetsCount = 2;
|
||||
for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
|
||||
BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
|
||||
if (outerSet->index != static_cast<uint32_t>(outeri)) {
|
||||
// This set was already identified as a duplicate.
|
||||
// It will not be allocated a position in the runtime array of ScriptSets.
|
||||
continue;
|
||||
}
|
||||
outerSet->rindex = rtScriptSetsCount++;
|
||||
for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
|
||||
BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
|
||||
if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
|
||||
delete innerSet->sset;
|
||||
innerSet->scriptSetOwned = FALSE;
|
||||
innerSet->sset = outerSet->sset;
|
||||
innerSet->index = outeri;
|
||||
innerSet->rindex = outerSet->rindex;
|
||||
duplicateCount++;
|
||||
}
|
||||
// But this doesn't get all. We need to fix the TRIE.
|
||||
}
|
||||
}
|
||||
// printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Update the Trie values to be reflect the run time script indexes (after duplicate merging).
|
||||
// (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
|
||||
// are unused, which is why the loop index starts at 2.)
|
||||
{
|
||||
for (int32_t i=2; i<scriptSets->size(); i++) {
|
||||
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
||||
if (bSet->rindex != (uint32_t)i) {
|
||||
utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For code points with script==Common or script==Inherited,
|
||||
// Set the reserved value of 1 into both Tries. These characters do not participate
|
||||
// in Whole Script Confusable detection; this reserved value is the means
|
||||
// by which they are detected.
|
||||
{
|
||||
UnicodeSet ignoreSet;
|
||||
ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
|
||||
UnicodeSet inheritedSet;
|
||||
inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
|
||||
ignoreSet.addAll(inheritedSet);
|
||||
for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
|
||||
UChar32 rangeStart = ignoreSet.getRangeStart(rn);
|
||||
UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
|
||||
utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
||||
utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize the data to the Spoof Detector
|
||||
{
|
||||
utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
||||
int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
|
||||
// printf("Any case Trie size: %d\n", size);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
goto cleanup;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
|
||||
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
|
||||
spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
|
||||
void *where = spImpl->fSpoofData->reserveSpace(size, status);
|
||||
utrie2_serialize(anyCaseTrie, where, size, &status);
|
||||
|
||||
utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
||||
size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
|
||||
// printf("Lower case Trie size: %d\n", size);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
goto cleanup;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
|
||||
spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
|
||||
spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
|
||||
where = spImpl->fSpoofData->reserveSpace(size, status);
|
||||
utrie2_serialize(lowerCaseTrie, where, size, &status);
|
||||
|
||||
spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
|
||||
spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
|
||||
ScriptSet *rtScriptSets = static_cast<ScriptSet *>
|
||||
(spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
|
||||
uint32_t rindex = 2;
|
||||
for (int32_t i=2; i<scriptSets->size(); i++) {
|
||||
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
||||
if (bSet->rindex < rindex) {
|
||||
// We have already copied this script set to the serialized data.
|
||||
continue;
|
||||
}
|
||||
U_ASSERT(rindex == bSet->rindex);
|
||||
rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
|
||||
rindex++;
|
||||
}
|
||||
}
|
||||
|
||||
// Open new utrie2s from the serialized data. We don't want to keep the ones
|
||||
// we just built because we would then have two copies of the data, one internal to
|
||||
// the utries that we have already constructed, and one in the serialized data area.
|
||||
// An alternative would be to not pre-serialize the Trie data, but that makes the
|
||||
// spoof detector data different, depending on how the detector was constructed.
|
||||
// It's simpler to keep the data always the same.
|
||||
|
||||
spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
|
||||
UTRIE2_16_VALUE_BITS,
|
||||
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
|
||||
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
||||
NULL,
|
||||
&status);
|
||||
|
||||
spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
|
||||
UTRIE2_16_VALUE_BITS,
|
||||
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
|
||||
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
||||
NULL,
|
||||
&status);
|
||||
|
||||
|
||||
|
||||
cleanup:
|
||||
if (U_FAILURE(status)) {
|
||||
pe->line = lineNum;
|
||||
}
|
||||
uregex_close(parseRegexp);
|
||||
uprv_free(input);
|
||||
|
||||
int32_t i;
|
||||
if (scriptSets != NULL) {
|
||||
for (i=0; i<scriptSets->size(); i++) {
|
||||
BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
||||
delete bsset;
|
||||
}
|
||||
delete scriptSets;
|
||||
}
|
||||
utrie2_close(anyCaseTrie);
|
||||
utrie2_close(lowerCaseTrie);
|
||||
return;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
||||
|
||||
BuilderScriptSet::BuilderScriptSet() {
|
||||
codePoint = -1;
|
||||
trie = NULL;
|
||||
sset = NULL;
|
||||
index = 0;
|
||||
rindex = 0;
|
||||
scriptSetOwned = TRUE;
|
||||
}
|
||||
|
||||
BuilderScriptSet::~BuilderScriptSet() {
|
||||
if (scriptSetOwned) {
|
||||
delete sset;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
@ -1,72 +0,0 @@
|
||||
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2008-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: uspoof_buildwsconf.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009Jan19
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Internal classes and functions
|
||||
* for compiling whole script confusable data into its binary (runtime) form.
|
||||
*/
|
||||
|
||||
#ifndef __USPOOF_BUILDWSCONF_H__
|
||||
#define __USPOOF_BUILDWSCONF_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "uspoof_impl.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//
|
||||
// class BuilderScriptSet. Represents the set of scripts (Script Codes)
|
||||
// containing characters that are confusable with one specific
|
||||
// code point.
|
||||
//
|
||||
|
||||
class BuilderScriptSet: public UMemory {
|
||||
public:
|
||||
UChar32 codePoint; // The source code point.
|
||||
UTrie2 *trie; // Any-case or Lower-case Trie.
|
||||
// These Trie tables are the final result of the
|
||||
// build. This flag indicates which of the two
|
||||
// this set of data is for.
|
||||
ScriptSet *sset; // The set of scripts itself.
|
||||
|
||||
// Vectors of all B
|
||||
uint32_t index; // Index of this set in the Build Time vector
|
||||
// of script sets.
|
||||
uint32_t rindex; // Index of this set in the final (runtime)
|
||||
// array of sets.
|
||||
UBool scriptSetOwned; // True if this BuilderScriptSet owns (should delete)
|
||||
// its underlying sset.
|
||||
|
||||
BuilderScriptSet();
|
||||
~BuilderScriptSet();
|
||||
};
|
||||
|
||||
|
||||
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
|
||||
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status);
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
#endif
|
@ -478,7 +478,7 @@ static void TestUSpoofCAPI(void) {
|
||||
const UChar* tests[] = { goodLatin, scMixed, scLatin,
|
||||
goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana };
|
||||
|
||||
for (int32_t i=0; i<sizeof(tests)/sizeof(UChar*); i++) {
|
||||
for (int32_t i=0; i<UPRV_LENGTHOF(tests); i++) {
|
||||
const UChar* str = tests[i];
|
||||
|
||||
// Basic test
|
||||
|
Loading…
Reference in New Issue
Block a user