7594250cc5
X-SVN-Rev: 36275
312 lines
11 KiB
C++
312 lines
11 KiB
C++
/*
|
|
**********************************************************************
|
|
* Copyright (C) 2012-2014, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/utf16.h"
|
|
|
|
#include "identifier_info.h"
|
|
#include "mutex.h"
|
|
#include "scriptset.h"
|
|
#include "ucln_in.h"
|
|
#include "uvector.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
static UnicodeSet *ASCII;
|
|
static ScriptSet *JAPANESE;
|
|
static ScriptSet *CHINESE;
|
|
static ScriptSet *KOREAN;
|
|
static ScriptSet *CONFUSABLE_WITH_LATIN;
|
|
static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
|
|
|
|
|
|
U_CDECL_BEGIN
|
|
static UBool U_CALLCONV
|
|
IdentifierInfo_cleanup(void) {
|
|
delete ASCII;
|
|
ASCII = NULL;
|
|
delete JAPANESE;
|
|
JAPANESE = NULL;
|
|
delete CHINESE;
|
|
CHINESE = NULL;
|
|
delete KOREAN;
|
|
KOREAN = NULL;
|
|
delete CONFUSABLE_WITH_LATIN;
|
|
CONFUSABLE_WITH_LATIN = NULL;
|
|
gIdentifierInfoInitOnce.reset();
|
|
return TRUE;
|
|
}
|
|
|
|
static void U_CALLCONV
|
|
IdentifierInfo_init(UErrorCode &status) {
|
|
ASCII = new UnicodeSet(0, 0x7f);
|
|
JAPANESE = new ScriptSet();
|
|
CHINESE = new ScriptSet();
|
|
KOREAN = new ScriptSet();
|
|
CONFUSABLE_WITH_LATIN = new ScriptSet();
|
|
if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
|
|
|| CONFUSABLE_WITH_LATIN == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
ASCII->freeze();
|
|
JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
|
|
.set(USCRIPT_KATAKANA, status);
|
|
CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
|
|
KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
|
|
CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
|
|
.set(USCRIPT_CHEROKEE, status);
|
|
ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
|
|
}
|
|
U_CDECL_END
|
|
|
|
|
|
IdentifierInfo::IdentifierInfo(UErrorCode &status):
|
|
fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
|
|
fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
|
|
umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
fIdentifier = new UnicodeString();
|
|
fRequiredScripts = new ScriptSet();
|
|
fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
|
|
uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
|
|
fCommonAmongAlternates = new ScriptSet();
|
|
fNumerics = new UnicodeSet();
|
|
fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
|
|
|
|
if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
|
|
fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
}
|
|
|
|
IdentifierInfo::~IdentifierInfo() {
|
|
delete fIdentifier;
|
|
delete fRequiredScripts;
|
|
uhash_close(fScriptSetSet);
|
|
delete fCommonAmongAlternates;
|
|
delete fNumerics;
|
|
delete fIdentifierProfile;
|
|
}
|
|
|
|
|
|
IdentifierInfo &IdentifierInfo::clear() {
|
|
fRequiredScripts->resetAll();
|
|
uhash_removeAll(fScriptSetSet);
|
|
fNumerics->clear();
|
|
fCommonAmongAlternates->resetAll();
|
|
return *this;
|
|
}
|
|
|
|
|
|
IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
|
|
*fIdentifierProfile = identifierProfile;
|
|
return *this;
|
|
}
|
|
|
|
|
|
const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
|
|
return *fIdentifierProfile;
|
|
}
|
|
|
|
|
|
IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return *this;
|
|
}
|
|
*fIdentifier = identifier;
|
|
clear();
|
|
ScriptSet scriptsForCP;
|
|
UChar32 cp;
|
|
for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
|
|
cp = identifier.char32At(i);
|
|
// Store a representative character for each kind of decimal digit
|
|
if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
|
|
// Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
|
|
fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
|
|
}
|
|
UScriptCode extensions[500];
|
|
int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
|
|
if (U_FAILURE(status)) {
|
|
return *this;
|
|
}
|
|
scriptsForCP.resetAll();
|
|
for (int32_t j=0; j<extensionsCount; j++) {
|
|
scriptsForCP.set(extensions[j], status);
|
|
}
|
|
scriptsForCP.reset(USCRIPT_COMMON, status);
|
|
scriptsForCP.reset(USCRIPT_INHERITED, status);
|
|
switch (scriptsForCP.countMembers()) {
|
|
case 0: break;
|
|
case 1:
|
|
// Single script, record it.
|
|
fRequiredScripts->Union(scriptsForCP);
|
|
break;
|
|
default:
|
|
if (!fRequiredScripts->intersects(scriptsForCP)
|
|
&& !uhash_geti(fScriptSetSet, &scriptsForCP)) {
|
|
// If the set hasn't been added already, add it
|
|
// (Add a copy, fScriptSetSet takes ownership of the copy.)
|
|
uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
// Now make a final pass through ScriptSetSet to remove alternates that came before singles.
|
|
// [Kana], [Kana Hira] => [Kana]
|
|
// This is relatively infrequent, so doesn't have to be optimized.
|
|
// We also compute any commonalities among the alternates.
|
|
if (uhash_count(fScriptSetSet) > 0) {
|
|
fCommonAmongAlternates->setAll();
|
|
for (int32_t it = -1;;) {
|
|
const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
|
|
if (nextHashEl == NULL) {
|
|
break;
|
|
}
|
|
ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
|
|
// [Kana], [Kana Hira] => [Kana]
|
|
if (fRequiredScripts->intersects(*next)) {
|
|
uhash_removeElement(fScriptSetSet, nextHashEl);
|
|
} else {
|
|
fCommonAmongAlternates->intersect(*next);
|
|
// [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
|
|
for (int32_t otherIt = -1;;) {
|
|
const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
|
|
if (otherHashEl == NULL) {
|
|
break;
|
|
}
|
|
ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
|
|
if (next != other && next->contains(*other)) {
|
|
uhash_removeElement(fScriptSetSet, nextHashEl);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (uhash_count(fScriptSetSet) == 0) {
|
|
fCommonAmongAlternates->resetAll();
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
|
|
const UnicodeString *IdentifierInfo::getIdentifier() const {
|
|
return fIdentifier;
|
|
}
|
|
|
|
const ScriptSet *IdentifierInfo::getScripts() const {
|
|
return fRequiredScripts;
|
|
}
|
|
|
|
const UHashtable *IdentifierInfo::getAlternates() const {
|
|
return fScriptSetSet;
|
|
}
|
|
|
|
|
|
const UnicodeSet *IdentifierInfo::getNumerics() const {
|
|
return fNumerics;
|
|
}
|
|
|
|
const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
|
|
return fCommonAmongAlternates;
|
|
}
|
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
|
|
URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
|
|
if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
|
|
return USPOOF_UNRESTRICTIVE;
|
|
}
|
|
if (ASCII->containsAll(*fIdentifier)) {
|
|
return USPOOF_ASCII;
|
|
}
|
|
// This is a bit tricky. We look at a number of factors.
|
|
// The number of scripts in the text.
|
|
// Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
|
|
// Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
|
|
|
|
// Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
|
|
// time it is created, in setIdentifier().
|
|
int32_t cardinalityPlus = fRequiredScripts->countMembers() +
|
|
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
|
|
if (cardinalityPlus < 2) {
|
|
return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
|
|
}
|
|
if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
|
|
|| containsWithAlternates(*KOREAN, *fRequiredScripts)) {
|
|
return USPOOF_HIGHLY_RESTRICTIVE;
|
|
}
|
|
if (cardinalityPlus == 2 &&
|
|
fRequiredScripts->test(USCRIPT_LATIN, status) &&
|
|
!fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
|
|
return USPOOF_MODERATELY_RESTRICTIVE;
|
|
}
|
|
return USPOOF_MINIMALLY_RESTRICTIVE;
|
|
}
|
|
|
|
#endif /* !UCONFIG_NO_NORMALIZATION */
|
|
|
|
int32_t IdentifierInfo::getScriptCount() const {
|
|
// Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
|
|
int32_t count = fRequiredScripts->countMembers() +
|
|
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
|
|
return count;
|
|
}
|
|
|
|
|
|
|
|
UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
|
|
if (!container.contains(containee)) {
|
|
return FALSE;
|
|
}
|
|
for (int32_t iter = -1; ;) {
|
|
const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
|
|
if (hashEl == NULL) {
|
|
break;
|
|
}
|
|
ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
|
|
if (!container.intersects(*alternatives)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
|
|
UVector sorted(status);
|
|
if (U_FAILURE(status)) {
|
|
return dest;
|
|
}
|
|
for (int32_t pos = -1; ;) {
|
|
const UHashElement *el = uhash_nextElement(alternates, &pos);
|
|
if (el == NULL) {
|
|
break;
|
|
}
|
|
ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
|
|
sorted.addElement(ss, status);
|
|
}
|
|
sorted.sort(uhash_compareScriptSet, status);
|
|
UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
|
|
for (int32_t i=0; i<sorted.size(); i++) {
|
|
if (i>0) {
|
|
dest.append(separator);
|
|
}
|
|
ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
|
|
ss->displayScripts(dest);
|
|
}
|
|
return dest;
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
|