scuffed-code/icu4c/source/i18n/uspoof_impl.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2008-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*/

#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "utrie2.h"
#include "cmemory.h"
#include "cstring.h"
#include "scriptset.h"
#include "umutex.h"
#include "udataswp.h"
#include "uassert.h"
#include "ucln_in.h"
#include "uspoof_impl.h"

#if !UCONFIG_NO_NORMALIZATION


U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)

SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
    construct(status);
    fSpoofData = data;
}

SpoofImpl::SpoofImpl(UErrorCode& status) {
    construct(status);

    // TODO: Call this method where it is actually needed, instead of in the
    // constructor, to allow for lazy data loading.  See #12696.
    fSpoofData = SpoofData::getDefault(status);
}

SpoofImpl::SpoofImpl() {
    UErrorCode status = U_ZERO_ERROR;
    construct(status);

    // TODO: Call this method where it is actually needed, instead of in the
    // constructor, to allow for lazy data loading.  See #12696.
    fSpoofData = SpoofData::getDefault(status);
}

void SpoofImpl::construct(UErrorCode& status) {
    fMagic = USPOOF_MAGIC;
    fChecks = USPOOF_ALL_CHECKS;
    fSpoofData = NULL;
    fAllowedCharsSet = NULL;
    fAllowedLocales = NULL;
    fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;

    if (U_FAILURE(status)) { return; }

    UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
    fAllowedCharsSet = allowedCharsSet;
    fAllowedLocales  = uprv_strdup("");
    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    allowedCharsSet->freeze();
}


// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
        fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 
        fAllowedLocales(NULL) {
    if (U_FAILURE(status)) {
        return;
    }
    fMagic = src.fMagic;
    fChecks = src.fChecks;
    if (src.fSpoofData != NULL) {
        fSpoofData = src.fSpoofData->addReference();
    }
    fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
    fRestrictionLevel = src.fRestrictionLevel;
}

SpoofImpl::~SpoofImpl() {
    fMagic = 0;                // head off application errors by preventing use of
                               //    of deleted objects.
    if (fSpoofData != NULL) {
        fSpoofData->removeReference();   // Will delete if refCount goes to zero.
    }
    delete fAllowedCharsSet;
    uprv_free((void *)fAllowedLocales);
}

//  Cast this instance as a USpoofChecker for the C API.
USpoofChecker *SpoofImpl::asUSpoofChecker() {
    return reinterpret_cast<USpoofChecker*>(this);
}

//
//  Incoming parameter check on Status and the SpoofChecker object
//    received from the C API.
//
const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    if (sc == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    SpoofImpl *This = (SpoofImpl *)sc;
    if (This->fMagic != USPOOF_MAGIC) {
        status = U_INVALID_FORMAT_ERROR;
        return NULL;
    }
    if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
        return NULL;
    }
    return This;
}

SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
    return const_cast<SpoofImpl *>
        (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
}


void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
    UnicodeSet    allowedChars;
    UnicodeSet    *tmpSet = NULL;
    const char    *locStart = localesList;
    const char    *locEnd = NULL;
    const char    *localesListEnd = localesList + uprv_strlen(localesList);
    int32_t        localeListCount = 0;   // Number of locales provided by caller.

    // Loop runs once per locale from the localesList, a comma separated list of locales.
    do {
        locEnd = uprv_strchr(locStart, ',');
        if (locEnd == NULL) {
            locEnd = localesListEnd;
        }
        while (*locStart == ' ') {
            locStart++;
        }
        const char *trimmedEnd = locEnd-1;
        while (trimmedEnd > locStart && *trimmedEnd == ' ') {
            trimmedEnd--;
        }
        if (trimmedEnd <= locStart) {
            break;
        }
        const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
        localeListCount++;

        // We have one locale from the locales list.
        // Add the script chars for this locale to the accumulating set of allowed chars.
        // If the locale is no good, we will be notified back via status.
        addScriptChars(locale, &allowedChars, status);
        uprv_free((void *)locale);
        if (U_FAILURE(status)) {
            break;
        }
        locStart = locEnd + 1;
    } while (locStart < localesListEnd);

    // If our caller provided an empty list of locales, we disable the allowed characters checking
    if (localeListCount == 0) {
        uprv_free((void *)fAllowedLocales);
        fAllowedLocales = uprv_strdup("");
        tmpSet = new UnicodeSet(0, 0x10ffff);
        if (fAllowedLocales == NULL || tmpSet == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        } 
        tmpSet->freeze();
        delete fAllowedCharsSet;
        fAllowedCharsSet = tmpSet;
        fChecks &= ~USPOOF_CHAR_LIMIT;
        return;
    }

        
    // Add all common and inherited characters to the set of allowed chars.
    UnicodeSet tempSet;
    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
    allowedChars.addAll(tempSet);
    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
    allowedChars.addAll(tempSet);
    
    // If anything went wrong, we bail out without changing
    // the state of the spoof checker.
    if (U_FAILURE(status)) {
        return;
    }

    // Store the updated spoof checker state.
    tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
    const char *tmpLocalesList = uprv_strdup(localesList);
    if (tmpSet == NULL || tmpLocalesList == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_free((void *)fAllowedLocales);
    fAllowedLocales = tmpLocalesList;
    tmpSet->freeze();
    delete fAllowedCharsSet;
    fAllowedCharsSet = tmpSet;
    fChecks |= USPOOF_CHAR_LIMIT;
}


const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
    return fAllowedLocales;
}


// Given a locale (a language), add all the characters from all of the scripts used with that language
// to the allowedChars UnicodeSet

void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
    UScriptCode scripts[30];

    int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
    if (U_FAILURE(status)) {
        return;
    }
    if (status == U_USING_DEFAULT_WARNING) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    UnicodeSet tmpSet;
    int32_t    i;
    for (i=0; i<numScripts; i++) {
        tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
        allowedChars->addAll(tmpSet);
    }
}

// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
    result.resetAll();
    result.setScriptExtensions(codePoint, status);
    if (U_FAILURE(status)) { return; }

    // Section 5.1 step 1
    if (result.test(USCRIPT_HAN, status)) {
        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
        result.set(USCRIPT_JAPANESE, status);
        result.set(USCRIPT_KOREAN, status);
    }
    if (result.test(USCRIPT_HIRAGANA, status)) {
        result.set(USCRIPT_JAPANESE, status);
    }
    if (result.test(USCRIPT_KATAKANA, status)) {
        result.set(USCRIPT_JAPANESE, status);
    }
    if (result.test(USCRIPT_HANGUL, status)) {
        result.set(USCRIPT_KOREAN, status);
    }
    if (result.test(USCRIPT_BOPOMOFO, status)) {
        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
    }

    // Section 5.1 step 2
    if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
        result.setAll();
    }
}

// Computes the resolved script set for a string, according to UTS 39 section 5.1.
void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
    getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
}

// Computes the resolved script set for a string, omitting characters having the specified script.
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
    result.setAll();

    ScriptSet temp;
    UChar32 codePoint;
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
        codePoint = input.char32At(i);

        // Compute the augmented script set for the character
        getAugmentedScriptSet(codePoint, temp, status);
        if (U_FAILURE(status)) { return; }

        // Intersect the augmented script set with the resolved script set, but only if the character doesn't
        // have the script specified in the function call
        if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
            result.intersect(temp);
        }
    }
}

// Computes the set of numerics for a string, according to UTS 39 section 5.3.
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
    result.clear();

    UChar32 codePoint;
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
        codePoint = input.char32At(i);

        // Store a representative character for each kind of decimal digit
        if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
            // Store the zero character as a representative for comparison.
            // Unicode guarantees it is codePoint - value
            result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
        }
    }
}

// Computes the restriction level of a string, according to UTS 39 section 5.2.
URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
    // Section 5.2 step 1:
    if (!fAllowedCharsSet->containsAll(input)) {
        return USPOOF_UNRESTRICTIVE;
    }

    // Section 5.2 step 2
    // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
    // and just do a simple for loop.
    UBool allASCII = TRUE;
    for (int32_t i=0, length=input.length(); i<length; i++) {
        if (input.charAt(i) > 0x7f) {
            allASCII = FALSE;
            break;
        }
    }
    if (allASCII) {
        return USPOOF_ASCII;
    }

    // Section 5.2 steps 3:
    ScriptSet resolvedScriptSet;
    getResolvedScriptSet(input, resolvedScriptSet, status);
    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }

    // Section 5.2 step 4:
    if (!resolvedScriptSet.isEmpty()) {
        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
    }

    // Section 5.2 step 5:
    ScriptSet resolvedNoLatn;
    getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }

    // Section 5.2 step 6:
    if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
            || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
            || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
        return USPOOF_HIGHLY_RESTRICTIVE;
    }

    // Section 5.2 step 7:
    if (!resolvedNoLatn.isEmpty()
            && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
            && !resolvedNoLatn.test(USCRIPT_GREEK, status)
            && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
        return USPOOF_MODERATELY_RESTRICTIVE;
    }

    // Section 5.2 step 8:
    return USPOOF_MINIMALLY_RESTRICTIVE;
}

int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
    bool sawLeadCharacter = false;
    for (int32_t i=0; i<input.length();) {
        UChar32 cp = input.char32At(i);
        if (sawLeadCharacter && cp == 0x0307) {
            return i;
        }
        uint8_t combiningClass = u_getCombiningClass(cp);
        // Skip over characters except for those with combining class 0 (non-combining characters) or with
        // combining class 230 (same class as U+0307)
        U_ASSERT(u_getCombiningClass(0x0307) == 230);
        if (combiningClass == 0 || combiningClass == 230) {
            sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
        }
        i += U16_LENGTH(cp);
    }
    return -1;
}

static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
    return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
           u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
}

bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
    if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
        return true;
    }
    UnicodeString skelStr;
    fSpoofData->confusableLookup(cp, skelStr);
    UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
    if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
        return true;
    }
    return false;
}


// Convert a text format hex number.  Utility function used by builder code.  Static.
// Input: UChar *string text.  Output: a UChar32
// Input has been pre-checked, and will have no non-hex chars.
// The number must fall in the code point range of 0..0x10ffff
// Static Function.
UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
    if (U_FAILURE(status)) {
        return 0;
    }
    U_ASSERT(limit-start > 0);
    uint32_t val = 0;
    int i;
    for (i=start; i<limit; i++) {
        int digitVal = s[i] - 0x30;
        if (digitVal>9) {
            digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
        }
        if (digitVal>15) {
            digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
        }
        U_ASSERT(digitVal <= 0xf);
        val <<= 4;
        val += digitVal;
    }
    if (val > 0x10ffff) {
        status = U_PARSE_ERROR;
        val = 0;
    }
    return (UChar32)val;
}


//-----------------------------------------
//
//   class CheckResult Implementation
//
//-----------------------------------------

CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
    clear();
}

USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
    return reinterpret_cast<USpoofCheckResult*>(this);
}

//
//  Incoming parameter check on Status and the CheckResult object
//    received from the C API.
//
const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
    if (U_FAILURE(status)) { return NULL; }
    if (ptr == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    CheckResult *This = (CheckResult*) ptr;
    if (This->fMagic != USPOOF_CHECK_MAGIC) {
        status = U_INVALID_FORMAT_ERROR;
        return NULL;
    }
    return This;
}

CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
    return const_cast<CheckResult *>
        (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
}

void CheckResult::clear() {
    fChecks = 0;
    fNumerics.clear();
    fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
}

int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
    if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
        return fChecks | fRestrictionLevel;
    } else {
        return fChecks;
    }
}

CheckResult::~CheckResult() {
}

//----------------------------------------------------------------------------------------------
//
//   class SpoofData Implementation
//
//----------------------------------------------------------------------------------------------


UBool SpoofData::validateDataVersion(UErrorCode &status) const {
    if (U_FAILURE(status) ||
        fRawData == NULL ||
        fRawData->fMagic != USPOOF_MAGIC ||
        fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
        fRawData->fFormatVersion[1] != 0 ||
        fRawData->fFormatVersion[2] != 0 ||
        fRawData->fFormatVersion[3] != 0) {
            status = U_INVALID_FORMAT_ERROR;
            return FALSE;
    }
    return TRUE;
}

static UBool U_CALLCONV
spoofDataIsAcceptable(void *context,
                        const char * /* type */, const char * /*name*/,
                        const UDataInfo *pInfo) {
    if(
        pInfo->size >= 20 &&
        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
        pInfo->charsetFamily == U_CHARSET_FAMILY &&
        pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
        pInfo->dataFormat[1] == 0x66 &&
        pInfo->dataFormat[2] == 0x75 &&
        pInfo->dataFormat[3] == 0x20 &&
        pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
    ) {
        UVersionInfo *version = static_cast<UVersionInfo *>(context);
        if(version != NULL) {
            uprv_memcpy(version, pInfo->dataVersion, 4);
        }
        return TRUE;
    } else {
        return FALSE;
    }
}

//  Methods for the loading of the default confusables data file.  The confusable
//  data is loaded only when it is needed.
//
//  SpoofData::getDefault() - Return the default confusables data, and call the
//                            initOnce() if it is not available.  Adds a reference
//                            to the SpoofData that the caller is responsible for
//                            decrementing when they are done with the data.
//
//  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
//                           is shared by all spoof checkers using the default data.
//
//  uspoof_cleanupDefaultData - Called during cleanup.
//

static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
static SpoofData* gDefaultSpoofData;

static UBool U_CALLCONV
uspoof_cleanupDefaultData(void) {
    if (gDefaultSpoofData) {
        // Will delete, assuming all user-level spoof checkers were closed.
        gDefaultSpoofData->removeReference();
        gDefaultSpoofData = nullptr;
        gSpoofInitDefaultOnce.reset();
    }
    return TRUE;
}

static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
    UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
                                        spoofDataIsAcceptable, 
                                        nullptr,       // context, would receive dataVersion if supplied.
                                        &status);
    if (U_FAILURE(status)) { return; }
    gDefaultSpoofData = new SpoofData(udm, status);
    if (U_FAILURE(status)) {
        delete gDefaultSpoofData;
        gDefaultSpoofData = nullptr;
        return;
    }
    if (gDefaultSpoofData == nullptr) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
}

SpoofData* SpoofData::getDefault(UErrorCode& status) {
    umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
    if (U_FAILURE(status)) { return NULL; }
    gDefaultSpoofData->addReference();
    return gDefaultSpoofData;
}


SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
{
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    fUDM = udm;
    // fRawData is non-const because it may be constructed by the data builder.
    fRawData = reinterpret_cast<SpoofDataHeader *>(
            const_cast<void *>(udata_getMemory(udm)));
    validateDataVersion(status);
    initPtrs(status);
}


SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
{
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    if ((size_t)length < sizeof(SpoofDataHeader)) {
        status = U_INVALID_FORMAT_ERROR;
        return;
    }
    if (data == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    void *ncData = const_cast<void *>(data);
    fRawData = static_cast<SpoofDataHeader *>(ncData);
    if (length < fRawData->fLength) {
        status = U_INVALID_FORMAT_ERROR;
        return;
    }
    validateDataVersion(status);
    initPtrs(status);
}


// Spoof Data constructor for use from data builder.
//   Initializes a new, empty data area that will be populated later.
SpoofData::SpoofData(UErrorCode &status) {
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    fDataOwned = true;

    // The spoof header should already be sized to be a multiple of 16 bytes.
    // Just in case it's not, round it up.
    uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
    U_ASSERT(initialSize == sizeof(SpoofDataHeader));
    
    fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
    fMemLimit = initialSize;
    if (fRawData == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_memset(fRawData, 0, initialSize);

    fRawData->fMagic = USPOOF_MAGIC;
    fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
    fRawData->fFormatVersion[1] = 0;
    fRawData->fFormatVersion[2] = 0;
    fRawData->fFormatVersion[3] = 0;
    initPtrs(status);
}

// reset() - initialize all fields.
//           Should be updated if any new fields are added.
//           Called by constructors to put things in a known initial state.
void SpoofData::reset() {
   fRawData = NULL;
   fDataOwned = FALSE;
   fUDM      = NULL;
   fMemLimit = 0;
   fRefCount = 1;
   fCFUKeys = NULL;
   fCFUValues = NULL;
   fCFUStrings = NULL;
}


//  SpoofData::initPtrs()
//            Initialize the pointers to the various sections of the raw data.
//
//            This function is used both during the Trie building process (multiple
//            times, as the individual data sections are added), and
//            during the opening of a Spoof Checker from prebuilt data.
//
//            The pointers for non-existent data sections (identified by an offset of 0)
//            are set to NULL.
//
//            Note:  During building the data, adding each new data section
//            reallocs the raw data area, which likely relocates it, which
//            in turn requires reinitializing all of the pointers into it, hence
//            multiple calls to this function during building.
//
void SpoofData::initPtrs(UErrorCode &status) {
    fCFUKeys = NULL;
    fCFUValues = NULL;
    fCFUStrings = NULL;
    if (U_FAILURE(status)) {
        return;
    }
    if (fRawData->fCFUKeys != 0) {
        fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
    }
    if (fRawData->fCFUStringIndex != 0) {
        fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
    }
    if (fRawData->fCFUStringTable != 0) {
        fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
    }
}


SpoofData::~SpoofData() {
    if (fDataOwned) {
        uprv_free(fRawData);
    }
    fRawData = NULL;
    if (fUDM != NULL) {
        udata_close(fUDM);
    }
    fUDM = NULL;
}


void SpoofData::removeReference() {
    if (umtx_atomic_dec(&fRefCount) == 0) {
        delete this;
    }
}


SpoofData *SpoofData::addReference() {
    umtx_atomic_inc(&fRefCount);
    return this;
}


void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    if (!fDataOwned) {
        U_ASSERT(FALSE);
        status = U_INTERNAL_PROGRAM_ERROR;
        return NULL;
    }

    numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
    uint32_t returnOffset = fMemLimit;
    fMemLimit += numBytes;
    fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
    fRawData->fLength = fMemLimit;
    uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
    initPtrs(status);
    return (char *)fRawData + returnOffset;
}

int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
    int32_t dataSize = fRawData->fLength;
    if (capacity < dataSize) {
        status = U_BUFFER_OVERFLOW_ERROR;
        return dataSize;
    }
    uprv_memcpy(buf, fRawData, dataSize);
    return dataSize;
}

int32_t SpoofData::size() const {
    return fRawData->fLength;
}

//-------------------------------
//
// Front-end APIs for SpoofData
//
//-------------------------------

int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
    // Perform a binary search.
    // [lo, hi), i.e lo is inclusive, hi is exclusive.
    // The result after the loop will be in lo.
    int32_t lo = 0;
    int32_t hi = length();
    do {
        int32_t mid = (lo + hi) / 2;
        if (codePointAt(mid) > inChar) {
            hi = mid;
        } else if (codePointAt(mid) < inChar) {
            lo = mid;
        } else {
            // Found result.  Break early.
            lo = mid;
            break;
        }
    } while (hi - lo > 1);

    // Did we find an entry?  If not, the char maps to itself.
    if (codePointAt(lo) != inChar) {
        dest.append(inChar);
        return 1;
    }

    // Add the element to the string builder and return.
    return appendValueTo(lo, dest);
}

int32_t SpoofData::length() const {
    return fRawData->fCFUKeysSize;
}

UChar32 SpoofData::codePointAt(int32_t index) const {
    return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
}

int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
    int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);

    // Value is either a char (for strings of length 1) or
    // an index into the string table (for longer strings)
    uint16_t value = fCFUValues[index];
    if (stringLength == 1) {
        dest.append((UChar)value);
    } else {
        dest.append(fCFUStrings + value, stringLength);
    }

    return stringLength;
}


U_NAMESPACE_END

U_NAMESPACE_USE

//-----------------------------------------------------------------------------
//
//  uspoof_swap   -  byte swap and char encoding swap of spoof data
//
//-----------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
           UErrorCode *status) {

    if (status == NULL || U_FAILURE(*status)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
        *status=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    //
    //  Check that the data header is for spoof data.
    //    (Header contents are defined in gencfu.cpp)
    //
    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
    if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
           pInfo->dataFormat[1]==0x66 &&
           pInfo->dataFormat[2]==0x75 &&
           pInfo->dataFormat[3]==0x20 &&
           pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
           pInfo->formatVersion[1]==0 &&
           pInfo->formatVersion[2]==0 &&
           pInfo->formatVersion[3]==0  )) {
        udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
                             "(format version %02x %02x %02x %02x) is not recognized\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
                         pInfo->formatVersion[0], pInfo->formatVersion[1],
                         pInfo->formatVersion[2], pInfo->formatVersion[3]);
        *status=U_UNSUPPORTED_ERROR;
        return 0;
    }

    //
    // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
    //                         header).  This swap also conveniently gets us
    //                         the size of the ICU d.h., which lets us locate the start
    //                         of the uspoof specific data.
    //
    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);


    //
    // Get the Spoof Data Header, and check that it appears to be OK.
    //
    //
    const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
    SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
    if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
        ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader)) 
    {
        udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
        *status=U_UNSUPPORTED_ERROR;
        return 0;
    }

    //
    // Prefight operation?  Just return the size
    //
    int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
    int32_t totalSize = headerSize + spoofDataLength;
    if (length < 0) {
        return totalSize;
    }

    //
    // Check that length passed in is consistent with length from Spoof data header.
    //
    if (length < totalSize) {
        udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
                            spoofDataLength);
        *status=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
        }


    //
    // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
    //                 we need to reference the header to locate the data, and an
    //                 inplace swap of the header leaves it unusable.
    //
    uint8_t          *outBytes = (uint8_t *)outData + headerSize;
    SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;

    int32_t   sectionStart;
    int32_t   sectionLength;

    //
    // If not swapping in place, zero out the output buffer before starting.
    //    Gaps may exist between the individual sections, and these must be zeroed in
    //    the output buffer.  The simplest way to do that is to just zero the whole thing.
    //
    if (inBytes != outBytes) {
        uprv_memset(outBytes, 0, spoofDataLength);
    }

    // Confusables Keys Section   (fCFUKeys)
    sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
    sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
    ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

    // String Index Section
    sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
    sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

    // String Table Section
    sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
    sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

    // And, last, swap the header itself.
    //   int32_t   fMagic             // swap this
    //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
    //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
    //
    uint32_t magic = ds->readUInt32(spoofDH->fMagic);
    ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);

    if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
        uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
    }
    // swap starting at fLength
    ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);

    return totalSize;
}

#endif
-												ICU-12764 icu4c utf-8 source files, update Copyright notices.

X-SVN-Rev: 39583
											
										
										
											2017-01-20 00:20:31 +00:00
+								// © 2016 and later: Unicode, Inc. and others.
-												ICU-12564 Update copyright notice in trunk

X-SVN-Rev: 38848
											
										
										
											2016-06-15 18:58:17 +00:00
+								// License & terms of use: http://www.unicode.org/copyright.html
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								/*
 								**********************************************************************
-												ICU-12564 Reverted r38761 and r38762, because we want to prepend the Unicode copyright for existing source files, instead of replacing copyright comments.

X-SVN-Rev: 38776
											
										
										
											2016-05-31 21:45:07 +00:00
+								*   Copyright (C) 2008-2016, International Business Machines
 								*   Corporation and others.  All Rights Reserved.
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								**********************************************************************
 								*/
 								#include "unicode/utypes.h"
 								#include "unicode/uspoof.h"
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								#include "unicode/uchar.h"
 								#include "unicode/uniset.h"
-												ICU-8575 option for not including utf headers by default; replace uses of deprecated utf_old.h macros

X-SVN-Rev: 30430
											
										
										
											2011-07-27 05:53:56 +00:00
+								#include "unicode/utf16.h"
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								#include "utrie2.h"
 								#include "cmemory.h"
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								#include "cstring.h"
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
+								#include "scriptset.h"
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								#include "umutex.h"
 								#include "udataswp.h"
 								#include "uassert.h"
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								#include "ucln_in.h"
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								#include "uspoof_impl.h"
-												ICU-4790 Fix uconfig error.

X-SVN-Rev: 25985
											
										
										
											2009-05-05 02:03:27 +00:00
+								#if !UCONFIG_NO_NORMALIZATION
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
 								U_NAMESPACE_BEGIN
 								UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
 								    construct(status);
 								    fSpoofData = data;
 								}
 								SpoofImpl::SpoofImpl(UErrorCode& status) {
 								    construct(status);
 								    // TODO: Call this method where it is actually needed, instead of in the
 								    // constructor, to allow for lazy data loading.  See #12696.
 								    fSpoofData = SpoofData::getDefault(status);
 								}
 								SpoofImpl::SpoofImpl() {
 								    UErrorCode status = U_ZERO_ERROR;
 								    construct(status);
 								    // TODO: Call this method where it is actually needed, instead of in the
 								    // constructor, to allow for lazy data loading.  See #12696.
 								    fSpoofData = SpoofData::getDefault(status);
 								}
 								void SpoofImpl::construct(UErrorCode& status) {
 								    fMagic = USPOOF_MAGIC;
 								    fChecks = USPOOF_ALL_CHECKS;
 								    fSpoofData = NULL;
 								    fAllowedCharsSet = NULL;
 								    fAllowedLocales = NULL;
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
+								    fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    if (U_FAILURE(status)) { return; }
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
+								    fAllowedCharsSet = allowedCharsSet;
 								    fAllowedLocales  = uprv_strdup("");
 								    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								        status = U_MEMORY_ALLOCATION_ERROR;
-												ICU-8578 Apply patch to fix some compiler warnings and related issues

X-SVN-Rev: 30205
											
										
										
											2011-06-10 18:56:08 +00:00
+								        return;
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    }
-												ICU-12549 Revisions to uspoof.h documentation.  Actually removing identifier_info.h and other obsolete files from r39218.

X-SVN-Rev: 39297
											
										
										
											2016-09-20 21:06:55 +00:00
+								    allowedCharsSet->freeze();
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								}
 								// Copy Constructor, used by the user level clone() function.
 								SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
+								        fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								        fAllowedLocales(NULL) {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    if (U_FAILURE(status)) {
 								        return;
 								    }
 								    fMagic = src.fMagic;
 								    fChecks = src.fChecks;
 								    if (src.fSpoofData != NULL) {
 								        fSpoofData = src.fSpoofData->addReference();
 								    }
 								    fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
-												ICU-12549 Revisions to uspoof.h documentation.  Actually removing identifier_info.h and other obsolete files from r39218.

X-SVN-Rev: 39297
											
										
										
											2016-09-20 21:06:55 +00:00
+								    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
 								    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								        status = U_MEMORY_ALLOCATION_ERROR;
 								    }
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
+								    fRestrictionLevel = src.fRestrictionLevel;
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								}
 								SpoofImpl::~SpoofImpl() {
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								    fMagic = 0;                // head off application errors by preventing use of
 								                               //    of deleted objects.
 								    if (fSpoofData != NULL) {
 								        fSpoofData->removeReference();   // Will delete if refCount goes to zero.
 								    }
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    delete fAllowedCharsSet;
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								    uprv_free((void *)fAllowedLocales);
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								}
 								//  Cast this instance as a USpoofChecker for the C API.
 								USpoofChecker *SpoofImpl::asUSpoofChecker() {
 								    return reinterpret_cast<USpoofChecker*>(this);
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								}
 								//
 								//  Incoming parameter check on Status and the SpoofChecker object
 								//    received from the C API.
 								//
 								const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
 								    if (U_FAILURE(status)) {
 								        return NULL;
 								    }
 								    if (sc == NULL) {
 								        status = U_ILLEGAL_ARGUMENT_ERROR;
 								        return NULL;
-												ICU-9440 gcc compiler warning cleanup.

X-SVN-Rev: 33279
											
										
										
											2013-02-20 21:22:03 +00:00
+								    }
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    SpoofImpl *This = (SpoofImpl *)sc;
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    if (This->fMagic != USPOOF_MAGIC) {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								        status = U_INVALID_FORMAT_ERROR;
 								        return NULL;
 								    }
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								        return NULL;
 								    }
 								    return This;
 								}
 								SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
 								    return const_cast<SpoofImpl *>
 								        (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
 								}
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
 								    UnicodeSet    allowedChars;
-												ICU-4790 spoof detection, more testing, fix leak in tests

X-SVN-Rev: 25682
											
										
										
											2009-03-30 23:05:55 +00:00
+								    UnicodeSet    *tmpSet = NULL;
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								    const char    *locStart = localesList;
 								    const char    *locEnd = NULL;
 								    const char    *localesListEnd = localesList + uprv_strlen(localesList);
 								    int32_t        localeListCount = 0;   // Number of locales provided by caller.
 								    // Loop runs once per locale from the localesList, a comma separated list of locales.
-												ICU-4790 Spoof detection code cleanup

X-SVN-Rev: 25886
											
										
										
											2009-04-23 06:23:11 +00:00
+								    do {
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								        locEnd = uprv_strchr(locStart, ',');
 								        if (locEnd == NULL) {
 								            locEnd = localesListEnd;
 								        }
 								        while (*locStart == ' ') {
 								            locStart++;
 								        }
 								        const char *trimmedEnd = locEnd-1;
-												ICU-4790 spoof detection, more testing, fix leak in tests

X-SVN-Rev: 25682
											
										
										
											2009-03-30 23:05:55 +00:00
+								        while (trimmedEnd > locStart && *trimmedEnd == ' ') {
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								            trimmedEnd--;
 								        }
 								        if (trimmedEnd <= locStart) {
 								            break;
 								        }
-												ICU-7222 Remove Windows x64 warnings by casting explicitly.

X-SVN-Rev: 26870
											
										
										
											2009-11-11 15:47:22 +00:00
+								        const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								        localeListCount++;
 								        // We have one locale from the locales list.
-												ICU-4790 Spoof detection code cleanup

X-SVN-Rev: 25886
											
										
										
											2009-04-23 06:23:11 +00:00
+								        // Add the script chars for this locale to the accumulating set of allowed chars.
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								        // If the locale is no good, we will be notified back via status.
 								        addScriptChars(locale, &allowedChars, status);
 								        uprv_free((void *)locale);
 								        if (U_FAILURE(status)) {
 								            break;
 								        }
 								        locStart = locEnd + 1;
 								    } while (locStart < localesListEnd);
 								    // If our caller provided an empty list of locales, we disable the allowed characters checking
 								    if (localeListCount == 0) {
 								        uprv_free((void *)fAllowedLocales);
 								        fAllowedLocales = uprv_strdup("");
-												ICU-4790 spoof detection, more testing, fix leak in tests

X-SVN-Rev: 25682
											
										
										
											2009-03-30 23:05:55 +00:00
+								        tmpSet = new UnicodeSet(0, 0x10ffff);
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								        if (fAllowedLocales == NULL || tmpSet == NULL) {
 								            status = U_MEMORY_ALLOCATION_ERROR;
 								            return;
 								        }
 								        tmpSet->freeze();
 								        delete fAllowedCharsSet;
 								        fAllowedCharsSet = tmpSet;
-												ICU-7399 various spoof detector cleanups

X-SVN-Rev: 29362
											
										
										
											2011-01-25 23:38:42 +00:00
+								        fChecks &= ~USPOOF_CHAR_LIMIT;
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								        return;
 								    }
 								    // Add all common and inherited characters to the set of allowed chars.
 								    UnicodeSet tempSet;
 								    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
 								    allowedChars.addAll(tempSet);
 								    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
 								    allowedChars.addAll(tempSet);
 								    // If anything went wrong, we bail out without changing
 								    // the state of the spoof checker.
 								    if (U_FAILURE(status)) {
 								        return;
 								    }
 								    // Store the updated spoof checker state.
-												ICU-4790 spoof detection, more testing, fix leak in tests

X-SVN-Rev: 25682
											
										
										
											2009-03-30 23:05:55 +00:00
+								    tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
 								    const char *tmpLocalesList = uprv_strdup(localesList);
 								    if (tmpSet == NULL || tmpLocalesList == NULL) {
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								        status = U_MEMORY_ALLOCATION_ERROR;
 								        return;
 								    }
-												ICU-4790 spoof detection, more testing, fix leak in tests

X-SVN-Rev: 25682
											
										
										
											2009-03-30 23:05:55 +00:00
+								    uprv_free((void *)fAllowedLocales);
 								    fAllowedLocales = tmpLocalesList;
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								    tmpSet->freeze();
 								    delete fAllowedCharsSet;
 								    fAllowedCharsSet = tmpSet;
-												ICU-7399 various spoof detector cleanups

X-SVN-Rev: 29362
											
										
										
											2011-01-25 23:38:42 +00:00
+								    fChecks |= USPOOF_CHAR_LIMIT;
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								}
 								const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
 								    return fAllowedLocales;
 								}
 								// Given a locale (a language), add all the characters from all of the scripts used with that language
 								// to the allowedChars UnicodeSet
 								void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
 								    UScriptCode scripts[30];
-												ICU-12012 Replace all sizeof p / sizeof T with UPRV_LENGTHOF().

R=markus.icu@gmail.com

Review URL: https://codereview.appspot.com/288320043 .

X-SVN-Rev: 38347
											
										
										
											2016-02-24 21:48:56 +00:00
+								    int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
-												ICU-4790 spoof work in progress

X-SVN-Rev: 25674
											
										
										
											2009-03-30 05:08:00 +00:00
+								    if (U_FAILURE(status)) {
 								        return;
 								    }
 								    if (status == U_USING_DEFAULT_WARNING) {
 								        status = U_ILLEGAL_ARGUMENT_ERROR;
 								        return;
 								    }
 								    UnicodeSet tmpSet;
 								    int32_t    i;
 								    for (i=0; i<numScripts; i++) {
 								        tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
 								        allowedChars->addAll(tmpSet);
 								    }
 								}
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
 								void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
 								    result.resetAll();
 								    result.setScriptExtensions(codePoint, status);
 								    if (U_FAILURE(status)) { return; }
 								    // Section 5.1 step 1
 								    if (result.test(USCRIPT_HAN, status)) {
 								        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
 								        result.set(USCRIPT_JAPANESE, status);
 								        result.set(USCRIPT_KOREAN, status);
 								    }
 								    if (result.test(USCRIPT_HIRAGANA, status)) {
 								        result.set(USCRIPT_JAPANESE, status);
 								    }
 								    if (result.test(USCRIPT_KATAKANA, status)) {
 								        result.set(USCRIPT_JAPANESE, status);
 								    }
 								    if (result.test(USCRIPT_HANGUL, status)) {
 								        result.set(USCRIPT_KOREAN, status);
 								    }
 								    if (result.test(USCRIPT_BOPOMOFO, status)) {
 								        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
 								    }
 								    // Section 5.1 step 2
 								    if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
 								        result.setAll();
 								    }
 								}
 								// Computes the resolved script set for a string, according to UTS 39 section 5.1.
 								void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
 								    getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
 								}
 								// Computes the resolved script set for a string, omitting characters having the specified script.
 								// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
 								void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
 								    result.setAll();
 								    ScriptSet temp;
 								    UChar32 codePoint;
 								    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
 								        codePoint = input.char32At(i);
 								        // Compute the augmented script set for the character
 								        getAugmentedScriptSet(codePoint, temp, status);
 								        if (U_FAILURE(status)) { return; }
 								        // Intersect the augmented script set with the resolved script set, but only if the character doesn't
 								        // have the script specified in the function call
 								        if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
 								            result.intersect(temp);
 								        }
 								    }
 								}
 								// Computes the set of numerics for a string, according to UTS 39 section 5.3.
 								void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
 								    result.clear();
 								    UChar32 codePoint;
 								    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
 								        codePoint = input.char32At(i);
 								        // Store a representative character for each kind of decimal digit
 								        if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
 								            // Store the zero character as a representative for comparison.
 								            // Unicode guarantees it is codePoint - value
 								            result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
 								        }
 								    }
 								}
 								// Computes the restriction level of a string, according to UTS 39 section 5.2.
 								URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
 								    // Section 5.2 step 1:
 								    if (!fAllowedCharsSet->containsAll(input)) {
 								        return USPOOF_UNRESTRICTIVE;
 								    }
 								    // Section 5.2 step 2
 								    // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
 								    // and just do a simple for loop.
 								    UBool allASCII = TRUE;
 								    for (int32_t i=0, length=input.length(); i<length; i++) {
 								        if (input.charAt(i) > 0x7f) {
 								            allASCII = FALSE;
 								            break;
 								        }
 								    }
 								    if (allASCII) {
 								        return USPOOF_ASCII;
 								    }
 								    // Section 5.2 steps 3:
 								    ScriptSet resolvedScriptSet;
 								    getResolvedScriptSet(input, resolvedScriptSet, status);
 								    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
 								    // Section 5.2 step 4:
 								    if (!resolvedScriptSet.isEmpty()) {
 								        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
 								    }
 								    // Section 5.2 step 5:
 								    ScriptSet resolvedNoLatn;
 								    getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
 								    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
 								    // Section 5.2 step 6:
 								    if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
 								            || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
 								            || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
 								        return USPOOF_HIGHLY_RESTRICTIVE;
 								    }
 								    // Section 5.2 step 7:
 								    if (!resolvedNoLatn.isEmpty()
 								            && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
 								            && !resolvedNoLatn.test(USCRIPT_GREEK, status)
 								            && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
 								        return USPOOF_MODERATELY_RESTRICTIVE;
 								    }
 								    // Section 5.2 step 8:
 								    return USPOOF_MINIMALLY_RESTRICTIVE;
 								}
-												ICU-13333 Adding combining dot spoof check.

X-SVN-Rev: 41428
											
										
										
											2018-05-22 02:47:31 +00:00
+								int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
 								    bool sawLeadCharacter = false;
 								    for (int32_t i=0; i<input.length();) {
 								        UChar32 cp = input.char32At(i);
 								        if (sawLeadCharacter && cp == 0x0307) {
 								            return i;
 								        }
 								        uint8_t combiningClass = u_getCombiningClass(cp);
 								        // Skip over characters except for those with combining class 0 (non-combining characters) or with
 								        // combining class 230 (same class as U+0307)
 								        U_ASSERT(u_getCombiningClass(0x0307) == 230);
 								        if (combiningClass == 0 || combiningClass == 230) {
 								            sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
 								        }
 								        i += U16_LENGTH(cp);
 								    }
 								    return -1;
 								}
 								static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
 								    return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
 								           u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
 								}
 								bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
 								    if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
 								        return true;
 								    }
 								    UnicodeString skelStr;
 								    fSpoofData->confusableLookup(cp, skelStr);
 								    UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
 								    if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
 								        return true;
 								    }
 								    return false;
 								}
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
 								// Convert a text format hex number.  Utility function used by builder code.  Static.
 								// Input: UChar *string text.  Output: a UChar32
 								// Input has been pre-checked, and will have no non-hex chars.
 								// The number must fall in the code point range of 0..0x10ffff
 								// Static Function.
 								UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
 								    if (U_FAILURE(status)) {
 								        return 0;
 								    }
 								    U_ASSERT(limit-start > 0);
 								    uint32_t val = 0;
 								    int i;
 								    for (i=start; i<limit; i++) {
 								        int digitVal = s[i] - 0x30;
 								        if (digitVal>9) {
 								            digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
 								        }
 								        if (digitVal>15) {
 								            digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
 								        }
 								        U_ASSERT(digitVal <= 0xf);
 								        val <<= 4;
 								        val += digitVal;
 								    }
 								    if (val > 0x10ffff) {
 								        status = U_PARSE_ERROR;
 								        val = 0;
 								    }
 								    return (UChar32)val;
 								}
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								//-----------------------------------------
 								//
 								//   class CheckResult Implementation
 								//
 								//-----------------------------------------
 								CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
 								    clear();
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
+								}
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
 								    return reinterpret_cast<USpoofCheckResult*>(this);
 								}
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								//
 								//  Incoming parameter check on Status and the CheckResult object
 								//    received from the C API.
 								//
 								const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
 								    if (U_FAILURE(status)) { return NULL; }
 								    if (ptr == NULL) {
 								        status = U_ILLEGAL_ARGUMENT_ERROR;
 								        return NULL;
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
+								    }
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    CheckResult *This = (CheckResult*) ptr;
 								    if (This->fMagic != USPOOF_CHECK_MAGIC) {
 								        status = U_INVALID_FORMAT_ERROR;
 								        return NULL;
 								    }
 								    return This;
-												ICU-9440 Delete misplaced comma causing build failure

X-SVN-Rev: 33164
											
										
										
											2013-02-11 06:57:31 +00:00
+								}
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
 								    return const_cast<CheckResult *>
 								        (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
 								}
 								void CheckResult::clear() {
 								    fChecks = 0;
 								    fNumerics.clear();
 								    fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
 								}
-												ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
											
										
										
											2013-02-11 04:51:14 +00:00
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
 								    if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
 								        return fChecks | fRestrictionLevel;
 								    } else {
 								        return fChecks;
 								    }
 								}
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								CheckResult::~CheckResult() {
 								}
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
 								//----------------------------------------------------------------------------------------------
 								//
 								//   class SpoofData Implementation
 								//
 								//----------------------------------------------------------------------------------------------
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								UBool SpoofData::validateDataVersion(UErrorCode &status) const {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    if (U_FAILURE(status) ||
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								        fRawData == NULL ||
 								        fRawData->fMagic != USPOOF_MAGIC ||
 								        fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
 								        fRawData->fFormatVersion[1] != 0 ||
 								        fRawData->fFormatVersion[2] != 0 ||
 								        fRawData->fFormatVersion[3] != 0) {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								            status = U_INVALID_FORMAT_ERROR;
 								            return FALSE;
 								    }
 								    return TRUE;
 								}
-												ICU-11032 Incorporate review comments, use isAcceptable call-back with udata_open

X-SVN-Rev: 36466
											
										
										
											2014-09-11 18:28:05 +00:00
+								static UBool U_CALLCONV
 								spoofDataIsAcceptable(void *context,
 								                        const char * /* type */, const char * /*name*/,
 								                        const UDataInfo *pInfo) {
 								    if(
 								        pInfo->size >= 20 &&
 								        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
 								        pInfo->charsetFamily == U_CHARSET_FAMILY &&
 								        pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
 								        pInfo->dataFormat[1] == 0x66 &&
 								        pInfo->dataFormat[2] == 0x75 &&
 								        pInfo->dataFormat[3] == 0x20 &&
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								        pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
-												ICU-11032 Incorporate review comments, use isAcceptable call-back with udata_open

X-SVN-Rev: 36466
											
										
										
											2014-09-11 18:28:05 +00:00
+								    ) {
 								        UVersionInfo *version = static_cast<UVersionInfo *>(context);
 								        if(version != NULL) {
 								            uprv_memcpy(version, pInfo->dataVersion, 4);
 								        }
 								        return TRUE;
 								    } else {
 								        return FALSE;
 								    }
 								}
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								//  Methods for the loading of the default confusables data file.  The confusable
 								//  data is loaded only when it is needed.
 								//
 								//  SpoofData::getDefault() - Return the default confusables data, and call the
 								//                            initOnce() if it is not available.  Adds a reference
 								//                            to the SpoofData that the caller is responsible for
 								//                            decrementing when they are done with the data.
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								//
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								//  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
 								//                           is shared by all spoof checkers using the default data.
-												ICU-11031 cache spoof check data (ICU4C).

X-SVN-Rev: 37793
											
										
										
											2015-08-21 01:23:29 +00:00
+								//
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								//  uspoof_cleanupDefaultData - Called during cleanup.
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								//
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
 								static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
 								static SpoofData* gDefaultSpoofData;
 								static UBool U_CALLCONV
 								uspoof_cleanupDefaultData(void) {
 								    if (gDefaultSpoofData) {
 								        // Will delete, assuming all user-level spoof checkers were closed.
 								        gDefaultSpoofData->removeReference();
-												ICU-13333 Adding combining dot spoof check.

X-SVN-Rev: 41428
											
										
										
											2018-05-22 02:47:31 +00:00
+								        gDefaultSpoofData = nullptr;
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								        gSpoofInitDefaultOnce.reset();
 								    }
 								    return TRUE;
 								}
-												ICU-12752 commit some IBM z fixes. Library code building

X-SVN-Rev: 39368
											
										
										
											2016-09-27 23:39:01 +00:00
+								static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
-												ICU-13333 Adding combining dot spoof check.

X-SVN-Rev: 41428
											
										
										
											2018-05-22 02:47:31 +00:00
+								    UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
-												ICU-11032 Incorporate review comments, use isAcceptable call-back with udata_open

X-SVN-Rev: 36466
											
										
										
											2014-09-11 18:28:05 +00:00
+								                                        spoofDataIsAcceptable,
-												ICU-13333 Adding combining dot spoof check.

X-SVN-Rev: 41428
											
										
										
											2018-05-22 02:47:31 +00:00
+								                                        nullptr,       // context, would receive dataVersion if supplied.
-												ICU-11032 Incorporate review comments, use isAcceptable call-back with udata_open

X-SVN-Rev: 36466
											
										
										
											2014-09-11 18:28:05 +00:00
+								                                        &status);
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    if (U_FAILURE(status)) { return; }
 								    gDefaultSpoofData = new SpoofData(udm, status);
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    if (U_FAILURE(status)) {
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								        delete gDefaultSpoofData;
-												ICU-13333 Adding combining dot spoof check.

X-SVN-Rev: 41428
											
										
										
											2018-05-22 02:47:31 +00:00
+								        gDefaultSpoofData = nullptr;
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								        return;
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    }
-												ICU-13333 Adding combining dot spoof check.

X-SVN-Rev: 41428
											
										
										
											2018-05-22 02:47:31 +00:00
+								    if (gDefaultSpoofData == nullptr) {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								        status = U_MEMORY_ALLOCATION_ERROR;
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								        return;
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    }
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
 								}
 								SpoofData* SpoofData::getDefault(UErrorCode& status) {
 								    umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
 								    if (U_FAILURE(status)) { return NULL; }
 								    gDefaultSpoofData->addReference();
 								    return gDefaultSpoofData;
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								}
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
 								{
 								    reset();
 								    if (U_FAILURE(status)) {
 								        return;
 								    }
-												ICU-11032 Incorporate review comments, use isAcceptable call-back with udata_open

X-SVN-Rev: 36466
											
										
										
											2014-09-11 18:28:05 +00:00
+								    fUDM = udm;
-												ICU-11032 call udata_getMemory() rather than reading internal structures

X-SVN-Rev: 36473
											
										
										
											2014-09-11 19:52:26 +00:00
+								    // fRawData is non-const because it may be constructed by the data builder.
 								    fRawData = reinterpret_cast<SpoofDataHeader *>(
 								            const_cast<void *>(udata_getMemory(udm)));
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    validateDataVersion(status);
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    initPtrs(status);
 								}
 								SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
 								{
 								    reset();
 								    if (U_FAILURE(status)) {
 								        return;
 								    }
 								    if ((size_t)length < sizeof(SpoofDataHeader)) {
 								        status = U_INVALID_FORMAT_ERROR;
 								        return;
 								    }
-												ICU-13773 Double free in uspoof_openFromSerialized, plus other memory related fixes.

X-SVN-Rev: 41386
											
										
										
											2018-05-16 20:38:05 +00:00
+								    if (data == NULL) {
 								        status = U_ILLEGAL_ARGUMENT_ERROR;
 								        return;
 								    }
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    void *ncData = const_cast<void *>(data);
 								    fRawData = static_cast<SpoofDataHeader *>(ncData);
 								    if (length < fRawData->fLength) {
 								        status = U_INVALID_FORMAT_ERROR;
 								        return;
 								    }
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    validateDataVersion(status);
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    initPtrs(status);
 								}
 								// Spoof Data constructor for use from data builder.
 								//   Initializes a new, empty data area that will be populated later.
 								SpoofData::SpoofData(UErrorCode &status) {
 								    reset();
 								    if (U_FAILURE(status)) {
 								        return;
 								    }
 								    fDataOwned = true;
 								    // The spoof header should already be sized to be a multiple of 16 bytes.
 								    // Just in case it's not, round it up.
 								    uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
 								    U_ASSERT(initialSize == sizeof(SpoofDataHeader));
 								    fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
 								    fMemLimit = initialSize;
 								    if (fRawData == NULL) {
 								        status = U_MEMORY_ALLOCATION_ERROR;
 								        return;
 								    }
 								    uprv_memset(fRawData, 0, initialSize);
 								    fRawData->fMagic = USPOOF_MAGIC;
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								    fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    fRawData->fFormatVersion[1] = 0;
 								    fRawData->fFormatVersion[2] = 0;
 								    fRawData->fFormatVersion[3] = 0;
 								    initPtrs(status);
 								}
 								// reset() - initialize all fields.
 								//           Should be updated if any new fields are added.
 								//           Called by constructors to put things in a known initial state.
 								void SpoofData::reset() {
 								   fRawData = NULL;
 								   fDataOwned = FALSE;
 								   fUDM      = NULL;
 								   fMemLimit = 0;
 								   fRefCount = 1;
 								   fCFUKeys = NULL;
 								   fCFUValues = NULL;
 								   fCFUStrings = NULL;
 								}
 								//  SpoofData::initPtrs()
 								//            Initialize the pointers to the various sections of the raw data.
 								//
 								//            This function is used both during the Trie building process (multiple
 								//            times, as the individual data sections are added), and
 								//            during the opening of a Spoof Checker from prebuilt data.
 								//
 								//            The pointers for non-existent data sections (identified by an offset of 0)
 								//            are set to NULL.
 								//
 								//            Note:  During building the data, adding each new data section
 								//            reallocs the raw data area, which likely relocates it, which
 								//            in turn requires reinitializing all of the pointers into it, hence
 								//            multiple calls to this function during building.
 								//
 								void SpoofData::initPtrs(UErrorCode &status) {
 								    fCFUKeys = NULL;
 								    fCFUValues = NULL;
 								    fCFUStrings = NULL;
 								    if (U_FAILURE(status)) {
 								        return;
 								    }
 								    if (fRawData->fCFUKeys != 0) {
 								        fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
 								    }
 								    if (fRawData->fCFUStringIndex != 0) {
 								        fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
 								    }
 								    if (fRawData->fCFUStringTable != 0) {
 								        fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
 								    }
 								}
 								SpoofData::~SpoofData() {
 								    if (fDataOwned) {
 								        uprv_free(fRawData);
 								    }
 								    fRawData = NULL;
 								    if (fUDM != NULL) {
 								        udata_close(fUDM);
 								    }
 								    fUDM = NULL;
 								}
 								void SpoofData::removeReference() {
 								    if (umtx_atomic_dec(&fRefCount) == 0) {
 								        delete this;
 								    }
 								}
 								SpoofData *SpoofData::addReference() {
 								    umtx_atomic_inc(&fRefCount);
 								    return this;
 								}
 								void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
 								    if (U_FAILURE(status)) {
 								        return NULL;
 								    }
 								    if (!fDataOwned) {
 								        U_ASSERT(FALSE);
 								        status = U_INTERNAL_PROGRAM_ERROR;
 								        return NULL;
 								    }
 								    numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
 								    uint32_t returnOffset = fMemLimit;
 								    fMemLimit += numBytes;
 								    fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
 								    fRawData->fLength = fMemLimit;
 								    uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
 								    initPtrs(status);
 								    return (char *)fRawData + returnOffset;
 								}
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
 								    int32_t dataSize = fRawData->fLength;
 								    if (capacity < dataSize) {
 								        status = U_BUFFER_OVERFLOW_ERROR;
 								        return dataSize;
 								    }
 								    uprv_memcpy(buf, fRawData, dataSize);
 								    return dataSize;
 								}
 								int32_t SpoofData::size() const {
 								    return fRawData->fLength;
 								}
 								//-------------------------------
 								//
 								// Front-end APIs for SpoofData
 								//
 								//-------------------------------
 								int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
 								    // Perform a binary search.
 								    // [lo, hi), i.e lo is inclusive, hi is exclusive.
 								    // The result after the loop will be in lo.
 								    int32_t lo = 0;
 								    int32_t hi = length();
 								    do {
 								        int32_t mid = (lo + hi) / 2;
 								        if (codePointAt(mid) > inChar) {
 								            hi = mid;
 								        } else if (codePointAt(mid) < inChar) {
 								            lo = mid;
 								        } else {
 								            // Found result.  Break early.
 								            lo = mid;
 								            break;
 								        }
 								    } while (hi - lo > 1);
 								    // Did we find an entry?  If not, the char maps to itself.
 								    if (codePointAt(lo) != inChar) {
 								        dest.append(inChar);
 								        return 1;
 								    }
 								    // Add the element to the string builder and return.
 								    return appendValueTo(lo, dest);
 								}
 								int32_t SpoofData::length() const {
 								    return fRawData->fCFUKeysSize;
 								}
 								UChar32 SpoofData::codePointAt(int32_t index) const {
 								    return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
 								}
 								int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
 								    int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
 								    // Value is either a char (for strings of length 1) or
 								    // an index into the string table (for longer strings)
 								    uint16_t value = fCFUValues[index];
 								    if (stringLength == 1) {
 								        dest.append((UChar)value);
 								    } else {
 								        dest.append(fCFUStrings + value, stringLength);
 								    }
 								    return stringLength;
 								}
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
 								U_NAMESPACE_END
 								U_NAMESPACE_USE
 								//-----------------------------------------------------------------------------
 								//
 								//  uspoof_swap   -  byte swap and char encoding swap of spoof data
 								//
 								//-----------------------------------------------------------------------------
 								U_CAPI int32_t U_EXPORT2
 								uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
 								           UErrorCode *status) {
 								    if (status == NULL || U_FAILURE(*status)) {
 								        return 0;
 								    }
 								    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
 								        *status=U_ILLEGAL_ARGUMENT_ERROR;
 								        return 0;
 								    }
 								    //
 								    //  Check that the data header is for spoof data.
 								    //    (Header contents are defined in gencfu.cpp)
 								    //
 								    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
 								    if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
 								           pInfo->dataFormat[1]==0x66 &&
 								           pInfo->dataFormat[2]==0x75 &&
 								           pInfo->dataFormat[3]==0x20 &&
-												ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
											
										
										
											2016-09-13 22:15:13 +00:00
+								           pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
 								           pInfo->formatVersion[1]==0 &&
 								           pInfo->formatVersion[2]==0 &&
 								           pInfo->formatVersion[3]==0  )) {
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								        udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
 								                             "(format version %02x %02x %02x %02x) is not recognized\n",
 								                         pInfo->dataFormat[0], pInfo->dataFormat[1],
 								                         pInfo->dataFormat[2], pInfo->dataFormat[3],
 								                         pInfo->formatVersion[0], pInfo->formatVersion[1],
 								                         pInfo->formatVersion[2], pInfo->formatVersion[3]);
 								        *status=U_UNSUPPORTED_ERROR;
 								        return 0;
 								    }
 								    //
 								    // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
 								    //                         header).  This swap also conveniently gets us
 								    //                         the size of the ICU d.h., which lets us locate the start
 								    //                         of the uspoof specific data.
 								    //
 								    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
 								    //
 								    // Get the Spoof Data Header, and check that it appears to be OK.
 								    //
 								    //
 								    const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
 								    SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
 								    if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
 								        ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader))
 								    {
 								        udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
 								        *status=U_UNSUPPORTED_ERROR;
 								        return 0;
 								    }
 								    //
 								    // Prefight operation?  Just return the size
 								    //
 								    int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
 								    int32_t totalSize = headerSize + spoofDataLength;
 								    if (length < 0) {
 								        return totalSize;
 								    }
 								    //
 								    // Check that length passed in is consistent with length from Spoof data header.
 								    //
 								    if (length < totalSize) {
 								        udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
 								                            spoofDataLength);
 								        *status=U_INDEX_OUTOFBOUNDS_ERROR;
 								        return 0;
 								        }
 								    //
 								    // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
 								    //                 we need to reference the header to locate the data, and an
 								    //                 inplace swap of the header leaves it unusable.
 								    //
 								    uint8_t          *outBytes = (uint8_t *)outData + headerSize;
 								    SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;
 								    int32_t   sectionStart;
 								    int32_t   sectionLength;
 								    //
 								    // If not swapping in place, zero out the output buffer before starting.
 								    //    Gaps may exist between the individual sections, and these must be zeroed in
 								    //    the output buffer.  The simplest way to do that is to just zero the whole thing.
 								    //
 								    if (inBytes != outBytes) {
 								        uprv_memset(outBytes, 0, spoofDataLength);
 								    }
 								    // Confusables Keys Section   (fCFUKeys)
 								    sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
 								    sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
 								    ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
 								    // String Index Section
 								    sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
 								    sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
 								    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
 								    // String Table Section
 								    sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
 								    sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
 								    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
 								    // And, last, swap the header itself.
 								    //   int32_t   fMagic             // swap this
-												ICU-8205 fix improper swap of uspoof, also add to swap test

X-SVN-Rev: 29139
											
										
										
											2010-12-02 19:17:46 +00:00
+								    //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
 								    //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
+								    //
 								    uint32_t magic = ds->readUInt32(spoofDH->fMagic);
 								    ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
-												ICU-8273 Add check to avoid memcpy from and to the same address.

X-SVN-Rev: 29299
											
										
										
											2011-01-13 23:15:02 +00:00
 								    if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
 								        uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
 								    }
-												ICU-8205 fix improper swap of uspoof, also add to swap test

X-SVN-Rev: 29139
											
										
										
											2010-12-02 19:17:46 +00:00
+								    // swap starting at fLength
 								    ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00
 								    return totalSize;
 								}
-												ICU-4790 Fix uconfig error.

X-SVN-Rev: 25985
											
										
										
											2009-05-05 02:03:27 +00:00
+								#endif
-												ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
											
										
										
											2009-03-09 23:40:15 +00:00