scuffed-code/icu4c/source/common/unorm.cpp

/*
******************************************************************************
* Copyright (c) 1996-2010, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* File unorm.cpp
*
* Created by: Vladimir Weinstein 12052000
*
* Modification history :
*
* Date        Name        Description
* 02/01/01    synwee      Added normalization quickcheck enum and method.
* 02/12/01    synwee      Commented out quickcheck util api has been approved
*                         Added private method for doing FCD checks
* 02/23/01    synwee      Modified quickcheck and checkFCE to run through 
*                         string for codepoints < 0x300 for the normalization 
*                         mode NFC.
* 05/25/01+   Markus Scherer total rewrite, implement all normalization here
*                         instead of just wrappers around normlzr.cpp,
*                         load unorm.dat, support Unicode 3.1 with
*                         supplementary code points, etc.
* 2009-nov..2010-jan  Markus Scherer  total rewrite, new Normalizer2 API & code
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include "unicode/udata.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "unicode/uiter.h"
#include "unicode/unorm.h"
#include "normalizer2impl.h"
#include "ucln_cmn.h"
#include "unormimp.h"
#include "uprops.h"
#include "cmemory.h"
#include "umutex.h"
#include "utrie2.h"
#include "unicode/uset.h"
#include "putilimp.h"

#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

U_NAMESPACE_USE

/*
 * This new implementation of the normalization code loads its data from
 * unorm.dat, which is generated with the gennorm tool.
 * The format of that file is described in unormimp.h .
 */

/* load unorm.dat ----------------------------------------------------------- */

#define UNORM_HARDCODE_DATA 1

#if UNORM_HARDCODE_DATA

/* unorm_props_data.c is machine-generated by gennorm --csource */
#include "unorm_props_data.c"

static const UBool formatVersion_2_2=TRUE;

#else

#define DATA_NAME "unorm"
#define DATA_TYPE "icu"

static UDataMemory *normData=NULL;
static UErrorCode dataErrorCode=U_ZERO_ERROR;
static int8_t haveNormData=0;

static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };

/*
 * pointers into the memory-mapped unorm.icu
 */
static const uint16_t *extraData=NULL,
                      *combiningTable=NULL,
                      *canonStartSets=NULL;

static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;

/* the Unicode version of the normalization data */
static UVersionInfo dataVersion={ 0, 0, 0, 0 };

#endif

U_CDECL_BEGIN

static UBool U_CALLCONV
unorm_cleanup(void) {
#if !UNORM_HARDCODE_DATA
    if(normData!=NULL) {
        udata_close(normData);
        normData=NULL;
    }
    dataErrorCode=U_ZERO_ERROR;
    haveNormData=0;
#endif

    return TRUE;
}

#if !UNORM_HARDCODE_DATA

static UBool U_CALLCONV
isAcceptable(void * /* context */,
             const char * /* type */, const char * /* name */,
             const UDataInfo *pInfo) {
    if(
        pInfo->size>=20 &&
        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
        pInfo->charsetFamily==U_CHARSET_FAMILY &&
        pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Norm" */
        pInfo->dataFormat[1]==0x6f &&
        pInfo->dataFormat[2]==0x72 &&
        pInfo->dataFormat[3]==0x6d &&
        pInfo->formatVersion[0]==2 &&
        pInfo->formatVersion[2]==UTRIE_SHIFT &&
        pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
    ) {
        uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
        uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
        return TRUE;
    } else {
        return FALSE;
    }
}

#endif

static UBool U_CALLCONV
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
    /* add the start code point to the USet */
    const USetAdder *sa=(const USetAdder *)context;
    sa->add(sa->set, start);
    return TRUE;
}

U_CDECL_END

#if !UNORM_HARDCODE_DATA

static int8_t
loadNormData(UErrorCode &errorCode) {
    /* load Unicode normalization data from file */

    /*
     * This lazy intialization with double-checked locking (without mutex protection for
     * haveNormData==0) is transiently unsafe under certain circumstances.
     * Check the readme and use u_init() if necessary.
     *
     * While u_init() initializes the main normalization data via this functions,
     * it does not do so for exclusion sets (which are fully mutexed).
     * This is because
     * - there can be many exclusion sets
     * - they are rarely used
     * - they are not usually used in execution paths that are
     *   as performance-sensitive as others
     *   (e.g., IDNA takes more time than unorm_quickCheck() anyway)
     *
     *  TODO:  Remove code in support for non-hardcoded data.  u_init() is now advertised
     *         as not being required for thread safety, and we can't reasonably
     *         revert to requiring it.
     */
    if(haveNormData==0) {
        UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
        UDataMemory *data;

        const int32_t *p=NULL;
        const uint8_t *pb;

        if(&errorCode==NULL || U_FAILURE(errorCode)) {
            return 0;
        }

        /* open the data outside the mutex block */
        data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
        dataErrorCode=errorCode;
        if(U_FAILURE(errorCode)) {
            return haveNormData=-1;
        }

        p=(const int32_t *)udata_getMemory(data);
        pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
        utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
        _normTrie.getFoldingOffset=getFoldingNormOffset;

        pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
        if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
            utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
        }
        pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];

        if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
            utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
            _auxTrie.getFoldingOffset=getFoldingAuxOffset;
        }

        if(U_FAILURE(errorCode)) {
            dataErrorCode=errorCode;
            udata_close(data);
            return haveNormData=-1;
        }

        /* in the mutex block, set the data for this process */
        umtx_lock(NULL);
        if(normData==NULL) {
            normData=data;
            data=NULL;

            uprv_memcpy(&indexes, p, sizeof(indexes));
            uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
            uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
            uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
        } else {
            p=(const int32_t *)udata_getMemory(normData);
        }

        /* initialize some variables */
        extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
        combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
        formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
        formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
        if(formatVersion_2_1) {
            canonStartSets=combiningTable+
                indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
                (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
        }
        haveNormData=1;
        ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
        umtx_unlock(NULL);

        /* if a different thread set it first, then close the extra data */
        if(data!=NULL) {
            udata_close(data); /* NULL if it was set correctly */
        }
    }

    return haveNormData;
}

#endif

static inline UBool
_haveData(UErrorCode &errorCode) {
#if UNORM_HARDCODE_DATA
    return U_SUCCESS(errorCode);
#else
    if(U_FAILURE(errorCode)) {
        return FALSE;
    } else if(haveNormData>0) {
        return TRUE;
    } else if(haveNormData<0) {
        errorCode=dataErrorCode;
        return FALSE;
    } else /* haveNormData==0 */ {
        return (UBool)(loadNormData(errorCode)>0);
    }
#endif
}

U_CAPI UBool U_EXPORT2
unorm_haveData(UErrorCode *pErrorCode) {
    return _haveData(*pErrorCode);
}

/* normalization properties ------------------------------------------------- */

U_CFUNC UBool U_EXPORT2
unorm_isCanonSafeStart(UChar32 c) {
#if UNORM_HARDCODE_DATA
    if(auxTrie.index!=NULL) {
#else
    UErrorCode errorCode=U_ZERO_ERROR;
    if(_haveData(errorCode) && auxTrie.index!=NULL) {
#endif
        uint16_t aux=UTRIE2_GET16(&auxTrie, c);
        return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
    } else {
        return FALSE;
    }
}

U_CAPI void U_EXPORT2
unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
    if(unorm_haveData(pErrorCode)){
        uprv_memcpy(*versionInfo, dataVersion, 4);
    }
}


U_CAPI UBool U_EXPORT2
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
#if !UNORM_HARDCODE_DATA
    UErrorCode errorCode=U_ZERO_ERROR;
#endif
    if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
#if !UNORM_HARDCODE_DATA
        _haveData(errorCode) &&
#endif
        canonStartSets!=NULL
    ) {
        const uint16_t *table;
        int32_t i, start, limit;

        /*
         * binary search for c
         *
         * There are two search tables,
         * one for BMP code points and one for supplementary ones.
         * See unormimp.h for details.
         */
        if(c<=0xffff) {
            table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
            start=0;
            limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];

            /* each entry is a pair { c, result } */
            while(start<limit-2) {
                i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
                if(c<table[i]) {
                    limit=i;
                } else {
                    start=i;
                }
            }

            /* found? */
            if(c==table[start]) {
                i=table[start+1];
                if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
                    /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
                    i&=(_NORM_MAX_CANON_SETS-1);
                    return uset_getSerializedSet(fillSet,
                                            canonStartSets+i,
                                            canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
                } else {
                    /* other result values are BMP code points for single-code point sets */
                    uset_setSerializedToOne(fillSet, (UChar32)i);
                    return TRUE;
                }
            }
        } else {
            uint16_t high, low, h;

            table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
                                 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
            start=0;
            limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];

            high=(uint16_t)(c>>16);
            low=(uint16_t)c;

            /* each entry is a triplet { high(c), low(c), result } */
            while(start<limit-3) {
                i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
                h=table[i]&0x1f; /* high word */
                if(high<h || (high==h && low<table[i+1])) {
                    limit=i;
                } else {
                    start=i;
                }
            }

            /* found? */
            h=table[start];
            if(high==(h&0x1f) && low==table[start+1]) {
                i=table[start+2];
                if((h&0x8000)==0) {
                    /* the result is an index to a USerializedSet */
                    return uset_getSerializedSet(fillSet,
                                            canonStartSets+i,
                                            canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
                } else {
                    /*
                     * single-code point set {x} in
                     * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
                     */
                    i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
                    uset_setSerializedToOne(fillSet, (UChar32)i);
                    return TRUE;
                }
            }
        }
    }

    return FALSE; /* not found */
}

U_CAPI int32_t U_EXPORT2
u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
    uint16_t aux;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    if(_haveData(*pErrorCode) && auxTrie.index!=NULL) {
        aux=UTRIE2_GET16(&auxTrie, c);
        aux&=_NORM_AUX_FNC_MASK;
    } else {
        aux=0;
    }
    if(aux!=0) {
        const UChar *s;
        int32_t length;

        s=(const UChar *)(extraData+aux);
        if(*s<0xff00) {
            /* s points to the single-unit string */
            length=1;
        } else {
            length=*s&0xff;
            ++s;
        }
        if(0<length && length<=destCapacity) {
            uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
        }
        return u_terminateUChars(dest, destCapacity, length, pErrorCode);
    } else {
        return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
    }
}

U_CAPI void U_EXPORT2
unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
    UChar c;

    if(!_haveData(*pErrorCode)) {
        return;
    }

    /* add the start code point of each same-value range of each trie */
    if(auxTrie.index!=NULL) {
        utrie2_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
    }

    /* add Hangul LV syllables and LV+1 because of skippables */
    for(c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
        sa->add(sa->set, c);
        sa->add(sa->set, c+1);
    }
    sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
}

/* quick check functions ---------------------------------------------------- */

U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar *src,
                 int32_t srcLength, 
                 UNormalizationMode mode,
                 UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
}

U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, 
                            UNormalizationMode mode, int32_t options,
                            UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    if(options&UNORM_UNICODE_3_2) {
        FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
        return unorm2_quickCheck((const UNormalizer2 *)&fn2, src, srcLength, pErrorCode);
    } else {
        return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
    }
}

U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar *src, int32_t srcLength,
                   UNormalizationMode mode,
                   UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
}

U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
                              UNormalizationMode mode, int32_t options,
                              UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    if(options&UNORM_UNICODE_3_2) {
        FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
        return unorm2_isNormalized((const UNormalizer2 *)&fn2, src, srcLength, pErrorCode);
    } else {
        return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
    }
}

/* normalize() API ---------------------------------------------------------- */

/** Public API for normalizing. */
U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar *src, int32_t srcLength,
                UNormalizationMode mode, int32_t options,
                UChar *dest, int32_t destCapacity,
                UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    if(options&UNORM_UNICODE_3_2) {
        FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
        return unorm2_normalize((const UNormalizer2 *)&fn2,
            src, srcLength, dest, destCapacity, pErrorCode);
    } else {
        return unorm2_normalize((const UNormalizer2 *)n2,
            src, srcLength, dest, destCapacity, pErrorCode);
    }
}


/* iteration functions ------------------------------------------------------ */

static int32_t
unorm_iterate(UCharIterator *src, UBool forward,
              UChar *dest, int32_t destCapacity,
              UNormalizationMode mode, int32_t options,
              UBool doNormalize, UBool *pNeededToNormalize,
              UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    const UnicodeSet *uni32;
    if(options&UNORM_UNICODE_3_2) {
        uni32=uniset_getUnicode32Instance(*pErrorCode);
    } else {
        uni32=NULL;  // unused
    }
    FilteredNormalizer2 fn2(*n2, *uni32);
    if(options&UNORM_UNICODE_3_2) {
        n2=&fn2;
    }
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
        src==NULL
    ) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(pNeededToNormalize!=NULL) {
        *pNeededToNormalize=FALSE;
    }
    if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
        return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
    }

    UnicodeString buffer;
    UChar32 c;
    if(forward) {
        /* get one character and ignore its properties */
        buffer.append(uiter_next32(src));
        /* get all following characters until we see a boundary */
        while((c=uiter_next32(src))>=0) {
            if(n2->hasBoundaryBefore(c)) {
                /* back out the latest movement to stop at the boundary */
                src->move(src, -U16_LENGTH(c), UITER_CURRENT);
                break;
            } else {
                buffer.append(c);
            }
        }
    } else {
        while((c=uiter_previous32(src))>=0) {
            /* always write this character to the front of the buffer */
            buffer.insert(0, c);
            /* stop if this just-copied character is a boundary */
            if(n2->hasBoundaryBefore(c)) {
                break;
            }
        }
    }

    UnicodeString destString(dest, 0, destCapacity);
    if(buffer.length()>0 && doNormalize) {
        n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
        if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
            *pNeededToNormalize= destString!=buffer;
        }
        return destString.length();
    } else {
        /* just copy the source characters */
        return buffer.extract(dest, destCapacity, *pErrorCode);
    }
}

U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator *src,
               UChar *dest, int32_t destCapacity,
               UNormalizationMode mode, int32_t options,
               UBool doNormalize, UBool *pNeededToNormalize,
               UErrorCode *pErrorCode) {
    return unorm_iterate(src, FALSE,
                         dest, destCapacity,
                         mode, options,
                         doNormalize, pNeededToNormalize,
                         pErrorCode);
}

U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator *src,
           UChar *dest, int32_t destCapacity,
           UNormalizationMode mode, int32_t options,
           UBool doNormalize, UBool *pNeededToNormalize,
           UErrorCode *pErrorCode) {
    return unorm_iterate(src, TRUE,
                         dest, destCapacity,
                         mode, options,
                         doNormalize, pNeededToNormalize,
                         pErrorCode);
}

/* Concatenation of normalized strings -------------------------------------- */

U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar *left, int32_t leftLength,
                  const UChar *right, int32_t rightLength,
                  UChar *dest, int32_t destCapacity,
                  UNormalizationMode mode, int32_t options,
                  UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    const UnicodeSet *uni32;
    if(options&UNORM_UNICODE_3_2) {
        uni32=uniset_getUnicode32Instance(*pErrorCode);
    } else {
        uni32=NULL;  // unused
    }
    FilteredNormalizer2 fn2(*n2, *uni32);
    if(options&UNORM_UNICODE_3_2) {
        n2=&fn2;
    }
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
        left==NULL || leftLength<-1 ||
        right==NULL || rightLength<-1
    ) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* check for overlapping right and destination */
    if( dest!=NULL &&
        ((right>=dest && right<(dest+destCapacity)) ||
         (rightLength>0 && dest>=right && dest<(right+rightLength)))
    ) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* allow left==dest */
    UnicodeString destString;
    if(left==dest) {
        destString.setTo(dest, leftLength, destCapacity);
    } else {
        destString.setTo(dest, 0, destCapacity);
        destString.append(left, leftLength);
    }
    return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
           extract(dest, destCapacity, *pErrorCode);
}

#endif /* #if !UCONFIG_NO_NORMALIZATION */