scuffed-code/icu4c/source/common/unorm.cpp

673 lines
22 KiB
C++
Raw Normal View History

/*
******************************************************************************
* Copyright (c) 1996-2010, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* File unorm.cpp
*
* Created by: Vladimir Weinstein 12052000
*
* Modification history :
*
* Date Name Description
* 02/01/01 synwee Added normalization quickcheck enum and method.
* 02/12/01 synwee Commented out quickcheck util api has been approved
* Added private method for doing FCD checks
* 02/23/01 synwee Modified quickcheck and checkFCE to run through
* string for codepoints < 0x300 for the normalization
* mode NFC.
* 05/25/01+ Markus Scherer total rewrite, implement all normalization here
* instead of just wrappers around normlzr.cpp,
* load unorm.dat, support Unicode 3.1 with
* supplementary code points, etc.
* 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/udata.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "unicode/uiter.h"
#include "unicode/unorm.h"
#include "normalizer2impl.h"
#include "ucln_cmn.h"
#include "unormimp.h"
#include "uprops.h"
#include "cmemory.h"
#include "umutex.h"
#include "utrie2.h"
#include "unicode/uset.h"
#include "putilimp.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
U_NAMESPACE_USE
/*
* This new implementation of the normalization code loads its data from
* unorm.dat, which is generated with the gennorm tool.
* The format of that file is described in unormimp.h .
*/
/* load unorm.dat ----------------------------------------------------------- */
#define UNORM_HARDCODE_DATA 1
#if UNORM_HARDCODE_DATA
/* unorm_props_data.c is machine-generated by gennorm --csource */
#include "unorm_props_data.c"
static const UBool formatVersion_2_2=TRUE;
#else
#define DATA_NAME "unorm"
#define DATA_TYPE "icu"
static UDataMemory *normData=NULL;
static UErrorCode dataErrorCode=U_ZERO_ERROR;
static int8_t haveNormData=0;
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
/*
* pointers into the memory-mapped unorm.icu
*/
static const uint16_t *extraData=NULL,
*combiningTable=NULL,
*canonStartSets=NULL;
static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
/* the Unicode version of the normalization data */
static UVersionInfo dataVersion={ 0, 0, 0, 0 };
#endif
U_CDECL_BEGIN
static UBool U_CALLCONV
unorm_cleanup(void) {
#if !UNORM_HARDCODE_DATA
if(normData!=NULL) {
udata_close(normData);
normData=NULL;
}
dataErrorCode=U_ZERO_ERROR;
haveNormData=0;
#endif
return TRUE;
}
#if !UNORM_HARDCODE_DATA
static UBool U_CALLCONV
isAcceptable(void * /* context */,
const char * /* type */, const char * /* name */,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */
pInfo->dataFormat[1]==0x6f &&
pInfo->dataFormat[2]==0x72 &&
pInfo->dataFormat[3]==0x6d &&
pInfo->formatVersion[0]==2 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
return TRUE;
} else {
return FALSE;
}
}
#endif
static UBool U_CALLCONV
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
/* add the start code point to the USet */
const USetAdder *sa=(const USetAdder *)context;
sa->add(sa->set, start);
return TRUE;
}
U_CDECL_END
#if !UNORM_HARDCODE_DATA
static int8_t
loadNormData(UErrorCode &errorCode) {
/* load Unicode normalization data from file */
/*
* This lazy intialization with double-checked locking (without mutex protection for
* haveNormData==0) is transiently unsafe under certain circumstances.
* Check the readme and use u_init() if necessary.
*
* While u_init() initializes the main normalization data via this functions,
* it does not do so for exclusion sets (which are fully mutexed).
* This is because
* - there can be many exclusion sets
* - they are rarely used
* - they are not usually used in execution paths that are
* as performance-sensitive as others
* (e.g., IDNA takes more time than unorm_quickCheck() anyway)
*
* TODO: Remove code in support for non-hardcoded data. u_init() is now advertised
* as not being required for thread safety, and we can't reasonably
* revert to requiring it.
*/
if(haveNormData==0) {
UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
UDataMemory *data;
const int32_t *p=NULL;
const uint8_t *pb;
if(&errorCode==NULL || U_FAILURE(errorCode)) {
return 0;
}
/* open the data outside the mutex block */
data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
dataErrorCode=errorCode;
if(U_FAILURE(errorCode)) {
return haveNormData=-1;
}
p=(const int32_t *)udata_getMemory(data);
pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
_normTrie.getFoldingOffset=getFoldingNormOffset;
pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
}
pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
_auxTrie.getFoldingOffset=getFoldingAuxOffset;
}
if(U_FAILURE(errorCode)) {
dataErrorCode=errorCode;
udata_close(data);
return haveNormData=-1;
}
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if(normData==NULL) {
normData=data;
data=NULL;
uprv_memcpy(&indexes, p, sizeof(indexes));
uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
} else {
p=(const int32_t *)udata_getMemory(normData);
}
/* initialize some variables */
extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
if(formatVersion_2_1) {
canonStartSets=combiningTable+
indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
(indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
}
haveNormData=1;
ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
umtx_unlock(NULL);
/* if a different thread set it first, then close the extra data */
if(data!=NULL) {
udata_close(data); /* NULL if it was set correctly */
}
}
return haveNormData;
}
#endif
static inline UBool
_haveData(UErrorCode &errorCode) {
#if UNORM_HARDCODE_DATA
return U_SUCCESS(errorCode);
#else
if(U_FAILURE(errorCode)) {
return FALSE;
} else if(haveNormData>0) {
return TRUE;
} else if(haveNormData<0) {
errorCode=dataErrorCode;
return FALSE;
} else /* haveNormData==0 */ {
return (UBool)(loadNormData(errorCode)>0);
}
#endif
}
U_CAPI UBool U_EXPORT2
unorm_haveData(UErrorCode *pErrorCode) {
return _haveData(*pErrorCode);
}
/* normalization properties ------------------------------------------------- */
U_CFUNC UBool U_EXPORT2
unorm_isCanonSafeStart(UChar32 c) {
#if UNORM_HARDCODE_DATA
if(auxTrie.index!=NULL) {
#else
UErrorCode errorCode=U_ZERO_ERROR;
if(_haveData(errorCode) && auxTrie.index!=NULL) {
#endif
uint16_t aux=UTRIE2_GET16(&auxTrie, c);
return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
} else {
return FALSE;
}
}
U_CAPI void U_EXPORT2
unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
if(unorm_haveData(pErrorCode)){
uprv_memcpy(*versionInfo, dataVersion, 4);
}
}
U_CAPI UBool U_EXPORT2
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
#if !UNORM_HARDCODE_DATA
UErrorCode errorCode=U_ZERO_ERROR;
#endif
if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
#if !UNORM_HARDCODE_DATA
_haveData(errorCode) &&
#endif
canonStartSets!=NULL
) {
const uint16_t *table;
int32_t i, start, limit;
/*
* binary search for c
*
* There are two search tables,
* one for BMP code points and one for supplementary ones.
* See unormimp.h for details.
*/
if(c<=0xffff) {
table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
start=0;
limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
/* each entry is a pair { c, result } */
while(start<limit-2) {
i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
if(c<table[i]) {
limit=i;
} else {
start=i;
}
}
/* found? */
if(c==table[start]) {
i=table[start+1];
if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
/* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
i&=(_NORM_MAX_CANON_SETS-1);
return uset_getSerializedSet(fillSet,
canonStartSets+i,
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
} else {
/* other result values are BMP code points for single-code point sets */
uset_setSerializedToOne(fillSet, (UChar32)i);
return TRUE;
}
}
} else {
uint16_t high, low, h;
table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
start=0;
limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
high=(uint16_t)(c>>16);
low=(uint16_t)c;
/* each entry is a triplet { high(c), low(c), result } */
while(start<limit-3) {
i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
h=table[i]&0x1f; /* high word */
if(high<h || (high==h && low<table[i+1])) {
limit=i;
} else {
start=i;
}
}
/* found? */
h=table[start];
if(high==(h&0x1f) && low==table[start+1]) {
i=table[start+2];
if((h&0x8000)==0) {
/* the result is an index to a USerializedSet */
return uset_getSerializedSet(fillSet,
canonStartSets+i,
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
} else {
/*
* single-code point set {x} in
* triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
*/
i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
uset_setSerializedToOne(fillSet, (UChar32)i);
return TRUE;
}
}
}
}
return FALSE; /* not found */
}
U_CAPI int32_t U_EXPORT2
u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
uint16_t aux;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(_haveData(*pErrorCode) && auxTrie.index!=NULL) {
aux=UTRIE2_GET16(&auxTrie, c);
aux&=_NORM_AUX_FNC_MASK;
} else {
aux=0;
}
if(aux!=0) {
const UChar *s;
int32_t length;
s=(const UChar *)(extraData+aux);
if(*s<0xff00) {
/* s points to the single-unit string */
length=1;
} else {
length=*s&0xff;
++s;
}
if(0<length && length<=destCapacity) {
uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
}
return u_terminateUChars(dest, destCapacity, length, pErrorCode);
} else {
return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
}
}
U_CAPI void U_EXPORT2
unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
UChar c;
if(!_haveData(*pErrorCode)) {
return;
}
/* add the start code point of each same-value range of each trie */
if(auxTrie.index!=NULL) {
utrie2_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
}
/* add Hangul LV syllables and LV+1 because of skippables */
for(c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
sa->add(sa->set, c);
sa->add(sa->set, c+1);
}
sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
}
/* quick check functions ---------------------------------------------------- */
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar *src,
int32_t srcLength,
UNormalizationMode mode,
UErrorCode *pErrorCode) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
}
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
if(options&UNORM_UNICODE_3_2) {
FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
return unorm2_quickCheck((const UNormalizer2 *)&fn2, src, srcLength, pErrorCode);
} else {
return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
}
}
U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar *src, int32_t srcLength,
UNormalizationMode mode,
UErrorCode *pErrorCode) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
}
U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
if(options&UNORM_UNICODE_3_2) {
FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
return unorm2_isNormalized((const UNormalizer2 *)&fn2, src, srcLength, pErrorCode);
} else {
return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
}
}
/* normalize() API ---------------------------------------------------------- */
/** Public API for normalizing. */
U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
if(options&UNORM_UNICODE_3_2) {
FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
return unorm2_normalize((const UNormalizer2 *)&fn2,
src, srcLength, dest, destCapacity, pErrorCode);
} else {
return unorm2_normalize((const UNormalizer2 *)n2,
src, srcLength, dest, destCapacity, pErrorCode);
}
}
/* iteration functions ------------------------------------------------------ */
static int32_t
unorm_iterate(UCharIterator *src, UBool forward,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UBool doNormalize, UBool *pNeededToNormalize,
UErrorCode *pErrorCode) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
const UnicodeSet *uni32;
if(options&UNORM_UNICODE_3_2) {
uni32=uniset_getUnicode32Instance(*pErrorCode);
} else {
uni32=NULL; // unused
}
FilteredNormalizer2 fn2(*n2, *uni32);
if(options&UNORM_UNICODE_3_2) {
n2=&fn2;
}
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
src==NULL
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(pNeededToNormalize!=NULL) {
*pNeededToNormalize=FALSE;
}
if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
}
UnicodeString buffer;
UChar32 c;
if(forward) {
/* get one character and ignore its properties */
buffer.append(uiter_next32(src));
/* get all following characters until we see a boundary */
while((c=uiter_next32(src))>=0) {
if(n2->hasBoundaryBefore(c)) {
/* back out the latest movement to stop at the boundary */
src->move(src, -U16_LENGTH(c), UITER_CURRENT);
break;
} else {
buffer.append(c);
}
}
} else {
while((c=uiter_previous32(src))>=0) {
/* always write this character to the front of the buffer */
buffer.insert(0, c);
/* stop if this just-copied character is a boundary */
if(n2->hasBoundaryBefore(c)) {
break;
}
}
}
UnicodeString destString(dest, 0, destCapacity);
if(buffer.length()>0 && doNormalize) {
n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
*pNeededToNormalize= destString!=buffer;
}
return destString.length();
} else {
/* just copy the source characters */
return buffer.extract(dest, destCapacity, *pErrorCode);
}
}
U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator *src,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UBool doNormalize, UBool *pNeededToNormalize,
UErrorCode *pErrorCode) {
return unorm_iterate(src, FALSE,
dest, destCapacity,
mode, options,
doNormalize, pNeededToNormalize,
pErrorCode);
}
U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator *src,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UBool doNormalize, UBool *pNeededToNormalize,
UErrorCode *pErrorCode) {
return unorm_iterate(src, TRUE,
dest, destCapacity,
mode, options,
doNormalize, pNeededToNormalize,
pErrorCode);
}
/* Concatenation of normalized strings -------------------------------------- */
U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar *left, int32_t leftLength,
const UChar *right, int32_t rightLength,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
const UnicodeSet *uni32;
if(options&UNORM_UNICODE_3_2) {
uni32=uniset_getUnicode32Instance(*pErrorCode);
} else {
uni32=NULL; // unused
}
FilteredNormalizer2 fn2(*n2, *uni32);
if(options&UNORM_UNICODE_3_2) {
n2=&fn2;
}
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
left==NULL || leftLength<-1 ||
right==NULL || rightLength<-1
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* check for overlapping right and destination */
if( dest!=NULL &&
((right>=dest && right<(dest+destCapacity)) ||
(rightLength>0 && dest>=right && dest<(right+rightLength)))
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* allow left==dest */
UnicodeString destString;
if(left==dest) {
destString.setTo(dest, leftLength, destCapacity);
} else {
destString.setTo(dest, 0, destCapacity);
destString.append(left, leftLength);
}
return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
extract(dest, destCapacity, *pErrorCode);
}
#endif /* #if !UCONFIG_NO_NORMALIZATION */