/* ****************************************************************************** * Copyright (c) 1996-2001, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * File unorm.cpp * * Created by: Vladimir Weinstein 12052000 * * Modification history : * * Date Name Description * 02/01/01 synwee Added normalization quickcheck enum and method. * 02/12/01 synwee Commented out quickcheck util api has been approved * Added private method for doing FCD checks * 02/23/01 synwee Modified quickcheck and checkFCE to run through * string for codepoints < 0x300 for the normalization * mode NFC. */ #include "unicode/utypes.h" #include "unicode/unorm.h" #include "unicode/normlzr.h" #include "unicode/ustring.h" #include "unicode/udata.h" #include "cpputils.h" #include "ustr_imp.h" #include "umutex.h" #include "unormimp.h" /* added by synwee ### TODO: remove once the new implementation is finished */ #include "unicode/uchar.h" #include "unicode/utf16.h" /* ### TODO: remove this once the new implementation is finished */ static UBool useNewImplementation=FALSE; U_CAPI void U_EXPORT2 unorm_setNewImplementation(UBool useNew) { useNewImplementation=useNew; } U_CAPI UBool U_EXPORT2 unorm_usesNewImplementation() { return useNewImplementation; } /* new implementation ------------------------------------------------------- */ /* Korean Hangul and Jamo constants */ enum { JAMO_L_BASE=0x1100, /* "lead" jamo */ JAMO_V_BASE=0x1161, /* "vowel" jamo */ JAMO_T_BASE=0x11a7, /* "trail" jamo */ HANGUL_BASE=0xac00, JAMO_L_COUNT=19, JAMO_V_COUNT=21, JAMO_T_COUNT=28, HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT }; inline UBool isHangulWithoutJamo3(UChar c) { c-=HANGUL_BASE; return csize>=20 && pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY && pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */ pInfo->dataFormat[1]==0x6f && pInfo->dataFormat[2]==0x72 && pInfo->dataFormat[3]==0x6d && pInfo->formatVersion[0]==1 && pInfo->formatVersion[3]==_NORM_TRIE_SHIFT ) { uprv_memcpy(dataVersion, pInfo->dataVersion, 4); return TRUE; } else { return FALSE; } } static int8_t loadNormData(UErrorCode &errorCode) { /* load Unicode normalization data from file */ if(haveNormData==0) { UDataMemory *data; const uint16_t *p=NULL; if(&errorCode==NULL || U_FAILURE(errorCode)) { return 0; } /* open the data outside the mutex block */ data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode); dataErrorCode=errorCode; if(U_FAILURE(errorCode)) { return haveNormData=-1; } p=(const uint16_t *)udata_getMemory(data); /* in the mutex block, set the data for this process */ umtx_lock(NULL); if(normData==NULL) { normData=data; data=NULL; indexes=p; p=NULL; } umtx_unlock(NULL); /* initialize some variables */ normTrieIndex=indexes+indexes[_NORM_INDEX_COUNT]; extraData=normTrieIndex+indexes[_NORM_INDEX_TRIE_INDEX_COUNT]+2*indexes[_NORM_INDEX_TRIE_DATA_COUNT]; combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT]; fcdTrieIndex=combiningTable+indexes[_NORM_INDEX_COMBINE_DATA_COUNT]; haveNormData=1; /* if a different thread set it first, then close the extra data */ if(data!=NULL) { udata_close(data); /* NULL if it was set correctly */ } } return haveNormData; } inline UBool _haveData(UErrorCode &errorCode) { if(haveNormData!=0) { errorCode=dataErrorCode; return (UBool)(haveNormData>0); } else { return (UBool)(loadNormData(errorCode)>0); } } U_CAPI UBool U_EXPORT2 unorm_haveData(UErrorCode *pErrorCode) { return _haveData(*pErrorCode); } /* data access primitives --------------------------------------------------- */ inline uint32_t _getNorm32(UChar c) { return normTrieData[ normTrieIndex[ c>>_NORM_TRIE_SHIFT ]+ (c&_NORM_STAGE_2_MASK) ]; } inline uint32_t _getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) { /* the surrogate index in norm32 is an offset over the BMP top of stage 1 */ uint32_t c= ((norm32>>(_NORM_EXTRA_SHIFT-10))&0xffc00)| (c2&0x3ff); return normTrieData[ normTrieIndex[ _NORM_STAGE_1_BMP_COUNT+ (c>>_NORM_TRIE_SHIFT) ]+ (c&_NORM_STAGE_2_MASK) ]; } inline uint16_t _getFCD16(UChar c) { return fcdTrieIndex[ fcdTrieIndex[ c>>_NORM_TRIE_SHIFT ]+ (c&_NORM_STAGE_2_MASK) ]; } inline uint16_t _getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) { /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */ uint32_t c= ((uint32_t)fcd16<<10)| (c2&0x3ff); return fcdTrieIndex[ fcdTrieIndex[ c>>_NORM_TRIE_SHIFT ]+ (c&_NORM_STAGE_2_MASK) ]; } inline const uint16_t * _getExtraData(uint32_t norm32) { return extraData+(norm32>>_NORM_EXTRA_SHIFT); } /* get the canonical or compatibility decomposition for one character */ inline const UChar * _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length, uint8_t &cc, uint8_t &trailCC) { const UChar *p=(const UChar *)_getExtraData(norm32); length=*p++; if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) { /* use compatibility decomposition, skip canonical data */ p+=((length>>7)&1)+(length&0x7f); length>>=8; } if(length&0x80) { /* get the lead and trail cc's */ UChar bothCCs=*p++; cc=(uint8_t)(bothCCs>>8); trailCC=(uint8_t)bothCCs; } else { /* lead and trail cc's are both 0 */ cc=trailCC=0; } length&=0x7f; return p; } /* get the canonical decomposition for one character */ inline const UChar * _decompose(uint32_t norm32, int32_t &length, uint8_t &cc, uint8_t &trailCC) { const UChar *p=(const UChar *)_getExtraData(norm32); length=*p++; if(length&0x80) { /* get the lead and trail cc's */ UChar bothCCs=*p++; cc=(uint8_t)(bothCCs>>8); trailCC=(uint8_t)bothCCs; } else { /* lead and trail cc's are both 0 */ cc=trailCC=0; } length&=0x7f; return p; } /* * get the combining class of (c, c2)=*p++ * before: p>_NORM_CC_SHIFT); } } /* * get the combining class of (c, c2)=*--p * before: start

>_NORM_CC_SHIFT); } else if(UTF_IS_SURROGATE_FIRST(c)) { /* unpaired first surrogate */ return 0; } else if(p!=start && UTF_IS_FIRST_SURROGATE(c2=*(p-1))) { --p; norm32=_getNorm32(c2); if((norm32&_NORM_CC_MASK)==0) { /* all surrogate pairs with this lead surrogate have cc==0 */ return 0; } else { /* norm32 must be a surrogate special */ return (uint8_t)(_getNorm32FromSurrogatePair(norm32, c)>>_NORM_CC_SHIFT); } } else { /* unpaired second surrogate */ return 0; } } /* reorder UTF-16 in-place -------------------------------------------------- */ /* * merge two UTF-16 string parts together * to canonically order (order by combining classes) their concatenation * * the two strings may already be adjacent, so that the merging is done in-place * if the two strings are not adjacent, then the buffer holding the first one * must be large enough * the second string may or may not be ordered in itself * * before: [start..current[ is already ordered, and * [next..limit[ may be ordered in itself, but * is not in relation to [start..current[ * after: [start..current+(limit-next)[ is ordered * * the algorithm is a simple bubble-sort that takes the characters from *next++ * and inserts them in correct combining class order into the preceding part * of the string * * returns the trailing combining class */ static uint8_t _mergeOrdered(UChar *start, UChar *current, const UChar *next, const UChar *limit, UBool isOrdered=TRUE) { const UChar *pBack, *pPreBack; UChar *q, *r; UChar c, c2; uint8_t cc, prevCC, trailCC=0; UBool adjacent; adjacent= current==next; if(start!=current || !isOrdered) { while(next=prevCC */ pPreBack=pBack=current; prevCC=_getPrevCC(start, pPreBack); if(cc>=prevCC) { /* does not bubble back */ trailCC=cc; if(adjacent) { current=(UChar *)next; } else { *current++=c; if(c2!=0) { *current++=c2; } } if(isOrdered) { break; } } else { /* this will be the last code point, so keep its cc */ trailCC=prevCC; pBack=pPreBack; while(start=prevCC) { break; } pBack=pPreBack; } /* * this is where we are right now with all these pointers: * [start..pPreBack[ 0..? code points that we can ignore * [pPreBack..pBack[ 0..1 code points with prevCC<=cc * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2) * [current..next[ 1 code point (c, c2) with cc * [next..limit[ 0..? code points yet to be bubbled in * * note that current and next may be unrelated (if not adjacent)! */ /* move the code units in between up (q moves left of r) */ q=current; r=current= c2==0 ? current+1 : current+2; do { *--r=*--q; } while(pBack!=q); /* insert (c, c2) */ *q=c; if(c2!=0) { *(q+1)=c2; } if(isOrdered) { /* we know that the new part is ordered in itself, so we can move start up */ start=r; /* set it to after where (c, c2) were inserted */ } } } } } if(next==limit) { /* we know the cc of the last code point */ return trailCC; } else { if(!adjacent) { /* copy the second string part */ do { *current++=*next++; } while(next!=limit); limit=current; } return _getPrevCC(start, limit); } } /* * simpler, more efficient version of _mergeOrdered() - * inserts only one code point into the preceding string * assume that (c, c2) has not yet been inserted at [current..p[ */ static uint8_t _insertOrdered(const UChar *start, UChar *current, UChar *p, UChar c, UChar c2, uint8_t cc) { const UChar *pBack, *pPreBack; UChar *r; uint8_t prevCC, trailCC=cc; if(start=prevCC */ pPreBack=pBack=current; prevCC=_getPrevCC(start, pPreBack); if(cc=prevCC) { break; } pBack=pPreBack; } /* * this is where we are right now with all these pointers: * [start..pPreBack[ 0..? code points that we can ignore * [pPreBack..pBack[ 0..1 code points with prevCC<=cc * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2) * [current..p[ 1 code point (c, c2) with cc */ /* move the code units in between up */ r=p; do { *--r=*--current; } while(pBack!=current); } } /* insert (c, c2) */ *current=c; if(c2!=0) { *(current+1)=c2; } /* we know the cc of the last code point */ return trailCC; } /* quick check functions ---------------------------------------------------- */ static UBool unorm_checkFCD(const UChar *src, int32_t srcLength) { const UChar *limit; UChar c, c2; uint16_t fcd16; int16_t prevCC, cc; /* initialize */ prevCC=0; if(srcLength>=0) { /* string with length */ limit=src+srcLength; } else /* srcLength==-1 */ { /* zero-terminated string */ limit=NULL; } U_ALIGN_CODE(16); for(;;) { /* skip a run of code units below the minimum or with irrelevant data for the FCD check */ if(limit==NULL) { for(;;) { c=*src++; if(c<_NORM_MIN_WITH_LEAD_CC) { if(c==0) { return TRUE; } prevCC=-(int16_t)c; } else if((fcd16=_getFCD16(c))==0) { prevCC=0; } else { break; } } } else { for(;;) { if(src==limit) { return TRUE; } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) { prevCC=-(int16_t)c; } else if((fcd16=_getFCD16(c))==0) { prevCC=0; } else { break; } } } /* check one above-minimum, relevant code unit */ if(UTF_IS_FIRST_SURROGATE(c)) { /* c is a lead surrogate, get the real fcd16 */ if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) { ++src; fcd16=_getFCD16FromSurrogatePair(fcd16, c2); } else { fcd16=0; } } /* * prevCC has values from the following ranges: * 0..0xff - the previous trail combining class * <0 - the negative value of the previous code unit; * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16() * was deferred so that average text is checked faster */ /* check the combining order */ cc=(int16_t)(fcd16>>8); if(cc!=0) { if(prevCC<0) { /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */ prevCC=(int16_t)_getFCD16((UChar)-prevCC)&0xff; } if(cc=0) { /* string with length */ limit=src+srcLength; } else /* srcLength==-1 */ { /* zero-terminated string */ limit=NULL; } U_ALIGN_CODE(16); for(;;) { /* skip a run of code units below the minimum or with irrelevant data for the quick check */ if(limit==NULL) { for(;;) { c=*src++; if(c=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) { break; } prevCC=0; } } /* check one above-minimum, relevant code unit */ if(_NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP) { /* c is a lead surrogate, get the real norm32 */ if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) { ++src; norm32=_getNorm32FromSurrogatePair(norm32, c2); } else { norm32=0; } } /* check the combining order */ cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); if(cc!=0 && cc=0) { /* string with length */ limit=src+srcLength; } else /* srcLength==-1 */ { /* zero-terminated string */ limit=NULL; } U_ALIGN_CODE(16); for(;;) { /* count code units below the minimum or with irrelevant data for the quick check */ prevSrc=src; if(limit==NULL) { while((c=*src)=_NORM_MIN_HANGUL) { if(ignoreHangul) { c2=0; p=NULL; length=1; } else { /* Hangul syllable: decompose algorithmically */ p=buffer; cc=trailCC=0; c-=HANGUL_BASE; c2=(UChar)(c%JAMO_T_COUNT); c/=JAMO_T_COUNT; if(c2>0) { buffer[2]=(UChar)(JAMO_T_BASE+c2); length=3; } else { length=2; } buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); } } else { if(norm32<_NORM_MIN_SPECIAL) { c2=0; length=1; } else { /* c is a lead surrogate, get the real norm32 */ if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) { ++src; length=2; norm32=_getNorm32FromSurrogatePair(norm32, c2); } else { c2=0; length=1; norm32=0; } } /* get the decomposition and the lead and trail cc's */ if((norm32&qcMask)==0) { /* c does not decompose */ cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT); p=NULL; } else { /* c decomposes, get everything from the variable-length extra data */ p=_decompose(norm32, qcMask, length, cc, trailCC); if(length==1) { /* fastpath a single code unit from decomposition */ c=*p; c2=0; p=NULL; } } } /* append the decomposition to the destination buffer, assume length>0 */ if( (destIndex+length)<=destCapacity || /* attempt to grow the buffer */ (canGrow && (canGrow=growBuffer(context, &dest, &destCapacity, limit==NULL ? 2*destCapacity+length+20 : destCapacity+length+2*(limit-src)+20, destIndex))!=FALSE) ) { UChar *reorderSplit=dest+destIndex; if(p==NULL) { /* fastpath: single code point */ if(cc!=0 && cc0); } } } else { /* buffer overflow */ /* keep incrementing the destIndex for preflighting */ destIndex+=length; } prevCC=trailCC; if(prevCC==0) { reorderStart=dest+destIndex; } } #if 1 /* ### TODO: this passes the tests but seems weird */ /* we may NUL-terminate if it fits as a convenience */ if(destIndexdestCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } #else /* ### TODO: this looks slightly to much more reasonable but fails some tests, esp. /tscoll/cmsccoll/TestIncrementalNormalize */ if(limit==NULL) { /* assume that we must NUL-terminate */ if(destIndexdestCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } } #endif return destIndex; } /* make FCD ----------------------------------------------------------------- */ static const UChar * _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) { UChar c, c2; /* * find the first position in [src..limit[ after some cc==0 according to FCD data * * at the beginning of the loop, we have fcd16 from before src * * stop at positions: * - after trail cc==0 * - at the end of the source * - before lead cc==0 */ for(;;) { /* stop if trail cc==0 for the previous character */ if((fcd16&0xff)==0) { break; } /* get c=*src - stop at end of string */ if(limit==NULL) { c=*src; if(c==0) { break; } } else { if(src==limit) { break; } c=*src; } /* stop if lead cc==0 for this character */ if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) { break; } if(!UTF_IS_FIRST_SURROGATE(c)) { if(fcd16<=0xff) { break; } ++src; } else if((limit==NULL || (src+1)!=limit) && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) { /* c is a lead surrogate, get the real fcd16 */ fcd16=_getFCD16FromSurrogatePair(fcd16, c2); if(fcd16<=0xff) { break; } src+=2; } else { /* c is an unpaired first surrogate, lead cc==0 */ break; } } return src; } static uint8_t _decomposeFCD(const UChar *src, const UChar *decompLimit, const UChar *limit, UChar *dest, int32_t &destIndex, int32_t &destCapacity, UBool canGrow, GrowBuffer *growBuffer, void *context) { UChar *reorderStart; const UChar *p; uint32_t norm32; int32_t length; UChar c, c2; uint8_t cc, prevCC, trailCC; /* * canonically decompose [src..decompLimit[ * * all characters in this range have some non-zero cc, * directly or in decomposition, * so that we do not need to check in the following for quick-check limits etc. * * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)! * * we also do not need to check for c==0 because we have an established decompLimit */ reorderStart=dest+destIndex; prevCC=0; while(src>_NORM_CC_SHIFT); p=NULL; } else { /* c decomposes, get everything from the variable-length extra data */ p=_decompose(norm32, length, cc, trailCC); if(length==1) { /* fastpath a single code unit from decomposition */ c=*p; c2=0; p=NULL; } } /* append the decomposition to the destination buffer, assume length>0 */ if( (destIndex+length)<=destCapacity || /* attempt to grow the buffer */ (canGrow && (canGrow=growBuffer(context, &dest, &destCapacity, limit==NULL ? 2*destCapacity+length+20 : destCapacity+length+2*(limit-src)+20, destIndex))!=FALSE) ) { UChar *reorderSplit=dest+destIndex; if(p==NULL) { /* fastpath: single code point */ if(cc!=0 && cc0); } } } else { /* buffer overflow */ /* keep incrementing the destIndex for preflighting */ destIndex+=length; } prevCC=trailCC; if(prevCC==0) { reorderStart=dest+destIndex; } } return prevCC; } /* * ### TODO: * try to use the previous two functions in incremental FCD in collation */ static int32_t unorm_makeFCD(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, GrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode) { const UChar *limit, *prevSrc, *decompStart; int32_t destIndex, length; UChar c, c2; uint16_t fcd16; int16_t prevCC, cc; UBool canGrow; if(!_haveData(*pErrorCode)) { return 0; } /* initialize */ decompStart=src; destIndex=0; prevCC=0; /* do not attempt to grow if there is no growBuffer function or if it has failed before */ canGrow=(UBool)(growBuffer!=NULL); if(srcLength>=0) { /* string with length */ limit=src+srcLength; } else /* srcLength==-1 */ { /* zero-terminated string */ limit=NULL; } U_ALIGN_CODE(16); for(;;) { /* skip a run of code units below the minimum or with irrelevant data for the FCD check */ prevSrc=src; if(limit==NULL) { for(;;) { c=*src; if(c<_NORM_MIN_WITH_LEAD_CC) { if(c==0) { break; } prevCC=-(int16_t)c; } else if((fcd16=_getFCD16(c))==0) { prevCC=0; } else { break; } ++src; } } else { for(;;) { if(src==limit) { break; } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) { prevCC=-(int16_t)c; } else if((fcd16=_getFCD16(c))==0) { prevCC=0; } else { break; } ++src; } } /* copy these code units all at once */ if(src!=prevSrc) { length=(int32_t)(src-prevSrc); if( (destIndex+length)<=destCapacity || /* attempt to grow the buffer */ (canGrow && (canGrow=growBuffer(context, &dest, &destCapacity, limit==NULL ? 2*destCapacity+length+20 : destCapacity+length+2*(limit-src)+20, destIndex))!=FALSE) ) { do { dest[destIndex++]=*prevSrc++; } while(src!=prevSrc); } else { /* buffer overflow */ /* keep incrementing the destIndex for preflighting */ destIndex+=length; prevSrc=src; } } /* now prevSrc==src - used later to adjust destIndex before decomposition */ /* end of source reached? */ if(limit==NULL ? c==0 : src==limit) { break; } /* * prevCC has values from the following ranges: * 0..0xff - the previous trail combining class * <0 - the negative value of the previous code unit; * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16() * was deferred so that average text is checked faster */ /* set a pointer to after the last source position where prevCC==0 */ if(prevCC<0) { /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */ prevCC=(int16_t)_getFCD16((UChar)-prevCC)&0xff; decompStart= prevCC==0 ? src : src-1; } else if(prevCC==0) { decompStart=src; /* else do not change decompStart */ } /* c already contains *src and fcd16 is set for it, increment src */ ++src; /* check one above-minimum, relevant code unit */ if(UTF_IS_FIRST_SURROGATE(c)) { /* c is a lead surrogate, get the real fcd16 */ if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) { ++src; fcd16=_getFCD16FromSurrogatePair(fcd16, c2); } else { fcd16=0; } } /* we are looking at the character at [prevSrc..src[ */ /* check the combining order */ cc=(int16_t)(fcd16>>8); if(cc==0 || cc>=prevCC) { /* the order is ok */ prevCC=(int16_t)fcd16&0xff; } else { /* * back out the part of the source that we copied already but * is now going to be decomposed; * prevSrc is set to after what was copied */ destIndex-=(int32_t)(prevSrc-decompStart); /* * find the part of the source that needs to be decomposed; * to be safe and simple, decompose to before the next character with lead cc==0 */ src=_findSafeFCD(src, limit, fcd16); /* * the source text does not fulfill the conditions for FCD; * decompose and reorder a limited piece of the text */ prevCC=_decomposeFCD(decompStart, src, limit, dest, destIndex, destCapacity, canGrow, growBuffer, context); decompStart=src; } } #if 1 /* ### TODO: this passes the tests but seems weird */ /* we may NUL-terminate if it fits as a convenience */ if(destIndexdestCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } #else /* ### TODO: this looks slightly to much more reasonable but fails some tests, esp. /tscoll/cmsccoll/TestIncrementalNormalize */ if(limit==NULL) { /* assume that we must NUL-terminate */ if(destIndexdestCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } } #endif return destIndex; } /* make NFC & NFKC ---------------------------------------------------------- */ enum { _STACK_BUFFER_CAPACITY=100 }; /* get the composition properties of the next character */ inline uint32_t _getNextCombining(UChar *&p, const UChar *limit, UChar &c, UChar &c2, uint16_t &combiningIndex, uint8_t &cc) { uint32_t norm32, combineFlags; c=*p++; norm32=_getNorm32(c); if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) { c2=0; combiningIndex=0; cc=0; return 0; } else { if(norm32<_NORM_MIN_SPECIAL) { c2=0; } else if(norm32>=_NORM_MIN_HANGUL) { /* a compatibility decomposition contained Jamos */ c2=0; combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT)); cc=0; return norm32&_NORM_COMBINES_ANY; } else { /* c is a lead surrogate, get the real norm32 */ if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) { ++p; norm32=_getNorm32FromSurrogatePair(norm32, c2); } else { c2=0; combiningIndex=0; cc=0; return 0; } } combineFlags=norm32&_NORM_COMBINES_ANY; if(combineFlags!=0) { combiningIndex=*(_getExtraData(norm32)-1); } cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); return combineFlags; } } /* * given a composition-result starter (c, c2) - which means its cc==0, * it combines forward, it has extra data, its norm32!=0, * it is not a Hangul or Jamo, * get just its combineFwdIndex * * norm32(c) is special if and only if c2!=0 */ inline uint16_t _getCombiningIndexFromStarter(UChar c, UChar c2) { uint32_t norm32; norm32=_getNorm32(c); if(c2!=0) { norm32=_getNorm32FromSurrogatePair(norm32, c2); } return *(_getExtraData(norm32)-1); } /* * recompose the characters in [p..limit[ (which is canonically ordered), * adjust limit, and return the trailing cc * * since for NFKC we may get Jamos in decompositions, we need to * recompose those too * * note that recomposition never lengthens the text: * any character consists of either one or two code units; * a composition may contain at most one more code unit than the original starter, * while the combining mark that is removed has at least one code unit */ static uint8_t _recompose(UChar *p, UChar *&limit) { UChar *starter, *pRemove, *q, *r; const uint16_t *table; uint32_t combineFlags; UChar c, c2; uint16_t combineFwdIndex, combineBackIndex; uint8_t cc, prevCC; UBool starterIsSupplementary; starter=NULL; /* no starter */ combineFwdIndex=0; /* will not be used until starter!=NULL - avoid compiler warnings */ starterIsSupplementary=FALSE; /* will not be used until starter!=NULL - avoid compiler warnings */ prevCC=0; for(;;) { combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc); if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) { if(combineBackIndex&0x8000) { /* c is a Jamo 2 or 3, see if we can compose it with the previous character */ pRemove=NULL; /* NULL while no Hangul composition */ c2=*starter; if(combineBackIndex==0xfff2) { /* Jamo 2, compose with previous Jamo 1 and following Jamo 3 */ c2=(UChar)(c2-JAMO_L_BASE); if(c2 * the rest of the loop body will reset starter; * technically, a composed Hangul syllable is a starter, but it * does not combine forward now that we have consumed all eligible Jamos */ } else if(!(combineFwdIndex&0x8000) && (prevCC=combineBackIndex) { break; } table+= *table&0x8000 ? 2 : 1; } if((key&0x7fff)==combineBackIndex) { /* found! combine! */ value=*table; /* is the composition a starter that combines forward? */ key=value&0x2000; /* get the composition result code point from the variable-length result value */ if(value&0x8000) { if(value&0x4000) { /* surrogate pair composition result */ value=(value&0x3ff)|0xd800; value2=*(table+1); } else { /* BMP composition result U+2000..U+ffff */ value=*(table+1); value2=0; } } else { /* BMP composition result U+0000..U+1fff */ value&=0x1fff; value2=0; } /* replace the starter with the composition, remove the combining mark */ *starter=(UChar)value; pRemove= c2==0 ? p-1 : p-2; if(starterIsSupplementary) { if(value2!=0) { *(starter+1)=(UChar)value2; } else { /* the composition is shorter than the starter, move the intermediate characters forward one */ starterIsSupplementary=FALSE; q=starter+1; r=q+1; while(r=_NORM_MIN_HANGUL) { if(norm32<_NORM_MIN_JAMO2) { /* Hangul decomposes but is all starters, Jamo 1 are starters */ return NULL; } /* Jamo 2/3 are not starters but cc==0 */ cc=trailCC=0; length=1; return src++; } if(norm32<_NORM_MIN_SPECIAL) { c2=0; length=1; } else { /* c is a lead surrogate, get the real norm32 */ if((limit==NULL || (src+1)!=limit) && UTF_IS_SECOND_SURROGATE(c2=*(src+1))) { length=2; norm32=_getNorm32FromSurrogatePair(norm32, c2); } else { return NULL; } } /* get the decomposition and the lead and trail cc's */ if((norm32&decompQCMask)==0) { /* c does not decompose */ cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT); p=src; } else { /* c decomposes, get everything from the variable-length extra data */ p=_decompose(norm32, decompQCMask, length, cc, trailCC); if(cc==0) { /* get the first character's norm32 to check if it is a starter with qc "no" or "maybe" */ norm32=_getNorm32(p, qcMask); } } if(cc==0 && !(norm32&qcMask)) { return NULL; } else { src+= c2==0 ? 1 : 2; return p; } } /* * decompose the previous code point (needs start=0 to the last starter in the decomposition * that has NF*C "yes" * starterIndex==-1 if there is no starter */ static const UChar * _decomposeBackFindStarter(const UChar *start, const UChar *&src, uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe, int32_t &starterIndex, int32_t &length) { const UChar *p; uint32_t norm32; UChar c, c2; uint8_t cc, trailCC; c=*--src; length=1; starterIndex=0; /* many characters are themselves starters */ /* check for a surrogate before getting norm32 to see if we need to predecrement further */ if(c=_NORM_MIN_HANGUL) { /* Hangul decomposes but is all starters, Jamo 1 are starters */ #if 0 /* we should never get Jamo 2/3 here because we go back through quick check "yes" text */ if(norm32>=_NORM_MIN_JAMO2) { /* Jamo 2/3 are not starters */ starterIndex=-1; } #endif return src; } } else if(UTF_IS_SURROGATE_FIRST(c)) { /* unpaired first surrogate */ return src; } else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) { --src; length=2; norm32=_getNorm32(c2); if((norm32&(_NORM_CC_MASK|decompQCMask))==0) { /* all surrogate pairs with this lead surrogate have cc==0 and no decomposition */ return src; } else { /* norm32 must be a surrogate special */ norm32=_getNorm32FromSurrogatePair(norm32, c); } } else { /* unpaired second surrogate */ return src; } /* get the decomposition and the lead and trail cc's */ if((norm32&decompQCMask)==0) { /* c does not decompose */ if((norm32&(_NORM_CC_MASK|qcMask))!=0) { starterIndex=-1; } p=src; } else { /* c decomposes, get everything from the variable-length extra data */ p=_decompose(norm32, decompQCMask, length, cc, trailCC); /* find the starterIndex (the decomposition is canonically ordered!) */ /* assume that the decomposition contains complete code points */ if(UTF_IS_SECOND_SURROGATE(p[length-1])) { starterIndex=length-2; } else { starterIndex=length-1; } if(trailCC!=0 || (_getNorm32(p+starterIndex, qcMask)&qcMask)) { /* search backwards */ for(;;) { if(starterIndex==0) { starterIndex=-1; break; } c=p[--starterIndex]; if(UTF_IS_SECOND_SURROGATE(c)) { c2=p[--starterIndex]; norm32=_getNorm32(c2); if((norm32&(_NORM_CC_MASK|qcMask))==0) { /* all surrogate pairs with this lead surrogate have cc==0 */ break; } else { /* norm32 must be a surrogate special */ norm32=_getNorm32FromSurrogatePair(norm32, c); } } else { norm32=_getNorm32(c); } if((norm32&(_NORM_CC_MASK|qcMask))==0) { break; } } } } return p; } /* * recompose around the current character: * this function is called when unorm_decompose() finds a quick check "no" or "maybe" * after some text (with quick check "yes") has been copied already * * decompose this character as well as parts of the source surrounding it, * find the previous and the next starter, * and then recompose between these two starters */ static const UChar * _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length, const UChar *&prevStarter, const UChar *prevSrc, const UChar *src, const UChar *limit, uint32_t norm32, uint32_t qcMask, uint8_t &prevCC, int32_t &destIndex, UErrorCode *pErrorCode) { const UChar *p, *starter; UChar *reorderSplit, *recomposeLimit; uint32_t decompQCMask; int32_t startIndex, limitIndex, firstStarterIndex, starterIndex; UChar minNoMaybe; uint8_t cc, trailCC; decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ if(!(decompQCMask&_NORM_QC_NFKD)) { minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]; } else { minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]; } /* get the decomposition and the lead and trail cc's */ if((norm32&decompQCMask)==0) { /* c does not decompose */ cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT); p=prevSrc; } else { /* c decomposes, get everything from the variable-length extra data */ p=_decompose(norm32, decompQCMask, length, cc, trailCC); if(cc==0) { /* get the first character's norm32 to check if it is a starter with qc "no" or "maybe" */ norm32=_getNorm32(p, qcMask); } } /* copy the decomposition into the buffer, assume that it fits */ startIndex=limitIndex=bufferCapacity/2; do { buffer[limitIndex++]=*p++; } while(--length>0); /* find the last starter in [prevStarter..src[ including this new decomposition */ if((cc==0 && !(norm32&qcMask)) || prevStarter==prevSrc) { prevCC=trailCC; starter=prevSrc; firstStarterIndex=startIndex; } else { /* decompose backwards and look for a starter */ firstStarterIndex=0; starter=prevSrc; for(;;) { p=_decomposeBackFindStarter(prevStarter, starter, qcMask, decompQCMask, minNoMaybe, starterIndex, length); /* make sure there is enough space in the buffer */ if(startIndex0); /* stop if we found a starter */ if(starterIndex>=0) { firstStarterIndex=startIndex+starterIndex; break; } /* stop if we are at the beginning of the text */ if(prevStarter>=starter) { firstStarterIndex=startIndex; break; } } /* reorder the backwards decomposition, set prevCC */ reorderSplit=buffer+firstStarterIndex; prevCC=_mergeOrdered(reorderSplit, reorderSplit, reorderSplit, buffer+limitIndex, FALSE); /* adjust destIndex: back out what had been copied with qc "yes" */ destIndex-=(int32_t)(prevSrc-starter); } /* find the next starter in [src..limit[ */ for(;;) { p=_decomposeBeforeNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe, cc, trailCC, length); if(p==NULL) { break; /* reached a starter */ } /* make sure there is enough space in the buffer */ if((limitIndex+length)>bufferCapacity) { if(startIndex>=length) { /* it fits if we move the buffer contents up */ uprv_memmove(buffer, buffer+startIndex, (limitIndex-startIndex)*U_SIZEOF_UCHAR); firstStarterIndex-=startIndex; limitIndex-=startIndex; startIndex=0; } else if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, limitIndex)) { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } } if(cc!=0 && cc0); prevCC=trailCC; } } /* recompose between the two starters */ recomposeLimit=buffer+limitIndex; if((limitIndex-firstStarterIndex)>=2) { prevCC=_recompose(buffer+firstStarterIndex, recomposeLimit); } /* set output parameters and return with a pointer to the recomposition */ prevStarter=src; p=buffer+startIndex; length=recomposeLimit-p; return p; } U_CFUNC int32_t unorm_compose(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBool compat, UBool /* ### TODO: need to do this? -- ignoreHangul -- ### */, GrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode) { UChar stackBuffer[_STACK_BUFFER_CAPACITY]; UChar *buffer; int32_t bufferCapacity; const UChar *limit, *prevSrc, *reorderStart, *prevStarter; uint32_t norm32, ccOrQCMask, qcMask; int32_t destIndex, length; UChar c, c2, minNoMaybe; uint8_t cc, prevCC; UBool canGrow; if(!_haveData(*pErrorCode)) { return 0; } if(!compat) { minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]; qcMask=_NORM_QC_NFC; } else { minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]; qcMask=_NORM_QC_NFKC; } /* initialize */ buffer=stackBuffer; bufferCapacity=_STACK_BUFFER_CAPACITY; prevStarter=src; reorderStart=dest; ccOrQCMask=_NORM_CC_MASK|qcMask; destIndex=0; prevCC=0; /* do not attempt to grow if there is no growBuffer function or if it has failed before */ canGrow=(UBool)(growBuffer!=NULL); if(srcLength>=0) { /* string with length */ limit=src+srcLength; } else /* srcLength==-1 */ { /* zero-terminated string */ limit=NULL; } U_ALIGN_CODE(16); for(;;) { /* count code units below the minimum or with irrelevant data for the quick check */ prevSrc=src; if(limit==NULL) { while((c=*src)=_NORM_MIN_HANGUL) { /* * Jamo 2 or 3: * try to compose with the previous character, Jamo 2 also with a following Jamo 3, * and set values here right now in case we just continue with the main loop */ length=1; prevCC=cc=0; prevStarter=prevSrc; reorderStart=dest+destIndex; if(/* ### TODO: do we need to do this? !ignoreHangul && ### */ destIndex>0) { /* c is a Jamo 2 or 3, see if we can compose it with the previous character */ c2=*(prevSrc-1); if(norm32<_NORM_JAMO2_TOP) { /* Jamo 2, compose with previous Jamo 1 and following Jamo 3 */ c2=(UChar)(c2-JAMO_L_BASE); if(c2>_NORM_CC_SHIFT); } else { const UChar *p; p=_composePart(stackBuffer, buffer, bufferCapacity, length, prevStarter, prevSrc, src, limit, norm32, qcMask, prevCC, /* output */ destIndex, /* will be adjusted */ pErrorCode); if(p==NULL) { return 0; /* an error occurred (out of memory) */ } /* append the recomposed buffer contents to the destination buffer */ if( (destIndex+length)<=destCapacity || /* attempt to grow the buffer */ (canGrow && (canGrow=growBuffer(context, &dest, &destCapacity, limit==NULL ? 2*destCapacity+length+20 : destCapacity+length+2*(limit-src)+20, destIndex))!=FALSE) ) { while(length>0) { dest[destIndex++]=*p++; --length; } } else { /* buffer overflow */ /* keep incrementing the destIndex for preflighting */ destIndex+=length; } src=prevStarter; continue; } } /* append the single code point (c, c2) to the destination buffer */ if( (destIndex+length)<=destCapacity || /* attempt to grow the buffer */ (canGrow && (canGrow=growBuffer(context, &dest, &destCapacity, limit==NULL ? 2*destCapacity+length+20 : destCapacity+length+2*(limit-src)+20, destIndex))!=FALSE) ) { if(cc!=0 && ccdestCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } #else /* ### TODO: this looks slightly to much more reasonable but fails some tests, esp. /tscoll/cmsccoll/TestIncrementalNormalize */ if(limit==NULL) { /* assume that we must NUL-terminate */ if(destIndexdestCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } } #endif if(buffer!=stackBuffer) { uprv_free(buffer); } return destIndex; } /* normalize() API ---------------------------------------------------------- */ /** * Internal API for normalizing. * Does not check for bad input and uses growBuffer. * @internal */ U_CFUNC int32_t unorm_internalNormalize(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UNormalizationMode mode, UBool ignoreHangul, GrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode) { switch(mode) { case UNORM_NFD: return unorm_decompose(dest, destCapacity, src, srcLength, FALSE, ignoreHangul, growBuffer, context, pErrorCode); case UNORM_NFKD: return unorm_decompose(dest, destCapacity, src, srcLength, TRUE, ignoreHangul, growBuffer, context, pErrorCode); case UNORM_NFC: return unorm_compose(dest, destCapacity, src, srcLength, FALSE, ignoreHangul, growBuffer, context, pErrorCode); case UNORM_NFKC: return unorm_compose(dest, destCapacity, src, srcLength, TRUE, ignoreHangul, growBuffer, context, pErrorCode); case UNORM_FCD: return unorm_makeFCD(dest, destCapacity, src, srcLength, growBuffer, context, pErrorCode); case UNORM_NONE: /* just copy the string */ if(srcLength==-1) { srcLength=u_strlen(src); } if( srcLength<=destCapacity || /* attempt to grow the buffer */ (growBuffer!=NULL && growBuffer(context, &dest, &destCapacity, srcLength+1, 0)) ) { uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR); /* ### TODO: revise NUL-termination parallel to rest of API */ /* we may NUL-terminate if it fits as a convenience */ if(srcLengthdestCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } } /* ### TODO: revise NUL-termination parallel to rest of API */ return srcLength; default: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } } /* old implementation ------------------------------------------------------- */ /* added by synwee for trie manipulation*/ #define STAGE_1_SHIFT_ 10 #define STAGE_2_SHIFT_ 4 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F #define STAGE_3_MASK_ 0xF #define LAST_BYTE_MASK_ 0xFF #define SECOND_LAST_BYTE_SHIFT_ 8 /* added by synwee for fast route in quickcheck and fcd */ #define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300 /* * for a description of the file format, * see icu/source/tools/genqchk/genqchk.c */ #define QCHK_DATA_NAME "qchk" #define FCHK_DATA_NAME "fchk" #define DATA_TYPE "dat" static UDataMemory *quickcheckData = NULL; static UDataMemory *fcdcheckData = NULL; /** * Authentication values */ static const uint8_t QCHK_DATA_FORMAT_[] = {0x71, 0x63, 0x68, 0x6b}; static const uint8_t FCHK_DATA_FORMAT_[] = {0x66, 0x63, 0x68, 0x6b}; static const uint8_t QCHK_FORMAT_VERSION_[] = {1, 0, 0, 0}; static const uint8_t FCHK_FORMAT_VERSION_[] = {1, 0, 0, 0}; /** * index values loaded from qchk.dat. * static uint16_t indexes[8]; */ enum { QCHK_INDEX_STAGE_2_BITS, QCHK_INDEX_STAGE_3_BITS, QCHK_INDEX_MIN_VALUES_SIZE, QCHK_INDEX_STAGE_1_INDEX, QCHK_INDEX_STAGE_2_INDEX, QCHK_INDEX_STAGE_3_INDEX }; /** * index values loaded from qchk.dat. * static uint16_t indexes[8]; */ enum { FCHK_INDEX_STAGE_2_BITS, FCHK_INDEX_STAGE_3_BITS, FCHK_INDEX_STAGE_1_INDEX, FCHK_INDEX_STAGE_2_INDEX, FCHK_INDEX_STAGE_3_INDEX }; /** * Array of mask for determining normalization quick check values. * Indexes follows the values in UNormalizationMode */ static const uint8_t QCHK_MASK_[] = {0, 0, 0x11, 0x22, 0x44, 0x88}; /** * Array of minimum codepoints that has UNORM_MAYBE or UNORM_NO quick check * values. Indexes follows the values in UNormalizationMode. * Generated values! Edit at your own risk. */ static const UChar32 *QCHK_MIN_VALUES_; /** * Flag to indicate if data has been loaded */ static UBool isQuickCheckLoaded = FALSE; static UBool isFCDCheckLoaded = FALSE; /** * Minimum value to determine if quickcheck value contains a MAYBE */ static const uint8_t MIN_UNORM_MAYBE_ = 0x10; /** * Array of normalization form corresponding to the index code point. * Hence codepoint 0xABCD will have normalization form QUICK_CHECK_DATA[0xABCD]. * UQUICK_CHECK_DATA[0xABCD] is a byte containing 2 sets of 4 bits information * representing UNORM_MAYBE and UNORM_YES.
* bits 1 2 3 4 5678
* NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
* ie if UQUICK_CHECK_DATA[0xABCD] = 10000001, this means that 0xABCD is in * NFD form and maybe in NFKC form */ static const uint16_t *QCHK_STAGE_1_; static const uint16_t *QCHK_STAGE_2_; static const uint8_t *QCHK_STAGE_3_; /** * Trie data for FCD. * Each index corresponds to each code point. * Trie value is the combining class of the first and the last character of the * NFD of the codepoint. * size uint16_t for the first 2 stages instead of uint32_t to reduce size. */ static const uint16_t *FCHK_STAGE_1_; static const uint16_t *FCHK_STAGE_2_; static const uint16_t *FCHK_STAGE_3_; U_CAPI int32_t unorm_normalize(const UChar* src, int32_t srcLength, UNormalizationMode mode, int32_t option, UChar* dest, int32_t destCapacity, UErrorCode* pErrorCode) { if(useNewImplementation) { /* check argument values */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } if( destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL || srcLength<-1 ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* check for overlapping src and destination */ /* ### TODO: real API may provide a temp buffer */ if( (src>=dest && src<(dest+destCapacity)) || (srcLength>0 && dest>=src && dest<(src+srcLength)) ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } return unorm_internalNormalize(dest, destCapacity, src, srcLength, mode, (UBool)((option&UNORM_IGNORE_HANGUL)!=0), NULL, NULL, pErrorCode); } if(U_FAILURE(*pErrorCode)) return -1; /* synwee : removed hard coded conversion */ Normalizer::EMode normMode = Normalizer::getNormalizerEMode(mode, *pErrorCode); if (U_FAILURE(*pErrorCode)) return -1; int32_t len = (srcLength == -1 ? u_strlen(src) : srcLength); const UnicodeString source(srcLength == -1, src, len); UnicodeString dst(dest, 0, destCapacity); /* synwee : note quickcheck is added in C ++ normalize method */ if ((option & UNORM_IGNORE_HANGUL) != 0) option = Normalizer::IGNORE_HANGUL; Normalizer::normalize(source, normMode, option, dst, *pErrorCode); return uprv_fillOutputString(dst, dest, destCapacity, pErrorCode); } static UBool U_CALLCONV isQuickCheckAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo) { if (pInfo->size >= 20 && pInfo->isBigEndian == U_IS_BIG_ENDIAN && pInfo->charsetFamily == U_CHARSET_FAMILY && (uprv_memcmp(pInfo->dataFormat, QCHK_DATA_FORMAT_, sizeof(QCHK_DATA_FORMAT_)) == 0) && /* pInfo->dataFormat[0] == 0x71 && pInfo->dataFormat[1] == 0x63 && pInfo->dataFormat[2] == 0x68 && pInfo->dataFormat[3] == 0x6b && pInfo->formatVersion[0] == 1 */ (uprv_memcmp(pInfo->formatVersion, QCHK_FORMAT_VERSION_, sizeof(QCHK_FORMAT_VERSION_)) == 0)) { return TRUE; } else { context = NULL; type = NULL; name = NULL; return FALSE; } } static UBool loadQuickCheckData(UErrorCode *error) { /* load quickcheck data from file if necessary */ if (!isQuickCheckLoaded && U_SUCCESS(*error)) { UDataMemory *data; /* open the data outside the mutex block */ data = udata_openChoice(NULL, DATA_TYPE, QCHK_DATA_NAME, isQuickCheckAcceptable, NULL, error); if (U_FAILURE(*error)) { return isQuickCheckLoaded = FALSE; } /* in the mutex block, set the data for this process */ umtx_lock(NULL); if (quickcheckData == NULL) { const uint16_t *temp = (const uint16_t *)udata_getMemory(data); const uint16_t *indexes = temp; quickcheckData = data; temp += 8; QCHK_MIN_VALUES_ = (const UChar32 *)temp; QCHK_STAGE_1_ = temp + indexes[QCHK_INDEX_STAGE_1_INDEX]; QCHK_STAGE_2_ = temp + indexes[QCHK_INDEX_STAGE_2_INDEX]; QCHK_STAGE_3_ = (const uint8_t *)(temp + indexes[QCHK_INDEX_STAGE_3_INDEX]); data = NULL; } umtx_unlock(NULL); isQuickCheckLoaded = TRUE; /* if a different thread set it first, then close the extra data */ if (data != NULL) { udata_close(data); /* NULL if it was set correctly */ } } return isQuickCheckLoaded; } /** * Performing quick check on a string, to quickly determine if the string is * in a particular normalization format. * Three types of result can be returned UNORM_YES, UNORM_NO or * UNORM_MAYBE. Result UNORM_YES indicates that the argument * string is in the desired normalized format, UNORM_NO determines that * argument string is not in the desired normalized format. A * UNORM_MAYBE result indicates that a more thorough check is required, * the user may have to put the string in its normalized form and compare the * results. * @param source string for determining if it is in a normalized format * @param sourcelength length of source to test * @param mode normalization format from the enum UNormalizationMode * @param status A pointer to an UErrorCode to receive any errors * @return UNORM_YES, UNORM_NO or UNORM_MAYBE */ U_CAPI UNormalizationCheckResult unorm_quickCheck(const UChar *source, int32_t sourcelength, UNormalizationMode mode, UErrorCode* status) { uint8_t oldcombiningclass = 0; uint8_t combiningclass; uint8_t quickcheckvalue; uint8_t mask = QCHK_MASK_[mode]; UChar32 min; UChar32 codepoint; UNormalizationCheckResult result = UNORM_YES; const UChar *psource; const UChar *pend = 0; if(useNewImplementation) { return _unorm_quickCheck(source, sourcelength, mode, status); } if (!loadQuickCheckData(status) || U_FAILURE(*status)) { return UNORM_MAYBE; } min = QCHK_MIN_VALUES_[mode]; /* checking argument*/ if (mode >= UNORM_MODE_COUNT || mode < UNORM_NONE) { *status = U_ILLEGAL_ARGUMENT_ERROR; return UNORM_MAYBE; } if (sourcelength >= 0) { psource = source; pend = source + sourcelength; for (;;) { if (psource >= pend) { return UNORM_YES; } /* fast route : since codepoints < min has combining class 0 and YES looking at the minimum values, surrogates are not a problem */ if (*psource >= min) { break; } psource ++; } } else { psource = source; for (;;) { if (*psource == 0) { return UNORM_YES; } /* fast route : since codepoints < min has combining class 0 and YES looking at the minimum values, surrogates are not a problem */ if (*psource >= min) { break; } psource ++; } } if (sourcelength >= 0) { for (;;) { int count = 0; if (psource >= pend) { break; } UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); combiningclass = u_getCombiningClass(codepoint); /* not in canonical order */ if (oldcombiningclass > combiningclass && combiningclass != 0) { return UNORM_NO; } oldcombiningclass = combiningclass; /* trie access */ quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[ QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)] & mask); /* value is a byte containing 2 sets of 4 bits information. bits 1 2 3 4 5678
NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form and maybe in NFKC form. */ if (quickcheckvalue == 0) { return UNORM_NO; } if (quickcheckvalue >= MIN_UNORM_MAYBE_) { result = UNORM_MAYBE; } psource += count; } } else { for (;;) { int count = 0; UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); if (codepoint == 0) { break; } combiningclass = u_getCombiningClass(codepoint); /* not in canonical order */ if (oldcombiningclass > combiningclass && combiningclass != 0) { return UNORM_NO; } oldcombiningclass = combiningclass; /* trie access */ quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[ QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)] & mask); /* value is a byte containing 2 sets of 4 bits information. bits 1 2 3 4 5678
NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form and maybe in NFKC form. */ if (quickcheckvalue == 0) { return UNORM_NO; } if (quickcheckvalue >= MIN_UNORM_MAYBE_) { result = UNORM_MAYBE; } psource += count; } } return result; } /* private methods ---------------------------------------------------------- */ static UBool U_CALLCONV isFCDCheckAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo) { if( pInfo->size >= 20 && pInfo->isBigEndian == U_IS_BIG_ENDIAN && pInfo->charsetFamily == U_CHARSET_FAMILY && (uprv_memcmp(pInfo->dataFormat, FCHK_DATA_FORMAT_, sizeof(FCHK_DATA_FORMAT_)) == 0) && /* pInfo->dataFormat[0] == 0x71 && pInfo->dataFormat[1] == 0x63 && pInfo->dataFormat[2] == 0x68 && pInfo->dataFormat[3] == 0x6b && pInfo->formatVersion[0] == 1 */ (uprv_memcmp(pInfo->formatVersion, FCHK_FORMAT_VERSION_, sizeof(FCHK_FORMAT_VERSION_)) == 0)) { return TRUE; } else { context = NULL; type = NULL; name = NULL; return FALSE; } } static UBool loadFCDCheckData(UErrorCode *error) { /* load fcdcheck data from file if necessary */ if (!isFCDCheckLoaded && U_SUCCESS(*error)) { UDataMemory *data; /* open the data outside the mutex block */ data = udata_openChoice(NULL, DATA_TYPE, FCHK_DATA_NAME, isFCDCheckAcceptable, NULL, error); if (U_FAILURE(*error)) { return isFCDCheckLoaded = FALSE; } /* in the mutex block, set the data for this process */ umtx_lock(NULL); if (fcdcheckData == NULL) { const uint16_t *temp = (const uint16_t *)udata_getMemory(data); const uint16_t *indexes = temp; fcdcheckData = data; temp += 8; FCHK_STAGE_1_ = temp + indexes[FCHK_INDEX_STAGE_1_INDEX]; FCHK_STAGE_2_ = temp + indexes[FCHK_INDEX_STAGE_2_INDEX]; FCHK_STAGE_3_ = (const uint16_t *)(temp + indexes[FCHK_INDEX_STAGE_3_INDEX]); data = NULL; } umtx_unlock(NULL); isFCDCheckLoaded = TRUE; /* if a different thread set it first, then close the extra data */ if (data != NULL) { udata_close(data); /* NULL if it was set correctly */ } } return isFCDCheckLoaded; } /** * Gets the stage 1 data for checkFCD. * @param error status * @return checkFCD data stage 1, null if data can not be loaded */ U_CAPI const uint16_t * getFCHK_STAGE_1_(UErrorCode *error) { if (loadFCDCheckData(error)) { return FCHK_STAGE_1_; } return NULL; } /** * Gets the stage 2 data for checkFCD. * @param error status * @return checkFCD data stage 2, null if data can not be loaded */ U_CAPI const uint16_t * getFCHK_STAGE_2_(UErrorCode *error) { if (loadFCDCheckData(error)) { return FCHK_STAGE_2_; } return NULL; } /** * Gets the stage 3 data for checkFCD. * @param error status * @return checkFCD data stage 3, null if data can not be loaded */ U_CAPI const uint16_t * getFCHK_STAGE_3_(UErrorCode *error) { if (loadFCDCheckData(error)) { return FCHK_STAGE_3_; } return NULL; } /** * Private method which performs a quick FCD check on a string, to quickly * determine if a string is in a required FCD format. * FCD is the set of strings such that for each character in the string, * decomposition without any canonical reordering will produce a NFD. * @param source string for determining if it is in a normalized format * @param sourcelength length of source to test * @paran mode normalization format from the enum UNormalizationMode * @param status A pointer to an UErrorCode to receive any errors * @return TRUE if source is in FCD format, FALSE otherwise */ U_CAPI UBool checkFCD(const UChar* source, int32_t sourcelength, UErrorCode* status) { if(useNewImplementation) { return UNORM_YES==unorm_quickCheck(source, sourcelength, UNORM_FCD, status); } UChar32 codepoint; const UChar *psource; const UChar *pend = 0; uint8_t oldfcdtrail = 0; uint16_t fcd = 0; if (!loadFCDCheckData(status) || U_FAILURE(*status)) { return FALSE; } if (sourcelength >= 0) { psource = source; pend = source + sourcelength; for (;;) { if (psource >= pend) { return TRUE; } /* fast route : since codepoints < NFC_ZER_CC_BLOCK_LIMIT_ has combining class 0. looking at the minimum values, surrogates are not a problem */ if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) { break; } psource ++; } } else { psource = source; for (;;) { if (*psource == 0) { return TRUE; } /* fast route : since codepoints < min has combining class 0 and YES looking at the minimum values, surrogates are not a problem */ if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) { break; } psource ++; } } /* not end of string and yet failed simple compare safe to shift back one char because the previous char has to be < 0x300 or the start of a string */ if (psource == source) { oldfcdtrail = 0; } else { codepoint = *(psource - 1); oldfcdtrail = (uint8_t)(FCHK_STAGE_3_[ FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)] & LAST_BYTE_MASK_); } if (sourcelength >= 0) { for (;;) { int count = 0; uint8_t lead; if (psource >= pend) { return TRUE; } UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); /* trie access */ fcd = FCHK_STAGE_3_[ FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)]; lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); if (lead != 0 && oldfcdtrail > lead) { return FALSE; } oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_); psource += count; } } else { for (;;) { int count = 0; uint8_t lead; UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); if (codepoint == 0) { return TRUE; } /* trie access */ fcd = FCHK_STAGE_3_[ FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)]; lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); if (lead != 0 && oldfcdtrail > lead) { return FALSE; } oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_); psource += count; } } return TRUE; }