/* ******************************************************************************* * * Copyright (C) 2009-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: normalizer2impl.cpp * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2009nov22 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "unicode/normalizer2.h" #include "unicode/udata.h" #include "unicode/ustring.h" #include "unicode/utf16.h" #include "cmemory.h" #include "mutex.h" #include "normalizer2impl.h" #include "putilimp.h" #include "uassert.h" #include "uset_imp.h" #include "utrie2.h" #include "uvector.h" U_NAMESPACE_BEGIN // ReorderingBuffer -------------------------------------------------------- *** UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { int32_t length=str.length(); start=str.getBuffer(destCapacity); if(start==NULL) { // getBuffer() already did str.setToBogus() errorCode=U_MEMORY_ALLOCATION_ERROR; return FALSE; } limit=start+length; remainingCapacity=str.getCapacity()-length; reorderStart=start; if(start==limit) { lastCC=0; } else { setIterator(); lastCC=previousCC(); // Set reorderStart after the last code point with cc<=1 if there is one. if(lastCC>1) { while(previousCC()>1) {} } reorderStart=codePointLimit; } return TRUE; } UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { int32_t length=(int32_t)(limit-start); return length==(int32_t)(otherLimit-otherStart) && 0==u_memcmp(start, otherStart, length); } UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { if(remainingCapacity<2 && !resize(2, errorCode)) { return FALSE; } if(lastCC<=cc || cc==0) { limit[0]=U16_LEAD(c); limit[1]=U16_TRAIL(c); limit+=2; lastCC=cc; if(cc<=1) { reorderStart=limit; } } else { insert(c, cc); } remainingCapacity-=2; return TRUE; } UBool ReorderingBuffer::append(const UChar *s, int32_t length, uint8_t leadCC, uint8_t trailCC, UErrorCode &errorCode) { if(length==0) { return TRUE; } if(remainingCapacity=codePointStart) { return 0; } UChar32 c=*--codePointStart; if(ccc;) {} // insert c at codePointLimit, after the character with prevCC<=cc UChar *q=limit; UChar *r=limit+=U16_LENGTH(c); do { *--r=*--q; } while(codePointLimit!=q); writeCodePoint(q, c); if(cc<=1) { reorderStart=r; } } // Normalizer2Impl --------------------------------------------------------- *** struct CanonIterData : public UMemory { CanonIterData(UErrorCode &errorCode); ~CanonIterData(); void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); UTrie2 *trie; UVector canonStartSets; // contains UnicodeSet * }; Normalizer2Impl::~Normalizer2Impl() { udata_close(memory); utrie2_close(normTrie); delete (CanonIterData *)canonIterDataSingleton.fInstance; } UBool U_CALLCONV Normalizer2Impl::isAcceptable(void *context, const char * /* type */, const char * /*name*/, const UDataInfo *pInfo) { if( pInfo->size>=20 && pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY && pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ pInfo->dataFormat[1]==0x72 && pInfo->dataFormat[2]==0x6d && pInfo->dataFormat[3]==0x32 && pInfo->formatVersion[0]==2 ) { Normalizer2Impl *me=(Normalizer2Impl *)context; uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); return TRUE; } else { return FALSE; } } void Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); if(U_FAILURE(errorCode)) { return; } const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); const int32_t *inIndexes=(const int32_t *)inBytes; int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; if(indexesLength<=IX_MIN_MAYBE_YES) { errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. return; } minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; minYesNo=inIndexes[IX_MIN_YES_NO]; minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; minNoNo=inIndexes[IX_MIN_NO_NO]; limitNoNo=inIndexes[IX_LIMIT_NO_NO]; minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, inBytes+offset, nextOffset-offset, NULL, &errorCode); if(U_FAILURE(errorCode)) { return; } offset=nextOffset; nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; maybeYesCompositions=(const uint16_t *)(inBytes+offset); extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); // smallFCD: new in formatVersion 2 offset=nextOffset; smallFCD=inBytes+offset; // Build tccc180[]. // gennorm2 enforces lccc=0 for c>=1) { if((c&0xff)==0) { bits=smallFCD[c>>8]; // one byte per 0x100 code points } if(bits&1) { for(int i=0; i<0x20; ++i, ++c) { tccc180[c]=(uint8_t)getFCD16FromNormData(c); } } else { uprv_memset(tccc180+c, 0, 0x20); c+=0x20; } } } uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { UChar32 c; if(cpStart==(cpLimit-1)) { c=*cpStart; } else { c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); } uint16_t prevNorm16=getNorm16(c); if(prevNorm16<=minYesNo) { return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 } else { return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo } } U_CDECL_BEGIN static UBool U_CALLCONV enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { /* add the start code point to the USet */ const USetAdder *sa=(const USetAdder *)context; sa->add(sa->set, start); return TRUE; } static uint32_t U_CALLCONV segmentStarterMapper(const void * /*context*/, uint32_t value) { return value&CANON_NOT_SEGMENT_STARTER; } U_CDECL_END void Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { /* add the start code point of each same-value range of each trie */ utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); /* add Hangul LV syllables and LV+1 because of skippables */ for(UChar c=Hangul::HANGUL_BASE; cadd(sa->set, c); sa->add(sa->set, c+1); } sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ } void Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { /* add the start code point of each same-value range of the canonical iterator data trie */ if(ensureCanonIterData(errorCode)) { // currently only used for the SEGMENT_STARTER property utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, segmentStarterMapper, enumPropertyStartsRange, sa); } } const UChar * Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, UChar32 minNeedDataCP, ReorderingBuffer *buffer, UErrorCode &errorCode) const { // Make some effort to support NUL-terminated strings reasonably. // Take the part of the fast quick check loop that does not look up // data and check the first part of the string. // After this prefix, determine the string length to simplify the rest // of the code. const UChar *prevSrc=src; UChar c; while((c=*src++)appendZeroCC(prevSrc, src, errorCode); } } return src; } // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/spanQuickCheckYes const UChar * Normalizer2Impl::decompose(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const { UChar32 minNoCP=minDecompNoCP; if(limit==NULL) { src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); if(U_FAILURE(errorCode)) { return src; } limit=u_strchr(src, 0); } const UChar *prevSrc; UChar32 c=0; uint16_t norm16=0; // only for quick check const UChar *prevBoundary=src; uint8_t prevCC=0; for(;;) { // count code units below the minimum or with irrelevant data for the quick check for(prevSrc=src; src!=limit;) { if( (c=*src)appendZeroCC(prevSrc, src, errorCode)) { break; } } else { prevCC=0; prevBoundary=src; } } if(src==limit) { break; } // Check one above-minimum, relevant code point. src+=U16_LENGTH(c); if(buffer!=NULL) { if(!decompose(c, norm16, *buffer, errorCode)) { break; } } else { if(isDecompYes(norm16)) { uint8_t cc=getCCFromYesOrMaybe(norm16); if(prevCC<=cc || cc==0) { prevCC=cc; if(cc<=1) { prevBoundary=src; } continue; } } return prevBoundary; // "no" or cc out of order } } return src; } // Decompose a short piece of text which is likely to contain characters that // fail the quick check loop and/or where the quick check loop's overhead // is unlikely to be amortized. // Called by the compose() and makeFCD() implementations. UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, ReorderingBuffer &buffer, UErrorCode &errorCode) const { while(src>8); if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { leadCC=(uint8_t)(*(mapping-1)>>8); } else { leadCC=0; } return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); } } } const UChar * Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { const UChar *decomp=NULL; uint16_t norm16; for(;;) { if(c>7)&1)-1; uint16_t rm0=*rawMapping; if(rm0<=MAPPING_LENGTH_MASK) { length=rm0; return (const UChar *)rawMapping-rm0; } else { // Copy the normal mapping and replace its first two code units with rm0. buffer[0]=(UChar)rm0; u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); length=mLength-1; return buffer; } } else { length=mLength; return (const UChar *)mapping+1; } } } void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, UBool doDecompose, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const { buffer.copyReorderableSuffixTo(safeMiddle); if(doDecompose) { decompose(src, limit, &buffer, errorCode); return; } // Just merge the strings at the boundary. ForwardUTrie2StringIterator iter(normTrie, src, limit); uint8_t firstCC, prevCC, cc; firstCC=prevCC=cc=getCC(iter.next16()); while(cc!=0) { prevCC=cc; cc=getCC(iter.next16()); }; if(limit==NULL) { // appendZeroCC() needs limit!=NULL limit=u_strchr(iter.codePointStart, 0); } if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { buffer.appendZeroCC(iter.codePointStart, limit, errorCode); } } // Note: hasDecompBoundary() could be implemented as aliases to // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() // at the cost of building the FCD trie for a decomposition normalizer. UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { for(;;) { if(cMIN_NORMAL_MAYBE_YES) { return FALSE; // ccc!=0 } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); } else { // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getMapping(norm16); uint16_t firstUnit=*mapping; if((firstUnit&MAPPING_LENGTH_MASK)==0) { return FALSE; } if(!before) { // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 if(firstUnit>0x1ff) { return FALSE; // trailCC>1 } if(firstUnit<=0xff) { return TRUE; // trailCC==0 } // if(trailCC==1) test leadCC==0, same as checking for before-boundary } // TRUE if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; } } } /* * Finds the recomposition result for * a forward-combining "lead" character, * specified with a pointer to its compositions list, * and a backward-combining "trail" character. * * If the lead and trail characters combine, then this function returns * the following "compositeAndFwd" value: * Bits 21..1 composite character * Bit 0 set if the composite is a forward-combining starter * otherwise it returns -1. * * The compositions list has (trail, compositeAndFwd) pair entries, * encoded as either pairs or triples of 16-bit units. * The last entry has the high bit of its first unit set. * * The list is sorted by ascending trail characters (there are no duplicates). * A linear search is used. * * See normalizer2impl.h for a more detailed description * of the compositions list format. */ int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { uint16_t key1, firstUnit; if(trail(firstUnit=*list)) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(firstUnit&COMP_1_TRIPLE) { return ((int32_t)list[1]<<16)|list[2]; } else { return list[1]; } } } else { // trail character is 3400..10FFFF // result entry has 3 units key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ (((trail>>COMP_1_TRAIL_SHIFT))& ~COMP_1_TRIPLE)); uint16_t key2=(uint16_t)(trail<(firstUnit=*list)) { list+=2+(firstUnit&COMP_1_TRIPLE); } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(key2>(secondUnit=list[1])) { if(firstUnit&COMP_1_LAST_TUPLE) { break; } else { list+=3; } } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; } else { break; } } else { break; } } } return -1; } /** * @param list some character's compositions list * @param set recursively receives the composites from these compositions */ void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { uint16_t firstUnit; int32_t compositeAndFwd; do { firstUnit=*list; if((firstUnit&COMP_1_TRIPLE)==0) { compositeAndFwd=list[1]; list+=2; } else { compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; list+=3; } UChar32 composite=compositeAndFwd>>1; if((compositeAndFwd&1)!=0) { addComposites(getCompositionsListForComposite(getNorm16(composite)), set); } set.add(composite); } while((firstUnit&COMP_1_LAST_TUPLE)==0); } /* * Recomposes the buffer text starting at recomposeStartIndex * (which is in NFD - decomposed and canonically ordered), * and truncates the buffer contents. * * Note that recomposition never lengthens the text: * Any character consists of either one or two code units; * a composition may contain at most one more code unit than the original starter, * while the combining mark that is removed has at least one code unit. */ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, UBool onlyContiguous) const { UChar *p=buffer.getStart()+recomposeStartIndex; UChar *limit=buffer.getLimit(); if(p==limit) { return; } UChar *starter, *pRemove, *q, *r; const uint16_t *compositionsList; UChar32 c, compositeAndFwd; uint16_t norm16; uint8_t cc, prevCC; UBool starterIsSupplementary; // Some of the following variables are not used until we have a forward-combining starter // and are only initialized now to avoid compiler warnings. compositionsList=NULL; // used as indicator for whether we have a forward-combining starter starter=NULL; starterIsSupplementary=FALSE; prevCC=0; for(;;) { UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); cc=getCCFromYesOrMaybe(norm16); if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and compositionsList!=NULL && // the backward-combining character is not blocked (prevCC=0) { // The starter and the combining mark (c) do combine. UChar32 composite=compositeAndFwd>>1; // Replace the starter with the composite, remove the combining mark. pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark if(starterIsSupplementary) { if(U_IS_SUPPLEMENTARY(composite)) { // both are supplementary starter[0]=U16_LEAD(composite); starter[1]=U16_TRAIL(composite); } else { *starter=(UChar)composite; // The composite is shorter than the starter, // move the intermediate characters forward one. starterIsSupplementary=FALSE; q=starter+1; r=q+1; while(rminYesNo) { // composite 'a' has both mapping & compositions list list+= // mapping pointer 1+ // +1 to skip the first unit with the mapping lenth (*list&MAPPING_LENGTH_MASK); // + mapping length } } } else if(norm16>1; #else int32_t compositeAndFwd=combine(list, b); return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; #endif } // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. // doCompose: normalize // !doCompose: isNormalized (buffer must be empty and initialized) UBool Normalizer2Impl::compose(const UChar *src, const UChar *limit, UBool onlyContiguous, UBool doCompose, ReorderingBuffer &buffer, UErrorCode &errorCode) const { /* * prevBoundary points to the last character before the current one * that has a composition boundary before it with ccc==0 and quick check "yes". * Keeping track of prevBoundary saves us looking for a composition boundary * when we find a "no" or "maybe". * * When we back out from prevSrc back to prevBoundary, * then we also remove those same characters (which had been simply copied * or canonically-order-inserted) from the ReorderingBuffer. * Therefore, at all times, the [prevBoundary..prevSrc[ source units * must correspond 1:1 to destination units at the end of the destination buffer. */ const UChar *prevBoundary=src; UChar32 minNoMaybeCP=minCompNoMaybeCP; if(limit==NULL) { src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, doCompose ? &buffer : NULL, errorCode); if(U_FAILURE(errorCode)) { return FALSE; } if(prevBoundary=minNoNo. * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) * or has ccc!=0. * Check for Jamo V/T, then for regular characters. * c is not a Hangul syllable or Jamo L because those have "yes" properties. */ if(isJamoVT(norm16) && prevBoundary!=prevSrc) { UChar prev=*(prevSrc-1); UBool needToDecompose=FALSE; if(c=MIN_YES_YES_WITH_CC) { uint8_t cc=(uint8_t)norm16; // cc!=0 if( onlyContiguous && // FCC (doCompose ? buffer.getLastCC() : prevCC)==0 && prevBoundarycc ) { // Fails FCD test, need to decompose and contiguously recompose. if(!doCompose) { return FALSE; } } else if(doCompose) { if(!buffer.append(c, cc, errorCode)) { break; } continue; } else if(prevCC<=cc) { prevCC=cc; continue; } else { return FALSE; } } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { return FALSE; } /* * Find appropriate boundaries around this character, * decompose the source text from between the boundaries, * and recompose it. * * We may need to remove the last few characters from the ReorderingBuffer * to account for source text that was copied or appended * but needs to take part in the recomposition. */ /* * Find the last composition boundary in [prevBoundary..src[. * It is either the decomposition of the current character (at prevSrc), * or prevBoundary. */ if(hasCompBoundaryBefore(c, norm16)) { prevBoundary=prevSrc; } else if(doCompose) { buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); } // Find the next composition boundary in [src..limit[ - // modifies src to point to the next starter. src=(UChar *)findNextCompBoundary(src, limit); // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. int32_t recomposeStartIndex=buffer.length(); if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { break; } recompose(buffer, recomposeStartIndex, onlyContiguous); if(!doCompose) { if(!buffer.equals(prevBoundary, src)) { return FALSE; } buffer.remove(); prevCC=0; } // Move to the next starter. We never need to look back before this point again. prevBoundary=src; } return TRUE; } // Very similar to compose(): Make the same changes in both places if relevant. // pQCResult==NULL: spanQuickCheckYes // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) const UChar * Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, UBool onlyContiguous, UNormalizationCheckResult *pQCResult) const { /* * prevBoundary points to the last character before the current one * that has a composition boundary before it with ccc==0 and quick check "yes". */ const UChar *prevBoundary=src; UChar32 minNoMaybeCP=minCompNoMaybeCP; if(limit==NULL) { UErrorCode errorCode=U_ZERO_ERROR; src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); if(prevBoundary=minNoNo. * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) * or has ccc!=0. */ if(isMaybeOrNonZeroCC(norm16)) { uint8_t cc=getCCFromYesOrMaybe(norm16); if( onlyContiguous && // FCC cc!=0 && prevCC==0 && prevBoundarycc ) { // Fails FCD test. } else if(prevCC<=cc || cc==0) { prevCC=cc; if(norm16= (testInert ? minNoNo : minMaybeYes)) { return FALSE; } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); } else { // c decomposes, get everything from the variable-length extra data. // If testInert, then c must be a yesNo character which has lccc=0, // otherwise it could be a noNo. const uint16_t *mapping=getMapping(norm16); uint16_t firstUnit=*mapping; // TRUE if // not MAPPING_NO_COMP_BOUNDARY_AFTER // (which is set if // c is not deleted, and // it and its decomposition do not combine forward, and it has a starter) // and if FCC then trailCC<=1 return (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && (!onlyContiguous || firstUnit<=0x1ff); } } } const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { BackwardUTrie2StringIterator iter(normTrie, start, p); uint16_t norm16; do { norm16=iter.previous16(); } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, // but that's probably not worth the extra cost. return iter.codePointStart; } const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { ForwardUTrie2StringIterator iter(normTrie, p, limit); uint16_t norm16; do { norm16=iter.next16(); } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); return iter.codePointStart; } // Note: normalizer2impl.cpp r30982 (2011-nov-27) // still had getFCDTrie() which built and cached an FCD trie. // That provided faster access to FCD data than getFCD16FromNormData() // but required synchronization and consumed some 10kB of heap memory // in any process that uses FCD (e.g., via collation). // tccc180[] and smallFCD[] are intended to help with any loss of performance, // at least for Latin & CJK. // Gets the FCD value from the regular normalization data. uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { // Only loops for 1:1 algorithmic mappings. for(;;) { uint16_t norm16=getNorm16(c); if(norm16<=minYesNo) { // no decomposition or Hangul syllable, all zeros return 0; } else if(norm16>=MIN_NORMAL_MAYBE_YES) { // combining mark norm16&=0xff; return norm16|(norm16<<8); } else if(norm16>=minMaybeYes) { return 0; } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); } else { // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getMapping(norm16); uint16_t firstUnit=*mapping; if((firstUnit&MAPPING_LENGTH_MASK)==0) { // A character that is deleted (maps to an empty string) must // get the worst-case lccc and tccc values because arbitrary // characters on both sides will become adjacent. return 0x1ff; } else { norm16=firstUnit>>8; // tccc if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { norm16|=*(mapping-1)&0xff00; // lccc } return norm16; } } } } // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes const UChar * Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const { // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. // Similar to the prevBoundary in the compose() implementation. const UChar *prevBoundary=src; int32_t prevFCD16=0; if(limit==NULL) { src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); if(U_FAILURE(errorCode)) { return src; } if(prevBoundary1) { --prevBoundary; } } limit=u_strchr(src, 0); } // Note: In this function we use buffer->appendZeroCC() because we track // the lead and trail combining classes here, rather than leaving it to // the ReorderingBuffer. // The exception is the call to decomposeShort() which uses the buffer // in the normal way. const UChar *prevSrc; UChar32 c=0; uint16_t fcd16=0; for(;;) { // count code units with lccc==0 for(prevSrc=src; src!=limit;) { if((c=*src)appendZeroCC(prevSrc, src, errorCode)) { break; } if(src==limit) { break; } prevBoundary=src; // We know that the previous character's lccc==0. if(prevFCD16<0) { // Fetching the fcd16 value was deferred for this below-U+0300 code point. UChar32 prev=~prevFCD16; prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); if(prevFCD16>1) { --prevBoundary; } } else { const UChar *p=src-1; if(U16_IS_TRAIL(*p) && prevSrc

1) { prevBoundary=p; } } // The start of the current character (c). prevSrc=src; } else if(src==limit) { break; } src+=U16_LENGTH(c); // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. // Check for proper order, and decompose locally if necessary. if((prevFCD16&0xff)<=(fcd16>>8)) { // proper order: prev tccc <= current lccc if((fcd16&0xff)<=1) { prevBoundary=src; } if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { break; } prevFCD16=fcd16; continue; } else if(buffer==NULL) { return prevBoundary; // quick check "no" } else { /* * Back out the part of the source that we copied or appended * already but is now going to be decomposed. * prevSrc is set to after what was copied/appended. */ buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); /* * Find the part of the source that needs to be decomposed, * up to the next safe boundary. */ src=findNextFCDBoundary(src, limit); /* * The source text does not fulfill the conditions for FCD. * Decompose and reorder a limited piece of the text. */ if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { break; } prevBoundary=src; prevFCD16=0; } } return src; } void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, UBool doMakeFCD, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const { if(!buffer.isEmpty()) { const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); if(src!=firstBoundaryInSrc) { const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), buffer.getLimit()); int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); UnicodeString middle(lastBoundaryInDest, destSuffixLength); buffer.removeSuffix(destSuffixLength); safeMiddle=middle; middle.append(src, (int32_t)(firstBoundaryInSrc-src)); const UChar *middleStart=middle.getBuffer(); makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); if(U_FAILURE(errorCode)) { return; } src=firstBoundaryInSrc; } } if(doMakeFCD) { makeFCD(src, limit, &buffer, errorCode); } else { if(limit==NULL) { // appendZeroCC() needs limit!=NULL limit=u_strchr(src, 0); } buffer.appendZeroCC(src, limit, errorCode); } } const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { while(start

0xff) {} return p; } const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { while(padd(firstOrigin); } } else { set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; } set->add(origin); } } class CanonIterDataSingleton { public: CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) : singleton(s), impl(ni), errorCode(ec) {} CanonIterData *getInstance(UErrorCode &errorCode) { CanonIterData *instance= (CanonIterData *)singleton.getInstance(createInstance, this, errorCode); return instance; } static void *createInstance(const void *context, UErrorCode &errorCode); UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { if(value!=0) { impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode); } return U_SUCCESS(errorCode); } private: SimpleSingleton &singleton; Normalizer2Impl &impl; CanonIterData *newData; UErrorCode &errorCode; }; U_CDECL_BEGIN // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. static UBool U_CALLCONV enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value); } U_CDECL_END void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) { CanonIterDataSingleton *me=(CanonIterDataSingleton *)context; me->newData=new CanonIterData(errorCode); if(me->newData==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } if(U_SUCCESS(errorCode)) { utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me); utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode); if(U_SUCCESS(errorCode)) { return me->newData; } } delete me->newData; return NULL; } void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, CanonIterData &newData, UErrorCode &errorCode) const { if(norm16==0 || (minYesNo<=norm16 && norm16=minMaybeYes) { // not a segment starter if it occurs in a decomposition or has cc!=0 newValue|=CANON_NOT_SEGMENT_STARTER; if(norm16=minNoNo) { while(i(this); CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode); return U_SUCCESS(errorCode); } int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c); } const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { return *(const UnicodeSet *)( ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]); } UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { return getCanonValue(c)>=0; } UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; if(canonValue==0) { return FALSE; } set.clear(); int32_t value=canonValue&CANON_VALUE_MASK; if((canonValue&CANON_HAS_SET)!=0) { set.addAll(getCanonStartSet(value)); } else if(value!=0) { set.add(value); } if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { uint16_t norm16=getNorm16(c); if(norm16==JAMO_L) { UChar32 syllable= (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); } else { addComposites(getCompositionsList(norm16), set); } } return TRUE; } U_NAMESPACE_END // Normalizer2 data swapping ----------------------------------------------- *** U_NAMESPACE_USE U_CAPI int32_t U_EXPORT2 unorm2_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode) { const UDataInfo *pInfo; int32_t headerSize; const uint8_t *inBytes; uint8_t *outBytes; const int32_t *inIndexes; int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; int32_t i, offset, nextOffset, size; /* udata_swapDataHeader checks the arguments */ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } /* check data format and format version */ pInfo=(const UDataInfo *)((const char *)inData+4); if(!( pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ pInfo->dataFormat[1]==0x72 && pInfo->dataFormat[2]==0x6d && pInfo->dataFormat[3]==0x32 && (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) )) { udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); *pErrorCode=U_UNSUPPORTED_ERROR; return 0; } inBytes=(const uint8_t *)inData+headerSize; outBytes=(uint8_t *)outData+headerSize; inIndexes=(const int32_t *)inBytes; if(length>=0) { length-=headerSize; if(length<(int32_t)sizeof(indexes)) { udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", length); *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } /* read the first few indexes */ for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { indexes[i]=udata_readInt32(ds, inIndexes[i]); } /* get the total length of the data */ size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; if(length>=0) { if(lengthswapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); offset=nextOffset; /* swap the UTrie2 */ nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); offset=nextOffset; /* swap the uint16_t extraData[] */ nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); offset=nextOffset; /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; offset=nextOffset; U_ASSERT(offset==size); } return headerSize+size; } #endif // !UCONFIG_NO_NORMALIZATION