/* ******************************************************************************* * * Copyright (C) 1999-2001, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: store.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2001may25 * created by: Markus W. Scherer * * Store Unicode normalization data in a memory-mappable file. */ #include #include #include "unicode/utypes.h" #include "unicode/uchar.h" #include "cmemory.h" #include "cstring.h" #include "filestrm.h" #include "unicode/udata.h" #include "unewdata.h" #include "unormimp.h" #include "gennorm.h" #ifdef WIN32 # pragma warning(disable: 4100) #endif #define DO_DEBUG_OUT 0 /* * The new implementation of the normalization code loads its data from * unorm.dat, which is generated with this gennorm tool. * The format of that file is described in unormimp.h . */ /* file data ---------------------------------------------------------------- */ /* UDataInfo cf. udata.h */ static UDataInfo dataInfo={ sizeof(UDataInfo), 0, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, U_SIZEOF_UCHAR, 0, { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */ {1, 0, 0, _NORM_TRIE_SHIFT}, /* formatVersion - [3] contains the trie shift! */ {3, 1, 0, 0} /* dataVersion (Unicode version) */ }; extern void setUnicodeVersion(const char *v) { UVersionInfo version; u_versionFromString(version, v); uprv_memcpy(dataInfo.dataVersion, version, 4); } static uint16_t indexes[_NORM_INDEX_TOP]={ 0 }; /* tool memory helper ------------------------------------------------------- */ /* * UToolMemory is used for generic, custom memory management. * It is allocated with enough space for count*size bytes starting * at array. * The array is declared with a union of large data types so * that its base address is aligned for any types. * If size is a multiple of a data type size, then such items * can be safely allocated inside the array, at offsets that * are themselves multiples of size. */ typedef struct UToolMemory { char name[64]; uint32_t count, size, index; union { uint32_t u; double d; void *p; } array[1]; } UToolMemory; static UToolMemory * utm_open(const char *name, uint32_t count, uint32_t size) { UToolMemory *mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+count*size); if(mem==NULL) { fprintf(stderr, "error: %s - out of memory\n", name); exit(U_MEMORY_ALLOCATION_ERROR); } uprv_strcpy(mem->name, name); mem->count=count; mem->size=size; mem->index=0; return mem; } /* we don't use this - we don't clean up memory here... */ static void utm_close(UToolMemory *mem) { if(mem!=NULL) { uprv_free(mem); } } static void * utm_getStart(UToolMemory *mem) { return (char *)mem->array; } static void * utm_alloc(UToolMemory *mem) { char *p=(char *)mem->array+mem->index*mem->size; if(++mem->index<=mem->count) { uprv_memset(p, 0, mem->size); return p; } else { fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n", mem->name, (long)mem->count); exit(U_MEMORY_ALLOCATION_ERROR); } } static void * utm_allocN(UToolMemory *mem, int32_t n) { char *p=(char *)mem->array+mem->index*mem->size; if((mem->index+=(uint32_t)n)<=mem->count) { uprv_memset(p, 0, n*mem->size); return p; } else { fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n", mem->name, (long)mem->count); exit(U_MEMORY_ALLOCATION_ERROR); } } /* builder data ------------------------------------------------------------- */ typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm); static UToolMemory *stage2Mem, *normMem, *utf32Mem, *extraMem, *combiningTriplesMem; static uint16_t stage1[_NORM_STAGE_1_MAX_COUNT], fcdStage1[_NORM_STAGE_1_MAX_COUNT]; static uint16_t *stage2; static Norm *norms; /* * set a flag for each code point that was seen in decompositions - * avoid to decompose ones that have not been used before */ static uint32_t haveSeenFlags[256]; static uint32_t combiningCPs[2000]; static uint16_t combiningIndexes[2000]; static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0; typedef struct CombiningTriple { uint16_t leadIndex, trailIndex; uint32_t lead, trail, combined; } CombiningTriple; /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */ static uint16_t combiningTable[0x8000]; static uint16_t combiningTableTop=0; /* stage 2 table after turning Norm structs into 32-bit words */ static uint32_t *norm32Table=NULL, *fcdTable=NULL; /* number of units used in stage 1 and norm32Table, and same for FCD */ static uint16_t stage1Top, fcdStage1Top, norm32TableTop, fcdTableTop; extern void init() { /* reset stage 1 of the trie */ uprv_memset(stage1, 0, sizeof(stage1)); /* allocate stage 2 of the trie and reset the first block */ stage2Mem=utm_open("gennorm trie stage 2", 30000, sizeof(*stage2)); stage2=utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT); /* allocate Norm structures and reset the first one */ normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm)); norms=utm_alloc(normMem); /* allocate UTF-32 string memory */ utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 4); /* reset all "have seen" flags */ uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags)); /* allocate extra data memory for UTF-16 decomposition strings and other values */ extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, 2); /* allocate temporary memory for combining triples */ combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, sizeof(CombiningTriple)); /* set the minimum code points for no/maybe quick check values to the end of the BMP */ indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff; indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff; indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff; indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff; } /* get or create a block in stage 2 of the trie */ static uint16_t createStage2Block(uint32_t code) { uint32_t i; uint16_t j; i=code>>_NORM_TRIE_SHIFT; j=stage1[i]; if(j==0) { /* allocate a stage 2 block */ uint16_t *p; p=(uint16_t *)utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT); stage1[i]=j=(uint16_t)(p-stage2); } return j; } /* * get or create a Norm unit; * get or create the intermediate trie entries for it as well */ static Norm * createNorm(uint32_t code) { Norm *p; uint16_t stage2Block, k; stage2Block=createStage2Block(code); k=(uint16_t)(stage2Block+(code&_NORM_STAGE_2_MASK)); if(stage2[k]==0) { /* allocate Norm */ p=(Norm *)utm_alloc(normMem); stage2[k]=(uint16_t)(p-norms); } else { p=norms+stage2[k]; } return p; } /* get an existing Norm unit */ static Norm * getNorm(uint32_t code) { uint32_t i; uint16_t j; /* access stage 1 and get the stage 2 block start index */ i=code>>_NORM_TRIE_SHIFT; j=stage1[i]; if(j==0) { return NULL; } /* access stage 2 and get the Norm unit */ i=(uint16_t)(j+(code&_NORM_STAGE_2_MASK)); j=stage2[i]; if(j==0) { return NULL; } else { return norms+j; } } /* get the canonical combining class of a character */ static uint8_t getCCFromCP(uint32_t code) { Norm *norm=getNorm(code); if(norm==NULL) { return 0; } else { return norm->udataCC; } } /* * enumerate all code points with their Norm structs and call a function for each * return the number of code points with data */ static uint32_t enumTrie(EnumTrieFn *fn, void *context) { uint32_t code, count, i; uint16_t j, k, l; code=0; count=0; for(i=0; i<_NORM_STAGE_1_MAX_COUNT; ++i) { j=stage1[i]; if(j!=0) { for(k=0; k<_NORM_STAGE_2_BLOCK_COUNT; ++k) { l=stage2[j+k]; if(l!=0) { fn(context, code, norms+l); ++count; } ++code; } } else { code+=_NORM_STAGE_2_BLOCK_COUNT; } } return count; } static void setHaveSeenString(const uint32_t *s, int32_t length) { uint32_t c; while(length>0) { c=*s++; haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f)); --length; } } #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f))) /* handle combining data ---------------------------------------------------- */ static void addCombiningCP(uint32_t code, uint8_t flags) { uint32_t newEntry; uint16_t i; newEntry=code|((uint32_t)flags<<24); /* search for this code point */ for(i=0; i=sizeof(combiningCPs)/4) { fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n", (long)(sizeof(combiningCPs)/4)); exit(U_MEMORY_ALLOCATION_ERROR); } /* set i to the insertion point */ flags=(uint8_t)(newEntry>>24); if(flags==1) { i=combineFwdTop++; ++combineBothTop; } else if(flags==3) { i=combineBothTop++; } else /* flags==2 */ { i=combineBackTop; } /* move the following code points up one and insert newEntry at i */ if(icombiningFlags|=1; /* combines forward */ createNorm(trail)->combiningFlags|=2; /* combines backward */ addCombiningCP(lead, 1); addCombiningCP(trail, 2); triple=(CombiningTriple *)utm_alloc(combiningTriplesMem); triple->lead=lead; triple->trail=trail; triple->combined=combined; } static int compareTriples(const void *l, const void *r) { int diff; diff=(int)((CombiningTriple *)l)->leadIndex- (int)((CombiningTriple *)r)->leadIndex; if(diff==0) { diff=(int)((CombiningTriple *)l)->trailIndex- (int)((CombiningTriple *)r)->trailIndex; } return diff; } static void processCombining() { CombiningTriple *triples; uint16_t *p; uint32_t combined; uint16_t i, j, count, tableTop, finalIndex, combinesFwd; triples=utm_getStart(combiningTriplesMem); /* add lead and trail indexes to the triples for sorting */ count=(uint16_t)combiningTriplesMem->index; for(i=0; icombiningIndex=combiningIndexes[i]=tableTop; /* calculate the length of the combining data for this lead code point in the combiningTable */ while(jcombiningIndex=combiningIndexes[i]=finalIndex++; } /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */ if(finalIndex>0x8000) { fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n", tableTop, (long)(sizeof(combiningTable)/4)); exit(U_MEMORY_ALLOCATION_ERROR); } combiningTableTop=tableTop; /* store the combining data in the combiningTable, with the final indexes from above */ p=combiningTable; j=0; /* triples counter */ /* * this is essentially the same loop as above, but * it writes the table data instead of calculating and setting the final indexes; * it is necessary to have two passes so that all the final indexes are known before * they are written into the table */ for(i=0; icombiningFlags&1)<<13); *p++=finalIndex; if(combined<=0x1fff) { *p++=(uint16_t)(combinesFwd|combined); } else if(combined<=0xffff) { *p++=(uint16_t)(0x8000|combinesFwd); *p++=(uint16_t)combined; } else { *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10)); *p++=(uint16_t)(0xdc00|(combined&0x3ff)); } } /* set a marker on the last final trail index in this lead's table */ if(combined<=0x1ffff) { *(p-2)|=0x8000; } else { *(p-3)|=0x8000; } } /* post condition: tableTop==(p-combiningTable) */ } /* processing incoming normalization data ----------------------------------- */ /* * decompose the one decomposition further, may generate two decompositions * apply all previous characters' decompositions to this one */ static void decompStoreNewNF(uint32_t code, Norm *norm) { uint32_t nfd[40], nfkd[40]; uint32_t *s32; Norm *p; uint32_t c; int32_t i, length; uint8_t lenNFD=0, lenNFKD=0; UBool changedNFD=FALSE, changedNFKD=FALSE; if((length=norm->lenNFD)!=0) { /* always allocate the original string */ changedNFD=TRUE; s32=norm->nfd; } else if((length=norm->lenNFKD)!=0) { /* always allocate the original string */ changedNFKD=TRUE; s32=norm->nfkd; } else { /* no decomposition here, nothing to do */ return; } /* decompose each code point */ for(i=0; ilenNFD!=0) { uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4); lenNFD+=p->lenNFD; } else { nfd[lenNFD++]=c; } } /* compatibility-decompose c */ if(p->lenNFKD!=0) { uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4); lenNFKD+=p->lenNFKD; changedNFKD=TRUE; } else if(p->lenNFD!=0) { uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4); lenNFKD+=p->lenNFD; changedNFKD=TRUE; } else { nfkd[lenNFKD++]=c; } } /* assume that norm->lenNFD==1 or ==2 */ if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) { addCombiningTriple(s32[0], s32[1], code); } if(changedNFD) { if(lenNFD!=0) { s32=utm_allocN(utf32Mem, lenNFD); uprv_memcpy(s32, nfd, lenNFD*4); } else { s32=NULL; } norm->lenNFD=lenNFD; norm->nfd=s32; setHaveSeenString(nfd, lenNFD); } if(changedNFKD) { if(lenNFKD!=0) { s32=utm_allocN(utf32Mem, lenNFKD); uprv_memcpy(s32, nfkd, lenNFKD*4); } else { s32=NULL; } norm->lenNFKD=lenNFKD; norm->nfkd=s32; setHaveSeenString(nfkd, lenNFKD); } } typedef struct DecompSingle { uint32_t c; Norm *norm; } DecompSingle; /* * apply this one character's decompositions (there is at least one!) to * all previous characters' decompositions to decompose them further */ static void decompWithSingleFn(void *context, uint32_t code, Norm *norm) { uint32_t nfd[40], nfkd[40]; uint32_t *s32; DecompSingle *me=(DecompSingle *)context; uint32_t c, myC; int32_t i, length; uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD; UBool changedNFD=FALSE, changedNFKD=FALSE; /* get the new character's data */ myC=me->c; myLenNFD=me->norm->lenNFD; myLenNFKD=me->norm->lenNFKD; /* assume that myC has at least one decomposition */ if((length=norm->lenNFD)!=0 && myLenNFD!=0) { /* apply NFD(myC) to norm->nfd */ s32=norm->nfd; for(i=0; inorm->nfd, myLenNFD*4); lenNFD+=myLenNFD; changedNFD=TRUE; } else { nfd[lenNFD++]=c; } } } if((length=norm->lenNFKD)!=0) { /* apply NFD(myC) and NFKD(myC) to norm->nfkd */ s32=norm->nfkd; for(i=0; inorm->nfkd, myLenNFKD*4); lenNFKD+=myLenNFKD; } else /* assume myLenNFD!=0 */ { uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4); lenNFKD+=myLenNFD; } changedNFKD=TRUE; } else { nfkd[lenNFKD++]=c; } } } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) { /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */ s32=norm->nfd; for(i=0; inorm->nfkd, myLenNFKD*4); lenNFKD+=myLenNFKD; changedNFKD=TRUE; } else { nfkd[lenNFKD++]=c; } } } /* set the new decompositions, forget the old ones */ if(changedNFD) { if(lenNFD!=0) { if(lenNFD>norm->lenNFD) { s32=utm_allocN(utf32Mem, lenNFD); } else { s32=norm->nfd; } uprv_memcpy(s32, nfd, lenNFD*4); } else { s32=NULL; } norm->lenNFD=lenNFD; norm->nfd=s32; } if(changedNFKD) { if(lenNFKD!=0) { if(lenNFKD>norm->lenNFKD) { s32=utm_allocN(utf32Mem, lenNFKD); } else { s32=norm->nfkd; } uprv_memcpy(s32, nfkd, lenNFKD*4); } else { s32=NULL; } norm->lenNFKD=lenNFKD; norm->nfkd=s32; } } /* * process the data for one code point listed in UnicodeData; * UnicodeData itself never maps a code point to both NFD and NFKD */ extern void storeNorm(uint32_t code, Norm *norm) { DecompSingle decompSingle; Norm *p; /* copy existing derived normalization properties */ p=createNorm(code); norm->qcFlags=p->qcFlags; norm->combiningFlags=p->combiningFlags; /* process the decomposition if if there is at one here */ if((norm->lenNFD|norm->lenNFKD)!=0) { /* decompose this one decomposition further, may generate two decompositions */ decompStoreNewNF(code, norm); /* has this code point been used in previous decompositions? */ if(HAVE_SEEN(code)) { /* use this decomposition to decompose other decompositions further */ decompSingle.c=code; decompSingle.norm=norm; enumTrie(decompWithSingleFn, &decompSingle); } } /* store the data */ uprv_memcpy(p, norm, sizeof(Norm)); } extern void setQCFlags(uint32_t code, uint8_t qcFlags) { createNorm(code)->qcFlags|=qcFlags; /* adjust the minimum code point for quick check no/maybe */ if(code<0xffff) { if((qcFlags&_NORM_QC_NFC) && (uint16_t)codecombiningFlags|=0x80; } static void setHangulJamoSpecials() { Norm *norm; uint16_t *pStage2Block; uint32_t c; uint16_t i; /* * Hangul syllables are algorithmically decomposed into Jamos, * and Jamos are algorithmically composed into Hangul syllables. * The quick check flags are parsed, except for Hangul. */ /* set Jamo L specials */ for(c=0x1100; c<=0x1112; ++c) { norm=createNorm(c); norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L; norm->combiningFlags=1; } /* set Jamo V specials */ for(c=0x1161; c<=0x1175; ++c) { norm=createNorm(c); norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V; norm->combiningFlags=2; } /* set Jamo T specials */ for(c=0x11a8; c<=0x11c2; ++c) { norm=createNorm(c); norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T; norm->combiningFlags=2; } /* set Hangul specials, precompacted */ norm=(Norm *)utm_alloc(normMem); norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL; norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD; /* set one complete stage 2 block with this Hangul information */ pStage2Block=(uint16_t *)utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT); for(i=0; i<_NORM_STAGE_2_BLOCK_COUNT; ++i) { pStage2Block[i]=(uint16_t)(norm-norms); } /* set these data for U+ac00..U+d7a3 */ c=0xac00; /* set a partial stage 2 block before pStage2Block can be repeated */ if(c&_NORM_STAGE_2_MASK) { i=(uint16_t)(createStage2Block(c)+(c&_NORM_STAGE_2_MASK)); do { stage2[i++]=(uint16_t)(norm-norms); } while(++c&_NORM_STAGE_2_MASK); } /* set full stage 1 blocks to the common stage 2 block */ while(c<(0xd7a3&~_NORM_STAGE_2_MASK)) { stage1[c>>_NORM_TRIE_SHIFT]=(uint16_t)(pStage2Block-stage2); c+=_NORM_STAGE_2_BLOCK_COUNT; } /* set a partial stage 2 block after the repetition */ i=createStage2Block(c); while(c<=0xd7a3) { stage2[i++]=(uint16_t)(norm-norms); ++c; } } /* build runtime structures ------------------------------------------------- */ /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */ static uint16_t reorderString(uint32_t *s, int32_t length) { uint8_t ccs[40]; uint32_t c; int32_t i, j; uint8_t cc, prevCC; if(length<=0) { return 0; } for(i=0; ilenNFD; if(length>0) { norm->canonBothCCs=reorderString(norm->nfd, length); } /* canonically reorder the NFKD */ length=norm->lenNFKD; if(length>0) { norm->compatBothCCs=reorderString(norm->nfkd, length); } /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */ if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) { printf("U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags); } if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) { printf("U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags); } /* ### see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */ combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1; if(norm->combiningFlags&1) { if(norm->udataCC!=0) { /* illegal - data-derivable composition exclusion */ printf("U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC); } } if(norm->combiningFlags&2) { if((norm->qcFlags&0x11)==0) { printf("U+%04lx combines backward but qcNF?C==0\n", (long)code); } #if 0 /* occurs sometimes */ if(norm->udataCC==0) { printf("U+%04lx combines backward but udataCC==0\n", (long)code); } #endif } if((norm->combiningFlags&3)==3) { printf("U+%04lx combines both ways\n", (long)code); } } /* ### debug */ static uint32_t countCCSame=0, countCCTrail=0, countCCTwo=0; static uint32_t make32BitNorm(Norm *norm) { UChar extra[100]; const Norm *other; uint32_t word; int32_t i, length, beforeZero=0, count, start; /* * Check for assumptions: * * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes, * then the decomposition also begins with a true starter. */ if(norm->udataCC==0) { /* this is a starter */ if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) { /* a "true" NFC starter with a canonical decomposition */ if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */ ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */ ) { fprintf(stderr, "error: true NFC starter canonical decomposition[%u] does not begin\n" " with a true NFC starter: U+%04lx U+%04lx%s\n", norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1], norm->lenNFD<=2 ? "" : " ..."); exit(U_INVALID_TABLE_FILE); } } if((norm->qcFlags&_NORM_QC_NFKC)==0) { if(norm->lenNFKD>0) { /* a "true" NFKC starter with a compatibility decomposition */ if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */ ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFC_YES */ ) { fprintf(stderr, "error: true NFKC starter compatibility decomposition[%u] does not begin\n" " with a true NFKC starter: U+%04lx U+%04lx%s\n", norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1], norm->lenNFKD<=2 ? "" : " ..."); exit(U_INVALID_TABLE_FILE); } } else if(norm->lenNFD>0) { /* a "true" NFKC starter with only a canonical decomposition */ if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */ ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFC_YES */ ) { fprintf(stderr, "error: true NFKC starter canonical decomposition[%u] does not begin\n" " with a true NFKC starter: U+%04lx U+%04lx%s\n", norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1], norm->lenNFD<=2 ? "" : " ..."); exit(U_INVALID_TABLE_FILE); } } } } /* reset the 32-bit word and set the quick check flags */ word=norm->qcFlags; /* set the UnicodeData combining class */ word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT; /* set the combining flag and index */ if(norm->combiningFlags&3) { word|=(uint32_t)(norm->combiningFlags&3)<<6; } /* set the combining index value into the extra data */ if(norm->combiningIndex!=0) { extra[0]=norm->combiningIndex; beforeZero=1; } count=beforeZero; /* write the decompositions */ if((norm->lenNFD|norm->lenNFKD)!=0) { extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */ length=norm->lenNFD; if(length>0) { if(norm->canonBothCCs!=0) { extra[beforeZero]|=0x80; extra[count++]=norm->canonBothCCs; } start=count; for(i=0; infd[i]); } extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */ } length=norm->lenNFKD; if(length>0) { if(norm->compatBothCCs!=0) { extra[beforeZero]|=0x8000; extra[count++]=norm->compatBothCCs; } start=count; for(i=0; infkd[i]); } extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */ } } /* allocate and copy the extra data */ if(count!=0) { UChar *p; if(norm->specialTag!=0) { fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag); exit(U_ILLEGAL_ARGUMENT_ERROR); } p=(UChar *)utm_allocN(extraMem, count); uprv_memcpy(p, extra, count*2); /* set the extra index, offset by beforeZero */ word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT; } else if(norm->specialTag!=0) { /* set a special tag instead of an extra index */ word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT; } return word; } /* turn all Norm structs into corresponding 32-bit norm values */ static void makeAll32() { uint16_t i, count; /* * allocate and fill the table of 32-bit normalization data * leave space for data for the up to 1024 lead surrogates */ norm32TableTop=(uint16_t)stage2Mem->index; norm32Table=(uint32_t *)uprv_malloc((norm32TableTop+1024)*4); if(norm32Table==NULL) { fprintf(stderr, "error: gennorm - unable to allocate %ld 32-bit words for norm32Table\n", (long)(norm32TableTop+1024)); exit(U_MEMORY_ALLOCATION_ERROR); } /* reset all entries */ uprv_memset(norm32Table, 0, (norm32TableTop+1024)*4); count=0; /* skip the first, all-empty block */ for(i=_NORM_STAGE_2_BLOCK_COUNT; iindex); printf("count of (uncompacted) non-zero 32-bit words: %lu\n", (long)count); printf("count CC frequencies: same %lu trail %lu two %lu\n", (long)countCCSame, (long)countCCTrail, (long)countCCTwo); } /* * extract all Norm.canonBothCCs into the FCD table * set 32-bit values to use the common fold and compact functions */ static void makeFCD() { static uint16_t map[0x10000>>_NORM_TRIE_SHIFT]; Norm *norm; uint32_t i, oredValues; uint16_t bothCCs, delta; /* * allocate and fill the table of 32-bit normalization data * leave space for data for the up to 1024 lead surrogates */ fcdTableTop=(uint16_t)stage2Mem->index; fcdTable=(uint32_t *)uprv_malloc((fcdTableTop+1024)*4); if(fcdTable==NULL) { fprintf(stderr, "error: gennorm - unable to allocate %ld 32-bit words for fcdTable\n", (long)(fcdTableTop+1024)); exit(U_MEMORY_ALLOCATION_ERROR); } /* reset all entries */ uprv_memset(fcdTable, 0, (fcdTableTop+1024)*4); /* compact out the all-zero stage 2 blocks */ map[0]=0; delta=0; /* oredValues detects all-zero stage 2 blocks that will be removed from fcdStage1 */ oredValues=0; /* skip the first, all-empty block */ for(i=_NORM_STAGE_2_BLOCK_COUNT; icanonBothCCs; if(bothCCs==0) { /* if there are no decomposition cc's then use the udataCC twice */ bothCCs=norm->udataCC; bothCCs|=bothCCs<<8; } oredValues|=fcdTable[i-delta]=bothCCs; } if((i&_NORM_STAGE_2_MASK)==_NORM_STAGE_2_MASK) { /* at the end of a stage 2 block, check if there are any non-zero entries */ if(oredValues==0) { /* all zero: skip this block */ delta+=_NORM_STAGE_2_BLOCK_COUNT; map[i>>_NORM_TRIE_SHIFT]=(uint16_t)0; } else { /* keep this block */ map[i>>_NORM_TRIE_SHIFT]=(uint16_t)((i&~_NORM_STAGE_2_MASK)-delta); oredValues=0; } } } /* now adjust stage 1 */ for(i=0; i<_NORM_STAGE_1_MAX_COUNT; ++i) { fcdStage1[i]=map[fcdStage1[i]>>_NORM_TRIE_SHIFT]; } printf("FCD: omitted %u stage 2 entries in all-zero blocks\n", delta); /* adjust the table top */ fcdTableTop-=delta; } /* * Fold the supplementary code point data for one lead surrogate. */ static uint16_t foldLeadSurrogate(uint16_t *parent, uint16_t parentCount, uint32_t *stage, uint16_t *pStageCount, uint32_t base, UBool isNorm32) { uint32_t leadNorm32=0; uint32_t i, j, s2; uint32_t leadSurrogate=0xd7c0+(base>>10); printf("supplementary data for lead surrogate U+%04lx\n", (long)leadSurrogate); /* calculate the 32-bit data word for the lead surrogate */ for(i=0; i<_NORM_SURROGATE_BLOCK_COUNT; ++i) { s2=parent[(base>>_NORM_TRIE_SHIFT)+i]; if(s2!=0) { for(j=0; j<_NORM_STAGE_2_BLOCK_COUNT; ++j) { /* basically, or all 32-bit data into the one for the lead surrogate */ leadNorm32|=stage[s2+j]; } } } if(isNorm32) { /* turn multi-bit fields into the worst-case value */ if(leadNorm32&_NORM_CC_MASK) { leadNorm32|=_NORM_CC_MASK; } /* clean up unnecessarily ored bit fields */ leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT); if(leadNorm32==0) { /* nothing to do (only composition exclusions?) */ return 0; } /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */ leadNorm32|=( (uint32_t)_NORM_EXTRA_INDEX_TOP+ (uint32_t)((parentCount-_NORM_STAGE_1_BMP_COUNT)>>_NORM_SURROGATE_BLOCK_BITS) )<<_NORM_EXTRA_SHIFT; } else { if(leadNorm32==0) { /* FCD: nothing to do */ return 0; } /* * For FCD, replace the entire combined value by the surrogate index * and make sure that it is not 0 (by not offsetting it by the BMP top, * since here we have enough bits for this); * lead surrogates are tested at runtime on the character code itself * instead on special values of the trie data - * this is because 16 bits in the FCD trie data do not allow for anything * but the two leading and trailing combining classes of the canonical decomposition. */ leadNorm32=parentCount>>_NORM_SURROGATE_BLOCK_BITS; } /* enter the lead surrogate's data */ s2=parent[leadSurrogate>>_NORM_TRIE_SHIFT]; if(s2==0) { /* allocate a new stage 2 block in stage (the memory is there from makeAll32()/makeFCD()) */ s2=parent[leadSurrogate>>_NORM_TRIE_SHIFT]=*pStageCount; *pStageCount+=_NORM_STAGE_2_BLOCK_COUNT; } stage[s2+(leadSurrogate&_NORM_STAGE_2_MASK)]=leadNorm32; /* move the actual stage 1 indexes from the supplementary position to the new one */ uprv_memmove(parent+parentCount, parent+(base>>_NORM_TRIE_SHIFT), _NORM_SURROGATE_BLOCK_COUNT*2); /* increment stage 1 top */ return _NORM_SURROGATE_BLOCK_COUNT; } /* * Fold the normalization data for supplementary code points into * a compact area on top of the BMP-part of the trie index, * with the lead surrogates indexing this compact area. * * Use after makeAll32(). */ static uint16_t foldSupplementary(uint16_t *parent, uint16_t parentCount, uint32_t *stage, uint16_t *pStageCount, UBool isNorm32) { uint32_t c; uint16_t i; /* search for any stage 1 entries for supplementary code points */ for(c=0x10000; c<0x110000;) { i=parent[c>>_NORM_TRIE_SHIFT]; if(i!=0) { /* there is data, treat the full block for a lead surrogate */ c&=~0x3ff; parentCount+=foldLeadSurrogate(parent, parentCount, stage, pStageCount, c, isNorm32); c+=0x400; } else { c+=_NORM_STAGE_2_BLOCK_COUNT; } } printf("trie index count: BMP %u all Unicode %lu folded %u\n", _NORM_STAGE_1_BMP_COUNT, (long)_NORM_STAGE_1_MAX_COUNT, parentCount); return parentCount; } static uint16_t compact(uint16_t *parent, uint16_t parentCount, uint32_t *stage, uint16_t stageCount) { /* * This function is the common implementation for compacting * the stage 2 tables of 32-bit values. * It is a copy of genprops/store.c's compactStage() adapted for the 32-bit stage 2 tables. */ static uint16_t map[0x10000>>_NORM_TRIE_SHIFT]; uint32_t x; uint16_t i, start, prevEnd, newStart; map[0]=0; newStart=_NORM_STAGE_2_BLOCK_COUNT; for(start=newStart; start>_NORM_TRIE_SHIFT]=(uint16_t)(newStart-i); /* move the non-overlapping indexes to their new positions */ start+=i; for(i=(uint16_t)(_NORM_STAGE_2_BLOCK_COUNT-i); i>0; --i) { stage[newStart++]=stage[start++]; } } else if(newStart>_NORM_TRIE_SHIFT]=newStart; for(i=_NORM_STAGE_2_BLOCK_COUNT; i>0; --i) { stage[newStart++]=stage[start++]; } } else /* no overlap && newStart==start */ { map[start>>_NORM_TRIE_SHIFT]=start; newStart+=_NORM_STAGE_2_BLOCK_COUNT; start=newStart; } } /* now adjust the parent table */ for(i=0; i>_NORM_TRIE_SHIFT]; } /* we saved some space */ printf("compacting trie: count of 32-bit words %lu->%lu\n", (long)stageCount, (long)newStart); return newStart; } extern void processData() { #if 0 uint16_t i; #endif processCombining(); /* canonically reorder decompositions and assign combining classes for decompositions */ enumTrie(postParseFn, NULL); #if 0 for(i=1; i<64; ++i) { if(combineAndQC[i]) { printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33); } } #endif /* add hangul/jamo specials */ setHangulJamoSpecials(); /* copy stage 1 for the FCD trie */ uprv_memcpy(fcdStage1, stage1, sizeof(stage1)); /* --- finalize data for quick checks & normalization: stage1/norm32Table --- */ /* turn the Norm structs (stage2, norms) into 32-bit data words (norm32Table) */ makeAll32(); /* fold supplementary code points into lead surrogates */ stage1Top=foldSupplementary(stage1, _NORM_STAGE_1_BMP_COUNT, norm32Table, &norm32TableTop, TRUE); /* compact stage 2 */ norm32TableTop=compact(stage1, stage1Top, norm32Table, norm32TableTop); /* --- finalize data for FCD checks: fcdStage1/fcdTable --- */ /* FCD data: take Norm.canonBothCCs and store them in the FCD table */ makeFCD(); /* FCD: fold supplementary code points into lead surrogates */ fcdStage1Top=foldSupplementary(fcdStage1, _NORM_STAGE_1_BMP_COUNT, fcdTable, &fcdTableTop, FALSE); /* FCD: compact stage 2 */ fcdTableTop=compact(fcdStage1, fcdStage1Top, fcdTable, fcdTableTop); /* ### debug output */ #if 0 printf("number of stage 2 entries: %ld\n", stage2Mem->index); printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2); #endif printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop); printf("combining table count: %u\n", combiningTableTop); } extern void generateData(const char *dataDir) { UNewDataMemory *pData; uint16_t *p16; UErrorCode errorCode=U_ZERO_ERROR; uint32_t size, dataLength; uint16_t i; size= _NORM_INDEX_TOP*2+ stage1Top*2+ norm32TableTop*4+ extraMem->index*2+ combiningTableTop*2+ fcdStage1Top*2+ fcdTableTop*2; printf("size of " DATA_NAME "." DATA_TYPE " contents: %lu bytes\n", (long)size); indexes[_NORM_INDEX_COUNT]=_NORM_INDEX_TOP; indexes[_NORM_INDEX_TRIE_SHIFT]=_NORM_TRIE_SHIFT; indexes[_NORM_INDEX_TRIE_INDEX_COUNT]=stage1Top; indexes[_NORM_INDEX_TRIE_DATA_COUNT]=norm32TableTop; indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index; indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop; indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop; indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop); indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop); indexes[_NORM_INDEX_FCD_TRIE_INDEX_COUNT]=fcdStage1Top; indexes[_NORM_INDEX_FCD_TRIE_DATA_COUNT]=fcdTableTop; /* adjust the stage 1 indexes to offset stage 2 from the beginning of stage 1 */ /* stage1/norm32Table */ for(i=0; iindex*2); udata_writeBlock(pData, combiningTable, combiningTableTop*2); udata_writeBlock(pData, fcdStage1, fcdStage1Top*2); udata_writeBlock(pData, fcdTable, fcdTableTop*2); /* finish up */ dataLength=udata_finish(pData, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode); exit(errorCode); } if(dataLength!=size) { fprintf(stderr, "gennorm: data length %lu != calculated size %lu\n", (long)dataLength, (long)size); exit(U_INTERNAL_PROGRAM_ERROR); } } extern void cleanUpData(void) { uprv_free(norm32Table); uprv_free(fcdTable); utm_close(stage2Mem); utm_close(normMem); utm_close(utf32Mem); utm_close(extraMem); utm_close(combiningTriplesMem); } /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */