scuffed-code/icu4c/source/tools/gennorm/store.c

1434 lines
43 KiB
C
Raw Normal View History

/*
*******************************************************************************
*
* Copyright (C) 1999-2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: store.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001may25
* created by: Markus W. Scherer
*
* Store Unicode normalization data in a memory-mappable file.
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "unormimp.h"
#include "gennorm.h"
#ifdef WIN32
# pragma warning(disable: 4100)
#endif
#define DO_DEBUG_OUT 0
/* file data ---------------------------------------------------------------- */
/* UDataInfo cf. udata.h */
static UDataInfo dataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
{ 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
{1, 0, 0, _NORM_TRIE_SHIFT}, /* formatVersion - [3] contains the trie shift! */
{3, 1, 0, 0} /* dataVersion (Unicode version) */
};
extern void
setUnicodeVersion(const char *v) {
UVersionInfo version;
u_versionFromString(version, v);
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
static uint16_t indexes[_NORM_INDEX_TOP]={ 0 };
/* tool memory helper ------------------------------------------------------- */
typedef struct UToolMemory {
char name[64];
uint32_t count, size, index;
uint32_t array[1];
} UToolMemory;
static UToolMemory *
utm_open(const char *name, uint32_t count, uint32_t size) {
UToolMemory *mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+count*size);
if(mem==NULL) {
fprintf(stderr, "error: %s - out of memory\n", name);
exit(U_MEMORY_ALLOCATION_ERROR);
}
uprv_strcpy(mem->name, name);
mem->count=count;
mem->size=size;
mem->index=0;
return mem;
}
#if 0
/* we don't use this - we don't clean up memory here... */
static void
utm_close(UToolMemory *mem) {
if(mem!=NULL) {
uprv_free(mem);
}
}
#endif
static void *
utm_getStart(UToolMemory *mem) {
return (char *)mem->array;
}
static void *
utm_alloc(UToolMemory *mem) {
char *p=(char *)mem->array+mem->index*mem->size;
if(++mem->index<=mem->count) {
uprv_memset(p, 0, mem->size);
return p;
} else {
fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n",
mem->name, mem->count);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
static void *
utm_allocN(UToolMemory *mem, int32_t n) {
char *p=(char *)mem->array+mem->index*mem->size;
if((mem->index+=(uint32_t)n)<=mem->count) {
uprv_memset(p, 0, n*mem->size);
return p;
} else {
fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n",
mem->name, mem->count);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
/* builder data ------------------------------------------------------------- */
typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
static UToolMemory *stage2Mem, *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
static uint16_t stage1[_NORM_STAGE_1_MAX_COUNT], fcdStage1[_NORM_STAGE_1_MAX_COUNT];
static uint16_t *stage2;
static Norm *norms;
/*
* set a flag for each code point that was seen in decompositions -
* avoid to decompose ones that have not been used before
*/
static uint32_t haveSeenFlags[256];
static uint32_t combiningCPs[2000];
static uint16_t combiningIndexes[2000];
static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
typedef struct CombiningTriple {
uint16_t leadIndex, trailIndex;
uint32_t lead, trail, combined;
} CombiningTriple;
/* 15b in the combining index -> <=0x8000 pairs of uint16_t in the combining table */
static uint16_t combiningTable[2*0x8000];
static uint16_t combiningTableTop=0;
/* stage 2 table after turning Norm structs into 32-bit words */
static uint32_t *norm32Table=NULL, *fcdTable=NULL;
/* number of units used in stage 1 and norm32Table, and same for FCD */
static uint16_t stage1Top, fcdStage1Top,
norm32TableTop, fcdTableTop;
extern void
init() {
/* reset stage 1 of the trie */
uprv_memset(stage1, 0, sizeof(stage1));
/* allocate stage 2 of the trie and reset the first block */
stage2Mem=utm_open("gennorm trie stage 2", 30000, sizeof(*stage2));
stage2=utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT);
/* allocate Norm structures and reset the first one */
normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm));
norms=utm_alloc(normMem);
/* allocate UTF-32 string memory */
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 4);
/* reset all "have seen" flags */
uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
/* allocate extra data memory for UTF-16 decomposition strings and other values */
extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, 2);
/* allocate temporary memory for combining triples */
combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, sizeof(CombiningTriple));
/* set the minimum code points for no/maybe quick check values to the end of the BMP */
indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
}
/* get or create a block in stage 2 of the trie */
static uint16_t
createStage2Block(uint32_t code) {
uint32_t i;
uint16_t j;
i=code>>_NORM_TRIE_SHIFT;
j=stage1[i];
if(j==0) {
/* allocate a stage 2 block */
uint16_t *p;
p=(uint16_t *)utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT);
stage1[i]=j=(uint16_t)(p-stage2);
}
return j;
}
/*
* get or create a Norm unit;
* get or create the intermediate trie entries for it as well
*/
static Norm *
createNorm(uint32_t code) {
Norm *p;
uint16_t stage2Block, k;
stage2Block=createStage2Block(code);
k=(uint16_t)(stage2Block+(code&_NORM_STAGE_2_MASK));
if(stage2[k]==0) {
/* allocate Norm */
p=(Norm *)utm_alloc(normMem);
stage2[k]=(uint16_t)(p-norms);
} else {
p=norms+stage2[k];
}
return p;
}
/* get an existing Norm unit */
static Norm *
getNorm(uint32_t code) {
uint32_t i;
uint16_t j;
/* access stage 1 and get the stage 2 block start index */
i=code>>_NORM_TRIE_SHIFT;
j=stage1[i];
if(j==0) {
return NULL;
}
/* access stage 2 and get the Norm unit */
i=(uint16_t)(j+(code&_NORM_STAGE_2_MASK));
j=stage2[i];
if(j==0) {
return NULL;
} else {
return norms+j;
}
}
/* get the canonical combining class of a character */
static uint8_t
getCCFromCP(uint32_t code) {
Norm *norm=getNorm(code);
if(norm==NULL) {
return 0;
} else {
return norm->udataCC;
}
}
/*
* enumerate all code points with their Norm structs and call a function for each
* return the number of code points with data
*/
static uint32_t
enumTrie(EnumTrieFn *fn, void *context) {
uint32_t code, count, i;
uint16_t j, k, l;
code=0;
count=0;
for(i=0; i<_NORM_STAGE_1_MAX_COUNT; ++i) {
j=stage1[i];
if(j!=0) {
for(k=0; k<_NORM_STAGE_2_BLOCK_COUNT; ++k) {
l=stage2[j+k];
if(l!=0) {
fn(context, code, norms+l);
++count;
}
++code;
}
} else {
code+=_NORM_STAGE_2_BLOCK_COUNT;
}
}
return count;
}
static void
setHaveSeenString(const uint32_t *s, int32_t length) {
uint32_t c;
while(length>0) {
c=*s++;
haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f));
--length;
}
}
#define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
/* handle combining data ---------------------------------------------------- */
static void
addCombiningCP(uint32_t code, uint8_t flags) {
uint32_t newEntry;
uint16_t i;
newEntry=code|((uint32_t)flags<<24);
/* search for this code point */
for(i=0; i<combineBackTop; ++i) {
if(code==(combiningCPs[i]&0xffffff)) {
/* found it */
if(newEntry==combiningCPs[i]) {
return; /* no change */
}
/* combine the flags, remove the old entry from the old place, and insert the new one */
newEntry|=combiningCPs[i];
if(i!=--combineBackTop) {
uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
}
if(i<combineBothTop) {
--combineBothTop;
}
if(i<combineFwdTop) {
--combineFwdTop;
}
break;
}
}
/* not found or modified, insert it */
if(combineBackTop>=sizeof(combiningCPs)/4) {
fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
sizeof(combiningCPs)/4);
exit(U_MEMORY_ALLOCATION_ERROR);
}
/* set i to the insertion point */
flags=(uint8_t)(newEntry>>24);
if(flags==1) {
i=combineFwdTop++;
++combineBothTop;
} else if(flags==3) {
i=combineBothTop++;
} else /* flags==2 */ {
i=combineBackTop;
}
/* move the following code points up one and insert newEntry at i */
if(i<combineBackTop) {
uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
}
combiningCPs[i]=newEntry;
/* finally increment the total counter */
++combineBackTop;
}
static uint16_t
findCombiningCP(uint32_t code, UBool isLead) {
uint16_t i, limit;
if(isLead) {
i=0;
limit=combineBothTop;
} else {
i=combineFwdTop;
limit=combineBackTop;
}
/* search for this code point */
for(; i<limit; ++i) {
if(code==(combiningCPs[i]&0xffffff)) {
/* found it */
return i;
}
}
/* not found */
return 0xffff;
}
static void
addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
CombiningTriple *triple;
/*
* set combiningFlags for the two code points
* do this after decomposition so that getNorm() above returns NULL
* if we do not have actual sub-decomposition data for the initial NFD here
*/
createNorm(lead)->combiningFlags|=1; /* combines forward */
createNorm(trail)->combiningFlags|=2; /* combines backward */
addCombiningCP(lead, 1);
addCombiningCP(trail, 2);
triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
triple->lead=lead;
triple->trail=trail;
triple->combined=combined;
}
static int
compareTriples(const void *l, const void *r) {
int diff;
diff=(int)((CombiningTriple *)l)->leadIndex-
(int)((CombiningTriple *)r)->leadIndex;
if(diff==0) {
diff=(int)((CombiningTriple *)l)->trailIndex-
(int)((CombiningTriple *)r)->trailIndex;
}
return diff;
}
static void
processCombining() {
CombiningTriple *triples;
uint16_t *p;
uint32_t combined;
uint16_t i, j, count, tableTop, finalIndex;
triples=utm_getStart(combiningTriplesMem);
/* add lead and trail indexes to the triples for sorting */
count=(uint16_t)combiningTriplesMem->index;
for(i=0; i<count; ++i) {
/* findCombiningCP() must always find the code point */
triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
}
/* sort them by leadIndex, trailIndex */
qsort(triples, count, sizeof(CombiningTriple), compareTriples);
/* calculate final combining indexes and store them in the Norm entries */
tableTop=0;
j=0; /* triples counter */
/* first, combining indexes of fwd/both characters are indexes into the combiningTable */
for(i=0; i<combineBothTop; ++i) {
/* start a new table */
/* assign combining index */
createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
/* calculate the length of the combining data for this lead code point in the combiningTable */
while(j<count && i==triples[j].leadIndex) {
/* count 2 16-bit units per composition code unit */
tableTop+=(uint16_t)(2*UTF16_CHAR_LENGTH(combined=triples[j++].combined));
}
}
/* second, combining indexes of back-only characters are simply incremented from here to be unique */
finalIndex=tableTop;
for(; i<combineBackTop; ++i) {
createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
}
/* it must be tableTop<0x7fff because bit 15 is used in combiningTable as an end-for-this-lead marker */
if(tableTop>=sizeof(combiningTable)/4) {
fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
tableTop, sizeof(combiningTable)/4);
exit(U_MEMORY_ALLOCATION_ERROR);
}
combiningTableTop=tableTop;
/* store the combining data in the combiningTable, with the final indexes from above */
p=combiningTable;
j=0; /* triples counter */
/*
* this is essentially the same loop as above, but
* it writes the table data instead of calculating and setting the final indexes;
* it is necessary to have two passes so that all the final indexes are known before
* they are written into the table
*/
for(i=0; i<combineBothTop; ++i) {
/* start a new table */
/* store the combining data for this lead code point in the combiningTable */
while(j<count && i==triples[j].leadIndex) {
finalIndex=combiningIndexes[triples[j].trailIndex];
combined=triples[j].combined;
if(combined<=0xffff) {
*p++=finalIndex;
*p++=(uint16_t)combined;
} else {
*p++=finalIndex;
*p++=(uint16_t)(0xd7c0+(combined>>10));
*p++=finalIndex;
*p++=(uint16_t)(0xdc00|(combined&0x3ff));
}
++j;
}
/* set a marker on the last final trail index in this lead's table */
*(p-2)|=0x8000;
}
/* post condition: tableTop==(p-combiningTable) */
}
/* processing incoming normalization data ----------------------------------- */
/*
* decompose the one decomposition further, may generate two decompositions
* apply all previous characters' decompositions to this one
*/
static void
decompStoreNewNF(uint32_t code, Norm *norm) {
uint32_t nfd[40], nfkd[40];
uint32_t *s32;
Norm *p;
uint32_t c;
int32_t i, length;
uint8_t lenNFD=0, lenNFKD=0;
UBool changedNFD=FALSE, changedNFKD=FALSE;
if((length=norm->lenNFD)!=0) {
/* always allocate the original string */
changedNFD=TRUE;
s32=norm->nfd;
} else if((length=norm->lenNFKD)!=0) {
/* always allocate the original string */
changedNFKD=TRUE;
s32=norm->nfkd;
} else {
/* no decomposition here, nothing to do */
return;
}
/* decompose each code point */
for(i=0; i<length; ++i) {
c=s32[i];
p=getNorm(c);
if(p==NULL) {
/* no data, no decomposition */
nfd[lenNFD++]=c;
nfkd[lenNFKD++]=c;
continue;
}
/* canonically decompose c */
if(changedNFD) {
if(p->lenNFD!=0) {
uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
lenNFD+=p->lenNFD;
} else {
nfd[lenNFD++]=c;
}
}
/* compatibility-decompose c */
if(p->lenNFKD!=0) {
uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
lenNFKD+=p->lenNFKD;
changedNFKD=TRUE;
} else if(p->lenNFD!=0) {
uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
lenNFKD+=p->lenNFD;
changedNFKD=TRUE;
} else {
nfkd[lenNFKD++]=c;
}
}
/* assume that norm->lenNFD==1 or ==2 */
if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
addCombiningTriple(s32[0], s32[1], code);
}
if(changedNFD) {
if(lenNFD!=0) {
s32=utm_allocN(utf32Mem, lenNFD);
uprv_memcpy(s32, nfd, lenNFD*4);
} else {
s32=NULL;
}
norm->lenNFD=lenNFD;
norm->nfd=s32;
setHaveSeenString(nfd, lenNFD);
}
if(changedNFKD) {
if(lenNFKD!=0) {
s32=utm_allocN(utf32Mem, lenNFKD);
uprv_memcpy(s32, nfkd, lenNFKD*4);
} else {
s32=NULL;
}
norm->lenNFKD=lenNFKD;
norm->nfkd=s32;
setHaveSeenString(nfkd, lenNFKD);
}
}
typedef struct DecompSingle {
uint32_t c;
Norm *norm;
} DecompSingle;
/*
* apply this one character's decompositions (there is at least one!) to
* all previous characters' decompositions to decompose them further
*/
static void
decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
uint32_t nfd[40], nfkd[40];
uint32_t *s32;
DecompSingle *me=(DecompSingle *)context;
uint32_t c, myC;
int32_t i, length;
uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
UBool changedNFD=FALSE, changedNFKD=FALSE;
/* get the new character's data */
myC=me->c;
myLenNFD=me->norm->lenNFD;
myLenNFKD=me->norm->lenNFKD;
/* assume that myC has at least one decomposition */
if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
/* apply NFD(myC) to norm->nfd */
s32=norm->nfd;
for(i=0; i<length; ++i) {
c=s32[i];
if(c==myC) {
uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
lenNFD+=myLenNFD;
changedNFD=TRUE;
} else {
nfd[lenNFD++]=c;
}
}
}
if((length=norm->lenNFKD)!=0) {
/* apply NFD(myC) and NFKD(myC) to norm->nfkd */
s32=norm->nfkd;
for(i=0; i<length; ++i) {
c=s32[i];
if(c==myC) {
if(myLenNFKD!=0) {
uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
lenNFKD+=myLenNFKD;
} else /* assume myLenNFD!=0 */ {
uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
lenNFKD+=myLenNFD;
}
changedNFKD=TRUE;
} else {
nfkd[lenNFKD++]=c;
}
}
} else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
/* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
s32=norm->nfd;
for(i=0; i<length; ++i) {
c=s32[i];
if(c==myC) {
uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
lenNFKD+=myLenNFKD;
changedNFKD=TRUE;
} else {
nfkd[lenNFKD++]=c;
}
}
}
/* set the new decompositions, forget the old ones */
if(changedNFD) {
if(lenNFD!=0) {
if(lenNFD>norm->lenNFD) {
s32=utm_allocN(utf32Mem, lenNFD);
} else {
s32=norm->nfd;
}
uprv_memcpy(s32, nfd, lenNFD*4);
} else {
s32=NULL;
}
norm->lenNFD=lenNFD;
norm->nfd=s32;
}
if(changedNFKD) {
if(lenNFKD!=0) {
if(lenNFKD>norm->lenNFKD) {
s32=utm_allocN(utf32Mem, lenNFKD);
} else {
s32=norm->nfkd;
}
uprv_memcpy(s32, nfkd, lenNFKD*4);
} else {
s32=NULL;
}
norm->lenNFKD=lenNFKD;
norm->nfkd=s32;
}
}
/*
* process the data for one code point listed in UnicodeData;
* UnicodeData itself never maps a code point to both NFD and NFKD
*/
extern void
storeNorm(uint32_t code, Norm *norm) {
DecompSingle decompSingle;
Norm *p;
/* copy existing derived normalization properties */
p=createNorm(code);
norm->qcFlags=p->qcFlags;
norm->combiningFlags=p->combiningFlags;
/* process the decomposition if if there is at one here */
if((norm->lenNFD|norm->lenNFKD)!=0) {
/* decompose this one decomposition further, may generate two decompositions */
decompStoreNewNF(code, norm);
/* has this code point been used in previous decompositions? */
if(HAVE_SEEN(code)) {
/* use this decomposition to decompose other decompositions further */
decompSingle.c=code;
decompSingle.norm=norm;
enumTrie(decompWithSingleFn, &decompSingle);
}
}
/* store the data */
uprv_memcpy(p, norm, sizeof(Norm));
}
extern void
setQCFlags(uint32_t code, uint8_t qcFlags) {
createNorm(code)->qcFlags|=qcFlags;
/* adjust the minimum code point for quick check no/maybe */
if(code<0xffff) {
if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
}
if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
}
if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
}
if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
}
}
}
extern void
setCompositionExclusion(uint32_t code) {
createNorm(code)->combiningFlags|=0x80;
}
static void
setHangulJamoSpecials() {
Norm *norm;
uint16_t *pStage2Block;
uint32_t c;
uint16_t i;
/*
* Hangul syllables are algorithmically decomposed into Jamos,
* and Jamos are algorithmically composed into Hangul syllables.
* The quick check flags are parsed, except for Hangul.
*/
#if 0
/* set Jamo 1 specials */
for(c=0x1100; c<=0x1112; ++c) {
norm=createNorm(c);
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_1;
norm->combiningFlags=1;
}
#endif
/* set Jamo 2 specials */
for(c=0x1161; c<=0x1175; ++c) {
norm=createNorm(c);
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_2;
norm->combiningFlags=3;
}
/* set Jamo 3 specials */
for(c=0x11a8; c<=0x11c2; ++c) {
norm=createNorm(c);
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_3;
norm->combiningFlags=2;
}
/* set Hangul specials, precompacted */
norm=(Norm *)utm_alloc(normMem);
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
/* set one complete stage 2 block with this Hangul information */
pStage2Block=(uint16_t *)utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT);
for(i=0; i<_NORM_STAGE_2_BLOCK_COUNT; ++i) {
pStage2Block[i]=(uint16_t)(norm-norms);
}
/* set these data for U+ac00..U+d7a3 */
c=0xac00;
/* set a partial stage 2 block before pStage2Block can be repeated */
if(c&_NORM_STAGE_2_MASK) {
i=(uint16_t)(createStage2Block(c)+(c&_NORM_STAGE_2_MASK));
do {
stage2[i++]=(uint16_t)(norm-norms);
} while(++c&_NORM_STAGE_2_MASK);
}
/* set full stage 1 blocks to the common stage 2 block */
while(c<(0xd7a3&~_NORM_STAGE_2_MASK)) {
stage1[c>>_NORM_TRIE_SHIFT]=(uint16_t)(pStage2Block-stage2);
c+=_NORM_STAGE_2_BLOCK_COUNT;
}
/* set a partial stage 2 block after the repetition */
i=createStage2Block(c);
while(c<=0xd7a3) {
stage2[i++]=(uint16_t)(norm-norms);
++c;
}
}
/* build runtime structures ------------------------------------------------- */
/* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
static uint16_t
reorderString(uint32_t *s, int32_t length) {
uint8_t ccs[40];
uint32_t c;
int32_t i, j;
uint8_t cc, prevCC;
if(length<=0) {
return 0;
}
for(i=0; i<length; ++i) {
/* get the i-th code point and its combining class */
c=s[i];
cc=getCCFromCP(c);
if(cc!=0 && i!=0) {
/* it is a combining mark, see if it needs to be moved back */
j=i;
do {
prevCC=ccs[j-1];
if(prevCC<=cc) {
break; /* found the right place */
}
/* move the previous code point here and go back */
s[j]=s[j-1];
ccs[j]=prevCC;
} while(--j!=0);
s[j]=c;
ccs[j]=cc;
} else {
/* just store the combining class */
ccs[i]=cc;
}
}
return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]);
}
static UBool combineAndQC[64]={ 0 };
/*
* canonically reorder the up to two decompositions
* and store the leading and trailing combining classes accordingly
*/
static void
postParseFn(void *context, uint32_t code, Norm *norm) {
int32_t length;
/* canonically order the NFD */
length=norm->lenNFD;
if(length>0) {
norm->canonBothCCs=reorderString(norm->nfd, length);
}
/* canonically reorder the NFKD */
length=norm->lenNFKD;
if(length>0) {
norm->compatBothCCs=reorderString(norm->nfkd, length);
}
/* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
printf("U+%04lx has NFD[%d] but quick check 0x%02x\n", code, norm->lenNFD, norm->qcFlags);
}
if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
printf("U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
}
/* ### see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
if(norm->combiningFlags&1) {
if(norm->udataCC!=0) {
/* illegal - data-derivable composition exclusion */
printf("U+%04lx combines forward but udataCC==%u\n", code, norm->udataCC);
}
}
if(norm->combiningFlags&2) {
if((norm->qcFlags&0x11)==0) {
printf("U+%04lx combines backward but qcNF?C==0\n", code);
}
#if 0
/* occurs sometimes */
if(norm->udataCC==0) {
printf("U+%04lx combines backward but udataCC==0\n", code);
}
#endif
}
if((norm->combiningFlags&3)==3) {
printf("U+%04lx combines both ways\n", code);
}
}
/* ### debug */
static uint32_t countCCSame=0, countCCTrail=0, countCCTwo=0;
static uint32_t
make32BitNorm(Norm *norm) {
UChar extra[100];
uint32_t word;
int32_t i, length, beforeZero=0, count, start;
/* reset the 32-bit word and set the quick check flags */
word=norm->qcFlags;
/* set the UnicodeData combining class */
word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
/* set the combining flag and index */
if(norm->combiningFlags&3) {
word|=(uint32_t)(norm->combiningFlags&3)<<6;
}
/* set the combining index value into the extra data */
if(norm->combiningIndex!=0) {
extra[0]=norm->combiningIndex;
beforeZero=1;
}
count=beforeZero;
/* write the decompositions */
if((norm->lenNFD|norm->lenNFKD)!=0) {
extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
length=norm->lenNFD;
if(length>0) {
if(norm->canonBothCCs!=0) {
extra[beforeZero]|=0x80;
extra[count++]=norm->canonBothCCs;
}
start=count;
for(i=0; i<length; ++i) {
UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
}
extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
}
length=norm->lenNFKD;
if(length>0) {
if(norm->compatBothCCs!=0) {
extra[beforeZero]|=0x8000;
extra[count++]=norm->compatBothCCs;
}
start=count;
for(i=0; i<length; ++i) {
UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
}
extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
}
}
/* allocate and copy the extra data */
if(count!=0) {
UChar *p;
if(norm->specialTag!=0) {
fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
exit(U_ILLEGAL_ARGUMENT_ERROR);
}
p=(UChar *)utm_allocN(extraMem, count);
uprv_memcpy(p, extra, count*2);
/* set the extra index, offset by beforeZero */
word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
} else if(norm->specialTag!=0) {
/* set a special tag instead of an extra index */
word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
}
return word;
}
/* turn all Norm structs into corresponding 32-bit norm values */
static void
makeAll32() {
uint16_t i, count;
/*
* allocate and fill the table of 32-bit normalization data
* leave space for data for the up to 1024 lead surrogates
*/
norm32TableTop=(uint16_t)stage2Mem->index;
norm32Table=(uint32_t *)uprv_malloc((norm32TableTop+1024)*4);
if(norm32Table==NULL) {
fprintf(stderr, "error: gennorm - unable to allocate %ld 32-bit words for norm32Table\n",
norm32TableTop+1024);
exit(U_MEMORY_ALLOCATION_ERROR);
}
/* reset all entries */
uprv_memset(norm32Table, 0, (norm32TableTop+1024)*4);
count=0;
/* skip the first, all-empty block */
for(i=_NORM_STAGE_2_BLOCK_COUNT; i<norm32TableTop; ++i) {
if(stage2[i]!=0) {
if(0!=(norm32Table[i]=make32BitNorm(norms+stage2[i]))) {
++count;
}
}
}
printf("count of 16-bit extra data: %lu\n", extraMem->index);
printf("count of (uncompacted) non-zero 32-bit words: %lu\n", count);
printf("count CC frequencies: same %lu trail %lu two %lu\n", countCCSame, countCCTrail, countCCTwo);
}
/*
* extract all Norm.canonBothCCs into the FCD table
* set 32-bit values to use the common fold and compact functions
*/
static void
makeFCD() {
static uint16_t map[0x10000>>_NORM_TRIE_SHIFT];
Norm *norm;
uint32_t i, oredValues;
uint16_t bothCCs, delta;
/*
* allocate and fill the table of 32-bit normalization data
* leave space for data for the up to 1024 lead surrogates
*/
fcdTableTop=(uint16_t)stage2Mem->index;
fcdTable=(uint32_t *)uprv_malloc((fcdTableTop+1024)*4);
if(fcdTable==NULL) {
fprintf(stderr, "error: gennorm - unable to allocate %ld 32-bit words for fcdTable\n",
fcdTableTop+1024);
exit(U_MEMORY_ALLOCATION_ERROR);
}
/* reset all entries */
uprv_memset(fcdTable, 0, (fcdTableTop+1024)*4);
/* compact out the all-zero stage 2 blocks */
map[0]=0;
delta=0;
/* oredValues detects all-zero stage 2 blocks that will be removed from fcdStage1 */
oredValues=0;
/* skip the first, all-empty block */
for(i=_NORM_STAGE_2_BLOCK_COUNT; i<fcdTableTop; ++i) {
if(stage2[i]!=0) {
norm=norms+stage2[i];
bothCCs=norm->canonBothCCs;
if(bothCCs==0) {
/* if there are no decomposition cc's then use the udataCC twice */
bothCCs=norm->udataCC;
bothCCs|=bothCCs<<8;
}
oredValues|=fcdTable[i-delta]=bothCCs;
}
if((i&_NORM_STAGE_2_MASK)==_NORM_STAGE_2_MASK) {
/* at the end of a stage 2 block, check if there are any non-zero entries */
if(oredValues==0) {
/* all zero: skip this block */
delta+=_NORM_STAGE_2_BLOCK_COUNT;
map[i>>_NORM_TRIE_SHIFT]=(uint16_t)0;
} else {
/* keep this block */
map[i>>_NORM_TRIE_SHIFT]=(uint16_t)((i&~_NORM_STAGE_2_MASK)-delta);
oredValues=0;
}
}
}
/* now adjust stage 1 */
for(i=0; i<_NORM_STAGE_1_MAX_COUNT; ++i) {
fcdStage1[i]=map[fcdStage1[i]>>_NORM_TRIE_SHIFT];
}
printf("FCD: omitted %u stage 2 entries in all-zero blocks\n", delta);
/* adjust the table top */
fcdTableTop-=delta;
}
/*
* Fold the supplementary code point data for one lead surrogate.
*/
static uint16_t
foldLeadSurrogate(uint16_t *parent, uint16_t parentCount,
uint32_t *stage, uint16_t *pStageCount,
uint32_t base,
UBool isNorm32) {
uint32_t leadNorm32=0;
uint32_t i, j, s2;
uint32_t leadSurrogate=0xd7c0+(base>>10);
printf("supplementary data for lead surrogate U+%04lx\n", leadSurrogate);
/* calculate the 32-bit data word for the lead surrogate */
for(i=0; i<_NORM_SURROGATE_BLOCK_COUNT; ++i) {
s2=parent[(base>>_NORM_TRIE_SHIFT)+i];
if(s2!=0) {
for(j=0; j<_NORM_STAGE_2_BLOCK_COUNT; ++j) {
/* basically, or all 32-bit data into the one for the lead surrogate */
leadNorm32|=stage[s2+j];
}
}
}
if(isNorm32) {
/* turn multi-bit fields into the worst-case value */
if(leadNorm32&_NORM_CC_MASK) {
leadNorm32|=_NORM_CC_MASK;
}
/* clean up unnecessarily ored bit fields */
leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
if(leadNorm32==0) {
/* nothing to do (only composition exclusions?) */
return 0;
}
/* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
leadNorm32|=(
(uint32_t)_NORM_EXTRA_INDEX_TOP+
(uint32_t)((parentCount-_NORM_STAGE_1_BMP_COUNT)>>_NORM_SURROGATE_BLOCK_BITS)
)<<_NORM_EXTRA_SHIFT;
} else {
if(leadNorm32==0) {
/* FCD: nothing to do */
return 0;
}
/*
* For FCD, replace the entire combined value by the surrogate index
* and make sure that it is not 0 (by not offsetting it by the BMP top,
* since here we have enough bits for this);
* lead surrogates are tested at runtime on the character code itself
* instead on special values of the trie data -
* this is because 16 bits in the FCD trie data do not allow for anything
* but the two leading and trailing combining classes of the canonical decomposition.
*/
leadNorm32=parentCount>>_NORM_SURROGATE_BLOCK_BITS;
}
/* enter the lead surrogate's data */
s2=parent[leadSurrogate>>_NORM_TRIE_SHIFT];
if(s2==0) {
/* allocate a new stage 2 block in stage (the memory is there from makeAll32()/makeFCD()) */
s2=parent[leadSurrogate>>_NORM_TRIE_SHIFT]=*pStageCount;
*pStageCount+=_NORM_STAGE_2_BLOCK_COUNT;
}
stage[s2+(leadSurrogate&_NORM_STAGE_2_MASK)]=leadNorm32;
/* move the actual stage 1 indexes from the supplementary position to the new one */
uprv_memmove(parent+parentCount, parent+(base>>_NORM_TRIE_SHIFT), _NORM_SURROGATE_BLOCK_COUNT*2);
/* increment stage 1 top */
return _NORM_SURROGATE_BLOCK_COUNT;
}
/*
* Fold the normalization data for supplementary code points into
* a compact area on top of the BMP-part of the trie index,
* with the lead surrogates indexing this compact area.
*
* Use after makeAll32().
*/
static uint16_t
foldSupplementary(uint16_t *parent, uint16_t parentCount,
uint32_t *stage, uint16_t *pStageCount,
UBool isNorm32) {
uint32_t c;
uint16_t i;
/* search for any stage 1 entries for supplementary code points */
for(c=0x10000; c<0x110000;) {
i=parent[c>>_NORM_TRIE_SHIFT];
if(i!=0) {
/* there is data, treat the full block for a lead surrogate */
c&=~0x3ff;
parentCount+=foldLeadSurrogate(parent, parentCount, stage, pStageCount, c, isNorm32);
c+=0x400;
} else {
c+=_NORM_STAGE_2_BLOCK_COUNT;
}
}
printf("trie index count: BMP %u all Unicode %lu folded %u\n",
_NORM_STAGE_1_BMP_COUNT, _NORM_STAGE_1_MAX_COUNT, parentCount);
return parentCount;
}
static uint16_t
compact(uint16_t *parent, uint16_t parentCount,
uint32_t *stage, uint16_t stageCount) {
/*
* This function is the common implementation for compacting
* the stage 2 tables of 32-bit values.
* It is a copy of genprops/store.c's compactStage() adapted for the 32-bit stage 2 tables.
*/
static uint16_t map[0x10000>>_NORM_TRIE_SHIFT];
uint32_t x;
uint16_t i, start, prevEnd, newStart;
map[0]=0;
newStart=_NORM_STAGE_2_BLOCK_COUNT;
for(start=newStart; start<stageCount;) {
prevEnd=(uint16_t)(newStart-1);
x=stage[start];
if(x==stage[prevEnd]) {
/* overlap by at least one */
for(i=1; i<_NORM_STAGE_2_BLOCK_COUNT && x==stage[start+i] && x==stage[prevEnd-i]; ++i) {}
/* overlap by i */
map[start>>_NORM_TRIE_SHIFT]=(uint16_t)(newStart-i);
/* move the non-overlapping indexes to their new positions */
start+=i;
for(i=(uint16_t)(_NORM_STAGE_2_BLOCK_COUNT-i); i>0; --i) {
stage[newStart++]=stage[start++];
}
} else if(newStart<start) {
/* move the indexes to their new positions */
map[start>>_NORM_TRIE_SHIFT]=newStart;
for(i=_NORM_STAGE_2_BLOCK_COUNT; i>0; --i) {
stage[newStart++]=stage[start++];
}
} else /* no overlap && newStart==start */ {
map[start>>_NORM_TRIE_SHIFT]=start;
newStart+=_NORM_STAGE_2_BLOCK_COUNT;
start=newStart;
}
}
/* now adjust the parent table */
for(i=0; i<parentCount; ++i) {
parent[i]=map[parent[i]>>_NORM_TRIE_SHIFT];
}
/* we saved some space */
printf("compacting trie: count of 32-bit words %lu->%lu\n", stageCount, newStart);
return newStart;
}
extern void
processData() {
#if 0
uint16_t i;
#endif
processCombining();
/* canonically reorder decompositions and assign combining classes for decompositions */
enumTrie(postParseFn, NULL);
#if 0
for(i=1; i<64; ++i) {
if(combineAndQC[i]) {
printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
}
}
#endif
/* add hangul/jamo specials */
setHangulJamoSpecials();
/* copy stage 1 for the FCD trie */
uprv_memcpy(fcdStage1, stage1, sizeof(stage1));
/* --- finalize data for quick checks & normalization: stage1/norm32Table --- */
/* turn the Norm structs (stage2, norms) into 32-bit data words (norm32Table) */
makeAll32();
/* fold supplementary code points into lead surrogates */
stage1Top=foldSupplementary(stage1, _NORM_STAGE_1_BMP_COUNT, norm32Table, &norm32TableTop, TRUE);
/* compact stage 2 */
norm32TableTop=compact(stage1, stage1Top, norm32Table, norm32TableTop);
/* --- finalize data for FCD checks: fcdStage1/fcdTable --- */
/* FCD data: take Norm.canonBothCCs and store them in the FCD table */
makeFCD();
/* FCD: fold supplementary code points into lead surrogates */
fcdStage1Top=foldSupplementary(fcdStage1, _NORM_STAGE_1_BMP_COUNT, fcdTable, &fcdTableTop, FALSE);
/* FCD: compact stage 2 */
fcdTableTop=compact(fcdStage1, fcdStage1Top, fcdTable, fcdTableTop);
/* ### debug output */
#if 0
printf("number of stage 2 entries: %ld\n", stage2Mem->index);
printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
#endif
printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop);
printf("combining table count: %u\n", combiningTableTop);
}
extern void
generateData(const char *dataDir) {
UNewDataMemory *pData;
uint16_t *p16;
UErrorCode errorCode=U_ZERO_ERROR;
uint32_t size, dataLength;
uint16_t i;
size=
_NORM_INDEX_TOP*2+
stage1Top*2+
norm32TableTop*4+
extraMem->index*2+
combiningTableTop*2+
fcdStage1Top*2+
fcdTableTop*2;
printf("size of " DATA_NAME "." DATA_TYPE " contents: %lu bytes\n", size);
indexes[_NORM_INDEX_COUNT]=_NORM_INDEX_TOP;
indexes[_NORM_INDEX_TRIE_SHIFT]=_NORM_TRIE_SHIFT;
indexes[_NORM_INDEX_TRIE_INDEX_COUNT]=stage1Top;
indexes[_NORM_INDEX_TRIE_DATA_COUNT]=norm32TableTop;
indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index;
indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
indexes[_NORM_INDEX_FCD_TRIE_INDEX_COUNT]=fcdStage1Top;
indexes[_NORM_INDEX_FCD_TRIE_DATA_COUNT]=fcdTableTop;
/* adjust the stage 1 indexes to offset stage 2 from the beginning of stage 1 */
/* stage1/norm32Table */
for(i=0; i<stage1Top; ++i) {
stage1[i]+=stage1Top/2; /* stage 2 is 32-bit indexed */
}
/* fcdStage1/fcdTable */
for(i=0; i<fcdStage1Top; ++i) {
fcdStage1[i]+=fcdStage1Top; /* FCD stage 2 is 16-bit indexed */
}
/* reduce the contents of fcdTable from 32-bit values to 16-bit values, in-place (destructive!) */
p16=(uint16_t *)fcdTable;
for(i=0; i<fcdTableTop; ++i) {
p16[i]=(uint16_t)fcdTable[i];
}
/* write the data */
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
exit(errorCode);
}
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, stage1, stage1Top*2);
udata_writeBlock(pData, norm32Table, norm32TableTop*4);
udata_writeBlock(pData, utm_getStart(extraMem), extraMem->index*2);
udata_writeBlock(pData, combiningTable, combiningTableTop*2);
udata_writeBlock(pData, fcdStage1, fcdStage1Top*2);
udata_writeBlock(pData, fcdTable, fcdTableTop*2);
/* finish up */
dataLength=udata_finish(pData, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
exit(errorCode);
}
if(dataLength!=size) {
fprintf(stderr, "gennorm: data length %lu != calculated size %lu\n",
dataLength, size);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/